sync from github
Browse files- src/display/about.py +1 -1
- src/display/utils.py +1 -3
- src/leaderboard/read_evals.py +6 -5
- src/populate.py +1 -1
src/display/about.py
CHANGED
|
@@ -10,9 +10,9 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
|
|
| 10 |
|
| 11 |
|
| 12 |
Tasks:
|
| 13 |
-
- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
|
| 14 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
| 15 |
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
|
|
|
| 16 |
|
| 17 |
Columns and Metrics:
|
| 18 |
- Method: The MOE LLMs inference framework.
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
Tasks:
|
|
|
|
| 13 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
| 14 |
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
| 15 |
+
- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
|
| 16 |
|
| 17 |
Columns and Metrics:
|
| 18 |
- Method: The MOE LLMs inference framework.
|
src/display/utils.py
CHANGED
|
@@ -37,9 +37,7 @@ gpu_metrics_to_name_map = {
|
|
| 37 |
GPU_Mem: GPU_Mem,
|
| 38 |
"batch_size": BATCH_SIZE,
|
| 39 |
"precision": PRECISION,
|
| 40 |
-
GPU_Name: GPU_Name
|
| 41 |
-
MFU: MFU,
|
| 42 |
-
MBU: MBU
|
| 43 |
}
|
| 44 |
|
| 45 |
@dataclass
|
|
|
|
| 37 |
GPU_Mem: GPU_Mem,
|
| 38 |
"batch_size": BATCH_SIZE,
|
| 39 |
"precision": PRECISION,
|
| 40 |
+
GPU_Name: GPU_Name
|
|
|
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
@dataclass
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -65,11 +65,11 @@ class EvalResult:
|
|
| 65 |
if len(org_and_model) == 1:
|
| 66 |
org = None
|
| 67 |
model = org_and_model[0]
|
| 68 |
-
result_key = f"{model}_{precision.value.name}"
|
| 69 |
else:
|
| 70 |
org = org_and_model[0]
|
| 71 |
model = org_and_model[1]
|
| 72 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
| 73 |
full_model = "/".join(org_and_model)
|
| 74 |
|
| 75 |
still_on_hub, error, model_config = is_model_on_hub(
|
|
@@ -120,12 +120,13 @@ class EvalResult:
|
|
| 120 |
multiplier = 1.0
|
| 121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
| 122 |
multiplier = 1
|
| 123 |
-
|
| 124 |
-
|
| 125 |
# print('RESULTS', data['results'])
|
| 126 |
# print('XXX', benchmark, metric, value, multiplier)
|
| 127 |
if value == "N/A":
|
| 128 |
-
results[benchmark][metric] =
|
|
|
|
|
|
|
| 129 |
else:
|
| 130 |
results[benchmark][metric] = value * multiplier
|
| 131 |
|
|
|
|
| 65 |
if len(org_and_model) == 1:
|
| 66 |
org = None
|
| 67 |
model = org_and_model[0]
|
| 68 |
+
result_key = f"{model}_{precision.value.name}_{inference_framework}"
|
| 69 |
else:
|
| 70 |
org = org_and_model[0]
|
| 71 |
model = org_and_model[1]
|
| 72 |
+
result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
|
| 73 |
full_model = "/".join(org_and_model)
|
| 74 |
|
| 75 |
still_on_hub, error, model_config = is_model_on_hub(
|
|
|
|
| 120 |
multiplier = 1.0
|
| 121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
| 122 |
multiplier = 1
|
| 123 |
+
|
|
|
|
| 124 |
# print('RESULTS', data['results'])
|
| 125 |
# print('XXX', benchmark, metric, value, multiplier)
|
| 126 |
if value == "N/A":
|
| 127 |
+
results[benchmark][metric] = "-"
|
| 128 |
+
elif value == "auto":
|
| 129 |
+
results[benchmark][metric] = "auto"
|
| 130 |
else:
|
| 131 |
results[benchmark][metric] = value * multiplier
|
| 132 |
|
src/populate.py
CHANGED
|
@@ -75,7 +75,7 @@ def get_leaderboard_df(
|
|
| 75 |
df[col] = np.nan
|
| 76 |
|
| 77 |
if not df.empty:
|
| 78 |
-
df = df.round(decimals=
|
| 79 |
|
| 80 |
# filter out if any of the benchmarks have not been produced
|
| 81 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
|
| 75 |
df[col] = np.nan
|
| 76 |
|
| 77 |
if not df.empty:
|
| 78 |
+
df = df.round(decimals=2)
|
| 79 |
|
| 80 |
# filter out if any of the benchmarks have not been produced
|
| 81 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|