Spaces:
Runtime error
Runtime error
Sean Cho
commited on
Commit
·
bd9a9ad
1
Parent(s):
6f030e8
revert logic
Browse files- src/leaderboard/read_evals.py +29 -1
- src/populate.py +0 -1
- src/tools/plots.py +19 -1
src/leaderboard/read_evals.py
CHANGED
|
@@ -103,6 +103,13 @@ class EvalResult:
|
|
| 103 |
results[task.benchmark] = 0.0
|
| 104 |
continue
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# We average all scores of a given metric (mostly for mmlu)
|
| 107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
@@ -144,7 +151,28 @@ class EvalResult:
|
|
| 144 |
def to_dict(self):
|
| 145 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 146 |
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
data_dict = {
|
| 149 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 150 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 103 |
results[task.benchmark] = 0.0
|
| 104 |
continue
|
| 105 |
|
| 106 |
+
# New tasks have been added, we need to skip them if not exists
|
| 107 |
+
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
|
| 108 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 109 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 110 |
+
results[task.benchmark] = 0.0
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
# We average all scores of a given metric (mostly for mmlu)
|
| 114 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 115 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
|
| 151 |
def to_dict(self):
|
| 152 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 153 |
|
| 154 |
+
# Skip the two new tasks for now
|
| 155 |
+
# TODO: safely remove this code when the task results are all added
|
| 156 |
+
skip_avg_len = 0
|
| 157 |
+
if self.results['ko_winogrande'] == 0.0:
|
| 158 |
+
skip_avg_len += 1
|
| 159 |
+
if self.results['ko_gsm8k'] == 0.0:
|
| 160 |
+
skip_avg_len += 1
|
| 161 |
+
if self.results['ko_eq_bench'] == 0.0:
|
| 162 |
+
skip_avg_len += 1
|
| 163 |
+
if self.results['ko_inst_follow'] == 0.0:
|
| 164 |
+
skip_avg_len += 1
|
| 165 |
+
if self.results['kor_nat_cka'] == 0.0:
|
| 166 |
+
skip_avg_len += 1
|
| 167 |
+
if self.results['kor_nat_sva'] == 0.0:
|
| 168 |
+
skip_avg_len += 1
|
| 169 |
+
if self.results['ko_harmlessness'] == 0.0:
|
| 170 |
+
skip_avg_len += 1
|
| 171 |
+
if self.results['ko_helpfulness'] == 0.0:
|
| 172 |
+
skip_avg_len += 1
|
| 173 |
+
|
| 174 |
+
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
| 175 |
+
|
| 176 |
data_dict = {
|
| 177 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 178 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/populate.py
CHANGED
|
@@ -16,7 +16,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 16 |
filter_models(all_data_json)
|
| 17 |
|
| 18 |
df = pd.DataFrame.from_records(all_data_json)
|
| 19 |
-
print(df.to_string())
|
| 20 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 21 |
df = df[cols].round(decimals=2)
|
| 22 |
|
|
|
|
| 16 |
filter_models(all_data_json)
|
| 17 |
|
| 18 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
| 19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 20 |
df = df[cols].round(decimals=2)
|
| 21 |
|
src/tools/plots.py
CHANGED
|
@@ -36,7 +36,25 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
| 36 |
|
| 37 |
current_date = row["date"]
|
| 38 |
if task.benchmark == "Average":
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
else:
|
| 41 |
current_score = row["results"][task.benchmark]
|
| 42 |
|
|
|
|
| 36 |
|
| 37 |
current_date = row["date"]
|
| 38 |
if task.benchmark == "Average":
|
| 39 |
+
avg_skip_len = 0
|
| 40 |
+
if row["results"]["ko_winogrande"] == 0.0:
|
| 41 |
+
avg_skip_len += 1
|
| 42 |
+
if row["results"]["ko_gsm8k"] == 0.0:
|
| 43 |
+
avg_skip_len += 1
|
| 44 |
+
if row["results"]["ko_eq_bench"] == 0.0:
|
| 45 |
+
avg_skip_len += 1
|
| 46 |
+
if row["results"]["ko_inst_follow"] == 0.0:
|
| 47 |
+
avg_skip_len += 1
|
| 48 |
+
if row["results"]["kor_nat_cka"] == 0.0:
|
| 49 |
+
avg_skip_len += 1
|
| 50 |
+
if row["results"]["kor_nat_sva"] == 0.0:
|
| 51 |
+
avg_skip_len += 1
|
| 52 |
+
if row["results"]["ko_harmlessness"] == 0.0:
|
| 53 |
+
avg_skip_len += 1
|
| 54 |
+
if row["results"]["ko_helpfulness"] == 0.0:
|
| 55 |
+
avg_skip_len += 1
|
| 56 |
+
|
| 57 |
+
current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
|
| 58 |
else:
|
| 59 |
current_score = row["results"][task.benchmark]
|
| 60 |
|