Spaces:
Runtime error
Runtime error
Sean Cho
commited on
Commit
Β·
3967c9e
1
Parent(s):
af4234d
add 6 new tasks
Browse files- src/display/about.py +9 -3
- src/display/utils.py +7 -0
- src/leaderboard/read_evals.py +2 -15
- src/tools/plots.py +1 -6
src/display/about.py
CHANGED
|
@@ -33,7 +33,7 @@ Please provide information about the model through an issue! π€©
|
|
| 33 |
|
| 34 |
π We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
| 35 |
|
| 36 |
-
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by
|
| 37 |
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
| 38 |
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
| 39 |
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
|
@@ -41,8 +41,14 @@ We have set up a benchmark using datasets translated into Korean, and applied va
|
|
| 41 |
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
| 42 |
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
| 43 |
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
|
| 48 |
|
|
|
|
| 33 |
|
| 34 |
π We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
|
| 35 |
|
| 36 |
+
We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
|
| 37 |
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
| 38 |
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
| 39 |
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
|
|
|
| 41 |
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
| 42 |
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
| 43 |
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
| 44 |
+
- Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
| 45 |
+
- Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
| 46 |
+
- KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
| 47 |
+
- KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
| 48 |
+
- Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
| 49 |
+
- Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
| 50 |
+
|
| 51 |
+
To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
|
| 52 |
|
| 53 |
GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
|
| 54 |
|
src/display/utils.py
CHANGED
|
@@ -21,6 +21,13 @@ class Tasks(Enum):
|
|
| 21 |
winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
|
| 22 |
gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
|
| 23 |
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# These classes are for user facing column names,
|
| 26 |
# to avoid having to change them all around the code
|
|
|
|
| 21 |
winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
|
| 22 |
gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
|
| 23 |
commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
|
| 24 |
+
eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
|
| 25 |
+
instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
|
| 26 |
+
korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
|
| 27 |
+
korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
|
| 28 |
+
harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
|
| 29 |
+
helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
|
| 30 |
+
|
| 31 |
|
| 32 |
# These classes are for user facing column names,
|
| 33 |
# to avoid having to change them all around the code
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -103,11 +103,6 @@ class EvalResult:
|
|
| 103 |
results[task.benchmark] = 0.0
|
| 104 |
continue
|
| 105 |
|
| 106 |
-
# Two new tasks have been added, we need to skip them for now
|
| 107 |
-
if task.benchmark == "ko_winogrande" or task.benchmark == "ko_gsm8k":
|
| 108 |
-
results[task.benchmark] = 0.0
|
| 109 |
-
continue
|
| 110 |
-
|
| 111 |
# We average all scores of a given metric (mostly for mmlu)
|
| 112 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 113 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
@@ -148,16 +143,8 @@ class EvalResult:
|
|
| 148 |
|
| 149 |
def to_dict(self):
|
| 150 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
# TODO: safely remove this code when the task results are added
|
| 154 |
-
skip_avg_len = 0
|
| 155 |
-
if self.results['ko_winogrande'] == 0.0:
|
| 156 |
-
skip_avg_len += 1
|
| 157 |
-
if self.results['ko_gsm8k'] == 0.0:
|
| 158 |
-
skip_avg_len += 1
|
| 159 |
-
|
| 160 |
-
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
| 161 |
data_dict = {
|
| 162 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 163 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 103 |
results[task.benchmark] = 0.0
|
| 104 |
continue
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# We average all scores of a given metric (mostly for mmlu)
|
| 107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
| 108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
|
| 143 |
|
| 144 |
def to_dict(self):
|
| 145 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 146 |
+
|
| 147 |
+
average = sum([v for v in self.results.values() if v is not None]) / sum([1 for v in self.results.values() if v is not None])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
data_dict = {
|
| 149 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 150 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/tools/plots.py
CHANGED
|
@@ -36,12 +36,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
| 36 |
|
| 37 |
current_date = row["date"]
|
| 38 |
if task.benchmark == "Average":
|
| 39 |
-
|
| 40 |
-
if row["results"]["ko_winogrande"] == 0.0:
|
| 41 |
-
avg_skip_len += 1
|
| 42 |
-
if row["results"]["ko_gsm8k"] == 0.0:
|
| 43 |
-
avg_skip_len += 1
|
| 44 |
-
current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
|
| 45 |
else:
|
| 46 |
current_score = row["results"][task.benchmark]
|
| 47 |
|
|
|
|
| 36 |
|
| 37 |
current_date = row["date"]
|
| 38 |
if task.benchmark == "Average":
|
| 39 |
+
current_score = np.mean(list(row["results"].values()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
else:
|
| 41 |
current_score = row["results"][task.benchmark]
|
| 42 |
|