Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
description update
Browse files- src/about.py +24 -7
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +5 -4
src/about.py
CHANGED
|
@@ -36,12 +36,18 @@ class Tasks(Enum):
|
|
| 36 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
| 37 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
| 38 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
| 39 |
-
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
| 40 |
task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
|
| 41 |
task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
|
| 42 |
task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
|
|
|
|
| 43 |
# task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 46 |
# ---------------------------------------------------
|
| 47 |
|
|
@@ -59,7 +65,7 @@ TITLE = """
|
|
| 59 |
"""
|
| 60 |
|
| 61 |
# What does your leaderboard evaluate?
|
| 62 |
-
INTRODUCTION_TEXT = """
|
| 63 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
| 64 |
For now, models are tested without theirs templates.
|
| 65 |
|
|
@@ -67,7 +73,14 @@ Almost every task has two versions: regex and multiple choice.
|
|
| 67 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
| 68 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
| 69 |
|
| 70 |
-
Average columns are normalized against scores by "Baseline (majority class)".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
|
| 73 |
"""
|
|
@@ -85,7 +98,6 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
|
|
| 85 |
* fix long model names
|
| 86 |
* add inference time
|
| 87 |
* add more tasks
|
| 88 |
-
* use model templates
|
| 89 |
* fix scrolling on Firefox
|
| 90 |
|
| 91 |
## Tasks
|
|
@@ -114,12 +126,15 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
|
|
| 114 |
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
| 115 |
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
| 116 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
## Reproducibility
|
| 119 |
To reproduce our results, you need to clone the repository:
|
| 120 |
|
| 121 |
```
|
| 122 |
-
git clone https://github.com/speakleash/lm-evaluation-harness.git -b
|
| 123 |
cd lm-evaluation-harness
|
| 124 |
pip install -e .
|
| 125 |
```
|
|
@@ -127,8 +142,10 @@ pip install -e .
|
|
| 127 |
and run benchmark for 0-shot and 5-shot:
|
| 128 |
|
| 129 |
```
|
| 130 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
| 131 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
|
|
|
|
|
|
| 132 |
```
|
| 133 |
|
| 134 |
## List of Polish models
|
|
|
|
| 36 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
| 37 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
| 38 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
|
|
|
| 39 |
task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
|
| 40 |
task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
|
| 41 |
task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
|
| 42 |
+
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
| 43 |
# task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
|
| 44 |
|
| 45 |
+
|
| 46 |
+
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 47 |
+
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 48 |
+
rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
|
| 49 |
+
all_tasks = g_tasks + mc_tasks
|
| 50 |
+
|
| 51 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 52 |
# ---------------------------------------------------
|
| 53 |
|
|
|
|
| 65 |
"""
|
| 66 |
|
| 67 |
# What does your leaderboard evaluate?
|
| 68 |
+
INTRODUCTION_TEXT = f"""
|
| 69 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
| 70 |
For now, models are tested without theirs templates.
|
| 71 |
|
|
|
|
| 73 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
| 74 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
| 75 |
|
| 76 |
+
Average columns are normalized against scores by "Baseline (majority class)". Tasks taken into account while calculating averages:
|
| 77 |
+
* Average: {', '.join(all_tasks)}
|
| 78 |
+
* Avg g: {', '.join(g_tasks)}
|
| 79 |
+
* Avg mc: {', '.join(mc_tasks)}
|
| 80 |
+
* Acg RAG: {', '.join(rag_tasks)}
|
| 81 |
+
|
| 82 |
+
* `,chat` suffix means that a model is tested using chat templates
|
| 83 |
+
* `,chat,multiturn` suffix means that a model is tested using chat templates and fewshot examples are treated as a multi-turn conversation
|
| 84 |
|
| 85 |
We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
|
| 86 |
"""
|
|
|
|
| 98 |
* fix long model names
|
| 99 |
* add inference time
|
| 100 |
* add more tasks
|
|
|
|
| 101 |
* fix scrolling on Firefox
|
| 102 |
|
| 103 |
## Tasks
|
|
|
|
| 126 |
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
| 127 |
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
| 128 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
| 129 |
+
| polish_poquad_open_book | enelpol/poleval2018_task3_test_10k | levenshtein | generate_until |
|
| 130 |
+
| polish_eq_bench_first_turn | speakleash/EQ-Bench-PL | eq_bench | generate_until |
|
| 131 |
+
| polish_eq_bench | speakleash/EQ-Bench-PL | eq_bench | generate_until |
|
| 132 |
|
| 133 |
## Reproducibility
|
| 134 |
To reproduce our results, you need to clone the repository:
|
| 135 |
|
| 136 |
```
|
| 137 |
+
git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish3
|
| 138 |
cd lm-evaluation-harness
|
| 139 |
pip install -e .
|
| 140 |
```
|
|
|
|
| 142 |
and run benchmark for 0-shot and 5-shot:
|
| 143 |
|
| 144 |
```
|
| 145 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
| 146 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
| 147 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
| 148 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
| 149 |
```
|
| 150 |
|
| 151 |
## List of Polish models
|
src/display/utils.py
CHANGED
|
@@ -34,11 +34,11 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
|
| 34 |
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
| 35 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
| 36 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
|
|
|
| 37 |
|
| 38 |
for task in Tasks:
|
| 39 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 40 |
# Model information
|
| 41 |
-
auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
| 42 |
|
| 43 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 44 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 34 |
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
| 35 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
| 36 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
| 37 |
+
auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
| 38 |
|
| 39 |
for task in Tasks:
|
| 40 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 41 |
# Model information
|
|
|
|
| 42 |
|
| 43 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 44 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -8,6 +8,7 @@ from dataclasses import dataclass
|
|
| 8 |
import dateutil
|
| 9 |
import numpy as np
|
| 10 |
|
|
|
|
| 11 |
from src.display.formatting import make_clickable_model
|
| 12 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
|
| 13 |
from src.submission.check_validity import is_model_on_hub
|
|
@@ -183,10 +184,10 @@ class EvalResult:
|
|
| 183 |
|
| 184 |
def to_dict(self):
|
| 185 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 186 |
-
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 187 |
-
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 188 |
-
rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
|
| 189 |
-
all_tasks = g_tasks + mc_tasks
|
| 190 |
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
| 191 |
|
| 192 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
|
|
|
| 8 |
import dateutil
|
| 9 |
import numpy as np
|
| 10 |
|
| 11 |
+
from src.about import all_tasks, g_tasks, mc_tasks, rag_tasks
|
| 12 |
from src.display.formatting import make_clickable_model
|
| 13 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
|
| 14 |
from src.submission.check_validity import is_model_on_hub
|
|
|
|
| 184 |
|
| 185 |
def to_dict(self):
|
| 186 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 187 |
+
# g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 188 |
+
# mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 189 |
+
# rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
|
| 190 |
+
# all_tasks = g_tasks + mc_tasks
|
| 191 |
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
| 192 |
|
| 193 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|