open_pl_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

djstrong commited on Jun 13, 2024

Commit

c7cf816

1 Parent(s): aa391c7

description update

Browse files

Files changed (3) hide show

src/about.py +24 -7
src/display/utils.py +1 -1
src/leaderboard/read_evals.py +5 -4

src/about.py CHANGED Viewed

@@ -36,12 +36,18 @@ class Tasks(Enum):
     task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
     task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
     task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
-    task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
     task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
     task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
     task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
     # task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -59,7 +65,7 @@ TITLE = """
 """
 # What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
 The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
 For now, models are tested without theirs templates.
@@ -67,7 +73,14 @@ Almost every task has two versions: regex and multiple choice.
 * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
 * _mc suffix means that a model is scored against every possible class (suitable also for base models)
-Average columns are normalized against scores by "Baseline (majority class)".
 We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
 """
@@ -85,7 +98,6 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
 * fix long model names
 * add inference time
 * add more tasks
-* use model templates
 * fix scrolling on Firefox
 ## Tasks
@@ -114,12 +126,15 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
 | polqa_open_book_g | ipipan/polqa   | levenshtein | generate_until |
 | polqa_closed_book_g | ipipan/polqa   | levenshtein | generate_until |
 | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k   | word perplexity | other |
 ## Reproducibility
 To reproduce our results, you need to clone the repository:
 ```
-git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish
 cd lm-evaluation-harness
 pip install -e .
 ```
@@ -127,8 +142,10 @@ pip install -e .
 and run benchmark for 0-shot and 5-shot:
 ```
-lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
-lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
 ```
 ## List of Polish models

     task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
     task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
     task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
     task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
     task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
     task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
+    task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
     # task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
+g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
+mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
+rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
+all_tasks = g_tasks + mc_tasks
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 """
 # What does your leaderboard evaluate?
+INTRODUCTION_TEXT = f"""
 The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
 For now, models are tested without theirs templates.
 * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
 * _mc suffix means that a model is scored against every possible class (suitable also for base models)
+Average columns are normalized against scores by "Baseline (majority class)". Tasks taken into account while calculating averages:
+* Average: {', '.join(all_tasks)}
+* Avg g: {', '.join(g_tasks)}
+* Avg mc: {', '.join(mc_tasks)}
+* Acg RAG: {', '.join(rag_tasks)}
+* `,chat` suffix means that a model is tested using chat templates
+* `,chat,multiturn` suffix means that a model is tested using chat templates and fewshot examples are treated as a multi-turn conversation
 We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
 """
 * fix long model names
 * add inference time
 * add more tasks
 * fix scrolling on Firefox
 ## Tasks
 | polqa_open_book_g | ipipan/polqa   | levenshtein | generate_until |
 | polqa_closed_book_g | ipipan/polqa   | levenshtein | generate_until |
 | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k   | word perplexity | other |
+| polish_poquad_open_book | enelpol/poleval2018_task3_test_10k   | levenshtein | generate_until |
+| polish_eq_bench_first_turn | speakleash/EQ-Bench-PL   | eq_bench | generate_until |
+| polish_eq_bench | speakleash/EQ-Bench-PL   | eq_bench | generate_until |
 ## Reproducibility
 To reproduce our results, you need to clone the repository:
 ```
+git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish3
 cd lm-evaluation-harness
 pip install -e .
 ```
 and run benchmark for 0-shot and 5-shot:
 ```
+lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
+lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
+lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
+lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
 ```
 ## List of Polish models

src/display/utils.py CHANGED Viewed

@@ -34,11 +34,11 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
 auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
 auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
 auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
-auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

 auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
 auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
 auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
+auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,6 +8,7 @@ from dataclasses import dataclass
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
 from src.submission.check_validity import is_model_on_hub
@@ -183,10 +184,10 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
-        mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
-        rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
-        all_tasks = g_tasks + mc_tasks
         all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
         baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}

 import dateutil
 import numpy as np
+from src.about import all_tasks, g_tasks, mc_tasks, rag_tasks
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
 from src.submission.check_validity import is_model_on_hub
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        # g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
+        # mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
+        # rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
+        # all_tasks = g_tasks + mc_tasks
         all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
         baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}