Spaces:

SeaLLMs
/

LLM_Leaderboard_for_SEA

Running

App Files Files Community

lukecq commited on Apr 25, 2024

Commit

0e06db3

1 Parent(s): 8c8300c

update results and display

Browse files

Files changed (3) hide show

app.py +6 -2
src/display/about.py +3 -1
src/leaderboard/load_results.py +27 -6

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import pandas as pd
 import os
-from huggingface_hub import snapshot_download
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.display.about import (
@@ -20,6 +20,7 @@ from src.leaderboard.load_results import load_data
 # clone / pull the lmeh eval data
 TOKEN = os.environ.get("TOKEN", None)
 RESULTS_REPO = f"SeaLLMs/SeaExam-results"
 CACHE_PATH=os.getenv("HF_HOME", ".")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
@@ -33,7 +34,7 @@ def restart_space():
     API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
 # Load the data from the csv file
-csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results.csv'
 df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 # Searching and filtering
@@ -112,6 +113,7 @@ with demo:
                 # datatype=TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
                 # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
             )
@@ -149,6 +151,7 @@ with demo:
                 value=df_m3exam,
                 interactive=False,
                 visible=True,
             )
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
@@ -184,6 +187,7 @@ with demo:
                 value=df_mmlu,
                 interactive=False,
                 visible=True,
             )
             hidden_leaderboard_table_for_search = gr.components.Dataframe(

 import gradio as gr
 import pandas as pd
 import os
+from huggingface_hub import snapshot_download, login
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.display.about import (
 # clone / pull the lmeh eval data
 TOKEN = os.environ.get("TOKEN", None)
+login(token=TOKEN)
 RESULTS_REPO = f"SeaLLMs/SeaExam-results"
 CACHE_PATH=os.getenv("HF_HOME", ".")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
     API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
 # Load the data from the csv file
+csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
 df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 # Searching and filtering
                 # datatype=TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
+                datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
                 visible=True,
                 # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
             )
                 value=df_m3exam,
                 interactive=False,
                 visible=True,
+                datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
             )
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
                 value=df_mmlu,
                 interactive=False,
                 visible=True,
+                datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
             )
             hidden_leaderboard_table_for_search = gr.components.Dataframe(

src/display/about.py CHANGED Viewed

@@ -53,7 +53,9 @@ How to interpret the leaderboard?
 * Each numerical value represet the accuracy (%).
 * The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
 * The "🏅 Overall" shows the average results of "M3Exam" and "MMLU".
-* The leaderboard is sorted by avg_sea, the average score across SEA languages (id, th, and vi).
 ## Reproducibility
 To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.

 * Each numerical value represet the accuracy (%).
 * The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
 * The "🏅 Overall" shows the average results of "M3Exam" and "MMLU".
+* The leaderboard is ranked by avg_sea, the average score across SEA languages (id, th, and vi).
+* The rank is in "R" column.
+* The "params(B)" column shows the number of parameters of the model in billions.
 ## Reproducibility
 To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.

src/leaderboard/load_results.py CHANGED Viewed

@@ -4,16 +4,19 @@ from huggingface_hub import HfApi
 api = HfApi()
 def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
     model_info = api.model_info(repo_id=model_name, revision=revision)
     # model_size = get_model_size(model_info=model_info, precision=precision)
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
-            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
         except AttributeError:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
@@ -21,11 +24,19 @@ def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
     model_size = size_factor * model_size
     return model_size
 def load_data(data_path):
     df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
     columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
-    columns_sorted = ['rank','type', 'Model', 'open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
     # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
     df_m3exam = df.iloc[:, :11]  # M3Exam columns
@@ -40,9 +51,9 @@ def load_data(data_path):
         df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
     # rank the DataFrames by the 'avg_sea' column
-    df_m3exam['rank'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
-    df_mmlu['rank'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
-    df_avg['rank'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
     # reorder the columns
     df_m3exam = df_m3exam[columns_sorted]
@@ -64,6 +75,16 @@ def load_data(data_path):
     df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
     df_avg['type'] = df_avg['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
     return df_m3exam, df_mmlu, df_avg

 api = HfApi()
 def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
+    if len(model_name.split("/")) == 1:
+        return None
     model_info = api.model_info(repo_id=model_name, revision=revision)
     # model_size = get_model_size(model_info=model_info, precision=precision)
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 1)
     except (AttributeError, TypeError):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 1)
         except AttributeError:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
     model_size = size_factor * model_size
     return model_size
+def make_clickable_model(model_name, link=None):
+    if len(model_name.split("/")) == 2:
+        link = "https://huggingface.co/" + model_name
+        return (
+            f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
+        )
+    return model_name
 def load_data(data_path):
     df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
     columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
+    columns_sorted = ['R','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
     # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
     df_m3exam = df.iloc[:, :11]  # M3Exam columns
         df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
     # rank the DataFrames by the 'avg_sea' column
+    df_m3exam['R'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
+    df_mmlu['R'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
+    df_avg['R'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
     # reorder the columns
     df_m3exam = df_m3exam[columns_sorted]
     df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
     df_avg['type'] = df_avg['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
+    # get the parameters of the models
+    df_m3exam['params(B)'] = df_m3exam['Model'].apply(get_model_size)
+    df_mmlu['params(B)'] = df_mmlu['Model'].apply(get_model_size)
+    df_avg['params(B)'] = df_avg['Model'].apply(get_model_size)
+    # make the 'Model' column clickable
+    df_m3exam['Model'] = df_m3exam['Model'].apply(make_clickable_model)
+    df_mmlu['Model'] = df_mmlu['Model'].apply(make_clickable_model)
+    df_avg['Model'] = df_avg['Model'].apply(make_clickable_model)
     return df_m3exam, df_mmlu, df_avg