Spaces:
Running
Running
update results and display
Browse files- app.py +6 -2
- src/display/about.py +3 -1
- src/leaderboard/load_results.py +27 -6
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
-
from huggingface_hub import snapshot_download
|
| 5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 6 |
|
| 7 |
from src.display.about import (
|
|
@@ -20,6 +20,7 @@ from src.leaderboard.load_results import load_data
|
|
| 20 |
|
| 21 |
# clone / pull the lmeh eval data
|
| 22 |
TOKEN = os.environ.get("TOKEN", None)
|
|
|
|
| 23 |
RESULTS_REPO = f"SeaLLMs/SeaExam-results"
|
| 24 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 25 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
@@ -33,7 +34,7 @@ def restart_space():
|
|
| 33 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
| 34 |
|
| 35 |
# Load the data from the csv file
|
| 36 |
-
csv_path = f'{EVAL_RESULTS_PATH}/
|
| 37 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
| 38 |
|
| 39 |
# Searching and filtering
|
|
@@ -112,6 +113,7 @@ with demo:
|
|
| 112 |
# datatype=TYPES,
|
| 113 |
elem_id="leaderboard-table",
|
| 114 |
interactive=False,
|
|
|
|
| 115 |
visible=True,
|
| 116 |
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
|
| 117 |
)
|
|
@@ -149,6 +151,7 @@ with demo:
|
|
| 149 |
value=df_m3exam,
|
| 150 |
interactive=False,
|
| 151 |
visible=True,
|
|
|
|
| 152 |
)
|
| 153 |
|
| 154 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
|
@@ -184,6 +187,7 @@ with demo:
|
|
| 184 |
value=df_mmlu,
|
| 185 |
interactive=False,
|
| 186 |
visible=True,
|
|
|
|
| 187 |
)
|
| 188 |
|
| 189 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
+
from huggingface_hub import snapshot_download, login
|
| 5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 6 |
|
| 7 |
from src.display.about import (
|
|
|
|
| 20 |
|
| 21 |
# clone / pull the lmeh eval data
|
| 22 |
TOKEN = os.environ.get("TOKEN", None)
|
| 23 |
+
login(token=TOKEN)
|
| 24 |
RESULTS_REPO = f"SeaLLMs/SeaExam-results"
|
| 25 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 26 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
| 34 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
| 35 |
|
| 36 |
# Load the data from the csv file
|
| 37 |
+
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
|
| 38 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
| 39 |
|
| 40 |
# Searching and filtering
|
|
|
|
| 113 |
# datatype=TYPES,
|
| 114 |
elem_id="leaderboard-table",
|
| 115 |
interactive=False,
|
| 116 |
+
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
| 117 |
visible=True,
|
| 118 |
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
|
| 119 |
)
|
|
|
|
| 151 |
value=df_m3exam,
|
| 152 |
interactive=False,
|
| 153 |
visible=True,
|
| 154 |
+
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
| 155 |
)
|
| 156 |
|
| 157 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
|
|
|
| 187 |
value=df_mmlu,
|
| 188 |
interactive=False,
|
| 189 |
visible=True,
|
| 190 |
+
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
| 191 |
)
|
| 192 |
|
| 193 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
src/display/about.py
CHANGED
|
@@ -53,7 +53,9 @@ How to interpret the leaderboard?
|
|
| 53 |
* Each numerical value represet the accuracy (%).
|
| 54 |
* The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
|
| 55 |
* The "π
Overall" shows the average results of "M3Exam" and "MMLU".
|
| 56 |
-
* The leaderboard is
|
|
|
|
|
|
|
| 57 |
|
| 58 |
## Reproducibility
|
| 59 |
To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
|
|
|
|
| 53 |
* Each numerical value represet the accuracy (%).
|
| 54 |
* The "M3Exam" and "MMLU" pages show the performance of each model for that dataset.
|
| 55 |
* The "π
Overall" shows the average results of "M3Exam" and "MMLU".
|
| 56 |
+
* The leaderboard is ranked by avg_sea, the average score across SEA languages (id, th, and vi).
|
| 57 |
+
* The rank is in "R" column.
|
| 58 |
+
* The "params(B)" column shows the number of parameters of the model in billions.
|
| 59 |
|
| 60 |
## Reproducibility
|
| 61 |
To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
|
src/leaderboard/load_results.py
CHANGED
|
@@ -4,16 +4,19 @@ from huggingface_hub import HfApi
|
|
| 4 |
api = HfApi()
|
| 5 |
|
| 6 |
def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
|
|
|
|
|
|
|
|
|
|
| 7 |
model_info = api.model_info(repo_id=model_name, revision=revision)
|
| 8 |
# model_size = get_model_size(model_info=model_info, precision=precision)
|
| 9 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
| 10 |
try:
|
| 11 |
-
model_size = round(model_info.safetensors["total"] / 1e9,
|
| 12 |
except (AttributeError, TypeError):
|
| 13 |
try:
|
| 14 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
| 15 |
model_size = size_match.group(0)
|
| 16 |
-
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3,
|
| 17 |
except AttributeError:
|
| 18 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 19 |
|
|
@@ -21,11 +24,19 @@ def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
|
|
| 21 |
model_size = size_factor * model_size
|
| 22 |
return model_size
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def load_data(data_path):
|
| 25 |
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
| 26 |
|
| 27 |
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
| 28 |
-
columns_sorted = ['
|
| 29 |
|
| 30 |
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
| 31 |
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
|
@@ -40,9 +51,9 @@ def load_data(data_path):
|
|
| 40 |
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
|
| 41 |
|
| 42 |
# rank the DataFrames by the 'avg_sea' column
|
| 43 |
-
df_m3exam['
|
| 44 |
-
df_mmlu['
|
| 45 |
-
df_avg['
|
| 46 |
|
| 47 |
# reorder the columns
|
| 48 |
df_m3exam = df_m3exam[columns_sorted]
|
|
@@ -64,6 +75,16 @@ def load_data(data_path):
|
|
| 64 |
df_mmlu['type'] = df_mmlu['type'].map({'base': 'π’ base', 'chat': 'πΆ chat'})
|
| 65 |
df_avg['type'] = df_avg['type'].map({'base': 'π’ base', 'chat': 'πΆ chat'})
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
return df_m3exam, df_mmlu, df_avg
|
| 68 |
|
| 69 |
|
|
|
|
| 4 |
api = HfApi()
|
| 5 |
|
| 6 |
def get_model_size(model_name, precision: str = "BF16", revision: str = "main"):
|
| 7 |
+
if len(model_name.split("/")) == 1:
|
| 8 |
+
return None
|
| 9 |
+
|
| 10 |
model_info = api.model_info(repo_id=model_name, revision=revision)
|
| 11 |
# model_size = get_model_size(model_info=model_info, precision=precision)
|
| 12 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
| 13 |
try:
|
| 14 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 1)
|
| 15 |
except (AttributeError, TypeError):
|
| 16 |
try:
|
| 17 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
| 18 |
model_size = size_match.group(0)
|
| 19 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 1)
|
| 20 |
except AttributeError:
|
| 21 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 22 |
|
|
|
|
| 24 |
model_size = size_factor * model_size
|
| 25 |
return model_size
|
| 26 |
|
| 27 |
+
def make_clickable_model(model_name, link=None):
|
| 28 |
+
if len(model_name.split("/")) == 2:
|
| 29 |
+
link = "https://huggingface.co/" + model_name
|
| 30 |
+
return (
|
| 31 |
+
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
|
| 32 |
+
)
|
| 33 |
+
return model_name
|
| 34 |
+
|
| 35 |
def load_data(data_path):
|
| 36 |
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
| 37 |
|
| 38 |
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
| 39 |
+
columns_sorted = ['R','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
|
| 40 |
|
| 41 |
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
| 42 |
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
|
|
|
| 51 |
df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']] = df_tmp[['en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']].round(2)
|
| 52 |
|
| 53 |
# rank the DataFrames by the 'avg_sea' column
|
| 54 |
+
df_m3exam['R'] = df_m3exam['avg_sea'].rank(ascending=False).astype(int)
|
| 55 |
+
df_mmlu['R'] = df_mmlu['avg_sea'].rank(ascending=False).astype(int)
|
| 56 |
+
df_avg['R'] = df_avg['avg_sea'].rank(ascending=False).astype(int)
|
| 57 |
|
| 58 |
# reorder the columns
|
| 59 |
df_m3exam = df_m3exam[columns_sorted]
|
|
|
|
| 75 |
df_mmlu['type'] = df_mmlu['type'].map({'base': 'π’ base', 'chat': 'πΆ chat'})
|
| 76 |
df_avg['type'] = df_avg['type'].map({'base': 'π’ base', 'chat': 'πΆ chat'})
|
| 77 |
|
| 78 |
+
# get the parameters of the models
|
| 79 |
+
df_m3exam['params(B)'] = df_m3exam['Model'].apply(get_model_size)
|
| 80 |
+
df_mmlu['params(B)'] = df_mmlu['Model'].apply(get_model_size)
|
| 81 |
+
df_avg['params(B)'] = df_avg['Model'].apply(get_model_size)
|
| 82 |
+
|
| 83 |
+
# make the 'Model' column clickable
|
| 84 |
+
df_m3exam['Model'] = df_m3exam['Model'].apply(make_clickable_model)
|
| 85 |
+
df_mmlu['Model'] = df_mmlu['Model'].apply(make_clickable_model)
|
| 86 |
+
df_avg['Model'] = df_avg['Model'].apply(make_clickable_model)
|
| 87 |
+
|
| 88 |
return df_m3exam, df_mmlu, df_avg
|
| 89 |
|
| 90 |
|