Spaces:
Running
Running
formatting
Browse files- .gitignore +2 -5
- app.py +31 -34
- data_utils.py +2 -5
- model_info.json +1 -0
- update_data.sh +4 -40
- utils_display.py +6 -1
.gitignore
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
|
| 2 |
-
*.pyc
|
| 3 |
-
|
| 4 |
-
ZeroEval-main/result_dirs/.DS_Store
|
| 5 |
-
ZeroEval-main/result_dirs/zebra-grid/.DS_Store
|
| 6 |
-
.DS_Store
|
|
|
|
| 1 |
|
| 2 |
+
*.pyc
|
| 3 |
+
*.DS_Store
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -37,59 +37,56 @@ with open("_metrics.md", "r") as f:
|
|
| 37 |
original_df = None
|
| 38 |
# available_models = [] # to be filled in later
|
| 39 |
available_models = list(model_info.keys())
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def _tab_leaderboard():
|
| 42 |
-
global original_df, available_models
|
| 43 |
with gr.TabItem("π Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
| 44 |
default_main_df = original_df.copy()
|
| 45 |
-
default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
| 46 |
-
default_main_df_no_task = default_main_df.copy()
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
# gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small top-left-LP")
|
| 53 |
-
# with gr.Row():
|
| 54 |
-
# with gr.Column(scale=2):
|
| 55 |
-
# md = gr.Markdown(" ### π More presentation options β¬οΈ", elem_classes="markdown-text")
|
| 56 |
-
|
| 57 |
-
# with gr.Column(scale=3):
|
| 58 |
-
# with gr.Column(scale=2):
|
| 59 |
-
# gr.Markdown(f"""**__πͺ§ Default options:__** K={DEFAULT_K}; Hybrid-Macro; for best corr w/ LMSYS Elo.""", elem_classes="markdown-text")
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
# gr.Markdown(LENGTH_MARGIN_DESC_MD, elem_classes="markdown-text-tiny no_margin")
|
| 63 |
-
with gr.Column(scale=5):
|
| 64 |
-
with gr.Accordion("π¬ Metric explanations", open=False, elem_classes="accordion-label"):
|
| 65 |
-
gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small no_margin")
|
| 66 |
-
rank_column_radio = gr.Radio(["π+π― Hybrid", "π Reward-Mix (Pairwise)", "π― Score (Individual)", "π WB Elo (beta)" ], show_label=False, elem_id="rank-column-radio",
|
| 67 |
-
value="π WB Elo (beta)"
|
| 68 |
-
# value="π+π― Hybrid"
|
| 69 |
-
)
|
| 70 |
-
with gr.Column(scale=2):
|
| 71 |
-
with gr.Row():
|
| 72 |
-
checkbox_show_task_categorized = gr.Checkbox(label="π by Task Type", elem_id="show-task-categorized", value=False)
|
| 73 |
-
show_open_source_model_only = gr.Checkbox(label="π Open Models", elem_id="show-open-source-models", value=False)
|
| 74 |
# with gr.Row():
|
| 75 |
# with gr.Column(scale=2):
|
| 76 |
|
| 77 |
leaderboard_table = gr.components.Dataframe(
|
| 78 |
-
value=
|
| 79 |
datatype= ["number", "markdown", "markdown", "number"],
|
| 80 |
# max_rows=None,
|
| 81 |
height=6000,
|
| 82 |
elem_id="leaderboard-table",
|
| 83 |
interactive=False,
|
| 84 |
visible=True,
|
| 85 |
-
column_widths=[50, 260,
|
| 86 |
wrap=True
|
| 87 |
# min_width=60,
|
| 88 |
)
|
| 89 |
# checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 90 |
# show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 91 |
# rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 92 |
-
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
def _tab_submit():
|
|
|
|
| 37 |
original_df = None
|
| 38 |
# available_models = [] # to be filled in later
|
| 39 |
available_models = list(model_info.keys())
|
| 40 |
+
|
| 41 |
+
def df_filters(mode_selection_radio, show_open_source_model_only):
|
| 42 |
+
global original_df
|
| 43 |
+
# remove the rows when the model contains "β"
|
| 44 |
+
original_df = original_df[~original_df["Model"].str.contains("β")]
|
| 45 |
+
|
| 46 |
+
modes = {
|
| 47 |
+
"greedy": ["greedy"],
|
| 48 |
+
"sampling (Temp=0.5)": ["sampling"],
|
| 49 |
+
"all": ["greedy", "sampling"]
|
| 50 |
+
}
|
| 51 |
+
# filter the df by the mode_selection_radio
|
| 52 |
+
default_main_df = original_df[original_df["Mode"].isin(modes[mode_selection_radio])]
|
| 53 |
+
default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
| 54 |
+
return default_main_df.copy()
|
| 55 |
+
|
| 56 |
+
def _gstr(text):
|
| 57 |
+
return gr.Text(text, visible=False)
|
| 58 |
+
|
| 59 |
def _tab_leaderboard():
|
| 60 |
+
global original_df, available_models
|
| 61 |
with gr.TabItem("π Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
| 62 |
default_main_df = original_df.copy()
|
| 63 |
+
# default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
| 64 |
+
# default_main_df_no_task = default_main_df.copy()
|
| 65 |
+
default_mode = "greedy"
|
| 66 |
+
default_main_df = df_filters(default_mode, False)
|
| 67 |
+
with gr.Row():
|
| 68 |
+
with gr.Column(scale=5):
|
| 69 |
+
mode_selection_radio = gr.Radio(["greedy", "sampling (Temp=0.5)", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
# with gr.Row():
|
| 71 |
# with gr.Column(scale=2):
|
| 72 |
|
| 73 |
leaderboard_table = gr.components.Dataframe(
|
| 74 |
+
value=default_main_df,
|
| 75 |
datatype= ["number", "markdown", "markdown", "number"],
|
| 76 |
# max_rows=None,
|
| 77 |
height=6000,
|
| 78 |
elem_id="leaderboard-table",
|
| 79 |
interactive=False,
|
| 80 |
visible=True,
|
| 81 |
+
column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
|
| 82 |
wrap=True
|
| 83 |
# min_width=60,
|
| 84 |
)
|
| 85 |
# checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 86 |
# show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 87 |
# rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 88 |
+
mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
|
| 89 |
+
|
| 90 |
|
| 91 |
|
| 92 |
def _tab_submit():
|
data_utils.py
CHANGED
|
@@ -32,11 +32,8 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column
|
|
| 32 |
if col == "Model" and click_url:
|
| 33 |
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
|
| 34 |
else:
|
| 35 |
-
df[col] = df[col].apply(formatter) # For numerical values
|
| 36 |
-
|
| 37 |
-
df[col] = df[col].replace('-', np.nan).astype(float)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
df.rename(columns=column_names, inplace=True)
|
| 41 |
list_columns = [col for col in ordered_columns if col in df.columns]
|
| 42 |
df = df[list_columns]
|
|
|
|
| 32 |
if col == "Model" and click_url:
|
| 33 |
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
|
| 34 |
else:
|
| 35 |
+
df[col] = df[col].apply(formatter) # For numerical values
|
| 36 |
+
|
|
|
|
|
|
|
|
|
|
| 37 |
df.rename(columns=column_names, inplace=True)
|
| 38 |
list_columns = [col for col in ordered_columns if col in df.columns]
|
| 39 |
df = df[list_columns]
|
model_info.json
CHANGED
|
@@ -53,6 +53,7 @@
|
|
| 53 |
"deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
|
| 54 |
"gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
|
| 55 |
"gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
|
|
|
|
| 56 |
"neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
|
| 57 |
"Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
|
| 58 |
"vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
|
|
|
|
| 53 |
"deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
|
| 54 |
"gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
|
| 55 |
"gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
|
| 56 |
+
"gemma-2-9b-it": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it", "hidden": true},
|
| 57 |
"neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
|
| 58 |
"Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
|
| 59 |
"vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
|
update_data.sh
CHANGED
|
@@ -1,40 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
curl -L -o zeroeval.zip https://github.com/yuchenlin/ZeroEval/archive/refs/heads/main.zip
|
| 6 |
-
unzip zeroeval.zip
|
| 7 |
-
rm zeroeval.zip
|
| 8 |
-
|
| 9 |
-
#!/bin/bash
|
| 10 |
-
|
| 11 |
-
# Define the target directory and the exception folder
|
| 12 |
-
EXCEPTION_FOLDER="result_dirs"
|
| 13 |
-
|
| 14 |
-
# Ensure the target directory exists
|
| 15 |
-
if [ -d "$TARGET_DIR" ]; then
|
| 16 |
-
# Loop through each item in the target directory
|
| 17 |
-
for item in "$TARGET_DIR"/*; do
|
| 18 |
-
# Check if it is not the exception folder
|
| 19 |
-
if [ "$(basename "$item")" != "$EXCEPTION_FOLDER" ]; then
|
| 20 |
-
# Remove the item (file or directory)
|
| 21 |
-
rm -rf "$item"
|
| 22 |
-
echo "Removed: $item"
|
| 23 |
-
fi
|
| 24 |
-
done
|
| 25 |
-
else
|
| 26 |
-
echo "Target directory does not exist: $TARGET_DIR"
|
| 27 |
-
fi
|
| 28 |
-
|
| 29 |
-
# only keep the result_dirs/zebra-grid under result_dirs folder; remove all other sub-folders under result_dirs
|
| 30 |
-
# Remove all subdirectories in result_dirs except zebra-grid
|
| 31 |
-
find "$TARGET_DIR/result_dirs" -maxdepth 1 -type d ! -name 'zebra-grid' ! -name 'result_dirs' -exec rm -rf {} +
|
| 32 |
-
|
| 33 |
-
rm -rf $TARGET_DIR/.github
|
| 34 |
-
rm -rf $TARGET_DIR/.gitignore
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# tables
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
# bash update_table.sh
|
|
|
|
| 1 |
+
# download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
|
| 2 |
+
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
| 3 |
+
mkdir -p ZeroEval-main/result_dirs
|
| 4 |
+
wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils_display.py
CHANGED
|
@@ -7,7 +7,9 @@ def make_clickable_model(model_name):
|
|
| 7 |
global model_info
|
| 8 |
modified_model_name = model_name
|
| 9 |
if model_name in model_info:
|
| 10 |
-
|
|
|
|
|
|
|
| 11 |
link = model_info[model_name]["hf_model_id"]
|
| 12 |
modified_model_name = f'π <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
|
| 13 |
else:
|
|
@@ -16,6 +18,9 @@ def make_clickable_model(model_name):
|
|
| 16 |
if "Neo-7B" in modified_model_name:
|
| 17 |
# models that are fully open source
|
| 18 |
modified_model_name = modified_model_name.replace("π", "ππ")
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
if "π¨</a>" in modified_model_name:
|
| 21 |
modified_model_name = modified_model_name.replace(' π¨</a>', '</a> π¨')
|
|
|
|
| 7 |
global model_info
|
| 8 |
modified_model_name = model_name
|
| 9 |
if model_name in model_info:
|
| 10 |
+
is_open_model = model_info[model_name]["hf_model_id"].startswith("http")
|
| 11 |
+
is_open_model = model_info[model_name].get("open", False)
|
| 12 |
+
if not is_open_model:
|
| 13 |
link = model_info[model_name]["hf_model_id"]
|
| 14 |
modified_model_name = f'π <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
|
| 15 |
else:
|
|
|
|
| 18 |
if "Neo-7B" in modified_model_name:
|
| 19 |
# models that are fully open source
|
| 20 |
modified_model_name = modified_model_name.replace("π", "ππ")
|
| 21 |
+
hidden = model_info[model_name].get("hidden", False)
|
| 22 |
+
if hidden:
|
| 23 |
+
modified_model_name = f'β {modified_model_name}'
|
| 24 |
|
| 25 |
if "π¨</a>" in modified_model_name:
|
| 26 |
modified_model_name = modified_model_name.replace(' π¨</a>', '</a> π¨')
|