Spaces:
Running
Running
modify paper names and paths to datasets
Browse files- README.md +5 -4
- _header.md +1 -2
- app.py +36 -36
- constants.py +31 -32
- eval_utils.py +1 -1
README.md
CHANGED
|
@@ -10,12 +10,12 @@ pinned: true
|
|
| 10 |
fullWidth: true
|
| 11 |
hf_oauth: true
|
| 12 |
api: false
|
| 13 |
-
tags:
|
| 14 |
- leaderboard
|
| 15 |
-
datasets:
|
| 16 |
- allenai/ZebraLogicBench
|
| 17 |
-
-
|
| 18 |
-
models:
|
| 19 |
- Qwen/Qwen2-72B-Instruct
|
| 20 |
- Qwen/Qwen1.5-72B-Chat
|
| 21 |
- Qwen/Qwen1.5-7B-Chat
|
|
@@ -58,3 +58,4 @@ models:
|
|
| 58 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 59 |
|
| 60 |
Paper: arxiv.org/abs/2406.04770
|
|
|
|
|
|
| 10 |
fullWidth: true
|
| 11 |
hf_oauth: true
|
| 12 |
api: false
|
| 13 |
+
tags:
|
| 14 |
- leaderboard
|
| 15 |
+
datasets:
|
| 16 |
- allenai/ZebraLogicBench
|
| 17 |
+
- WildEval/ZebraLogic
|
| 18 |
+
models:
|
| 19 |
- Qwen/Qwen2-72B-Instruct
|
| 20 |
- Qwen/Qwen1.5-72B-Chat
|
| 21 |
- Qwen/Qwen1.5-7B-Chat
|
|
|
|
| 58 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 59 |
|
| 60 |
Paper: arxiv.org/abs/2406.04770
|
| 61 |
+
Paper: arxiv.org/abs/2502.01100
|
_header.md
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
<br/>
|
| 2 |
|
| 3 |
-
# 🦓 ZebraLogic:
|
| 4 |
<!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
| 5 |
[📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
| 6 |
-
|
|
|
|
| 1 |
<br/>
|
| 2 |
|
| 3 |
+
# 🦓 ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning
|
| 4 |
<!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
| 5 |
[📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
|
|
app.py
CHANGED
|
@@ -12,16 +12,16 @@ import pandas as pd
|
|
| 12 |
from pathlib import Path
|
| 13 |
import json
|
| 14 |
from constants import *
|
| 15 |
-
from datetime import datetime, timezone
|
| 16 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
| 17 |
-
import os, uuid
|
| 18 |
from utils_display import model_info
|
| 19 |
from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
|
| 20 |
import pytz
|
| 21 |
from data_utils import post_processing, get_random_item
|
| 22 |
|
| 23 |
# get the last updated time from the elo_ranks.all.jsonl file
|
| 24 |
-
LAST_UPDATED = None
|
| 25 |
# with open("_intro.md", "r") as f:
|
| 26 |
# INTRO_MD = f.read()
|
| 27 |
INTRO_MD = ""
|
|
@@ -33,11 +33,11 @@ with open("_header.md", "r") as f:
|
|
| 33 |
|
| 34 |
with open("_metrics.md", "r") as f:
|
| 35 |
METRICS_MD = f.read()
|
| 36 |
-
|
| 37 |
-
raw_data = None
|
| 38 |
-
original_df = None
|
| 39 |
# available_models = [] # to be filled in later
|
| 40 |
-
available_models = list(model_info.keys())
|
| 41 |
|
| 42 |
def df_filters(mode_selection_radio, show_open_source_model_only):
|
| 43 |
global original_df
|
|
@@ -59,19 +59,19 @@ def _gstr(text):
|
|
| 59 |
|
| 60 |
def _tab_leaderboard():
|
| 61 |
global original_df, available_models
|
| 62 |
-
# with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
| 63 |
if True:
|
| 64 |
-
default_main_df = original_df.copy()
|
| 65 |
# default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
| 66 |
-
# default_main_df_no_task = default_main_df.copy()
|
| 67 |
default_mode = "greedy"
|
| 68 |
default_main_df = df_filters(default_mode, False)
|
| 69 |
-
with gr.Row():
|
| 70 |
-
with gr.Column(scale=5):
|
| 71 |
mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
|
| 72 |
# with gr.Row():
|
| 73 |
# with gr.Column(scale=2):
|
| 74 |
-
|
| 75 |
leaderboard_table = gr.components.Dataframe(
|
| 76 |
value=default_main_df,
|
| 77 |
datatype= ["number", "markdown", "markdown", "number"],
|
|
@@ -83,7 +83,7 @@ def _tab_leaderboard():
|
|
| 83 |
column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
|
| 84 |
wrap=True
|
| 85 |
# min_width=60,
|
| 86 |
-
)
|
| 87 |
# checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 88 |
# show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 89 |
# rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
|
@@ -121,14 +121,14 @@ def _tab_explore():
|
|
| 121 |
# greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
|
| 122 |
gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
|
| 123 |
explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
|
| 124 |
-
|
| 125 |
puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
|
| 126 |
model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
|
| 127 |
model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
|
| 128 |
turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
|
| 129 |
model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
|
| 130 |
-
explore_button.click(fn=sample_explore_item,
|
| 131 |
-
inputs=[model_selection, size_H_selection, size_W_selection],
|
| 132 |
outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
|
| 133 |
|
| 134 |
|
|
@@ -136,8 +136,8 @@ def _tab_explore():
|
|
| 136 |
def _tab_submit():
|
| 137 |
markdown_text = """
|
| 138 |
Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
|
| 139 |
-
If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
|
| 140 |
-
and apply for the access for the [private dataset](https://huggingface.co/datasets/
|
| 141 |
"""
|
| 142 |
|
| 143 |
gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
|
|
@@ -149,33 +149,33 @@ def build_demo():
|
|
| 149 |
|
| 150 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
| 151 |
gr.HTML(BANNER, elem_id="banner")
|
| 152 |
-
# convert LAST_UPDATED to the PDT time
|
| 153 |
LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
|
| 154 |
header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
|
| 155 |
-
gr.Markdown(header_md_text, elem_classes="markdown-text")
|
| 156 |
|
| 157 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 158 |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
| 159 |
-
_tab_leaderboard()
|
| 160 |
with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
|
| 161 |
_tab_explore()
|
| 162 |
with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
|
| 163 |
-
_tab_submit()
|
| 164 |
|
| 165 |
with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
|
| 166 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
| 167 |
-
|
| 168 |
with gr.Row():
|
| 169 |
with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
|
| 170 |
gr.Textbox(
|
| 171 |
-
value=CITATION_TEXT,
|
| 172 |
lines=7,
|
| 173 |
label="Copy the BibTeX snippet to cite this source",
|
| 174 |
elem_id="citation-button",
|
| 175 |
show_copy_button=True)
|
| 176 |
# ).style(show_copy_button=True)
|
| 177 |
|
| 178 |
-
return demo
|
| 179 |
|
| 180 |
|
| 181 |
|
|
@@ -184,11 +184,11 @@ def data_load(result_file):
|
|
| 184 |
print(f"Loading {result_file}")
|
| 185 |
column_names_main = column_names.copy()
|
| 186 |
# column_names_main.update({})
|
| 187 |
-
main_ordered_columns = ORDERED_COLUMN_NAMES
|
| 188 |
-
# filter the data with Total Puzzles == 1000
|
| 189 |
-
|
| 190 |
-
click_url = True
|
| 191 |
-
# read json file from the result_file
|
| 192 |
with open(result_file, "r") as f:
|
| 193 |
raw_data = json.load(f)
|
| 194 |
# floatify the data, if possible
|
|
@@ -201,16 +201,16 @@ def data_load(result_file):
|
|
| 201 |
original_df = pd.DataFrame(raw_data)
|
| 202 |
original_df = original_df[original_df["Total Puzzles"] == 1000]
|
| 203 |
original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
|
| 204 |
-
# print(original_df.columns)
|
| 205 |
-
|
| 206 |
|
| 207 |
if __name__ == "__main__":
|
| 208 |
parser = argparse.ArgumentParser()
|
| 209 |
parser.add_argument("--share", action="store_true")
|
| 210 |
parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
|
| 211 |
-
|
| 212 |
args = parser.parse_args()
|
| 213 |
-
data_load(args.result_file)
|
| 214 |
print(original_df)
|
| 215 |
demo = build_demo()
|
| 216 |
demo.launch(share=args.share, height=3000, width="100%")
|
|
|
|
| 12 |
from pathlib import Path
|
| 13 |
import json
|
| 14 |
from constants import *
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
| 17 |
+
import os, uuid
|
| 18 |
from utils_display import model_info
|
| 19 |
from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
|
| 20 |
import pytz
|
| 21 |
from data_utils import post_processing, get_random_item
|
| 22 |
|
| 23 |
# get the last updated time from the elo_ranks.all.jsonl file
|
| 24 |
+
LAST_UPDATED = None
|
| 25 |
# with open("_intro.md", "r") as f:
|
| 26 |
# INTRO_MD = f.read()
|
| 27 |
INTRO_MD = ""
|
|
|
|
| 33 |
|
| 34 |
with open("_metrics.md", "r") as f:
|
| 35 |
METRICS_MD = f.read()
|
| 36 |
+
|
| 37 |
+
raw_data = None
|
| 38 |
+
original_df = None
|
| 39 |
# available_models = [] # to be filled in later
|
| 40 |
+
available_models = list(model_info.keys())
|
| 41 |
|
| 42 |
def df_filters(mode_selection_radio, show_open_source_model_only):
|
| 43 |
global original_df
|
|
|
|
| 59 |
|
| 60 |
def _tab_leaderboard():
|
| 61 |
global original_df, available_models
|
| 62 |
+
# with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
| 63 |
if True:
|
| 64 |
+
default_main_df = original_df.copy()
|
| 65 |
# default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
|
| 66 |
+
# default_main_df_no_task = default_main_df.copy()
|
| 67 |
default_mode = "greedy"
|
| 68 |
default_main_df = df_filters(default_mode, False)
|
| 69 |
+
with gr.Row():
|
| 70 |
+
with gr.Column(scale=5):
|
| 71 |
mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
|
| 72 |
# with gr.Row():
|
| 73 |
# with gr.Column(scale=2):
|
| 74 |
+
|
| 75 |
leaderboard_table = gr.components.Dataframe(
|
| 76 |
value=default_main_df,
|
| 77 |
datatype= ["number", "markdown", "markdown", "number"],
|
|
|
|
| 83 |
column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
|
| 84 |
wrap=True
|
| 85 |
# min_width=60,
|
| 86 |
+
)
|
| 87 |
# checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 88 |
# show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
| 89 |
# rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
|
|
|
|
| 121 |
# greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
|
| 122 |
gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
|
| 123 |
explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
|
| 124 |
+
|
| 125 |
puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
|
| 126 |
model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
|
| 127 |
model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
|
| 128 |
turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
|
| 129 |
model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
|
| 130 |
+
explore_button.click(fn=sample_explore_item,
|
| 131 |
+
inputs=[model_selection, size_H_selection, size_W_selection],
|
| 132 |
outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
|
| 133 |
|
| 134 |
|
|
|
|
| 136 |
def _tab_submit():
|
| 137 |
markdown_text = """
|
| 138 |
Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
|
| 139 |
+
If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
|
| 140 |
+
and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
|
| 141 |
"""
|
| 142 |
|
| 143 |
gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
|
|
|
|
| 149 |
|
| 150 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
| 151 |
gr.HTML(BANNER, elem_id="banner")
|
| 152 |
+
# convert LAST_UPDATED to the PDT time
|
| 153 |
LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
|
| 154 |
header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
|
| 155 |
+
gr.Markdown(header_md_text, elem_classes="markdown-text")
|
| 156 |
|
| 157 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 158 |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
| 159 |
+
_tab_leaderboard()
|
| 160 |
with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
|
| 161 |
_tab_explore()
|
| 162 |
with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
|
| 163 |
+
_tab_submit()
|
| 164 |
|
| 165 |
with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
|
| 166 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
| 167 |
+
|
| 168 |
with gr.Row():
|
| 169 |
with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
|
| 170 |
gr.Textbox(
|
| 171 |
+
value=CITATION_TEXT,
|
| 172 |
lines=7,
|
| 173 |
label="Copy the BibTeX snippet to cite this source",
|
| 174 |
elem_id="citation-button",
|
| 175 |
show_copy_button=True)
|
| 176 |
# ).style(show_copy_button=True)
|
| 177 |
|
| 178 |
+
return demo
|
| 179 |
|
| 180 |
|
| 181 |
|
|
|
|
| 184 |
print(f"Loading {result_file}")
|
| 185 |
column_names_main = column_names.copy()
|
| 186 |
# column_names_main.update({})
|
| 187 |
+
main_ordered_columns = ORDERED_COLUMN_NAMES
|
| 188 |
+
# filter the data with Total Puzzles == 1000
|
| 189 |
+
|
| 190 |
+
click_url = True
|
| 191 |
+
# read json file from the result_file
|
| 192 |
with open(result_file, "r") as f:
|
| 193 |
raw_data = json.load(f)
|
| 194 |
# floatify the data, if possible
|
|
|
|
| 201 |
original_df = pd.DataFrame(raw_data)
|
| 202 |
original_df = original_df[original_df["Total Puzzles"] == 1000]
|
| 203 |
original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
|
| 204 |
+
# print(original_df.columns)
|
| 205 |
+
|
| 206 |
|
| 207 |
if __name__ == "__main__":
|
| 208 |
parser = argparse.ArgumentParser()
|
| 209 |
parser.add_argument("--share", action="store_true")
|
| 210 |
parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
|
| 211 |
+
|
| 212 |
args = parser.parse_args()
|
| 213 |
+
data_load(args.result_file)
|
| 214 |
print(original_df)
|
| 215 |
demo = build_demo()
|
| 216 |
demo.launch(share=args.share, height=3000, width="100%")
|
constants.py
CHANGED
|
@@ -8,15 +8,15 @@ banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_ba
|
|
| 8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
| 9 |
|
| 10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
|
| 11 |
-
|
| 12 |
|
| 13 |
CITATION_TEXT = """
|
| 14 |
|
| 15 |
-
@
|
| 16 |
-
title={ZebraLogic:
|
| 17 |
-
author={Bill Yuchen Lin and Ronan Le Bras and Peter Clark and Yejin Choi},
|
| 18 |
-
|
| 19 |
-
|
| 20 |
}
|
| 21 |
|
| 22 |
|
|
@@ -27,15 +27,15 @@ CITATION_TEXT = """
|
|
| 27 |
volume={36},
|
| 28 |
year={2024}
|
| 29 |
}
|
| 30 |
-
|
| 31 |
"""
|
| 32 |
|
| 33 |
# make column_names as an ordered dict
|
| 34 |
-
|
| 35 |
|
| 36 |
|
| 37 |
column_names = OrderedDict({
|
| 38 |
-
"Model": "Model",
|
| 39 |
"Mode": "Mode",
|
| 40 |
"Puzzle Acc": "Puzzle Acc",
|
| 41 |
"Cell Acc": "Cell Acc",
|
|
@@ -48,29 +48,29 @@ column_names = OrderedDict({
|
|
| 48 |
|
| 49 |
|
| 50 |
|
| 51 |
-
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
|
| 52 |
"""
|
| 53 |
|
| 54 |
# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
|
| 55 |
-
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
|
| 56 |
# **WB Score** individually scores each model based on checklists.
|
| 57 |
# Evaluator is GPT-4-Turbo.
|
| 58 |
-
LEADERBOARD_REMARKS_MAIN = """
|
| 59 |
"""
|
| 60 |
-
|
| 61 |
RANKING_COLUMN = "Puzzle Acc"
|
| 62 |
|
| 63 |
ORDERED_COLUMN_NAMES = [
|
| 64 |
-
"Model",
|
| 65 |
"Mode",
|
| 66 |
"Puzzle Acc",
|
| 67 |
"Easy Puzzle Acc",
|
| 68 |
"Hard Puzzle Acc",
|
| 69 |
"Cell Acc",
|
| 70 |
-
"No answer",
|
| 71 |
]
|
| 72 |
|
| 73 |
-
|
| 74 |
js_light = """
|
| 75 |
function refresh() {
|
| 76 |
const url = new URL(window.location);
|
|
@@ -110,15 +110,15 @@ function refresh() {
|
|
| 110 |
|
| 111 |
js_code = """
|
| 112 |
function scroll_top() {
|
| 113 |
-
console.log("Hello from Gradio!");
|
| 114 |
const bubbles = document.querySelectorAll('.bubble-wrap');
|
| 115 |
bubbles.forEach((bubble, index) => {
|
| 116 |
setTimeout(() => {
|
| 117 |
bubble.scrollTop = 0;
|
| 118 |
}, index * 100); // Delay of 100ms between each iteration
|
| 119 |
});
|
| 120 |
-
|
| 121 |
-
}
|
| 122 |
"""
|
| 123 |
|
| 124 |
|
|
@@ -126,7 +126,7 @@ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtW
|
|
| 126 |
|
| 127 |
css = """
|
| 128 |
|
| 129 |
-
|
| 130 |
|
| 131 |
code {
|
| 132 |
font-size: large;
|
|
@@ -179,17 +179,17 @@ td {
|
|
| 179 |
.chat-common{
|
| 180 |
height: auto;
|
| 181 |
max-height: 400px;
|
| 182 |
-
min-height: 100px;
|
| 183 |
}
|
| 184 |
.chat-specific{
|
| 185 |
height: auto;
|
| 186 |
max-height: 600px;
|
| 187 |
-
min-height: 200px;
|
| 188 |
}
|
| 189 |
#od-benchmark-tab-table-button{
|
| 190 |
font-size: 15pt;
|
| 191 |
font-weight: bold;
|
| 192 |
-
}
|
| 193 |
|
| 194 |
.btn_boderline{
|
| 195 |
border: 1px solid #000000;
|
|
@@ -197,7 +197,7 @@ td {
|
|
| 197 |
padding: 5px;
|
| 198 |
margin: 5px;
|
| 199 |
font-size: 15pt;
|
| 200 |
-
font-weight: bold;
|
| 201 |
}
|
| 202 |
|
| 203 |
.btn_boderline_next{
|
|
@@ -206,7 +206,7 @@ td {
|
|
| 206 |
padding: 5px;
|
| 207 |
margin: 5px;
|
| 208 |
font-size: 15pt;
|
| 209 |
-
font-weight: bold;
|
| 210 |
}
|
| 211 |
|
| 212 |
.btn_boderline_gray{
|
|
@@ -215,7 +215,7 @@ td {
|
|
| 215 |
padding: 5px;
|
| 216 |
margin: 5px;
|
| 217 |
font-size: 15pt;
|
| 218 |
-
font-weight: italic;
|
| 219 |
}
|
| 220 |
.btn_boderline_selected{
|
| 221 |
border: 2px solid purple;
|
|
@@ -224,12 +224,12 @@ td {
|
|
| 224 |
padding: 5px;
|
| 225 |
margin: 5px;
|
| 226 |
font-size: 15pt;
|
| 227 |
-
font-weight: bold;
|
| 228 |
}
|
| 229 |
.accordion-label button span{
|
| 230 |
font-size: 14pt;
|
| 231 |
font-weight: bold;
|
| 232 |
-
}
|
| 233 |
|
| 234 |
#show-task-categorized span{
|
| 235 |
font-size: 13pt;
|
|
@@ -269,7 +269,7 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
| 269 |
.plotly-plot{
|
| 270 |
height: auto;
|
| 271 |
max-height: 600px;
|
| 272 |
-
min-height: 600px;
|
| 273 |
}
|
| 274 |
|
| 275 |
#length-margin-radio{
|
|
@@ -279,12 +279,12 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
| 279 |
}
|
| 280 |
|
| 281 |
#show-task-categorized{
|
| 282 |
-
font-size: 12pt;
|
| 283 |
font-decoration: bold;
|
| 284 |
}
|
| 285 |
|
| 286 |
#show-open-source-models{
|
| 287 |
-
font-size: 12pt;
|
| 288 |
font-decoration: bold;
|
| 289 |
}
|
| 290 |
|
|
@@ -296,4 +296,3 @@ button.selected[role="tab"][aria-selected="true"] {
|
|
| 296 |
margin: 5px;
|
| 297 |
}
|
| 298 |
"""
|
| 299 |
-
|
|
|
|
| 8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
| 9 |
|
| 10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
|
| 11 |
+
|
| 12 |
|
| 13 |
CITATION_TEXT = """
|
| 14 |
|
| 15 |
+
@article{zebralogic2025,
|
| 16 |
+
title={ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning},
|
| 17 |
+
author={Bill Yuchen Lin and Ronan Le Bras and Kyle Richardson and Ashish Sabharwal and Radha Poovendran and Peter Clark and Yejin Choi},
|
| 18 |
+
year={2025},
|
| 19 |
+
url={https://arxiv.org/abs/2502.01100},
|
| 20 |
}
|
| 21 |
|
| 22 |
|
|
|
|
| 27 |
volume={36},
|
| 28 |
year={2024}
|
| 29 |
}
|
| 30 |
+
|
| 31 |
"""
|
| 32 |
|
| 33 |
# make column_names as an ordered dict
|
| 34 |
+
|
| 35 |
|
| 36 |
|
| 37 |
column_names = OrderedDict({
|
| 38 |
+
"Model": "Model",
|
| 39 |
"Mode": "Mode",
|
| 40 |
"Puzzle Acc": "Puzzle Acc",
|
| 41 |
"Cell Acc": "Cell Acc",
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
|
| 51 |
+
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
|
| 52 |
"""
|
| 53 |
|
| 54 |
# **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
|
| 55 |
+
# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
|
| 56 |
# **WB Score** individually scores each model based on checklists.
|
| 57 |
# Evaluator is GPT-4-Turbo.
|
| 58 |
+
LEADERBOARD_REMARKS_MAIN = """
|
| 59 |
"""
|
| 60 |
+
|
| 61 |
RANKING_COLUMN = "Puzzle Acc"
|
| 62 |
|
| 63 |
ORDERED_COLUMN_NAMES = [
|
| 64 |
+
"Model",
|
| 65 |
"Mode",
|
| 66 |
"Puzzle Acc",
|
| 67 |
"Easy Puzzle Acc",
|
| 68 |
"Hard Puzzle Acc",
|
| 69 |
"Cell Acc",
|
| 70 |
+
"No answer",
|
| 71 |
]
|
| 72 |
|
| 73 |
+
|
| 74 |
js_light = """
|
| 75 |
function refresh() {
|
| 76 |
const url = new URL(window.location);
|
|
|
|
| 110 |
|
| 111 |
js_code = """
|
| 112 |
function scroll_top() {
|
| 113 |
+
console.log("Hello from Gradio!");
|
| 114 |
const bubbles = document.querySelectorAll('.bubble-wrap');
|
| 115 |
bubbles.forEach((bubble, index) => {
|
| 116 |
setTimeout(() => {
|
| 117 |
bubble.scrollTop = 0;
|
| 118 |
}, index * 100); // Delay of 100ms between each iteration
|
| 119 |
});
|
| 120 |
+
|
| 121 |
+
}
|
| 122 |
"""
|
| 123 |
|
| 124 |
|
|
|
|
| 126 |
|
| 127 |
css = """
|
| 128 |
|
| 129 |
+
|
| 130 |
|
| 131 |
code {
|
| 132 |
font-size: large;
|
|
|
|
| 179 |
.chat-common{
|
| 180 |
height: auto;
|
| 181 |
max-height: 400px;
|
| 182 |
+
min-height: 100px;
|
| 183 |
}
|
| 184 |
.chat-specific{
|
| 185 |
height: auto;
|
| 186 |
max-height: 600px;
|
| 187 |
+
min-height: 200px;
|
| 188 |
}
|
| 189 |
#od-benchmark-tab-table-button{
|
| 190 |
font-size: 15pt;
|
| 191 |
font-weight: bold;
|
| 192 |
+
}
|
| 193 |
|
| 194 |
.btn_boderline{
|
| 195 |
border: 1px solid #000000;
|
|
|
|
| 197 |
padding: 5px;
|
| 198 |
margin: 5px;
|
| 199 |
font-size: 15pt;
|
| 200 |
+
font-weight: bold;
|
| 201 |
}
|
| 202 |
|
| 203 |
.btn_boderline_next{
|
|
|
|
| 206 |
padding: 5px;
|
| 207 |
margin: 5px;
|
| 208 |
font-size: 15pt;
|
| 209 |
+
font-weight: bold;
|
| 210 |
}
|
| 211 |
|
| 212 |
.btn_boderline_gray{
|
|
|
|
| 215 |
padding: 5px;
|
| 216 |
margin: 5px;
|
| 217 |
font-size: 15pt;
|
| 218 |
+
font-weight: italic;
|
| 219 |
}
|
| 220 |
.btn_boderline_selected{
|
| 221 |
border: 2px solid purple;
|
|
|
|
| 224 |
padding: 5px;
|
| 225 |
margin: 5px;
|
| 226 |
font-size: 15pt;
|
| 227 |
+
font-weight: bold;
|
| 228 |
}
|
| 229 |
.accordion-label button span{
|
| 230 |
font-size: 14pt;
|
| 231 |
font-weight: bold;
|
| 232 |
+
}
|
| 233 |
|
| 234 |
#show-task-categorized span{
|
| 235 |
font-size: 13pt;
|
|
|
|
| 269 |
.plotly-plot{
|
| 270 |
height: auto;
|
| 271 |
max-height: 600px;
|
| 272 |
+
min-height: 600px;
|
| 273 |
}
|
| 274 |
|
| 275 |
#length-margin-radio{
|
|
|
|
| 279 |
}
|
| 280 |
|
| 281 |
#show-task-categorized{
|
| 282 |
+
font-size: 12pt;
|
| 283 |
font-decoration: bold;
|
| 284 |
}
|
| 285 |
|
| 286 |
#show-open-source-models{
|
| 287 |
+
font-size: 12pt;
|
| 288 |
font-decoration: bold;
|
| 289 |
}
|
| 290 |
|
|
|
|
| 296 |
margin: 5px;
|
| 297 |
}
|
| 298 |
"""
|
|
|
eval_utils.py
CHANGED
|
@@ -8,7 +8,7 @@ private_solutions = {}
|
|
| 8 |
|
| 9 |
def load_private_solutions():
|
| 10 |
global private_solutions
|
| 11 |
-
private_zebra_data = load_dataset("
|
| 12 |
for item in private_zebra_data:
|
| 13 |
private_solutions[item["id"]] = item["solution"]
|
| 14 |
return
|
|
|
|
| 8 |
|
| 9 |
def load_private_solutions():
|
| 10 |
global private_solutions
|
| 11 |
+
private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
|
| 12 |
for item in private_zebra_data:
|
| 13 |
private_solutions[item["id"]] = item["solution"]
|
| 14 |
return
|