Spaces:
Running
Running
rename the github link
Browse files- ZeroEval-main/result_dirs/zebra-grid.summary.json +44 -0
- _about_us.md +1 -1
- _header.md +1 -1
- app.py +2 -2
- constants.py +1 -1
- data_utils.py +1 -1
- update_data.sh +3 -3
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
|
@@ -175,6 +175,17 @@
|
|
| 175 |
"Total Puzzles": 1000,
|
| 176 |
"Reason Lens": "855.72"
|
| 177 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
{
|
| 179 |
"Model": "gpt-4-turbo-2024-04-09",
|
| 180 |
"Mode": "sampling",
|
|
@@ -186,6 +197,17 @@
|
|
| 186 |
"Total Puzzles": 1000,
|
| 187 |
"Reason Lens": "1165.90"
|
| 188 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
{
|
| 190 |
"Model": "gemini-1.5-pro-exp-0801",
|
| 191 |
"Mode": "greedy",
|
|
@@ -472,6 +494,17 @@
|
|
| 472 |
"Total Puzzles": 1000,
|
| 473 |
"Reason Lens": "849.84"
|
| 474 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
{
|
| 476 |
"Model": "Meta-Llama-3-8B-Instruct",
|
| 477 |
"Mode": "greedy",
|
|
@@ -604,6 +637,17 @@
|
|
| 604 |
"Total Puzzles": 1000,
|
| 605 |
"Reason Lens": "718.43"
|
| 606 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
{
|
| 608 |
"Model": "gemma-2-2b-it",
|
| 609 |
"Mode": "greedy",
|
|
|
|
| 175 |
"Total Puzzles": 1000,
|
| 176 |
"Reason Lens": "855.72"
|
| 177 |
},
|
| 178 |
+
{
|
| 179 |
+
"Model": "Qwen2.5-72B-Instruct",
|
| 180 |
+
"Mode": "greedy",
|
| 181 |
+
"Puzzle Acc": "26.60",
|
| 182 |
+
"Cell Acc": "40.92",
|
| 183 |
+
"No answer": "11.90",
|
| 184 |
+
"Easy Puzzle Acc": "76.43",
|
| 185 |
+
"Hard Puzzle Acc": "7.22",
|
| 186 |
+
"Total Puzzles": 1000,
|
| 187 |
+
"Reason Lens": "1795.90"
|
| 188 |
+
},
|
| 189 |
{
|
| 190 |
"Model": "gpt-4-turbo-2024-04-09",
|
| 191 |
"Mode": "sampling",
|
|
|
|
| 197 |
"Total Puzzles": 1000,
|
| 198 |
"Reason Lens": "1165.90"
|
| 199 |
},
|
| 200 |
+
{
|
| 201 |
+
"Model": "Qwen2.5-32B-Instruct",
|
| 202 |
+
"Mode": "greedy",
|
| 203 |
+
"Puzzle Acc": "26.10",
|
| 204 |
+
"Cell Acc": "43.39",
|
| 205 |
+
"No answer": "6.30",
|
| 206 |
+
"Easy Puzzle Acc": "77.50",
|
| 207 |
+
"Hard Puzzle Acc": "6.11",
|
| 208 |
+
"Total Puzzles": 1000,
|
| 209 |
+
"Reason Lens": "1333.07"
|
| 210 |
+
},
|
| 211 |
{
|
| 212 |
"Model": "gemini-1.5-pro-exp-0801",
|
| 213 |
"Mode": "greedy",
|
|
|
|
| 494 |
"Total Puzzles": 1000,
|
| 495 |
"Reason Lens": "849.84"
|
| 496 |
},
|
| 497 |
+
{
|
| 498 |
+
"Model": "Qwen2.5-7B-Instruct",
|
| 499 |
+
"Mode": "greedy",
|
| 500 |
+
"Puzzle Acc": "12.00",
|
| 501 |
+
"Cell Acc": "30.67",
|
| 502 |
+
"No answer": "9.50",
|
| 503 |
+
"Easy Puzzle Acc": "38.93",
|
| 504 |
+
"Hard Puzzle Acc": "1.53",
|
| 505 |
+
"Total Puzzles": 1000,
|
| 506 |
+
"Reason Lens": "850.93"
|
| 507 |
+
},
|
| 508 |
{
|
| 509 |
"Model": "Meta-Llama-3-8B-Instruct",
|
| 510 |
"Mode": "greedy",
|
|
|
|
| 637 |
"Total Puzzles": 1000,
|
| 638 |
"Reason Lens": "718.43"
|
| 639 |
},
|
| 640 |
+
{
|
| 641 |
+
"Model": "Qwen2.5-3B-Instruct",
|
| 642 |
+
"Mode": "greedy",
|
| 643 |
+
"Puzzle Acc": "4.80",
|
| 644 |
+
"Cell Acc": "11.44",
|
| 645 |
+
"No answer": "56.70",
|
| 646 |
+
"Easy Puzzle Acc": "17.14",
|
| 647 |
+
"Hard Puzzle Acc": "0.00",
|
| 648 |
+
"Total Puzzles": 1000,
|
| 649 |
+
"Reason Lens": "906.58"
|
| 650 |
+
},
|
| 651 |
{
|
| 652 |
"Model": "gemma-2-2b-it",
|
| 653 |
"Mode": "greedy",
|
_about_us.md
CHANGED
|
@@ -10,6 +10,6 @@ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organiz
|
|
| 10 |
### Contact
|
| 11 |
|
| 12 |
Please contact us in the following ways:
|
| 13 |
-
- Github Issues/PRs: [https://github.com/
|
| 14 |
- Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
|
| 15 |
|
|
|
|
| 10 |
### Contact
|
| 11 |
|
| 12 |
Please contact us in the following ways:
|
| 13 |
+
- Github Issues/PRs: [https://github.com/WildEval/ZeroEval/](https://github.com/WildEval/ZeroEval/)
|
| 14 |
- Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
|
| 15 |
|
_header.md
CHANGED
|
@@ -2,5 +2,5 @@
|
|
| 2 |
|
| 3 |
# π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
|
| 4 |
<!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
| 5 |
-
[π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/
|
| 6 |
|
|
|
|
| 2 |
|
| 3 |
# π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
|
| 4 |
<!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) | -->
|
| 5 |
+
[π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/WildEval/ZeroEval) | [π€ HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [π¦ X](https://twitter.com/billyuchenlin/) | [π¬ Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
|
| 6 |
|
app.py
CHANGED
|
@@ -135,8 +135,8 @@ def _tab_explore():
|
|
| 135 |
|
| 136 |
def _tab_submit():
|
| 137 |
markdown_text = """
|
| 138 |
-
Please create an issue on our [Github](https://github.com/
|
| 139 |
-
If you would like to do local testing, please read our code [here](https://github.com/
|
| 140 |
and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
|
| 141 |
"""
|
| 142 |
|
|
|
|
| 135 |
|
| 136 |
def _tab_submit():
|
| 137 |
markdown_text = """
|
| 138 |
+
Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
|
| 139 |
+
If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
|
| 140 |
and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
|
| 141 |
"""
|
| 142 |
|
constants.py
CHANGED
|
@@ -4,7 +4,7 @@ from collections import OrderedDict
|
|
| 4 |
DEFAULT_K = "β"
|
| 5 |
# DEFAULT_K = "1500"
|
| 6 |
|
| 7 |
-
banner_url = "https://github.com/
|
| 8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
| 9 |
|
| 10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
|
|
|
|
| 4 |
DEFAULT_K = "β"
|
| 5 |
# DEFAULT_K = "1500"
|
| 6 |
|
| 7 |
+
banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
|
| 8 |
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
|
| 9 |
|
| 10 |
# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
|
data_utils.py
CHANGED
|
@@ -49,7 +49,7 @@ def load_all_data():
|
|
| 49 |
model_summary = json.load(f)
|
| 50 |
model_names = [model["Model"] for model in model_summary]
|
| 51 |
for model_name in model_names:
|
| 52 |
-
download_url = f"https://raw.githubusercontent.com/
|
| 53 |
output_file = os.path.join(result_dir, f"{model_name}.json")
|
| 54 |
# mkdir -p result_dir if not exists
|
| 55 |
os.makedirs(result_dir, exist_ok=True)
|
|
|
|
| 49 |
model_summary = json.load(f)
|
| 50 |
model_names = [model["Model"] for model in model_summary]
|
| 51 |
for model_name in model_names:
|
| 52 |
+
download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
|
| 53 |
output_file = os.path.join(result_dir, f"{model_name}.json")
|
| 54 |
# mkdir -p result_dir if not exists
|
| 55 |
os.makedirs(result_dir, exist_ok=True)
|
update_data.sh
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
# download the file from https://raw.githubusercontent.com/
|
| 2 |
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
| 3 |
mkdir -p ZeroEval-main/result_dirs/zebra-grid/
|
| 4 |
-
wget https://raw.githubusercontent.com/
|
| 5 |
-
wget https://raw.githubusercontent.com/
|
|
|
|
| 1 |
+
# download the file from https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json
|
| 2 |
# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
|
| 3 |
mkdir -p ZeroEval-main/result_dirs/zebra-grid/
|
| 4 |
+
wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
|
| 5 |
+
wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
|