Spaces:

allenai
/

ZebraLogic

Running

App Files Files Community

yuchenlin commited on Oct 22, 2024

Commit

1f16e7a

1 Parent(s): db67fbd

rename the github link

Browse files

Files changed (7) hide show

ZeroEval-main/result_dirs/zebra-grid.summary.json +44 -0
_about_us.md +1 -1
_header.md +1 -1
app.py +2 -2
constants.py +1 -1
data_utils.py +1 -1
update_data.sh +3 -3

ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED Viewed

@@ -175,6 +175,17 @@
     "Total Puzzles": 1000,
     "Reason Lens": "855.72"
   },
   {
     "Model": "gpt-4-turbo-2024-04-09",
     "Mode": "sampling",
@@ -186,6 +197,17 @@
     "Total Puzzles": 1000,
     "Reason Lens": "1165.90"
   },
   {
     "Model": "gemini-1.5-pro-exp-0801",
     "Mode": "greedy",
@@ -472,6 +494,17 @@
     "Total Puzzles": 1000,
     "Reason Lens": "849.84"
   },
   {
     "Model": "Meta-Llama-3-8B-Instruct",
     "Mode": "greedy",
@@ -604,6 +637,17 @@
     "Total Puzzles": 1000,
     "Reason Lens": "718.43"
   },
   {
     "Model": "gemma-2-2b-it",
     "Mode": "greedy",

     "Total Puzzles": 1000,
     "Reason Lens": "855.72"
   },
+  {
+    "Model": "Qwen2.5-72B-Instruct",
+    "Mode": "greedy",
+    "Puzzle Acc": "26.60",
+    "Cell Acc": "40.92",
+    "No answer": "11.90",
+    "Easy Puzzle Acc": "76.43",
+    "Hard Puzzle Acc": "7.22",
+    "Total Puzzles": 1000,
+    "Reason Lens": "1795.90"
+  },
   {
     "Model": "gpt-4-turbo-2024-04-09",
     "Mode": "sampling",
     "Total Puzzles": 1000,
     "Reason Lens": "1165.90"
   },
+  {
+    "Model": "Qwen2.5-32B-Instruct",
+    "Mode": "greedy",
+    "Puzzle Acc": "26.10",
+    "Cell Acc": "43.39",
+    "No answer": "6.30",
+    "Easy Puzzle Acc": "77.50",
+    "Hard Puzzle Acc": "6.11",
+    "Total Puzzles": 1000,
+    "Reason Lens": "1333.07"
+  },
   {
     "Model": "gemini-1.5-pro-exp-0801",
     "Mode": "greedy",
     "Total Puzzles": 1000,
     "Reason Lens": "849.84"
   },
+  {
+    "Model": "Qwen2.5-7B-Instruct",
+    "Mode": "greedy",
+    "Puzzle Acc": "12.00",
+    "Cell Acc": "30.67",
+    "No answer": "9.50",
+    "Easy Puzzle Acc": "38.93",
+    "Hard Puzzle Acc": "1.53",
+    "Total Puzzles": 1000,
+    "Reason Lens": "850.93"
+  },
   {
     "Model": "Meta-Llama-3-8B-Instruct",
     "Mode": "greedy",
     "Total Puzzles": 1000,
     "Reason Lens": "718.43"
   },
+  {
+    "Model": "Qwen2.5-3B-Instruct",
+    "Mode": "greedy",
+    "Puzzle Acc": "4.80",
+    "Cell Acc": "11.44",
+    "No answer": "56.70",
+    "Easy Puzzle Acc": "17.14",
+    "Hard Puzzle Acc": "0.00",
+    "Total Puzzles": 1000,
+    "Reason Lens": "906.58"
+  },
   {
     "Model": "gemma-2-2b-it",
     "Mode": "greedy",

_about_us.md CHANGED Viewed

@@ -10,6 +10,6 @@ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organiz
 ### Contact
 Please contact us in the following ways:
-- Github Issues/PRs: [https://github.com/yuchenlin/ZeroEval/](https://github.com/yuchenlin/ZeroEval/)
 - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org

 ### Contact
 Please contact us in the following ways:
+- Github Issues/PRs: [https://github.com/WildEval/ZeroEval/](https://github.com/WildEval/ZeroEval/)
 - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org

_header.md CHANGED Viewed

@@ -2,5 +2,5 @@
 # 🦓 ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
 <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
-[📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/yuchenlin/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**

 # 🦓 ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
 <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
+[📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**

app.py CHANGED Viewed

@@ -135,8 +135,8 @@ def _tab_explore():
 def _tab_submit():
     markdown_text = """
-    Please create an issue on our [Github](https://github.com/yuchenlin/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
-    If you would like to do local testing, please read our code [here](https://github.com/yuchenlin/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
     and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
     """

 def _tab_submit():
     markdown_text = """
+    Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
+    If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
     and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
     """

constants.py CHANGED Viewed

@@ -4,7 +4,7 @@ from collections import OrderedDict
 DEFAULT_K = "∞"
 # DEFAULT_K = "1500"
-banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
 BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
 # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"

 DEFAULT_K = "∞"
 # DEFAULT_K = "1500"
+banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
 BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
 # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"

data_utils.py CHANGED Viewed

@@ -49,7 +49,7 @@ def load_all_data():
         model_summary = json.load(f)
     model_names = [model["Model"] for model in model_summary]
     for model_name in model_names:
-        download_url = f"https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
         output_file = os.path.join(result_dir, f"{model_name}.json")
         # mkdir -p result_dir if not exists
         os.makedirs(result_dir, exist_ok=True)

         model_summary = json.load(f)
     model_names = [model["Model"] for model in model_summary]
     for model_name in model_names:
+        download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
         output_file = os.path.join(result_dir, f"{model_name}.json")
         # mkdir -p result_dir if not exists
         os.makedirs(result_dir, exist_ok=True)

update_data.sh CHANGED Viewed

@@ -1,5 +1,5 @@
-# download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
 # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
 mkdir -p ZeroEval-main/result_dirs/zebra-grid/
-wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
-wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json

+# download the file from https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json
 # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
 mkdir -p ZeroEval-main/result_dirs/zebra-grid/
+wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
+wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json