Spaces:
Running
Running
update results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
|
@@ -43,6 +43,17 @@
|
|
| 43 |
"Total Puzzles": 1000,
|
| 44 |
"Reason Lens": "439.96"
|
| 45 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
{
|
| 47 |
"Model": "gpt-4o-2024-05-13",
|
| 48 |
"Mode": "sampling",
|
|
@@ -54,6 +65,28 @@
|
|
| 54 |
"Total Puzzles": 1000,
|
| 55 |
"Reason Lens": "1549.74"
|
| 56 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
{
|
| 58 |
"Model": "Mistral-Large-2",
|
| 59 |
"Mode": "greedy",
|
|
@@ -120,6 +153,17 @@
|
|
| 120 |
"Total Puzzles": 1000,
|
| 121 |
"Reason Lens": "1165.90"
|
| 122 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
{
|
| 124 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
| 125 |
"Mode": "greedy",
|
|
@@ -142,6 +186,17 @@
|
|
| 142 |
"Total Puzzles": 1000,
|
| 143 |
"Reason Lens": "1260.23"
|
| 144 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
{
|
| 146 |
"Model": "Qwen2-72B-Instruct",
|
| 147 |
"Mode": "greedy",
|
|
@@ -483,6 +538,17 @@
|
|
| 483 |
"Total Puzzles": 1000,
|
| 484 |
"Reason Lens": "1473.23"
|
| 485 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
{
|
| 487 |
"Model": "gemma-2-2b-it",
|
| 488 |
"Mode": "greedy",
|
|
|
|
| 43 |
"Total Puzzles": 1000,
|
| 44 |
"Reason Lens": "439.96"
|
| 45 |
},
|
| 46 |
+
{
|
| 47 |
+
"Model": "gpt-4o-2024-08-06",
|
| 48 |
+
"Mode": "greedy",
|
| 49 |
+
"Puzzle Acc": "31.70",
|
| 50 |
+
"Cell Acc": "50.34",
|
| 51 |
+
"No answer": "3.60",
|
| 52 |
+
"Easy Puzzle Acc": "84.64",
|
| 53 |
+
"Hard Puzzle Acc": "11.11",
|
| 54 |
+
"Total Puzzles": 1000,
|
| 55 |
+
"Reason Lens": "1106.51"
|
| 56 |
+
},
|
| 57 |
{
|
| 58 |
"Model": "gpt-4o-2024-05-13",
|
| 59 |
"Mode": "sampling",
|
|
|
|
| 65 |
"Total Puzzles": 1000,
|
| 66 |
"Reason Lens": "1549.74"
|
| 67 |
},
|
| 68 |
+
{
|
| 69 |
+
"Model": "gemini-1.5-pro-exp-0827",
|
| 70 |
+
"Mode": "greedy",
|
| 71 |
+
"Puzzle Acc": "30.50",
|
| 72 |
+
"Cell Acc": "50.84",
|
| 73 |
+
"No answer": "0.80",
|
| 74 |
+
"Easy Puzzle Acc": "79.64",
|
| 75 |
+
"Hard Puzzle Acc": "11.39",
|
| 76 |
+
"Total Puzzles": 1000,
|
| 77 |
+
"Reason Lens": "1594.47"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"Model": "chatgpt-4o-latest-24-09-07",
|
| 81 |
+
"Mode": "greedy",
|
| 82 |
+
"Puzzle Acc": "29.90",
|
| 83 |
+
"Cell Acc": "48.83",
|
| 84 |
+
"No answer": "4.20",
|
| 85 |
+
"Easy Puzzle Acc": "81.43",
|
| 86 |
+
"Hard Puzzle Acc": "9.86",
|
| 87 |
+
"Total Puzzles": 1000,
|
| 88 |
+
"Reason Lens": "1539.99"
|
| 89 |
+
},
|
| 90 |
{
|
| 91 |
"Model": "Mistral-Large-2",
|
| 92 |
"Mode": "greedy",
|
|
|
|
| 153 |
"Total Puzzles": 1000,
|
| 154 |
"Reason Lens": "1165.90"
|
| 155 |
},
|
| 156 |
+
{
|
| 157 |
+
"Model": "gemini-1.5-pro-exp-0801",
|
| 158 |
+
"Mode": "greedy",
|
| 159 |
+
"Puzzle Acc": "25.20",
|
| 160 |
+
"Cell Acc": "48.50",
|
| 161 |
+
"No answer": "0.00",
|
| 162 |
+
"Easy Puzzle Acc": "72.50",
|
| 163 |
+
"Hard Puzzle Acc": "6.81",
|
| 164 |
+
"Total Puzzles": 1000,
|
| 165 |
+
"Reason Lens": "1389.75"
|
| 166 |
+
},
|
| 167 |
{
|
| 168 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
| 169 |
"Mode": "greedy",
|
|
|
|
| 186 |
"Total Puzzles": 1000,
|
| 187 |
"Reason Lens": "1260.23"
|
| 188 |
},
|
| 189 |
+
{
|
| 190 |
+
"Model": "deepseek-v2.5-0908",
|
| 191 |
+
"Mode": "greedy",
|
| 192 |
+
"Puzzle Acc": "22.10",
|
| 193 |
+
"Cell Acc": "38.01",
|
| 194 |
+
"No answer": "12.70",
|
| 195 |
+
"Easy Puzzle Acc": "68.21",
|
| 196 |
+
"Hard Puzzle Acc": "4.17",
|
| 197 |
+
"Total Puzzles": 1000,
|
| 198 |
+
"Reason Lens": "1294.46"
|
| 199 |
+
},
|
| 200 |
{
|
| 201 |
"Model": "Qwen2-72B-Instruct",
|
| 202 |
"Mode": "greedy",
|
|
|
|
| 538 |
"Total Puzzles": 1000,
|
| 539 |
"Reason Lens": "1473.23"
|
| 540 |
},
|
| 541 |
+
{
|
| 542 |
+
"Model": "Phi-3.5-mini-instruct",
|
| 543 |
+
"Mode": "greedy",
|
| 544 |
+
"Puzzle Acc": "6.40",
|
| 545 |
+
"Cell Acc": "5.98",
|
| 546 |
+
"No answer": "80.60",
|
| 547 |
+
"Easy Puzzle Acc": "21.79",
|
| 548 |
+
"Hard Puzzle Acc": "0.42",
|
| 549 |
+
"Total Puzzles": 1000,
|
| 550 |
+
"Reason Lens": "718.43"
|
| 551 |
+
},
|
| 552 |
{
|
| 553 |
"Model": "gemma-2-2b-it",
|
| 554 |
"Mode": "greedy",
|