Spaces:
Running
Running
update o1 results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
|
@@ -1,4 +1,15 @@
|
|
| 1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
{
|
| 3 |
"Model": "claude-3-5-sonnet-20240620",
|
| 4 |
"Mode": "greedy",
|
|
@@ -22,7 +33,7 @@
|
|
| 22 |
"Reason Lens": "1153.83"
|
| 23 |
},
|
| 24 |
{
|
| 25 |
-
"Model": "Llama-3.1-405B-
|
| 26 |
"Mode": "greedy",
|
| 27 |
"Puzzle Acc": "32.60",
|
| 28 |
"Cell Acc": "45.80",
|
|
@@ -33,7 +44,7 @@
|
|
| 33 |
"Reason Lens": "314.66"
|
| 34 |
},
|
| 35 |
{
|
| 36 |
-
"Model": "Llama-3.1-405B-
|
| 37 |
"Mode": "sampling",
|
| 38 |
"Puzzle Acc": "32.60",
|
| 39 |
"Cell Acc": "47.04",
|
|
@@ -76,6 +87,17 @@
|
|
| 76 |
"Total Puzzles": 1000,
|
| 77 |
"Reason Lens": "1594.47"
|
| 78 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
{
|
| 80 |
"Model": "chatgpt-4o-latest-24-09-07",
|
| 81 |
"Mode": "greedy",
|
|
@@ -164,6 +186,28 @@
|
|
| 164 |
"Total Puzzles": 1000,
|
| 165 |
"Reason Lens": "1389.75"
|
| 166 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
{
|
| 168 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
| 169 |
"Mode": "greedy",
|
|
@@ -176,7 +220,7 @@
|
|
| 176 |
"Reason Lens": "1483.68"
|
| 177 |
},
|
| 178 |
{
|
| 179 |
-
"Model": "deepseek-chat",
|
| 180 |
"Mode": "greedy",
|
| 181 |
"Puzzle Acc": "22.70",
|
| 182 |
"Cell Acc": "42.46",
|
|
@@ -209,7 +253,7 @@
|
|
| 209 |
"Reason Lens": "1813.82"
|
| 210 |
},
|
| 211 |
{
|
| 212 |
-
"Model": "deepseek-coder",
|
| 213 |
"Mode": "greedy",
|
| 214 |
"Puzzle Acc": "21.10",
|
| 215 |
"Cell Acc": "41.58",
|
|
@@ -220,7 +264,7 @@
|
|
| 220 |
"Reason Lens": "1324.55"
|
| 221 |
},
|
| 222 |
{
|
| 223 |
-
"Model": "
|
| 224 |
"Mode": "greedy",
|
| 225 |
"Puzzle Acc": "20.50",
|
| 226 |
"Cell Acc": "42.35",
|
|
@@ -352,7 +396,7 @@
|
|
| 352 |
"Reason Lens": "391.19"
|
| 353 |
},
|
| 354 |
{
|
| 355 |
-
"Model": "gemma-2-27b-it
|
| 356 |
"Mode": "greedy",
|
| 357 |
"Puzzle Acc": "16.30",
|
| 358 |
"Cell Acc": "41.18",
|
|
@@ -407,7 +451,7 @@
|
|
| 407 |
"Reason Lens": "1043.90"
|
| 408 |
},
|
| 409 |
{
|
| 410 |
-
"Model": "gemma-2-9b-it
|
| 411 |
"Mode": "greedy",
|
| 412 |
"Puzzle Acc": "12.80",
|
| 413 |
"Cell Acc": "36.79",
|
|
|
|
| 1 |
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "o1-mini-2024-09-12",
|
| 4 |
+
"Mode": "greedy",
|
| 5 |
+
"Puzzle Acc": "52.60",
|
| 6 |
+
"Cell Acc": "52.29",
|
| 7 |
+
"No answer": "0.80",
|
| 8 |
+
"Easy Puzzle Acc": "87.14",
|
| 9 |
+
"Hard Puzzle Acc": "39.17",
|
| 10 |
+
"Total Puzzles": 1000,
|
| 11 |
+
"Reason Lens": "993.28"
|
| 12 |
+
},
|
| 13 |
{
|
| 14 |
"Model": "claude-3-5-sonnet-20240620",
|
| 15 |
"Mode": "greedy",
|
|
|
|
| 33 |
"Reason Lens": "1153.83"
|
| 34 |
},
|
| 35 |
{
|
| 36 |
+
"Model": "Llama-3.1-405B-Inst-fp8@together",
|
| 37 |
"Mode": "greedy",
|
| 38 |
"Puzzle Acc": "32.60",
|
| 39 |
"Cell Acc": "45.80",
|
|
|
|
| 44 |
"Reason Lens": "314.66"
|
| 45 |
},
|
| 46 |
{
|
| 47 |
+
"Model": "Llama-3.1-405B-Inst-fp8@together",
|
| 48 |
"Mode": "sampling",
|
| 49 |
"Puzzle Acc": "32.60",
|
| 50 |
"Cell Acc": "47.04",
|
|
|
|
| 87 |
"Total Puzzles": 1000,
|
| 88 |
"Reason Lens": "1594.47"
|
| 89 |
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "Llama-3.1-405B-Inst@sambanova",
|
| 92 |
+
"Mode": "greedy",
|
| 93 |
+
"Puzzle Acc": "30.10",
|
| 94 |
+
"Cell Acc": "39.06",
|
| 95 |
+
"No answer": "24.70",
|
| 96 |
+
"Easy Puzzle Acc": "84.64",
|
| 97 |
+
"Hard Puzzle Acc": "8.89",
|
| 98 |
+
"Total Puzzles": 1000,
|
| 99 |
+
"Reason Lens": "2001.12"
|
| 100 |
+
},
|
| 101 |
{
|
| 102 |
"Model": "chatgpt-4o-latest-24-09-07",
|
| 103 |
"Mode": "greedy",
|
|
|
|
| 186 |
"Total Puzzles": 1000,
|
| 187 |
"Reason Lens": "1389.75"
|
| 188 |
},
|
| 189 |
+
{
|
| 190 |
+
"Model": "Llama-3.1-405B-Inst@hyperbolic",
|
| 191 |
+
"Mode": "greedy",
|
| 192 |
+
"Puzzle Acc": "25.00",
|
| 193 |
+
"Cell Acc": "46.62",
|
| 194 |
+
"No answer": "6.25",
|
| 195 |
+
"Easy Puzzle Acc": "66.67",
|
| 196 |
+
"Hard Puzzle Acc": "15.38",
|
| 197 |
+
"Total Puzzles": 16,
|
| 198 |
+
"Reason Lens": "1517.13"
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"Model": "gemini-1.5-flash-exp-0827",
|
| 202 |
+
"Mode": "greedy",
|
| 203 |
+
"Puzzle Acc": "25.00",
|
| 204 |
+
"Cell Acc": "43.56",
|
| 205 |
+
"No answer": "8.50",
|
| 206 |
+
"Easy Puzzle Acc": "70.71",
|
| 207 |
+
"Hard Puzzle Acc": "7.22",
|
| 208 |
+
"Total Puzzles": 1000,
|
| 209 |
+
"Reason Lens": "1705.11"
|
| 210 |
+
},
|
| 211 |
{
|
| 212 |
"Model": "Meta-Llama-3.1-70B-Instruct",
|
| 213 |
"Mode": "greedy",
|
|
|
|
| 220 |
"Reason Lens": "1483.68"
|
| 221 |
},
|
| 222 |
{
|
| 223 |
+
"Model": "deepseek-v2-chat-0628",
|
| 224 |
"Mode": "greedy",
|
| 225 |
"Puzzle Acc": "22.70",
|
| 226 |
"Cell Acc": "42.46",
|
|
|
|
| 253 |
"Reason Lens": "1813.82"
|
| 254 |
},
|
| 255 |
{
|
| 256 |
+
"Model": "deepseek-v2-coder-0614",
|
| 257 |
"Mode": "greedy",
|
| 258 |
"Puzzle Acc": "21.10",
|
| 259 |
"Cell Acc": "41.58",
|
|
|
|
| 264 |
"Reason Lens": "1324.55"
|
| 265 |
},
|
| 266 |
{
|
| 267 |
+
"Model": "deepseek-v2-coder-0724",
|
| 268 |
"Mode": "greedy",
|
| 269 |
"Puzzle Acc": "20.50",
|
| 270 |
"Cell Acc": "42.35",
|
|
|
|
| 396 |
"Reason Lens": "391.19"
|
| 397 |
},
|
| 398 |
{
|
| 399 |
+
"Model": "gemma-2-27b-it",
|
| 400 |
"Mode": "greedy",
|
| 401 |
"Puzzle Acc": "16.30",
|
| 402 |
"Cell Acc": "41.18",
|
|
|
|
| 451 |
"Reason Lens": "1043.90"
|
| 452 |
},
|
| 453 |
{
|
| 454 |
+
"Model": "gemma-2-9b-it",
|
| 455 |
"Mode": "greedy",
|
| 456 |
"Puzzle Acc": "12.80",
|
| 457 |
"Cell Acc": "36.79",
|