Spaces:
Running
Running
update results
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
|
@@ -120,6 +120,17 @@
|
|
| 120 |
"Total Puzzles": 1000,
|
| 121 |
"Reason Lens": "1324.55"
|
| 122 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
{
|
| 124 |
"Model": "gemini-1.5-pro",
|
| 125 |
"Mode": "sampling",
|
|
@@ -361,5 +372,16 @@
|
|
| 361 |
"Hard Puzzle Acc": "0.00",
|
| 362 |
"Total Puzzles": 1000,
|
| 363 |
"Reason Lens": "1592.60"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
}
|
| 365 |
]
|
|
|
|
| 120 |
"Total Puzzles": 1000,
|
| 121 |
"Reason Lens": "1324.55"
|
| 122 |
},
|
| 123 |
+
{
|
| 124 |
+
"Model": "gpt-4o-mini-2024-07-18",
|
| 125 |
+
"Mode": "greedy",
|
| 126 |
+
"Puzzle Acc": "20.10",
|
| 127 |
+
"Cell Acc": "41.26",
|
| 128 |
+
"No answer": "0.10",
|
| 129 |
+
"Easy Puzzle Acc": "62.50",
|
| 130 |
+
"Hard Puzzle Acc": "3.61",
|
| 131 |
+
"Total Puzzles": 1000,
|
| 132 |
+
"Reason Lens": "943.52"
|
| 133 |
+
},
|
| 134 |
{
|
| 135 |
"Model": "gemini-1.5-pro",
|
| 136 |
"Mode": "sampling",
|
|
|
|
| 372 |
"Hard Puzzle Acc": "0.00",
|
| 373 |
"Total Puzzles": 1000,
|
| 374 |
"Reason Lens": "1592.60"
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"Model": "gemma-2-27b-it@vllm",
|
| 378 |
+
"Mode": "greedy",
|
| 379 |
+
"Puzzle Acc": "0.47",
|
| 380 |
+
"Cell Acc": "0.31",
|
| 381 |
+
"No answer": "96.23",
|
| 382 |
+
"Easy Puzzle Acc": "2.08",
|
| 383 |
+
"Hard Puzzle Acc": "0.00",
|
| 384 |
+
"Total Puzzles": 212,
|
| 385 |
+
"Reason Lens": "1280.62"
|
| 386 |
}
|
| 387 |
]
|
model_info.json
CHANGED
|
@@ -32,6 +32,7 @@
|
|
| 32 |
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
| 33 |
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
| 34 |
"gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
|
|
|
|
| 35 |
"gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
|
| 36 |
"gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
|
| 37 |
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},
|
|
|
|
| 32 |
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
| 33 |
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
| 34 |
"gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
|
| 35 |
+
"gpt-4o-mini-2024-07-18": {"pretty_name": "gpt-4o-mini-2024-07-18", "hf_model_id": "https://platform.openai.com/"},
|
| 36 |
"gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
|
| 37 |
"gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
|
| 38 |
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},
|