Commit
·
b605a32
1
Parent(s):
a2d5ea0
Add Humanity's Last Exam, LiveBench and LiveCodeBench; Remove Codeforces; Update Simple Bench
Browse files- app.py +255 -30
- codeforces_leaderboard.jsonl +0 -6
- humanitys_last_exam.jsonl +5 -0
- livebench.jsonl +56 -0
- livebench_coding.jsonl +56 -0
- livebench_data_analysis.jsonl +56 -0
- livebench_if.jsonl +56 -0
- livebench_language.jsonl +56 -0
- livebench_mathematics.jsonl +56 -0
- livebench_reasoning.jsonl +56 -0
- livecodebench.jsonl +26 -0
- models.jsonl +56 -0
- simple_bench_leaderboard.jsonl +5 -1
app.py
CHANGED
|
@@ -187,16 +187,25 @@ with gr.Blocks() as demo:
|
|
| 187 |
|
| 188 |
| Benchmark | Top Score |
|
| 189 |
|-----------|-----------|
|
|
|
|
| 190 |
| BigCodeBench | 🟠 36% |
|
| 191 |
| Simple Bench | 🟠 42% |
|
| 192 |
| PlanBench | 🟠 53% |
|
| 193 |
| GAIA | 🟡 65% |
|
|
|
|
|
|
|
|
|
|
| 194 |
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
|
|
|
|
| 195 |
| GPQA | 🟡 76% |
|
|
|
|
| 196 |
| ZebraLogic | 🟡 81% |
|
|
|
|
| 197 |
| ARC-AGI-Pub (Public Eval) | 🟡 83% |
|
|
|
|
| 198 |
| ZeroEval | 🟡 86% |
|
| 199 |
| MATH-L5 | 🟡 89% |
|
|
|
|
| 200 |
| MMLU-Redux | 🟢 93% |
|
| 201 |
| CRUX | 🟢 96% |
|
| 202 |
|
|
@@ -209,6 +218,11 @@ with gr.Blocks() as demo:
|
|
| 209 |
| 🟡 Yellow | 60% to 90% |
|
| 210 |
| 🟢 Green | Above 90% |"""
|
| 211 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
|
| 213 |
bigcodebench_plot: gr.Plot = gr.Plot()
|
| 214 |
bigcodebench_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -229,6 +243,21 @@ with gr.Blocks() as demo:
|
|
| 229 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
| 230 |
value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
|
| 231 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
|
| 233 |
with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
|
| 234 |
arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
|
|
@@ -237,16 +266,36 @@ with gr.Blocks() as demo:
|
|
| 237 |
arc_agi_markdown: gr.Markdown = gr.Markdown(
|
| 238 |
value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
|
| 239 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
with gr.Tab("🟡 GPQA") as gpqa_tab:
|
| 241 |
gpqa_plot: gr.Plot = gr.Plot()
|
| 242 |
gpqa_markdown: gr.Markdown = gr.Markdown(
|
| 243 |
value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
|
| 244 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
|
| 246 |
zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
|
| 247 |
zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
|
| 248 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
| 249 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
|
| 251 |
zeroeval_average_plot: gr.Plot = gr.Plot()
|
| 252 |
zeroeval_average_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -257,6 +306,11 @@ with gr.Blocks() as demo:
|
|
| 257 |
zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
|
| 258 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
| 259 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
|
| 261 |
zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
|
| 262 |
zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -267,8 +321,6 @@ with gr.Blocks() as demo:
|
|
| 267 |
zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
|
| 268 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
| 269 |
)
|
| 270 |
-
with gr.Tab("Codeforces") as codeforces_tab:
|
| 271 |
-
codeforces_plot: gr.Plot = gr.Plot()
|
| 272 |
with gr.Tab("OpenCompass", visible=False):
|
| 273 |
opencompass_plot: gr.Plot = gr.Plot()
|
| 274 |
opencompass_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -284,6 +336,107 @@ with gr.Blocks() as demo:
|
|
| 284 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
| 285 |
value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
|
| 286 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
with gr.Tab("Finance") as finance_tab:
|
| 288 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
| 289 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
|
@@ -292,24 +445,30 @@ with gr.Blocks() as demo:
|
|
| 292 |
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
| 293 |
arc_agi_public_eval_tab.select(fn=create_simple_plot,
|
| 294 |
inputs=[gr.State("arc_agi_leaderboard.jsonl"),
|
| 295 |
-
gr.State(
|
| 296 |
-
|
|
|
|
|
|
|
| 297 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 298 |
gr.State(0), gr.State(100),
|
| 299 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
| 300 |
outputs=arc_agi_public_eval_plot)
|
| 301 |
arc_agi_tab.select(fn=create_simple_plot,
|
| 302 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
| 303 |
-
gr.State(
|
| 304 |
-
|
|
|
|
|
|
|
| 305 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 306 |
gr.State(0), gr.State(100),
|
| 307 |
gr.State({"MTurkers": 77})],
|
| 308 |
outputs=arc_agi_semi_private_eval_plot)
|
| 309 |
arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
|
| 310 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
| 311 |
-
gr.State(
|
| 312 |
-
|
|
|
|
|
|
|
| 313 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 314 |
gr.State(0), gr.State(100),
|
| 315 |
gr.State({"MTurkers": 77})],
|
|
@@ -318,35 +477,31 @@ with gr.Blocks() as demo:
|
|
| 318 |
simple_bench_tab.select(fn=create_simple_plot,
|
| 319 |
inputs=[gr.State("simple_bench_leaderboard.jsonl"),
|
| 320 |
gr.State("Simple Bench Score"),
|
| 321 |
-
gr.State(
|
| 322 |
-
|
|
|
|
| 323 |
gr.State(0), gr.State(100),
|
| 324 |
gr.State({"Humans": 83.7})],
|
| 325 |
outputs=simple_bench_plot)
|
| 326 |
-
codeforces_tab.select(fn=create_simple_plot,
|
| 327 |
-
inputs=[gr.State("codeforces_leaderboard.jsonl"),
|
| 328 |
-
gr.State("Codeforces Rating"),
|
| 329 |
-
gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
|
| 330 |
-
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 331 |
-
gr.State(0), gr.State(4000),
|
| 332 |
-
gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
|
| 333 |
-
outputs=codeforces_plot)
|
| 334 |
planbench_tab.select(fn=create_simple_plot,
|
| 335 |
inputs=[gr.State("planbench_leaderboard.jsonl"),
|
| 336 |
gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
|
| 337 |
-
gr.State(
|
|
|
|
| 338 |
gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
|
| 339 |
outputs=planbench_plot)
|
| 340 |
bigcodebench_tab.select(fn=create_simple_plot,
|
| 341 |
inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
|
| 342 |
gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
|
| 343 |
-
gr.State(
|
|
|
|
| 344 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
|
| 345 |
outputs=bigcodebench_plot)
|
| 346 |
gaia_tab.select(fn=create_simple_plot,
|
| 347 |
inputs=[gr.State("gaia_leaderboard.jsonl"),
|
| 348 |
gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
|
| 349 |
-
gr.State(
|
|
|
|
| 350 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
|
| 351 |
gr.State(0), gr.State(100),
|
| 352 |
gr.State({"Humans": 92})],
|
|
@@ -354,7 +509,8 @@ with gr.Blocks() as demo:
|
|
| 354 |
gpqa_tab.select(fn=create_simple_plot,
|
| 355 |
inputs=[gr.State("gpqa_leaderboard.jsonl"),
|
| 356 |
gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
|
| 357 |
-
gr.State(
|
|
|
|
| 358 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
|
| 359 |
gr.State(25), gr.State(100),
|
| 360 |
gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
|
|
@@ -362,34 +518,103 @@ with gr.Blocks() as demo:
|
|
| 362 |
zeroeval_average_tab.select(fn=create_simple_plot,
|
| 363 |
inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
|
| 364 |
gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
|
| 365 |
-
gr.State(
|
|
|
|
| 366 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 367 |
outputs=zeroeval_average_plot)
|
| 368 |
zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
|
| 369 |
inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
|
| 370 |
-
gr.State(
|
| 371 |
-
|
|
|
|
|
|
|
| 372 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 373 |
outputs=zeroeval_mmlu_redux_plot)
|
| 374 |
zeroeval_zebralogic_tab.select(fn=create_simple_plot,
|
| 375 |
inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
|
| 376 |
gr.State("ZeroEval ZebraLogic Score"),
|
| 377 |
-
gr.State(
|
|
|
|
| 378 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 379 |
outputs=zeroeval_zebralogic_plot)
|
| 380 |
zeroeval_crux_tab.select(fn=create_simple_plot,
|
| 381 |
inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
|
| 382 |
-
gr.State(
|
| 383 |
-
|
|
|
|
|
|
|
| 384 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 385 |
outputs=zeroeval_crux_plot)
|
| 386 |
zeroeval_math_l5_tab.select(fn=create_simple_plot,
|
| 387 |
inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
|
| 388 |
gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
|
| 389 |
-
gr.State(
|
|
|
|
| 390 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 391 |
outputs=zeroeval_math_l5_plot)
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
if __name__ == "__main__":
|
| 395 |
demo.launch()
|
|
|
|
| 187 |
|
| 188 |
| Benchmark | Top Score |
|
| 189 |
|-----------|-----------|
|
| 190 |
+
| Humanity's Last Exam | 🔴 7% |
|
| 191 |
| BigCodeBench | 🟠 36% |
|
| 192 |
| Simple Bench | 🟠 42% |
|
| 193 |
| PlanBench | 🟠 53% |
|
| 194 |
| GAIA | 🟡 65% |
|
| 195 |
+
| LiveBench Language | 🟡 65% |
|
| 196 |
+
| LiveBench Data Analysis | 🟡 71% |
|
| 197 |
+
| LiveCodeBench | 🟡 73% |
|
| 198 |
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
|
| 199 |
+
| LiveBench | 🟡 76% |
|
| 200 |
| GPQA | 🟡 76% |
|
| 201 |
+
| LiveBench Mathematics | 🟡 81% |
|
| 202 |
| ZebraLogic | 🟡 81% |
|
| 203 |
+
| LiveBench Coding | 🟡 83% |
|
| 204 |
| ARC-AGI-Pub (Public Eval) | 🟡 83% |
|
| 205 |
+
| LiveBench IF | 🟡 86% |
|
| 206 |
| ZeroEval | 🟡 86% |
|
| 207 |
| MATH-L5 | 🟡 89% |
|
| 208 |
+
| LiveBench Reasoning | 🟢 92% |
|
| 209 |
| MMLU-Redux | 🟢 93% |
|
| 210 |
| CRUX | 🟢 96% |
|
| 211 |
|
|
|
|
| 218 |
| 🟡 Yellow | 60% to 90% |
|
| 219 |
| 🟢 Green | Above 90% |"""
|
| 220 |
)
|
| 221 |
+
with gr.Tab("🔴 Humanity's Last Exam") as humanitys_last_exam_tab:
|
| 222 |
+
humanitys_last_exam_plot: gr.Plot = gr.Plot()
|
| 223 |
+
humanitys_last_exam_markdown: gr.Markdown = gr.Markdown(
|
| 224 |
+
value="""Source: [Humanity's Last Exam Quantitative Results](https://lastexam.ai/)"""
|
| 225 |
+
)
|
| 226 |
with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
|
| 227 |
bigcodebench_plot: gr.Plot = gr.Plot()
|
| 228 |
bigcodebench_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 243 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
| 244 |
value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
|
| 245 |
)
|
| 246 |
+
with gr.Tab("🟡 LiveBench Language") as livebench_language_tab:
|
| 247 |
+
livebench_language_plot: gr.Plot = gr.Plot()
|
| 248 |
+
livebench_language_markdown: gr.Markdown = gr.Markdown(
|
| 249 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
| 250 |
+
)
|
| 251 |
+
with gr.Tab("🟡 LiveBench Data Analysis") as livebench_data_analysis_tab:
|
| 252 |
+
livebench_data_analysis_plot: gr.Plot = gr.Plot()
|
| 253 |
+
livebench_data_analysis_markdown: gr.Markdown = gr.Markdown(
|
| 254 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
| 255 |
+
)
|
| 256 |
+
with gr.Tab("🟡 LiveCodeBench") as livecodebench_tab:
|
| 257 |
+
livecodebench_plot: gr.Plot = gr.Plot()
|
| 258 |
+
livecodebench_markdown: gr.Markdown = gr.Markdown(
|
| 259 |
+
value="""Source: [LiveCodeBench Leaderboard](https://livecodebench.github.io/leaderboard.html)"""
|
| 260 |
+
)
|
| 261 |
with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
|
| 262 |
with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
|
| 263 |
arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
|
|
|
|
| 266 |
arc_agi_markdown: gr.Markdown = gr.Markdown(
|
| 267 |
value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
|
| 268 |
)
|
| 269 |
+
with gr.Tab("🟡 LiveBench") as livebench_tab:
|
| 270 |
+
livebench_plot: gr.Plot = gr.Plot()
|
| 271 |
+
livebench_markdown: gr.Markdown = gr.Markdown(
|
| 272 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
| 273 |
+
)
|
| 274 |
with gr.Tab("🟡 GPQA") as gpqa_tab:
|
| 275 |
gpqa_plot: gr.Plot = gr.Plot()
|
| 276 |
gpqa_markdown: gr.Markdown = gr.Markdown(
|
| 277 |
value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
|
| 278 |
)
|
| 279 |
+
with gr.Tab("🟡 LiveBench Mathematics") as livebench_mathematics_tab:
|
| 280 |
+
livebench_mathematics_plot: gr.Plot = gr.Plot()
|
| 281 |
+
livebench_mathematics_markdown: gr.Markdown = gr.Markdown(
|
| 282 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
| 283 |
+
)
|
| 284 |
with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
|
| 285 |
zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
|
| 286 |
zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
|
| 287 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
| 288 |
)
|
| 289 |
+
with gr.Tab("🟡 LiveBench Coding") as livebench_coding_tab:
|
| 290 |
+
livebench_coding_plot: gr.Plot = gr.Plot()
|
| 291 |
+
livebench_coding_markdown: gr.Markdown = gr.Markdown(
|
| 292 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
| 293 |
+
)
|
| 294 |
+
with gr.Tab("🟡 LiveBench IF") as livebench_if_tab:
|
| 295 |
+
livebench_if_plot: gr.Plot = gr.Plot()
|
| 296 |
+
livebench_if_markdown: gr.Markdown = gr.Markdown(
|
| 297 |
+
value="""Source: [LiveBench IF](https://livebench.ai/)"""
|
| 298 |
+
)
|
| 299 |
with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
|
| 300 |
zeroeval_average_plot: gr.Plot = gr.Plot()
|
| 301 |
zeroeval_average_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 306 |
zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
|
| 307 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
| 308 |
)
|
| 309 |
+
with gr.Tab("🟢 LiveBench Reasoning") as livebench_reasoning_tab:
|
| 310 |
+
livebench_reasoning_plot: gr.Plot = gr.Plot()
|
| 311 |
+
livebench_reasoning_markdown: gr.Markdown = gr.Markdown(
|
| 312 |
+
value="""Source: [LiveBench Leaderboard](https://livebench.ai/)"""
|
| 313 |
+
)
|
| 314 |
with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
|
| 315 |
zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
|
| 316 |
zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 321 |
zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
|
| 322 |
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
| 323 |
)
|
|
|
|
|
|
|
| 324 |
with gr.Tab("OpenCompass", visible=False):
|
| 325 |
opencompass_plot: gr.Plot = gr.Plot()
|
| 326 |
opencompass_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 336 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
| 337 |
value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
|
| 338 |
)
|
| 339 |
+
with gr.Tab("OSWorld", visible=False):
|
| 340 |
+
osworld_plot: gr.Plot = gr.Plot()
|
| 341 |
+
osworld_markdown: gr.Markdown = gr.Markdown(
|
| 342 |
+
value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
|
| 343 |
+
)
|
| 344 |
+
with gr.Tab("EMMA-Mini", visible=False):
|
| 345 |
+
emma_plot: gr.Plot = gr.Plot()
|
| 346 |
+
emma_markdown: gr.Markdown = gr.Markdown(
|
| 347 |
+
value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
|
| 348 |
+
)
|
| 349 |
+
with gr.Tab("MathVista", visible=False):
|
| 350 |
+
mathvista_plot: gr.Plot = gr.Plot()
|
| 351 |
+
mathvista_markdown: gr.Markdown = gr.Markdown(
|
| 352 |
+
value="""Source: [Leaderboard on MathVista](https://mathvista.github.io/#leaderboard)"""
|
| 353 |
+
)
|
| 354 |
+
with gr.Tab("DABStep", visible=False):
|
| 355 |
+
dabstep_plot: gr.Plot = gr.Plot()
|
| 356 |
+
dabstep_markdown: gr.Markdown = gr.Markdown(
|
| 357 |
+
value="""Source: [DABStep Leaderboard](https://huggingface.co/spaces/adyen/DABstep)"""
|
| 358 |
+
)
|
| 359 |
+
with gr.Tab("lineage-bench", visible=False):
|
| 360 |
+
lineage_bench_plot: gr.Plot = gr.Plot()
|
| 361 |
+
lineage_bench_markdown: gr.Markdown = gr.Markdown(
|
| 362 |
+
value="""Source: [lineage-bench Results](https://github.com/fairydreaming/lineage-bench)"""
|
| 363 |
+
)
|
| 364 |
+
with gr.Tab("Step-Game", visible=False):
|
| 365 |
+
step_game_plot: gr.Plot = gr.Plot()
|
| 366 |
+
step_game_markdown: gr.Markdown = gr.Markdown(
|
| 367 |
+
value="""Source: [Step-Game TrueSkill Leaderboard](https://github.com/lechmazur/step_game)"""
|
| 368 |
+
)
|
| 369 |
+
with gr.Tab("HHEM", visible=False):
|
| 370 |
+
hhem_plot: gr.Plot = gr.Plot()
|
| 371 |
+
hhem_markdown: gr.Markdown = gr.Markdown(
|
| 372 |
+
value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
|
| 373 |
+
)
|
| 374 |
+
with gr.Tab("NYT Connections", visible=False):
|
| 375 |
+
nyt_connections_exam_plot: gr.Plot = gr.Plot()
|
| 376 |
+
nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
|
| 377 |
+
value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
|
| 378 |
+
)
|
| 379 |
+
with gr.Tab("USACO", visible=False):
|
| 380 |
+
usaco_plot: gr.Plot = gr.Plot()
|
| 381 |
+
usaco_markdown: gr.Markdown = gr.Markdown(
|
| 382 |
+
value="""Source: [USACO Leaderboard](https://hal.cs.princeton.edu/usaco)"""
|
| 383 |
+
)
|
| 384 |
+
with gr.Tab("AppWorld", visible=False):
|
| 385 |
+
appworld_plot: gr.Plot = gr.Plot()
|
| 386 |
+
appworld_markdown: gr.Markdown = gr.Markdown(
|
| 387 |
+
value="""Source: [AppWorld Agent Scores](https://appworld.dev/leaderboard)"""
|
| 388 |
+
)
|
| 389 |
+
with gr.Tab("CORE-Bench", visible=False):
|
| 390 |
+
core_bench_plot: gr.Plot = gr.Plot()
|
| 391 |
+
core_bench_markdown: gr.Markdown = gr.Markdown(
|
| 392 |
+
value="""Source: [HAL Leaderboards](https://hal.cs.princeton.edu/#leaderboards)"""
|
| 393 |
+
)
|
| 394 |
+
with gr.Tab("Cybench", visible=False):
|
| 395 |
+
cybench_plot: gr.Plot = gr.Plot()
|
| 396 |
+
cybench_markdown: gr.Markdown = gr.Markdown(
|
| 397 |
+
value="""Source: [Cybench Leaderboard](https://hal.cs.princeton.edu/cybench)"""
|
| 398 |
+
)
|
| 399 |
+
with gr.Tab("MultiChallenge", visible=False):
|
| 400 |
+
multichallenge_plot: gr.Plot = gr.Plot()
|
| 401 |
+
multichallenge_markdown: gr.Markdown = gr.Markdown(
|
| 402 |
+
value="""Source: [SEAL Leaderboard: MultiChallenge](https://scale.com/leaderboard/multichallenge)"""
|
| 403 |
+
)
|
| 404 |
+
with gr.Tab("VISTA", visible=False):
|
| 405 |
+
vista_plot: gr.Plot = gr.Plot()
|
| 406 |
+
vista_markdown: gr.Markdown = gr.Markdown(
|
| 407 |
+
value="""Source: [SEAL Leaderboard: Visual-Language Understanding](https://scale.com/leaderboard/visual_language_understanding)"""
|
| 408 |
+
)
|
| 409 |
+
with gr.Tab("ToolComp", visible=False):
|
| 410 |
+
with gr.Tab("Enterprise"):
|
| 411 |
+
toolcomp_enterprise_plot: gr.Plot = gr.Plot()
|
| 412 |
+
toolcomp_enterprise_markdown: gr.Markdown = gr.Markdown(
|
| 413 |
+
value="""Source: [SEAL Leaderboard: Agentic Tool Use (Enterprise)](https://scale.com/leaderboard/tool_use_enterprise)"""
|
| 414 |
+
)
|
| 415 |
+
with gr.Tab("Chat"):
|
| 416 |
+
toolcomp_chat_plot: gr.Plot = gr.Plot()
|
| 417 |
+
toolcomp_chat_markdown: gr.Markdown = gr.Markdown(
|
| 418 |
+
value="""Source: [SEAL Leaderboard: Agentic Tool Use (Chat)](https://scale.com/leaderboard/tool_use_chat)"""
|
| 419 |
+
)
|
| 420 |
+
with gr.Tab("BFCL", visible=False):
|
| 421 |
+
bfcl_plot: gr.Plot = gr.Plot()
|
| 422 |
+
bfcl_markdown: gr.Markdown = gr.Markdown(
|
| 423 |
+
value="""Source: [BFCL Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)"""
|
| 424 |
+
)
|
| 425 |
+
with gr.Tab("EvalPlus", visible=False):
|
| 426 |
+
evalplus_plot: gr.Plot = gr.Plot()
|
| 427 |
+
evalplus_markdown: gr.Markdown = gr.Markdown(
|
| 428 |
+
value="""Source: [EvalPlus Leaderboard](https://evalplus.github.io/leaderboard.html)"""
|
| 429 |
+
)
|
| 430 |
+
with gr.Tab("Aider Polyglot", visible=False):
|
| 431 |
+
aider_plot: gr.Plot = gr.Plot()
|
| 432 |
+
aider_markdown: gr.Markdown = gr.Markdown(
|
| 433 |
+
value="""Source: [Aider LLM Leaderboards](https://aider.chat/docs/leaderboards/)"""
|
| 434 |
+
)
|
| 435 |
+
with gr.Tab("QuALITY", visible=False):
|
| 436 |
+
quality_plot: gr.Plot = gr.Plot()
|
| 437 |
+
quality_markdown: gr.Markdown = gr.Markdown(
|
| 438 |
+
value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
|
| 439 |
+
)
|
| 440 |
with gr.Tab("Finance") as finance_tab:
|
| 441 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
| 442 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
|
|
|
| 445 |
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
| 446 |
arc_agi_public_eval_tab.select(fn=create_simple_plot,
|
| 447 |
inputs=[gr.State("arc_agi_leaderboard.jsonl"),
|
| 448 |
+
gr.State(
|
| 449 |
+
"ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 450 |
+
gr.State(
|
| 451 |
+
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 452 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 453 |
gr.State(0), gr.State(100),
|
| 454 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
| 455 |
outputs=arc_agi_public_eval_plot)
|
| 456 |
arc_agi_tab.select(fn=create_simple_plot,
|
| 457 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
| 458 |
+
gr.State(
|
| 459 |
+
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 460 |
+
gr.State(
|
| 461 |
+
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 462 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 463 |
gr.State(0), gr.State(100),
|
| 464 |
gr.State({"MTurkers": 77})],
|
| 465 |
outputs=arc_agi_semi_private_eval_plot)
|
| 466 |
arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
|
| 467 |
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
| 468 |
+
gr.State(
|
| 469 |
+
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 470 |
+
gr.State(
|
| 471 |
+
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 472 |
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
| 473 |
gr.State(0), gr.State(100),
|
| 474 |
gr.State({"MTurkers": 77})],
|
|
|
|
| 477 |
simple_bench_tab.select(fn=create_simple_plot,
|
| 478 |
inputs=[gr.State("simple_bench_leaderboard.jsonl"),
|
| 479 |
gr.State("Simple Bench Score"),
|
| 480 |
+
gr.State(
|
| 481 |
+
"\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
|
| 482 |
+
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1)),
|
| 483 |
gr.State(0), gr.State(100),
|
| 484 |
gr.State({"Humans": 83.7})],
|
| 485 |
outputs=simple_bench_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
planbench_tab.select(fn=create_simple_plot,
|
| 487 |
inputs=[gr.State("planbench_leaderboard.jsonl"),
|
| 488 |
gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
|
| 489 |
+
gr.State(
|
| 490 |
+
"\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
|
| 491 |
gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
|
| 492 |
outputs=planbench_plot)
|
| 493 |
bigcodebench_tab.select(fn=create_simple_plot,
|
| 494 |
inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
|
| 495 |
gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
|
| 496 |
+
gr.State(
|
| 497 |
+
"\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
|
| 498 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
|
| 499 |
outputs=bigcodebench_plot)
|
| 500 |
gaia_tab.select(fn=create_simple_plot,
|
| 501 |
inputs=[gr.State("gaia_leaderboard.jsonl"),
|
| 502 |
gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
|
| 503 |
+
gr.State(
|
| 504 |
+
"\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
|
| 505 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
|
| 506 |
gr.State(0), gr.State(100),
|
| 507 |
gr.State({"Humans": 92})],
|
|
|
|
| 509 |
gpqa_tab.select(fn=create_simple_plot,
|
| 510 |
inputs=[gr.State("gpqa_leaderboard.jsonl"),
|
| 511 |
gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
|
| 512 |
+
gr.State(
|
| 513 |
+
"\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
|
| 514 |
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
|
| 515 |
gr.State(25), gr.State(100),
|
| 516 |
gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
|
|
|
|
| 518 |
zeroeval_average_tab.select(fn=create_simple_plot,
|
| 519 |
inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
|
| 520 |
gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
|
| 521 |
+
gr.State(
|
| 522 |
+
"\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
|
| 523 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 524 |
outputs=zeroeval_average_plot)
|
| 525 |
zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
|
| 526 |
inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
|
| 527 |
+
gr.State(
|
| 528 |
+
"ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
|
| 529 |
+
gr.State(
|
| 530 |
+
"\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
|
| 531 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 532 |
outputs=zeroeval_mmlu_redux_plot)
|
| 533 |
zeroeval_zebralogic_tab.select(fn=create_simple_plot,
|
| 534 |
inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
|
| 535 |
gr.State("ZeroEval ZebraLogic Score"),
|
| 536 |
+
gr.State(
|
| 537 |
+
"\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
|
| 538 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 539 |
outputs=zeroeval_zebralogic_plot)
|
| 540 |
zeroeval_crux_tab.select(fn=create_simple_plot,
|
| 541 |
inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
|
| 542 |
+
gr.State(
|
| 543 |
+
"ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
|
| 544 |
+
gr.State(
|
| 545 |
+
"\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
|
| 546 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 547 |
outputs=zeroeval_crux_plot)
|
| 548 |
zeroeval_math_l5_tab.select(fn=create_simple_plot,
|
| 549 |
inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
|
| 550 |
gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
|
| 551 |
+
gr.State(
|
| 552 |
+
"\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
|
| 553 |
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
| 554 |
outputs=zeroeval_math_l5_plot)
|
| 555 |
+
livebench_tab.select(fn=create_simple_plot,
|
| 556 |
+
inputs=[gr.State("livebench.jsonl"),
|
| 557 |
+
gr.State("LiveBench-2024-11-25: Global Average Score"),
|
| 558 |
+
gr.State(
|
| 559 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 560 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 561 |
+
outputs=livebench_plot)
|
| 562 |
+
livebench_reasoning_tab.select(fn=create_simple_plot,
|
| 563 |
+
inputs=[gr.State("livebench_reasoning.jsonl"),
|
| 564 |
+
gr.State("LiveBench-2024-11-25: Reasoning Average Score"),
|
| 565 |
+
gr.State(
|
| 566 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 567 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 568 |
+
outputs=livebench_reasoning_plot)
|
| 569 |
+
livebench_coding_tab.select(fn=create_simple_plot,
|
| 570 |
+
inputs=[gr.State("livebench_coding.jsonl"),
|
| 571 |
+
gr.State("LiveBench-2024-11-25: Coding Average Score"),
|
| 572 |
+
gr.State(
|
| 573 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 574 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 575 |
+
outputs=livebench_coding_plot)
|
| 576 |
+
livebench_mathematics_tab.select(fn=create_simple_plot,
|
| 577 |
+
inputs=[gr.State("livebench_mathematics.jsonl"),
|
| 578 |
+
gr.State("LiveBench-2024-11-25: Mathematics Average Score"),
|
| 579 |
+
gr.State(
|
| 580 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 581 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 582 |
+
outputs=livebench_mathematics_plot)
|
| 583 |
+
livebench_data_analysis_tab.select(fn=create_simple_plot,
|
| 584 |
+
inputs=[gr.State("livebench_data_analysis.jsonl"),
|
| 585 |
+
gr.State("LiveBench-2024-11-25: Data Analysis Average Score"),
|
| 586 |
+
gr.State(
|
| 587 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 588 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 589 |
+
outputs=livebench_data_analysis_plot)
|
| 590 |
+
livebench_language_tab.select(fn=create_simple_plot,
|
| 591 |
+
inputs=[gr.State("livebench_language.jsonl"),
|
| 592 |
+
gr.State("LiveBench-2024-11-25: Language Average Score"),
|
| 593 |
+
gr.State(
|
| 594 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 595 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 596 |
+
outputs=livebench_language_plot)
|
| 597 |
+
livebench_if_tab.select(fn=create_simple_plot,
|
| 598 |
+
inputs=[gr.State("livebench_if.jsonl"),
|
| 599 |
+
gr.State("LiveBench-2024-11-25: IF Average Score"),
|
| 600 |
+
gr.State(
|
| 601 |
+
"\"LiveBench is designed to limit potential contamination by releasing new questions regularly [...] Each question has verifiable, objective ground-truth answers\" (White et al. 2024)"),
|
| 602 |
+
gr.State(date(2024, 2, 29)), gr.State(date(2025, 2, 1))],
|
| 603 |
+
outputs=livebench_if_plot)
|
| 604 |
+
humanitys_last_exam_tab.select(fn=create_simple_plot,
|
| 605 |
+
inputs=[gr.State("humanitys_last_exam.jsonl"),
|
| 606 |
+
gr.State("Humanity's Last Exam (Multi-Modal Models Only) Score"),
|
| 607 |
+
gr.State(
|
| 608 |
+
"\"multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage\" (Phan et al. 2025)"),
|
| 609 |
+
gr.State(date(2024, 5, 13)), gr.State(date(2025, 2, 11))],
|
| 610 |
+
outputs=humanitys_last_exam_plot)
|
| 611 |
+
livecodebench_tab.select(fn=create_simple_plot,
|
| 612 |
+
inputs=[gr.State("livecodebench.jsonl"),
|
| 613 |
+
gr.State("LiveCodeBench (7/1/2024 to 2/1/2025) Score"),
|
| 614 |
+
gr.State(
|
| 615 |
+
"\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
|
| 616 |
+
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
|
| 617 |
+
outputs=livecodebench_plot)
|
| 618 |
|
| 619 |
if __name__ == "__main__":
|
| 620 |
demo.launch()
|
codeforces_leaderboard.jsonl
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
{"model": "o3", "score": 2400}
|
| 2 |
-
{"model": "o3-mini", "score": 2073}
|
| 3 |
-
{"model": "o1", "score": 1673}
|
| 4 |
-
{"model": "o1-mini", "score": 1650}
|
| 5 |
-
{"model": "o1-preview", "score": 1258}
|
| 6 |
-
{"model": "gpt-4o", "score": 808}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
humanitys_last_exam.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "gpt-4o", "score": 3.1}
|
| 2 |
+
{"model": "grok-2", "score": 3.9}
|
| 3 |
+
{"model": "claude-3-5-sonnet", "score": 4.8}
|
| 4 |
+
{"model": "gemini-2.0-flash-thinking", "score": 7.2}
|
| 5 |
+
{"model": "o1", "score": 7.2}
|
livebench.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 75.88}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 75.67}
|
| 3 |
+
{"model": "deepseek-r1", "score": 71.57}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 70.01}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 66.92}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 65.13}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 64.09}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 62.45}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 62.29}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 61.47}
|
| 11 |
+
{"model": "deepseek-v3", "score": 60.45}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 59.26}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 59.03}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 57.79}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 57.76}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 56.02}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 55.33}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 54.33}
|
| 19 |
+
{"model": "grok-2-1212", "score": 54.30}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 53.24}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 52.64}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 52.36}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 52.19}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 52.19}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 51.66}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 51.44}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 50.40}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 50.16}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 49.66}
|
| 30 |
+
{"model": "grok-beta", "score": 49.18}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 49.16}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 48.43}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 46.23}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 46.21}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 44.89}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 43.53}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 43.45}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 42.93}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 42.55}
|
| 40 |
+
{"model": "phi-4", "score": 41.61}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 41.26}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 40.25}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 38.18}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 36.35}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 34.90}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 33.42}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 31.76}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 29.59}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 28.66}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 27.48}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 27.11}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 25.97}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 24.03}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 22.36}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 22.12}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 22.08}
|
livebench_coding.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 82.74}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 69.69}
|
| 3 |
+
{"model": "deepseek-r1", "score": 66.74}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 65.38}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 53.49}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 63.49}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 63.41}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 61.46}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 64.41}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 53.92}
|
| 11 |
+
{"model": "deepseek-v3", "score": 61.77}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 54.36}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 67.13}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 60.56}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 48.05}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 47.19}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 51.44}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 48.80}
|
| 19 |
+
{"model": "grok-2-1212", "score": 46.44}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 43.80}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 58.92}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 42.65}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 46.08}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 46.87}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 47.44}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 57.64}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 49.00}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 36.59}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 50.97}
|
| 30 |
+
{"model": "grok-beta", "score": 45.15}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 38.59}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 47.08}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 56.85}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 36.31}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 33.49}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 38.15}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 51.36}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 32.85}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 35.31}
|
| 40 |
+
{"model": "phi-4", "score": 30.67}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 43.15}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 37.20}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 35.95}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 27.46}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 38.37}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 25.74}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 19.14}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 20.18}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 22.46}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 17.90}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 19.46}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.74}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 20.26}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 15.04}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 10.41}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 15.54}
|
livebench_data_analysis.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 70.64}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 65.47}
|
| 3 |
+
{"model": "deepseek-r1", "score": 69.78}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 66.56}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 69.37}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 68.02}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 63.16}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 62.04}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 67.93}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 67.55}
|
| 11 |
+
{"model": "deepseek-v3", "score": 60.94}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 61.67}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 55.03}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 66.00}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 57.92}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 63.72}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 60.91}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 54.97}
|
| 19 |
+
{"model": "grok-2-1212", "score": 54.45}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 57.47}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 55.51}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 55.85}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 56.15}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 54.97}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 57.93}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 51.91}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 54.36}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 49.49}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 55.93}
|
| 30 |
+
{"model": "grok-beta", "score": 54.27}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 57.89}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 50.15}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 49.87}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 53.98}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 53.75}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 48.31}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 48.45}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 45.41}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 53.69}
|
| 40 |
+
{"model": "phi-4", "score": 45.17}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 49.96}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 31.62}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 47.87}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 37.23}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 35.22}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 42.73}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 38.06}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 33.95}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 36.39}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 33.34}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 25.48}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 32.82}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 30.29}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 34.69}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 20.60}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 30.21}
|
livebench_if.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 84.36}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 81.55}
|
| 3 |
+
{"model": "deepseek-r1", "score": 80.51}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 83.16}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 82.47}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 83.38}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 77.34}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 80.06}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 75.35}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 85.79}
|
| 11 |
+
{"model": "deepseek-v3", "score": 75.25}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 81.86}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 69.30}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 65.07}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 65.40}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 79.88}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 68.58}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 70.78}
|
| 19 |
+
{"model": "grok-2-1212", "score": 69.63}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 78.28}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 65.22}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 75.90}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 64.94}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 68.16}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 66.37}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 64.39}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 60.85}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 82.67}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 41.55}
|
| 30 |
+
{"model": "grok-beta", "score": 69.62}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 63.89}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 67.93}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 58.69}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 63.24}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 68.98}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 67.13}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 61.88}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 40.92}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 59.54}
|
| 40 |
+
{"model": "phi-4", "score": 58.38}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 56.80}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 35.59}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 58.10}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 54.13}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 52.11}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 53.23}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 57.61}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 48.04}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 52.62}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 55.62}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 59.47}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 54.90}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 47.20}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 39.08}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 60.56}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 36.36}
|
livebench_language.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 50.68}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 65.39}
|
| 3 |
+
{"model": "deepseek-r1", "score": 48.53}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 46.26}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 42.18}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 44.85}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 51.29}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 38.25}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 56.28}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 40.69}
|
| 11 |
+
{"model": "deepseek-v3", "score": 47.48}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 38.22}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 53.76}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 49.14}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 40.89}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 44.39}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 47.59}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 43.29}
|
| 19 |
+
{"model": "grok-2-1212", "score": 45.58}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 34.28}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 34.12}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 45.46}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 47.37}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 41.98}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 45.30}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 34.99}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 44.26}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 39.20}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 23.81}
|
| 30 |
+
{"model": "grok-beta", "score": 43.16}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 50.39}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 39.39}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 23.25}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 38.78}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 35.42}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 36.96}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 35.37}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 26.82}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 30.46}
|
| 40 |
+
{"model": "phi-4", "score": 25.61}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 28.61}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 21.09}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 32.62}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 25.93}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 15.80}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 24.49}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 29.73}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 15.78}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 25.53}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 16.72}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 19.70}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 17.71}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 12.94}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 9.15}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 11.16}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 8.56}
|
livebench_mathematics.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 77.29}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 80.32}
|
| 3 |
+
{"model": "deepseek-r1", "score": 80.71}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 72.37}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 75.85}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 70.97}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 72.36}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 63.06}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 58.35}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 65.62}
|
| 11 |
+
{"model": "deepseek-v3", "score": 60.54}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 60.39}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 52.28}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 48.02}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 61.99}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 48.77}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 49.54}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 59.07}
|
| 19 |
+
{"model": "grok-2-1212", "score": 54.88}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 55.54}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 54.66}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 41.07}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 42.87}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 57.75}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 42.45}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 54.29}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 43.02}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 42.24}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 58.11}
|
| 30 |
+
{"model": "grok-beta", "score": 45.84}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 43.62}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 42.55}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 46.61}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 40.30}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 34.72}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 38.04}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 35.54}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 59.36}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 39.89}
|
| 40 |
+
{"model": "phi-4", "score": 41.98}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 36.31}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 58.26}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 26.46}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 36.70}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 39.51}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 24.42}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 21.27}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 34.49}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 19.80}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 19.39}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 17.99}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 18.31}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 17.58}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 15.72}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 13.64}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 14.96}
|
livebench_reasoning.jsonl
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o3-mini-2025-01-31-high", "score": 89.58}
|
| 2 |
+
{"model": "o1-2024-12-17-high", "score": 91.58}
|
| 3 |
+
{"model": "deepseek-r1", "score": 83.17}
|
| 4 |
+
{"model": "o3-mini-2025-01-31-medium", "score": 86.33}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 78.17}
|
| 6 |
+
{"model": "gemini-2.0-pro-exp-02-05", "score": 60.08}
|
| 7 |
+
{"model": "gemini-exp-1206", "score": 57.00}
|
| 8 |
+
{"model": "o3-mini-2025-01-31-low", "score": 69.83}
|
| 9 |
+
{"model": "qwen2.5-max", "score": 51.42}
|
| 10 |
+
{"model": "gemini-2.0-flash", "score": 55.25}
|
| 11 |
+
{"model": "deepseek-v3", "score": 56.75}
|
| 12 |
+
{"model": "gemini-2.0-flash-exp", "score": 59.08}
|
| 13 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 56.67}
|
| 14 |
+
{"model": "chatgpt-4o-latest-2025-01-29", "score": 57.92}
|
| 15 |
+
{"model": "o1-mini-2024-09-12", "score": 72.33}
|
| 16 |
+
{"model": "step-2-16k-202411", "score": 52.17}
|
| 17 |
+
{"model": "gpt-4o-2024-08-06", "score": 53.92}
|
| 18 |
+
{"model": "gemini-1.5-pro-002", "score": 49.08}
|
| 19 |
+
{"model": "grok-2-1212", "score": 54.83}
|
| 20 |
+
{"model": "gemini-2.0-flash-lite-preview-02-05", "score": 50.08}
|
| 21 |
+
{"model": "dracarys2-72b-instruct", "score": 47.38}
|
| 22 |
+
{"model": "meta-llama-3.1-405b-instruct-turbo", "score": 53.25}
|
| 23 |
+
{"model": "gpt-4o-2024-11-20", "score": 55.75}
|
| 24 |
+
{"model": "learnlm-1.5-pro-experimental", "score": 43.42}
|
| 25 |
+
{"model": "chatgpt-4o-latest-0903", "score": 50.50}
|
| 26 |
+
{"model": "qwen2.5-72b-instruct-turbo", "score": 45.42}
|
| 27 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 50.92}
|
| 28 |
+
{"model": "llama-3.3-70b-instruct-turbo", "score": 50.75}
|
| 29 |
+
{"model": "deepseek-r1-distill-llama-70b", "score": 67.58}
|
| 30 |
+
{"model": "grok-beta", "score": 37.00}
|
| 31 |
+
{"model": "claude-3-opus-20240229", "score": 40.58}
|
| 32 |
+
{"model": "mistral-large-2411", "score": 43.50}
|
| 33 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 42.08}
|
| 34 |
+
{"model": "dracarys2-llama-3.1-70b-instruct", "score": 44.67}
|
| 35 |
+
{"model": "meta-llama-3.1-70b-instruct-turbo", "score": 43.00}
|
| 36 |
+
{"model": "amazon.nova-pro-v1:0", "score": 32.58}
|
| 37 |
+
{"model": "claude-3-5-haiku-20241022", "score": 28.08}
|
| 38 |
+
{"model": "deepseek-r1-distill-qwen-32b", "score": 52.25}
|
| 39 |
+
{"model": "mistral-small-2501", "score": 36.42}
|
| 40 |
+
{"model": "phi-4", "score": 47.83}
|
| 41 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 32.75}
|
| 42 |
+
{"model": "qwq-32b-preview", "score": 57.71}
|
| 43 |
+
{"model": "gemma-2-27b-it", "score": 28.08}
|
| 44 |
+
{"model": "amazon.nova-lite-v1:0", "score": 36.67}
|
| 45 |
+
{"model": "qwen2.5-7b-instruct-turbo", "score": 28.42}
|
| 46 |
+
{"model": "mistral-small-2409", "score": 29.92}
|
| 47 |
+
{"model": "command-r-plus-08-2024", "score": 24.75}
|
| 48 |
+
{"model": "amazon.nova-micro-v1:0", "score": 25.08}
|
| 49 |
+
{"model": "gemma-2-9b-it", "score": 15.17}
|
| 50 |
+
{"model": "command-r-08-2024", "score": 21.92}
|
| 51 |
+
{"model": "command-r-plus-04-2024", "score": 20.58}
|
| 52 |
+
{"model": "meta-llama-3.1-8b-instruct-turbo", "score": 13.33}
|
| 53 |
+
{"model": "phi-3-small-8k-instruct", "score": 15.92}
|
| 54 |
+
{"model": "phi-3-mini-128k-instruct", "score": 20.50}
|
| 55 |
+
{"model": "olmo-2-1124-13b-instruct", "score": 16.33}
|
| 56 |
+
{"model": "phi-3-mini-4k-instruct", "score": 26.83}
|
livecodebench.jsonl
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o1-2024-12-17 (high)", "score": 73.1}
|
| 2 |
+
{"model": "o3-mini-2025-01-31 (high)", "score": 71.6}
|
| 3 |
+
{"model": "o3-mini-2025-01-31 (medium)", "score": 68.8}
|
| 4 |
+
{"model": "o1-2024-12-17 (medium)", "score": 65.4}
|
| 5 |
+
{"model": "deepseek-r1-preview", "score": 64.3}
|
| 6 |
+
{"model": "o1-2024-12-17 (low)", "score": 62.7}
|
| 7 |
+
{"model": "o3-mini-2025-01-31 (low)", "score": 62.7}
|
| 8 |
+
{"model": "o1-mini-2024-09-12", "score": 54.1}
|
| 9 |
+
{"model": "deepseek-r1-lite-preview", "score": 50.4}
|
| 10 |
+
{"model": "gemini-flash-2.0-thinking-01-21", "score": 45}
|
| 11 |
+
{"model": "qwq-32b-preview", "score": 44}
|
| 12 |
+
{"model": "gemini-flash-2.0-thinking-12-19", "score": 43.4}
|
| 13 |
+
{"model": "o1-preview-2024-09-12", "score": 42.5}
|
| 14 |
+
{"model": "claude-3.5-sonnet-20241022", "score": 37.1}
|
| 15 |
+
{"model": "deepseek-v3", "score": 36.3}
|
| 16 |
+
{"model": "gpt-4o-2024-05-13", "score": 33}
|
| 17 |
+
{"model": "claude-3.5-sonnet-20240620", "score": 32}
|
| 18 |
+
{"model": "gemini-flash-2.0-exp", "score": 32}
|
| 19 |
+
{"model": "gemini-pro-1.5-002", "score": 30.9}
|
| 20 |
+
{"model": "gpt-4o-2024-08-06", "score": 30.5}
|
| 21 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 29.6}
|
| 22 |
+
{"model": "gemini-flash-1.5-002", "score": 28.4}
|
| 23 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 27.7}
|
| 24 |
+
{"model": "mistral-large", "score": 27.6}
|
| 25 |
+
{"model": "codestral-latest", "score": 23.8}
|
| 26 |
+
{"model": "claude-3-haiku", "score": 17.1}
|
models.jsonl
CHANGED
|
@@ -1,3 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 2 |
{"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 3 |
{"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
@@ -12,6 +58,7 @@
|
|
| 12 |
{"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 13 |
{"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 14 |
{"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 15 |
{"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 16 |
{"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 17 |
{"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
@@ -22,6 +69,7 @@
|
|
| 22 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 23 |
{"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 24 |
{"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 25 |
{"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 26 |
{"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 27 |
{"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
@@ -30,6 +78,7 @@
|
|
| 30 |
{"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 31 |
{"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 32 |
{"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 33 |
{"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 34 |
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 35 |
{"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
@@ -37,6 +86,7 @@
|
|
| 37 |
{"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 38 |
{"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
| 39 |
{"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 40 |
{"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 41 |
{"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 42 |
{"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
@@ -83,6 +133,7 @@
|
|
| 83 |
{"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 84 |
{"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 85 |
{"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 86 |
{"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
|
| 87 |
{"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
|
| 88 |
{"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
|
@@ -90,6 +141,7 @@
|
|
| 90 |
{"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 91 |
{"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
|
| 92 |
{"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 93 |
{"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
|
| 94 |
{"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 95 |
{"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
|
@@ -187,6 +239,8 @@
|
|
| 187 |
{"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 188 |
{"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 189 |
{"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
|
|
| 190 |
{"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 191 |
{"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 192 |
{"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
@@ -202,6 +256,8 @@
|
|
| 202 |
{"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 203 |
{"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 204 |
{"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
|
|
| 205 |
{"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 206 |
{"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 207 |
{"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 1 |
+
{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 2 |
+
{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 3 |
+
{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 4 |
+
{"Name": "o3-mini-2025-01-31 (low)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 5 |
+
{"Name": "o1-2024-12-17-high", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 6 |
+
{"Name": "o1-2024-12-17 (high)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 7 |
+
{"Name": "o1-2024-12-17 (medium)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 8 |
+
{"Name": "o1-2024-12-17 (low)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 9 |
+
{"Name": "deepseek-r1", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 10 |
+
{"Name": "deepseek-r1-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 11 |
+
{"Name": "deepseek-r1-lite-preview", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 12 |
+
{"Name": "o3-mini-2025-01-31-medium", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 13 |
+
{"Name": "gemini-2.0-flash-thinking-exp-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 14 |
+
{"Name": "gemini-flash-2.0-thinking-01-21", "Release Date": "2025-01-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 15 |
+
{"Name": "gemini-2.0-pro-exp-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 16 |
+
{"Name": "o3-mini-2025-01-31-low", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 17 |
+
{"Name": "qwen2.5-max", "Release Date": "2025-01-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 18 |
+
{"Name": "gemini-2.0-flash", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 19 |
+
{"Name": "gemini-2.0-flash-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 20 |
+
{"Name": "gemini-flash-2.0-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 21 |
+
{"Name": "deepseek-v3", "Release Date": "2024-12-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 22 |
+
{"Name": "chatgpt-4o-latest-2025-01-29", "Release Date": "2025-01-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 23 |
+
{"Name": "step-2-16k-202411", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 24 |
+
{"Name": "gemini-2.0-flash-lite-preview-02-05", "Release Date": "2025-02-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 25 |
+
{"Name": "dracarys2-72b-instruct", "Release Date": "2024-09-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 26 |
+
{"Name": "meta-llama-3.1-405b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 27 |
+
{"Name": "learnlm-1.5-pro-experimental", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 28 |
+
{"Name": "chatgpt-4o-latest-0903", "Release Date": "2024-09-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 29 |
+
{"Name": "qwen2.5-72b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 30 |
+
{"Name": "llama-3.3-70b-instruct-turbo", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 31 |
+
{"Name": "deepseek-r1-distill-llama-70b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 32 |
+
{"Name": "mistral-large-2411", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 33 |
+
{"Name": "dracarys2-llama-3.1-70b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 34 |
+
{"Name": "meta-llama-3.1-70b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 35 |
+
{"Name": "amazon.nova-pro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 36 |
+
{"Name": "deepseek-r1-distill-qwen-32b", "Release Date": "2025-01-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 37 |
+
{"Name": "mistral-small-2501", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 38 |
+
{"Name": "phi-4", "Release Date": "2024-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 39 |
+
{"Name": "qwq-32b-preview", "Release Date": "2024-11-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 40 |
+
{"Name": "amazon.nova-lite-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 41 |
+
{"Name": "qwen2.5-7b-instruct-turbo", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 42 |
+
{"Name": "mistral-small-2409", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 43 |
+
{"Name": "amazon.nova-micro-v1:0", "Release Date": "2024-12-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 44 |
+
{"Name": "command-r-plus-04-2024", "Release Date": "2024-04-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 45 |
+
{"Name": "meta-llama-3.1-8b-instruct-turbo", "Release Date": "2024-07-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 46 |
+
{"Name": "olmo-2-1124-13b-instruct", "Release Date": "2024-11-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 47 |
{"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 48 |
{"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 49 |
{"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 58 |
{"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 59 |
{"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 60 |
{"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 61 |
+
{"Name": "gemini-pro-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 62 |
{"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 63 |
{"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 64 |
{"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 69 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 70 |
{"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 71 |
{"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 72 |
+
{"Name": "gemini-flash-1.5-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 73 |
{"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 74 |
{"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 75 |
{"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 78 |
{"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 79 |
{"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 80 |
{"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 81 |
+
{"Name": "grok-2", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 82 |
{"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 83 |
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 84 |
{"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 86 |
{"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 87 |
{"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
| 88 |
{"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 89 |
+
{"Name": "claude-3-5-sonnet", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 90 |
{"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 91 |
{"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 92 |
{"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 133 |
{"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 134 |
{"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 135 |
{"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 136 |
+
{"Name": "claude-3-haiku", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 137 |
{"Name": "deepseek-coder-v2", "Release Date": "2024-06-17", "Total Parameters": 236, "Active Parameters": 21, "API Cost": 0}
|
| 138 |
{"Name": "jamba-1.5-mini", "Release Date": "2024-08-22", "Total Parameters": 52, "Active Parameters": 12, "API Cost": 0}
|
| 139 |
{"Name": "llama-3.1-8b-instruct", "Release Date": "2024-07-23", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
|
|
|
| 141 |
{"Name": "gpt-4-0613", "Release Date": "2023-06-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 142 |
{"Name": "qwen1.5-110b-chat", "Release Date": "2024-02-04", "Total Parameters": 110, "Active Parameters": 110, "API Cost": 0}
|
| 143 |
{"Name": "mistral-large-2402", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 144 |
+
{"Name": "mistral-large", "Release Date": "2024-02-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 145 |
{"Name": "yi-1.5-34b-chat", "Release Date": "2024-05-13", "Total Parameters": 34, "Active Parameters": 34, "API Cost": 0}
|
| 146 |
{"Name": "reka-flash-21b-20240226-online", "Release Date": "2024-02-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 147 |
{"Name": "llama-3-8b-instruct", "Release Date": "2024-04-18", "Total Parameters": 8, "Active Parameters": 8, "API Cost": 0}
|
|
|
|
| 239 |
{"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 240 |
{"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 241 |
{"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 242 |
+
{"Name": "gemini-flash-2.0-thinking-12-19", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 243 |
+
{"Name": "gemini-2.0-flash-thinking", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 244 |
{"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 245 |
{"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 246 |
{"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 256 |
{"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 257 |
{"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 258 |
{"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 259 |
+
{"Name": "codestral-2501", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 260 |
+
{"Name": "codestral-latest", "Release Date": "2025-01-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 261 |
{"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 262 |
{"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 263 |
{"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
simple_bench_leaderboard.jsonl
CHANGED
|
@@ -1,15 +1,19 @@
|
|
| 1 |
{"model": "o1-preview-2024-09-12", "score": 41.7}
|
| 2 |
{"model": "claude-3-5-sonnet-20241022", "score": 41.4}
|
| 3 |
-
{"model": "o1-2024-12-17", "score":
|
|
|
|
| 4 |
{"model": "gemini-exp-1206", "score": 31.1}
|
|
|
|
| 5 |
{"model": "claude-3-5-sonnet-20240620", "score": 27.5}
|
| 6 |
{"model": "gemini-1.5-pro-002", "score": 27.1}
|
| 7 |
{"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
|
| 8 |
{"model": "claude-3-opus-20240229", "score": 23.5}
|
| 9 |
{"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
|
|
|
|
| 10 |
{"model": "grok-beta", "score": 22.7}
|
| 11 |
{"model": "mistral-large-2407", "score": 22.5}
|
| 12 |
{"model": "llama-3.3-70b-instruct", "score": 19.9}
|
|
|
|
| 13 |
{"model": "gemini-2.0-flash-exp", "score": 18.9}
|
| 14 |
{"model": "o1-mini-2024-09-12", "score": 18.1}
|
| 15 |
{"model": "gpt-4o-2024-08-06", "score": 17.8}
|
|
|
|
| 1 |
{"model": "o1-preview-2024-09-12", "score": 41.7}
|
| 2 |
{"model": "claude-3-5-sonnet-20241022", "score": 41.4}
|
| 3 |
+
{"model": "o1-2024-12-17 (high)", "score": 40.1}
|
| 4 |
+
{"model": "o1-2024-12-17 (medium)", "score": 36.7}
|
| 5 |
{"model": "gemini-exp-1206", "score": 31.1}
|
| 6 |
+
{"model": "deepseek-r1", "score": 30.9}
|
| 7 |
{"model": "claude-3-5-sonnet-20240620", "score": 27.5}
|
| 8 |
{"model": "gemini-1.5-pro-002", "score": 27.1}
|
| 9 |
{"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
|
| 10 |
{"model": "claude-3-opus-20240229", "score": 23.5}
|
| 11 |
{"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
|
| 12 |
+
{"model": "o3-mini-2025-01-31 (high)", "score": 22.8}
|
| 13 |
{"model": "grok-beta", "score": 22.7}
|
| 14 |
{"model": "mistral-large-2407", "score": 22.5}
|
| 15 |
{"model": "llama-3.3-70b-instruct", "score": 19.9}
|
| 16 |
+
{"model": "deepseek-v3", "score": 18.9}
|
| 17 |
{"model": "gemini-2.0-flash-exp", "score": 18.9}
|
| 18 |
{"model": "o1-mini-2024-09-12", "score": 18.1}
|
| 19 |
{"model": "gpt-4o-2024-08-06", "score": 17.8}
|