Commit
ยท
064c980
1
Parent(s):
b605a32
Add EMMA benchmark
Browse files- app.py +29 -5
- emma_mini.jsonl +4 -0
- models.jsonl +1 -0
app.py
CHANGED
|
@@ -190,6 +190,7 @@ with gr.Blocks() as demo:
|
|
| 190 |
| Humanity's Last Exam | ๐ด 7% |
|
| 191 |
| BigCodeBench | ๐ 36% |
|
| 192 |
| Simple Bench | ๐ 42% |
|
|
|
|
| 193 |
| PlanBench | ๐ 53% |
|
| 194 |
| GAIA | ๐ก 65% |
|
| 195 |
| LiveBench Language | ๐ก 65% |
|
|
@@ -233,6 +234,11 @@ with gr.Blocks() as demo:
|
|
| 233 |
simple_bench_markdown: gr.Markdown = gr.Markdown(
|
| 234 |
value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
|
| 235 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
with gr.Tab("๐ PlanBench") as planbench_tab:
|
| 237 |
planbench_plot: gr.Plot = gr.Plot()
|
| 238 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -331,6 +337,11 @@ with gr.Blocks() as demo:
|
|
| 331 |
swe_bench_markdown: gr.Markdown = gr.Markdown(
|
| 332 |
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
|
| 333 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
with gr.Tab("WebArena", visible=False):
|
| 335 |
webarena_plot: gr.Plot = gr.Plot()
|
| 336 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -341,11 +352,6 @@ with gr.Blocks() as demo:
|
|
| 341 |
osworld_markdown: gr.Markdown = gr.Markdown(
|
| 342 |
value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
|
| 343 |
)
|
| 344 |
-
with gr.Tab("EMMA-Mini", visible=False):
|
| 345 |
-
emma_plot: gr.Plot = gr.Plot()
|
| 346 |
-
emma_markdown: gr.Markdown = gr.Markdown(
|
| 347 |
-
value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
|
| 348 |
-
)
|
| 349 |
with gr.Tab("MathVista", visible=False):
|
| 350 |
mathvista_plot: gr.Plot = gr.Plot()
|
| 351 |
mathvista_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -437,6 +443,16 @@ with gr.Blocks() as demo:
|
|
| 437 |
quality_markdown: gr.Markdown = gr.Markdown(
|
| 438 |
value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
|
| 439 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
with gr.Tab("Finance") as finance_tab:
|
| 441 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
| 442 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
|
@@ -615,6 +631,14 @@ with gr.Blocks() as demo:
|
|
| 615 |
"\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
|
| 616 |
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
|
| 617 |
outputs=livecodebench_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
if __name__ == "__main__":
|
| 620 |
demo.launch()
|
|
|
|
| 190 |
| Humanity's Last Exam | ๐ด 7% |
|
| 191 |
| BigCodeBench | ๐ 36% |
|
| 192 |
| Simple Bench | ๐ 42% |
|
| 193 |
+
| EMMA-Mini | ๐ 48% |
|
| 194 |
| PlanBench | ๐ 53% |
|
| 195 |
| GAIA | ๐ก 65% |
|
| 196 |
| LiveBench Language | ๐ก 65% |
|
|
|
|
| 234 |
simple_bench_markdown: gr.Markdown = gr.Markdown(
|
| 235 |
value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
|
| 236 |
)
|
| 237 |
+
with gr.Tab("๐ EMMA-Mini") as emma_tab:
|
| 238 |
+
emma_plot: gr.Plot = gr.Plot()
|
| 239 |
+
emma_markdown: gr.Markdown = gr.Markdown(
|
| 240 |
+
value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
|
| 241 |
+
)
|
| 242 |
with gr.Tab("๐ PlanBench") as planbench_tab:
|
| 243 |
planbench_plot: gr.Plot = gr.Plot()
|
| 244 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 337 |
swe_bench_markdown: gr.Markdown = gr.Markdown(
|
| 338 |
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
|
| 339 |
)
|
| 340 |
+
with gr.Tab("SWE-bench Multimodal", visible=False):
|
| 341 |
+
swe_bench_multimodal_plot: gr.Plot = gr.Plot()
|
| 342 |
+
swe_bench_multimodal_markdown: gr.Markdown = gr.Markdown(
|
| 343 |
+
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/#multimodal)"""
|
| 344 |
+
)
|
| 345 |
with gr.Tab("WebArena", visible=False):
|
| 346 |
webarena_plot: gr.Plot = gr.Plot()
|
| 347 |
webarena_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 352 |
osworld_markdown: gr.Markdown = gr.Markdown(
|
| 353 |
value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
|
| 354 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
with gr.Tab("MathVista", visible=False):
|
| 356 |
mathvista_plot: gr.Plot = gr.Plot()
|
| 357 |
mathvista_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 443 |
quality_markdown: gr.Markdown = gr.Markdown(
|
| 444 |
value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
|
| 445 |
)
|
| 446 |
+
with gr.Tab("MMVU", visible=False):
|
| 447 |
+
mmvu_plot: gr.Plot = gr.Plot()
|
| 448 |
+
mmvu_markdown: gr.Markdown = gr.Markdown(
|
| 449 |
+
value="""Source: [MMVU Leaderboard](https://mmvu-benchmark.github.io/#leaderboard)"""
|
| 450 |
+
)
|
| 451 |
+
with gr.Tab("PhysBench", visible=False):
|
| 452 |
+
physbench_plot: gr.Plot = gr.Plot()
|
| 453 |
+
physbench_markdown: gr.Markdown = gr.Markdown(
|
| 454 |
+
value="""Source: [PhysBench Leaderboard](https://physbench.github.io/#leaderboard)"""
|
| 455 |
+
)
|
| 456 |
with gr.Tab("Finance") as finance_tab:
|
| 457 |
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
| 458 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
|
|
|
| 631 |
"\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
|
| 632 |
gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
|
| 633 |
outputs=livecodebench_plot)
|
| 634 |
+
emma_tab.select(fn=create_simple_plot,
|
| 635 |
+
inputs=[gr.State("emma_mini.jsonl"),
|
| 636 |
+
gr.State("EMMA-Mini (Enhanced MultiModal ReAsoning) Score"),
|
| 637 |
+
gr.State("\"benchmark targeting organic multimodal reasoning across mathematics, physics, chemistry, and coding\" (Hao et al. 2025)"),
|
| 638 |
+
gr.State(date(2024, 9, 17)), gr.State(date(2025, 2, 1)),
|
| 639 |
+
gr.State(22.75), gr.State(100),
|
| 640 |
+
gr.State({"Human experts": 77.75})],
|
| 641 |
+
outputs=emma_plot)
|
| 642 |
|
| 643 |
if __name__ == "__main__":
|
| 644 |
demo.launch()
|
emma_mini.jsonl
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 48.00}
|
| 2 |
+
{"model": "o1-2024-12-17", "score": 45.75}
|
| 3 |
+
{"model": "gemini-2.0-flash-thinking-exp-1219", "score": 43.50}
|
| 4 |
+
{"model": "qwen2-vl-72b-instruct", "score": 37.25}
|
models.jsonl
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 2 |
{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 3 |
{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
| 1 |
+
{"Name": "qwen2-vl-72b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 2 |
{"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 3 |
{"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
| 4 |
{"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|