Spaces:

kaizuberbuehler
/

ai-progress-charts

Running

App Files Files Community

kaizuberbuehler commited on Feb 14

Commit

064c980

1 Parent(s): b605a32

Add EMMA benchmark

Browse files

Files changed (3) hide show

app.py +29 -5
emma_mini.jsonl +4 -0
models.jsonl +1 -0

app.py CHANGED Viewed

@@ -190,6 +190,7 @@ with gr.Blocks() as demo:
 | Humanity's Last Exam | 🔴 7% |
 | BigCodeBench | 🟠 36% |
 | Simple Bench | 🟠 42% |
 | PlanBench | 🟠 53% |
 | GAIA | 🟡 65% |
 | LiveBench Language | 🟡 65% |
@@ -233,6 +234,11 @@ with gr.Blocks() as demo:
             simple_bench_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
             )
         with gr.Tab("🟠 PlanBench") as planbench_tab:
             planbench_plot: gr.Plot = gr.Plot()
             planbench_markdown: gr.Markdown = gr.Markdown(
@@ -331,6 +337,11 @@ with gr.Blocks() as demo:
             swe_bench_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
             )
         with gr.Tab("WebArena", visible=False):
             webarena_plot: gr.Plot = gr.Plot()
             webarena_markdown: gr.Markdown = gr.Markdown(
@@ -341,11 +352,6 @@ with gr.Blocks() as demo:
             osworld_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
             )
-        with gr.Tab("EMMA-Mini", visible=False):
-            emma_plot: gr.Plot = gr.Plot()
-            emma_markdown: gr.Markdown = gr.Markdown(
-                value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
-            )
         with gr.Tab("MathVista", visible=False):
             mathvista_plot: gr.Plot = gr.Plot()
             mathvista_markdown: gr.Markdown = gr.Markdown(
@@ -437,6 +443,16 @@ with gr.Blocks() as demo:
             quality_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
             )
     with gr.Tab("Finance") as finance_tab:
         with gr.Tab("Big Tech Capex") as big_five_capex_tab:
             big_five_capex_plot: gr.Plot = gr.Plot()
@@ -615,6 +631,14 @@ with gr.Blocks() as demo:
                                          "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
                                      gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
                              outputs=livecodebench_plot)
 if __name__ == "__main__":
     demo.launch()

 | Humanity's Last Exam | 🔴 7% |
 | BigCodeBench | 🟠 36% |
 | Simple Bench | 🟠 42% |
+| EMMA-Mini | 🟠 48% |
 | PlanBench | 🟠 53% |
 | GAIA | 🟡 65% |
 | LiveBench Language | 🟡 65% |
             simple_bench_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
             )
+        with gr.Tab("🟠 EMMA-Mini") as emma_tab:
+            emma_plot: gr.Plot = gr.Plot()
+            emma_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [EMMA Leaderboard](https://emma-benchmark.github.io/#leaderboard)"""
+            )
         with gr.Tab("🟠 PlanBench") as planbench_tab:
             planbench_plot: gr.Plot = gr.Plot()
             planbench_markdown: gr.Markdown = gr.Markdown(
             swe_bench_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
             )
+        with gr.Tab("SWE-bench Multimodal", visible=False):
+            swe_bench_multimodal_plot: gr.Plot = gr.Plot()
+            swe_bench_multimodal_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/#multimodal)"""
+            )
         with gr.Tab("WebArena", visible=False):
             webarena_plot: gr.Plot = gr.Plot()
             webarena_markdown: gr.Markdown = gr.Markdown(
             osworld_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [OSWorld Benchmark](https://os-world.github.io/)"""
             )
         with gr.Tab("MathVista", visible=False):
             mathvista_plot: gr.Plot = gr.Plot()
             mathvista_markdown: gr.Markdown = gr.Markdown(
             quality_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [QuALITY Leaderboard](https://nyu-mll.github.io/quality/)"""
             )
+        with gr.Tab("MMVU", visible=False):
+            mmvu_plot: gr.Plot = gr.Plot()
+            mmvu_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [MMVU Leaderboard](https://mmvu-benchmark.github.io/#leaderboard)"""
+            )
+        with gr.Tab("PhysBench", visible=False):
+            physbench_plot: gr.Plot = gr.Plot()
+            physbench_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [PhysBench Leaderboard](https://physbench.github.io/#leaderboard)"""
+            )
     with gr.Tab("Finance") as finance_tab:
         with gr.Tab("Big Tech Capex") as big_five_capex_tab:
             big_five_capex_plot: gr.Plot = gr.Plot()
                                          "\"comprehensive and contamination-free evaluation of LLMs for code, which continuously collects new problems over time from contests across three competition platforms\" (Jain et al. 2024)"),
                                      gr.State(date(2024, 4, 9)), gr.State(date(2025, 2, 1))],
                              outputs=livecodebench_plot)
+    emma_tab.select(fn=create_simple_plot,
+                    inputs=[gr.State("emma_mini.jsonl"),
+                            gr.State("EMMA-Mini (Enhanced MultiModal ReAsoning) Score"),
+                            gr.State("\"benchmark targeting organic multimodal reasoning across mathematics, physics, chemistry, and coding\" (Hao et al. 2025)"),
+                            gr.State(date(2024, 9, 17)), gr.State(date(2025, 2, 1)),
+                            gr.State(22.75), gr.State(100),
+                            gr.State({"Human experts": 77.75})],
+                    outputs=emma_plot)
 if __name__ == "__main__":
     demo.launch()

emma_mini.jsonl ADDED Viewed

	@@ -0,0 +1,4 @@

+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 48.00}
+{"model": "o1-2024-12-17", "score": 45.75}
+{"model": "gemini-2.0-flash-thinking-exp-1219", "score": 43.50}
+{"model": "qwen2-vl-72b-instruct", "score": 37.25}

models.jsonl CHANGED Viewed

@@ -1,3 +1,4 @@
 {"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}

+{"Name": "qwen2-vl-72b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini-2025-01-31-high", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini-2025-01-31 (high)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 {"Name": "o3-mini-2025-01-31 (medium)", "Release Date": "2025-01-31", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}