Spaces:

kaizuberbuehler
/

ai-progress-charts

Running

App Files Files Community

kaizuberbuehler commited on Feb 14

Commit

5c2a615

1 Parent(s): 064c980

Add NYT Connections benchmark

Browse files

Files changed (2) hide show

app.py +13 -7
nyt_connections.jsonl +7 -0

app.py CHANGED Viewed

@@ -244,6 +244,11 @@ with gr.Blocks() as demo:
             planbench_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
             )
         with gr.Tab("🟡 GAIA") as gaia_tab:
             gaia_plot: gr.Plot = gr.Plot()
             gaia_markdown: gr.Markdown = gr.Markdown(
@@ -377,11 +382,6 @@ with gr.Blocks() as demo:
             hhem_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
             )
-        with gr.Tab("NYT Connections", visible=False):
-            nyt_connections_exam_plot: gr.Plot = gr.Plot()
-            nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
-                value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
-            )
         with gr.Tab("USACO", visible=False):
             usaco_plot: gr.Plot = gr.Plot()
             usaco_markdown: gr.Markdown = gr.Markdown(
@@ -465,7 +465,7 @@ with gr.Blocks() as demo:
                                                "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                                            gr.State(
                                                "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
-                                           gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                            gr.State(0), gr.State(100),
                                            gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
                                    outputs=arc_agi_public_eval_plot)
@@ -475,7 +475,7 @@ with gr.Blocks() as demo:
                                    "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                                gr.State(
                                    "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
-                               gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
                                gr.State(0), gr.State(100),
                                gr.State({"MTurkers": 77})],
                        outputs=arc_agi_semi_private_eval_plot)
@@ -639,6 +639,12 @@ with gr.Blocks() as demo:
                             gr.State(22.75), gr.State(100),
                             gr.State({"Human experts": 77.75})],
                     outputs=emma_plot)
 if __name__ == "__main__":
     demo.launch()

             planbench_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
             )
+        with gr.Tab("🟡 NYT Connections") as nyt_connections_tab:
+            nyt_connections_plot: gr.Plot = gr.Plot()
+            nyt_connections_markdown: gr.Markdown = gr.Markdown(
+                value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
+            )
         with gr.Tab("🟡 GAIA") as gaia_tab:
             gaia_plot: gr.Plot = gr.Plot()
             gaia_markdown: gr.Markdown = gr.Markdown(
             hhem_markdown: gr.Markdown = gr.Markdown(
                 value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
             )
         with gr.Tab("USACO", visible=False):
             usaco_plot: gr.Plot = gr.Plot()
             usaco_markdown: gr.Markdown = gr.Markdown(
                                                "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                                            gr.State(
                                                "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
+                                           gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
                                            gr.State(0), gr.State(100),
                                            gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
                                    outputs=arc_agi_public_eval_plot)
                                    "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
                                gr.State(
                                    "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
+                               gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
                                gr.State(0), gr.State(100),
                                gr.State({"MTurkers": 77})],
                        outputs=arc_agi_semi_private_eval_plot)
                             gr.State(22.75), gr.State(100),
                             gr.State({"Human experts": 77.75})],
                     outputs=emma_plot)
+    nyt_connections_tab.select(fn=create_simple_plot,
+                               inputs=[gr.State("nyt_connections.jsonl"),
+                                       gr.State("NYT Connections (Extended Version, Newest 100 Puzzles) Score"),
+                                       gr.State("\"NYT Connections puzzles [...] To increase difficulty, Extended Connections adds up to four extra trick words to each puzzle.\" (Mazur, 2025)"),
+                                       gr.State(date(2024, 7, 23)), gr.State(date(2025, 2, 1))],
+                               outputs=nyt_connections_plot)
 if __name__ == "__main__":
     demo.launch()

nyt_connections.jsonl ADDED Viewed

	@@ -0,0 +1,7 @@

+{"model": "o1-2024-12-17", "score": 60.0}
+{"model": "o3-mini-2025-01-31", "score": 42.8}
+{"model": "deepseek-r1", "score": 28.7}
+{"model": "o1-mini-2024-09-12", "score": 18.8}
+{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 15.2}
+{"model": "qwen2.5-max", "score": 13.8}
+{"model": "llama-3.1-405b-instruct", "score": 13.2}