Commit
·
5c2a615
1
Parent(s):
064c980
Add NYT Connections benchmark
Browse files- app.py +13 -7
- nyt_connections.jsonl +7 -0
app.py
CHANGED
|
@@ -244,6 +244,11 @@ with gr.Blocks() as demo:
|
|
| 244 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
| 245 |
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
|
| 246 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
with gr.Tab("🟡 GAIA") as gaia_tab:
|
| 248 |
gaia_plot: gr.Plot = gr.Plot()
|
| 249 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -377,11 +382,6 @@ with gr.Blocks() as demo:
|
|
| 377 |
hhem_markdown: gr.Markdown = gr.Markdown(
|
| 378 |
value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
|
| 379 |
)
|
| 380 |
-
with gr.Tab("NYT Connections", visible=False):
|
| 381 |
-
nyt_connections_exam_plot: gr.Plot = gr.Plot()
|
| 382 |
-
nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
|
| 383 |
-
value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
|
| 384 |
-
)
|
| 385 |
with gr.Tab("USACO", visible=False):
|
| 386 |
usaco_plot: gr.Plot = gr.Plot()
|
| 387 |
usaco_markdown: gr.Markdown = gr.Markdown(
|
|
@@ -465,7 +465,7 @@ with gr.Blocks() as demo:
|
|
| 465 |
"ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 466 |
gr.State(
|
| 467 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 468 |
-
gr.State(date(2024,
|
| 469 |
gr.State(0), gr.State(100),
|
| 470 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
| 471 |
outputs=arc_agi_public_eval_plot)
|
|
@@ -475,7 +475,7 @@ with gr.Blocks() as demo:
|
|
| 475 |
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 476 |
gr.State(
|
| 477 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 478 |
-
gr.State(date(2024,
|
| 479 |
gr.State(0), gr.State(100),
|
| 480 |
gr.State({"MTurkers": 77})],
|
| 481 |
outputs=arc_agi_semi_private_eval_plot)
|
|
@@ -639,6 +639,12 @@ with gr.Blocks() as demo:
|
|
| 639 |
gr.State(22.75), gr.State(100),
|
| 640 |
gr.State({"Human experts": 77.75})],
|
| 641 |
outputs=emma_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
if __name__ == "__main__":
|
| 644 |
demo.launch()
|
|
|
|
| 244 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
| 245 |
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
|
| 246 |
)
|
| 247 |
+
with gr.Tab("🟡 NYT Connections") as nyt_connections_tab:
|
| 248 |
+
nyt_connections_plot: gr.Plot = gr.Plot()
|
| 249 |
+
nyt_connections_markdown: gr.Markdown = gr.Markdown(
|
| 250 |
+
value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
|
| 251 |
+
)
|
| 252 |
with gr.Tab("🟡 GAIA") as gaia_tab:
|
| 253 |
gaia_plot: gr.Plot = gr.Plot()
|
| 254 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 382 |
hhem_markdown: gr.Markdown = gr.Markdown(
|
| 383 |
value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
|
| 384 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
with gr.Tab("USACO", visible=False):
|
| 386 |
usaco_plot: gr.Plot = gr.Plot()
|
| 387 |
usaco_markdown: gr.Markdown = gr.Markdown(
|
|
|
|
| 465 |
"ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 466 |
gr.State(
|
| 467 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 468 |
+
gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
|
| 469 |
gr.State(0), gr.State(100),
|
| 470 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
| 471 |
outputs=arc_agi_public_eval_plot)
|
|
|
|
| 475 |
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
| 476 |
gr.State(
|
| 477 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
| 478 |
+
gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
|
| 479 |
gr.State(0), gr.State(100),
|
| 480 |
gr.State({"MTurkers": 77})],
|
| 481 |
outputs=arc_agi_semi_private_eval_plot)
|
|
|
|
| 639 |
gr.State(22.75), gr.State(100),
|
| 640 |
gr.State({"Human experts": 77.75})],
|
| 641 |
outputs=emma_plot)
|
| 642 |
+
nyt_connections_tab.select(fn=create_simple_plot,
|
| 643 |
+
inputs=[gr.State("nyt_connections.jsonl"),
|
| 644 |
+
gr.State("NYT Connections (Extended Version, Newest 100 Puzzles) Score"),
|
| 645 |
+
gr.State("\"NYT Connections puzzles [...] To increase difficulty, Extended Connections adds up to four extra trick words to each puzzle.\" (Mazur, 2025)"),
|
| 646 |
+
gr.State(date(2024, 7, 23)), gr.State(date(2025, 2, 1))],
|
| 647 |
+
outputs=nyt_connections_plot)
|
| 648 |
|
| 649 |
if __name__ == "__main__":
|
| 650 |
demo.launch()
|
nyt_connections.jsonl
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model": "o1-2024-12-17", "score": 60.0}
|
| 2 |
+
{"model": "o3-mini-2025-01-31", "score": 42.8}
|
| 3 |
+
{"model": "deepseek-r1", "score": 28.7}
|
| 4 |
+
{"model": "o1-mini-2024-09-12", "score": 18.8}
|
| 5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 15.2}
|
| 6 |
+
{"model": "qwen2.5-max", "score": 13.8}
|
| 7 |
+
{"model": "llama-3.1-405b-instruct", "score": 13.2}
|