Update space
Browse files- app.py +76 -8
- src/about.py +18 -1
app.py
CHANGED
|
@@ -11,6 +11,8 @@ from src.about import (
|
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
LLM_BENCHMARKS_TEXT,
|
| 13 |
TITLE,
|
|
|
|
|
|
|
| 14 |
COMING_SOON_TEXT
|
| 15 |
)
|
| 16 |
from src.display.css_html_js import custom_css
|
|
@@ -99,7 +101,8 @@ def init_leaderboard(dataframe):
|
|
| 99 |
|
| 100 |
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
| 101 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
| 102 |
-
model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
|
|
|
| 103 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
| 104 |
|
| 105 |
|
|
@@ -124,6 +127,8 @@ def overall_leaderboard(dataframe):
|
|
| 124 |
demo = gr.Blocks(css=custom_css)
|
| 125 |
with demo:
|
| 126 |
gr.HTML(TITLE)
|
|
|
|
|
|
|
| 127 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 128 |
|
| 129 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
@@ -147,6 +152,12 @@ with demo:
|
|
| 147 |
)
|
| 148 |
|
| 149 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
leaderboard = overall_leaderboard(
|
| 152 |
get_model_leaderboard_df(
|
|
@@ -164,7 +175,21 @@ with demo:
|
|
| 164 |
))
|
| 165 |
|
| 166 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 169 |
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
|
| 170 |
leaderboard = overall_leaderboard(
|
|
@@ -217,8 +242,27 @@ with demo:
|
|
| 217 |
)
|
| 218 |
)
|
| 219 |
|
| 220 |
-
|
| 221 |
with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
|
| 224 |
leaderboard = overall_leaderboard(
|
|
@@ -254,13 +298,37 @@ with demo:
|
|
| 254 |
)
|
| 255 |
)
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
|
| 262 |
-
with gr.TabItem("🔬 Science", elem_id="science-table", id=5):
|
| 263 |
-
gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
|
| 264 |
|
| 265 |
|
| 266 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
|
|
|
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
LLM_BENCHMARKS_TEXT,
|
| 13 |
TITLE,
|
| 14 |
+
SUB_TITLE,
|
| 15 |
+
EXTERNAL_LINKS,
|
| 16 |
COMING_SOON_TEXT
|
| 17 |
)
|
| 18 |
from src.display.css_html_js import custom_css
|
|
|
|
| 101 |
|
| 102 |
# model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
| 103 |
# model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
|
| 104 |
+
# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
|
| 105 |
+
model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
|
| 106 |
# model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
| 107 |
|
| 108 |
|
|
|
|
| 127 |
demo = gr.Blocks(css=custom_css)
|
| 128 |
with demo:
|
| 129 |
gr.HTML(TITLE)
|
| 130 |
+
gr.HTML(SUB_TITLE)
|
| 131 |
+
gr.HTML(EXTERNAL_LINKS)
|
| 132 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 133 |
|
| 134 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
| 152 |
)
|
| 153 |
|
| 154 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
| 155 |
+
DESCRIPTION_TEXT = """
|
| 156 |
+
Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
|
| 157 |
+
We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
|
| 158 |
+
|
| 159 |
+
"""
|
| 160 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
| 161 |
|
| 162 |
leaderboard = overall_leaderboard(
|
| 163 |
get_model_leaderboard_df(
|
|
|
|
| 175 |
))
|
| 176 |
|
| 177 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
| 178 |
+
DESCRIPTION_TEXT="""
|
| 179 |
+
Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
|
| 180 |
+
To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
|
| 181 |
+
We prioritize recent math datasets and focus on college and beyond level math questions.
|
| 182 |
+
The current datasets include
|
| 183 |
+
[MATH](https://arxiv.org/abs/2103.03874),
|
| 184 |
+
[MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
|
| 185 |
+
[Omni](https://omni-math.github.io/),
|
| 186 |
+
[MathQA](https://arxiv.org/abs/1905.13319),
|
| 187 |
+
[MathBench](https://arxiv.org/abs/2405.12209),
|
| 188 |
+
[SciBench](https://arxiv.org/abs/2307.10635), and more!
|
| 189 |
+
We plan to include more math domains, such as calculus, number theory, and more in the future.
|
| 190 |
+
"""
|
| 191 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
| 192 |
+
|
| 193 |
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 194 |
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
|
| 195 |
leaderboard = overall_leaderboard(
|
|
|
|
| 242 |
)
|
| 243 |
)
|
| 244 |
|
|
|
|
| 245 |
with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
|
| 246 |
+
DESCRIPTION_TEXT = """
|
| 247 |
+
Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective at distinguishing between modern LLMs.
|
| 248 |
+
Our current focus is on two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
|
| 249 |
+
|
| 250 |
+
For logical reasoning, we collect datasets from
|
| 251 |
+
[BigBench Hard (BBH)](https://arxiv.org/abs/2210.09261),
|
| 252 |
+
[FOLIO](https://arxiv.org/abs/2209.00840),
|
| 253 |
+
[LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
|
| 254 |
+
[PrOntoQA](https://arxiv.org/abs/2210.01240),
|
| 255 |
+
[ReClor](https://arxiv.org/abs/2002.04326).
|
| 256 |
+
|
| 257 |
+
For social reasoning, we collect datasets from
|
| 258 |
+
[MMToM-QA](https://arxiv.org/abs/2401.08743),
|
| 259 |
+
[BigToM](https://arxiv.org/abs/2306.15448),
|
| 260 |
+
[Adv-CSFB](https://arxiv.org/abs/2305.14763),
|
| 261 |
+
[SocialIQA](https://arxiv.org/abs/1904.09728),
|
| 262 |
+
[NormBank](https://arxiv.org/abs/2305.17008).
|
| 263 |
+
|
| 264 |
+
"""
|
| 265 |
+
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
| 266 |
|
| 267 |
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
|
| 268 |
leaderboard = overall_leaderboard(
|
|
|
|
| 298 |
)
|
| 299 |
)
|
| 300 |
|
| 301 |
+
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
| 302 |
+
CURRENT_TEXT = """
|
| 303 |
+
# Coming soon!
|
| 304 |
+
We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
| 305 |
+
We have diversely and aggressively collected recent science datasets, including but not limited to
|
| 306 |
+
[GPQA](https://arxiv.org/abs/2311.12022),
|
| 307 |
+
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
| 308 |
+
[MMLU-Pro](https://arxiv.org/abs/2406.01574),
|
| 309 |
+
[OlympiadBench](https://arxiv.org/abs/2402.14008),
|
| 310 |
+
[SciBench](https://arxiv.org/abs/2307.10635),
|
| 311 |
+
[SciEval](https://arxiv.org/abs/2308.13149).
|
| 312 |
+
"""
|
| 313 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=5):
|
| 317 |
+
CURRENT_TEXT = """
|
| 318 |
+
# Comming soon!
|
| 319 |
+
We are working on adding more tasks in coding domains to the leaderboard.
|
| 320 |
+
The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
|
| 321 |
+
We collect a variety of recent coding datasets, including
|
| 322 |
+
[HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
|
| 323 |
+
[MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp),
|
| 324 |
+
[HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack),
|
| 325 |
+
[newly crawled LeetCode data](https://leetcode.com/problemset/),
|
| 326 |
+
filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
|
| 327 |
+
Our efforts also include synthesizing new code-related queries to ensure diversity!
|
| 328 |
+
"""
|
| 329 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 330 |
|
| 331 |
|
|
|
|
|
|
|
| 332 |
|
| 333 |
|
| 334 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):
|
src/about.py
CHANGED
|
@@ -53,7 +53,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 53 |
|
| 54 |
|
| 55 |
# Your leaderboard name
|
| 56 |
-
TITLE = """<h1 align="center" id="space-title">Decentralized Arena</h1>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# What does your leaderboard evaluate?
|
| 59 |
INTRODUCTION_TEXT = """
|
|
@@ -110,4 +122,9 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
| 110 |
|
| 111 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 112 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"""
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
# Your leaderboard name
|
| 56 |
+
TITLE = """<h1 align="center" id="space-title">Decentralized Arena Leaderboard</h1>"""
|
| 57 |
+
|
| 58 |
+
SUB_TITLE = """<h3 align="center" id="space-subtitle">Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions</h3>"""
|
| 59 |
+
|
| 60 |
+
EXTERNAL_LINKS = """
|
| 61 |
+
<h3 align="center" id="space-links">
|
| 62 |
+
<a href="https://de-arena.maitrix.org/" target="_blank">Blog</a> |
|
| 63 |
+
<a href="https://github.com/maitrix-org/de-arena" target="_blank">GitHub</a> |
|
| 64 |
+
<a href="https://de-arena.maitrix.org/images/Heading.mp4" target="">Video</a> |
|
| 65 |
+
<a href="https://maitrix.org/" target="_blank">@Maitrix.org</a> |
|
| 66 |
+
<a href="https://www.llm360.ai/" target="_blank">@LLM360</a>
|
| 67 |
+
</h3>
|
| 68 |
+
"""
|
| 69 |
|
| 70 |
# What does your leaderboard evaluate?
|
| 71 |
INTRODUCTION_TEXT = """
|
|
|
|
| 122 |
|
| 123 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 124 |
CITATION_BUTTON_TEXT = r"""
|
| 125 |
+
@misc{decentralized2024,
|
| 126 |
+
title={Decentralized Arena via Collective LLM Intelligence: Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions},
|
| 127 |
+
author={Yanbin Yin, Zhen Wang, Kun Zhou, Xiangdong Zhang, Shibo Hao, Yi Gu, Jieyuan Liu, Somanshu Singla, Tianyang Liu, Eric P. Xing, Zhengzhong Liu, Haojian Jin, Zhiting Hu},
|
| 128 |
+
year=2024
|
| 129 |
+
}
|
| 130 |
"""
|