Spaces:

LLM360
/

de-arena

Running

App Files Files Community

yzabc007 commited on Oct 9, 2024

Commit

606c189

1 Parent(s): 056a0a0

Update space

Browse files

Files changed (2) hide show

app.py +76 -8
src/about.py +18 -1

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ from src.about import (
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
     COMING_SOON_TEXT
 )
 from src.display.css_html_js import custom_css
@@ -99,7 +101,8 @@ def init_leaderboard(dataframe):
 # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
 # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
-model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
 # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
@@ -124,6 +127,8 @@ def overall_leaderboard(dataframe):
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
@@ -147,6 +152,12 @@ with demo:
             )
         with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
             leaderboard = overall_leaderboard(
                 get_model_leaderboard_df(
@@ -164,7 +175,21 @@ with demo:
                 ))
         with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
             # leaderboard = init_leaderboard(LEADERBOARD_DF)
             with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
@@ -217,8 +242,27 @@ with demo:
                     )
                 )
         with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
             with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
@@ -254,13 +298,37 @@ with demo:
                     )
                 )
-        with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=4):
-            gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🔬 Science", elem_id="science-table", id=5):
-            gr.Markdown(COMING_SOON_TEXT, elem_classes="markdown-text")
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):

     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    SUB_TITLE,
+    EXTERNAL_LINKS,
     COMING_SOON_TEXT
 )
 from src.display.css_html_js import custom_css
 # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
 # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
+# model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
+model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
 # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
+    gr.HTML(SUB_TITLE)
+    gr.HTML(EXTERNAL_LINKS)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
             )
         with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
+            DESCRIPTION_TEXT = """
+            Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
+            We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
+            """
+            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
             leaderboard = overall_leaderboard(
                 get_model_leaderboard_df(
                 ))
         with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
+            DESCRIPTION_TEXT="""
+            Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
+            To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
+            We prioritize recent math datasets and focus on college and beyond level math questions.
+            The current datasets include
+            [MATH](https://arxiv.org/abs/2103.03874),
+            [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
+            [Omni](https://omni-math.github.io/),
+            [MathQA](https://arxiv.org/abs/1905.13319),
+            [MathBench](https://arxiv.org/abs/2405.12209),
+            [SciBench](https://arxiv.org/abs/2307.10635), and more!
+            We plan to include more math domains, such as calculus, number theory, and more in the future.
+            """
+            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
             # leaderboard = init_leaderboard(LEADERBOARD_DF)
             with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     )
                 )
         with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
+            DESCRIPTION_TEXT = """
+            Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective at distinguishing between modern LLMs.
+            Our current focus is on two challenging types of reasoning: logical reasoning and social reasoning, both of which present more meaningful and sophisticated ways to assess LLM performance.
+            For logical reasoning, we collect datasets from
+            [BigBench Hard (BBH)](https://arxiv.org/abs/2210.09261),
+            [FOLIO](https://arxiv.org/abs/2209.00840),
+            [LogiQA2.0](https://github.com/csitfun/LogiQA2.0),
+            [PrOntoQA](https://arxiv.org/abs/2210.01240),
+            [ReClor](https://arxiv.org/abs/2002.04326).
+            For social reasoning, we collect datasets from
+            [MMToM-QA](https://arxiv.org/abs/2401.08743),
+            [BigToM](https://arxiv.org/abs/2306.15448),
+            [Adv-CSFB](https://arxiv.org/abs/2305.14763),
+            [SocialIQA](https://arxiv.org/abs/1904.09728),
+            [NormBank](https://arxiv.org/abs/2305.17008).
+            """
+            gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
             with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     )
                 )
+        with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
+            CURRENT_TEXT = """
+            # Coming soon!
+            We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
+            We have diversely and aggressively collected recent science datasets, including but not limited to
+            [GPQA](https://arxiv.org/abs/2311.12022),
+            [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
+            [MMLU-Pro](https://arxiv.org/abs/2406.01574),
+            [OlympiadBench](https://arxiv.org/abs/2402.14008),
+            [SciBench](https://arxiv.org/abs/2307.10635),
+            [SciEval](https://arxiv.org/abs/2308.13149).
+            """
+            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=5):
+            CURRENT_TEXT = """
+            # Comming soon!
+            We are working on adding more tasks in coding domains to the leaderboard.
+            The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
+            We collect a variety of recent coding datasets, including
+            [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
+            [MBPP](https://huggingface.co/datasets/google-research-datasets/mbpp),
+            [HumanEvalFix](https://huggingface.co/datasets/bigcode/humanevalpack),
+            [newly crawled LeetCode data](https://leetcode.com/problemset/),
+            filtered code-related queries from [Arena-Hard-Auto](https://github.com/lmarena/arena-hard-auto) and more!
+            Our efforts also include synthesizing new code-related queries to ensure diversity!
+            """
+            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=6):

src/about.py CHANGED Viewed

@@ -53,7 +53,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Decentralized Arena</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
@@ -110,4 +122,9 @@ If everything is done, check you can launch the EleutherAIHarness on your model
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Decentralized Arena Leaderboard</h1>"""
+SUB_TITLE = """<h3 align="center" id="space-subtitle">Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions</h3>"""
+EXTERNAL_LINKS = """
+<h3 align="center" id="space-links">
+    <a href="https://de-arena.maitrix.org/" target="_blank">Blog</a> |
+    <a href="https://github.com/maitrix-org/de-arena" target="_blank">GitHub</a> |
+    <a href="https://de-arena.maitrix.org/images/Heading.mp4" target="">Video</a> |
+    <a href="https://maitrix.org/" target="_blank">@Maitrix.org</a> |
+    <a href="https://www.llm360.ai/" target="_blank">@LLM360</a>
+</h3>
+"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
+@misc{decentralized2024,
+  title={Decentralized Arena via Collective LLM Intelligence: Building Automated, Robust, and Transparent LLM Evaluation for Numerous Dimensions},
+  author={Yanbin Yin, Zhen Wang, Kun Zhou, Xiangdong Zhang, Shibo Hao, Yi Gu, Jieyuan Liu, Somanshu Singla, Tianyang Liu, Eric P. Xing, Zhengzhong Liu, Haojian Jin, Zhiting Hu},
+  year=2024
+}
 """