Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1147

Clémentine commited on Jun 26, 2024

Commit

beb2b32

1 Parent(s): aa85eec

update v2

Browse files

Files changed (16) hide show

README.md +4 -2
app.py +404 -25
app_bkp.py +0 -316
gif.gif +0 -3
pyproject.toml +4 -2
requirements.txt +7 -2
src/display/about.py +85 -2
src/display/css_html_js.py +39 -15
src/display/formatting.py +1 -1
src/display/utils.py +67 -50
src/envs.py +6 -4
src/leaderboard/filter_models.py +8 -118
src/populate.py +4 -3
src/submission/submit.py +2 -0
src/tools/plots.py +1 -1
src/voting/vote_system.py +151 -0

README.md CHANGED Viewed

@@ -1,15 +1,17 @@
 ---
-title: Open LLM Leaderboard
 emoji: 🏆
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.20.0
 app_file: app.py
 pinned: true
 license: apache-2.0
 fullWidth: true
 startup_duration_timeout: 1h
 space_ci:
   private: true
   secrets:

 ---
+title: Open LLM Leaderboard 2
 emoji: 🏆
 colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.36.1
 app_file: app.py
 pinned: true
 license: apache-2.0
+duplicated_from: open-llm-leaderboard/open_llm_leaderboard
 fullWidth: true
 startup_duration_timeout: 1h
+hf_oauth: true
 space_ci:
   private: true
   secrets:

app.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import os
 import logging
 import time
 import datetime
 import gradio as gr
 import datasets
 from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
@@ -27,6 +32,7 @@ from src.display.utils import (
     Precision,
     WeightType,
     fields,
 )
 from src.envs import (
     API,
@@ -35,35 +41,343 @@ from src.envs import (
     HF_TOKEN,
     QUEUE_REPO,
     REPO_ID,
     HF_HOME,
 )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    countdown = gr.HTML(
-        """<div align="center">
-        <div position: relative>
-        <img
-            src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
-            allowtransparency="true"
-            style="display:block;width:100%;height:auto;"
-        />
-        <iframe
-            src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
-            style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
-            scrolling="no"
-            allowtransparency="true"
-            frameborder="0"
-            allowfullscreen
-        />
-        </div>
-        </div>"""
     )
-    #gif = gr.Image(value="./gif.gif", interactive=False)
-    gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
@@ -75,4 +389,69 @@ with demo:
                 show_copy_button=True,
             )
-demo.queue(default_concurrency_limit=40).launch()

 import os
 import logging
 import time
+import schedule
 import datetime
 import gradio as gr
+from threading import Thread
 import datasets
 from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+# Start ephemeral Spaces on PRs (see config in README.md)
+from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     Precision,
     WeightType,
     fields,
+    EvalQueueColumn
 )
 from src.envs import (
     API,
     HF_TOKEN,
     QUEUE_REPO,
     REPO_ID,
+    VOTES_REPO,
+    VOTES_PATH,
     HF_HOME,
 )
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
+from src.voting.vote_system import VoteManager, run_scheduler
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Start ephemeral Spaces on PRs (see config in README.md)
+from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
+# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
+# This controls whether a full initialization should be performed.
+DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
+LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
+LEADERBOARD_DF = None
+def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
+def time_diff_wrapper(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        diff = end_time - start_time
+        logging.info(f"Time taken for {func.__name__}: {diff} seconds")
+        return result
+    return wrapper
+@time_diff_wrapper
+def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
+    """Download dataset with exponential backoff retries."""
+    attempt = 0
+    while attempt < max_attempts:
+        try:
+            logging.info(f"Downloading {repo_id} to {local_dir}")
+            snapshot_download(
+                repo_id=repo_id,
+                local_dir=local_dir,
+                repo_type=repo_type,
+                tqdm_class=None,
+                etag_timeout=30,
+                max_workers=8,
+            )
+            logging.info("Download successful")
+            return
+        except Exception as e:
+            wait_time = backoff_factor**attempt
+            logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
+            time.sleep(wait_time)
+            attempt += 1
+    raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
+def get_latest_data_leaderboard(leaderboard_initial_df = None):
+    current_time = datetime.datetime.now()
+    global LAST_UPDATE_LEADERBOARD
+    if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
+        return leaderboard_initial_df
+    LAST_UPDATE_LEADERBOARD = current_time
+    leaderboard_dataset = datasets.load_dataset(
+        AGGREGATED_REPO,
+        "default",
+        split="train",
+        cache_dir=HF_HOME,
+        download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+        verification_mode="no_checks"
+    )
+    global LEADERBOARD_DF
+    LEADERBOARD_DF = get_leaderboard_df(
+        leaderboard_dataset=leaderboard_dataset,
+        cols=COLS,
+        benchmark_cols=BENCHMARK_COLS,
     )
+    return LEADERBOARD_DF
+def get_latest_data_queue():
+    eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    return eval_queue_dfs
+def init_space():
+    """Initializes the application space, loading only necessary data."""
+    if DO_FULL_INIT:
+        # These downloads only occur on full initialization
+        try:
+            download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
+            download_dataset(VOTES_REPO, VOTES_PATH)
+        except Exception:
+            restart_space()
+    # Always redownload the leaderboard DataFrame
+    global LEADERBOARD_DF
+    LEADERBOARD_DF = get_latest_data_leaderboard()
+    # Evaluation queue DataFrame retrieval is independent of initialization detail level
+    eval_queue_dfs = get_latest_data_queue()
+    return LEADERBOARD_DF, eval_queue_dfs
+# Initialize VoteManager
+vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
+# Schedule the upload_votes method to run every 15 minutes
+schedule.every(15).minutes.do(vote_manager.upload_votes)
+# Start the scheduler in a separate thread
+scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
+scheduler_thread.start()
+# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
+# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+LEADERBOARD_DF, eval_queue_dfs = init_space()
+finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
+# Data processing for plots now only on demand in the respective Gradio tab
+def load_and_create_plots():
+    plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
+    return plot_df
+# Function to check if a user is logged in
+def check_login(profile: gr.OAuthProfile | None) -> bool:
+    if profile is None:
+        return False
+    return True
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0.01,
+                max=150,
+                label="Select the number of parameters (B)",
+            ),
+            ColumnFilter(
+                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            ),
+            ColumnFilter(
+                AutoEvalColumn.merged.name, type="boolean", label="Merge/MoErge", default=True
+            ),
+            ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
+            ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
+            ColumnFilter(AutoEvalColumn.maintainers_highlight.name, type="boolean", label="Show only maintainer's highlight", default=False),
+        ],
+        bool_checkboxgroup_label="Hide models",
+        interactive=False,
+    )
+main_block = gr.Blocks(css=custom_css)
+with main_block:
+    with gr.Row(elem_id="header-row"):
+        gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="latest")
+                    with gr.Row():
+                        model_type = gr.Dropdown(
+                            choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                            label="Model type",
+                            multiselect=False,
+                            value=ModelType.FT.to_str(" : "),
+                            interactive=True,
+                        )
+                        chat_template_toggle = gr.Checkbox(
+                            label="Use chat template",
+                            value=False,
+                            info="Is your model a chat model?",
+                        )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            with gr.Column():
+                with gr.Accordion(
+                    f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                    open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                with gr.Accordion(
+                    f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                    open=False,
+                ):
+                    with gr.Row():
+                        running_eval_table = gr.components.Dataframe(
+                            value=running_eval_queue_df,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            row_count=5,
+                        )
+                with gr.Accordion(
+                    f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                    open=False,
+                ):
+                    with gr.Row():
+                        pending_eval_table = gr.components.Dataframe(
+                            value=pending_eval_queue_df,
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            row_count=5,
+                        )
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            # The chat template checkbox update function
+            def update_chat_checkbox(model_type_value):
+                return ModelType.from_str(model_type_value) == ModelType.chat
+            model_type.change(
+                fn=update_chat_checkbox,
+                inputs=[model_type],  # Pass the current checkbox value
+                outputs=chat_template_toggle,
+            )
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                    chat_template_toggle,
+                ],
+                submission_result,
+            )
+        # Ensure  the values in 'pending_eval_queue_df' are correct and ready for the DataFrame component
+        with gr.TabItem("🆙 Model Vote"):
+            with gr.Row():
+                gr.Markdown(
+                    "## Vote for the models which should be evaluated first! \nYou'll need to sign in with the button above first. All votes are recorded.",
+                    elem_classes="markdown-text"
+                )
+                login_button = gr.LoginButton(elem_id="oauth-button")
+            with gr.Row():
+                pending_models = pending_eval_queue_df[EvalQueueColumn.model_name.name].to_list()
+                with gr.Column():
+                    selected_model = gr.Dropdown(
+                        choices=pending_models,
+                        label="Models",
+                        multiselect=False,
+                        value="str",
+                        interactive=True,
+                    )
+                    vote_button = gr.Button("Vote", variant="primary")
+            with gr.Row():
+                with gr.Accordion(
+                    f"Available models pending ({len(pending_eval_queue_df)})",
+                    open=True,
+                ):
+                    with gr.Row():
+                        pending_eval_table_votes = gr.components.Dataframe(
+                            value=vote_manager.create_request_vote_df(
+                                pending_eval_queue_df
+                            ),
+                            headers=EVAL_COLS,
+                            datatype=EVAL_TYPES,
+                            row_count=5,
+                            interactive=False
+                        )
+            # Set the click event for the vote button
+            vote_button.click(
+                vote_manager.add_vote,
+                inputs=[selected_model, pending_eval_table],
+                outputs=[pending_eval_table_votes]
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
                 show_copy_button=True,
             )
+    main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
+    leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
+    pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
+main_block.queue(default_concurrency_limit=40)
+def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
+    # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
+    # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
+    # ht to Lucain!
+    if SPACE_ID is None:
+        print("Not in a Space: Space CI disabled.")
+        return WebhooksServer(ui=main_block)
+    if IS_EPHEMERAL_SPACE:
+        print("In an ephemeral Space: Space CI disabled.")
+        return WebhooksServer(ui=main_block)
+    card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
+    config = card.data.get("space_ci", {})
+    print(f"Enabling Space CI with config from README: {config}")
+    return configure_space_ci(
+        blocks=ui,
+        trusted_authors=config.get("trusted_authors"),
+        private=config.get("private", "auto"),
+        variables=config.get("variables", "auto"),
+        secrets=config.get("secrets"),
+        hardware=config.get("hardware"),
+        storage=config.get("storage"),
+    )
+# Create webhooks server (with CI url if in Space and not ephemeral)
+webhooks_server = enable_space_ci_and_return_server(ui=main_block)
+# Add webhooks
+@webhooks_server.add_webhook
+def update_leaderboard(payload: WebhookPayload) -> None:
+    """Redownloads the leaderboard dataset each time it updates"""
+    if payload.repo.type == "dataset" and payload.event.action == "update":
+        datasets.load_dataset(
+            AGGREGATED_REPO,
+            "default",
+            split="train",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
+            verification_mode="no_checks"
+        )
+# The below code is not used at the moment, as we can manage the queue file locally
+LAST_UPDATE_QUEUE = datetime.datetime.now()
+@webhooks_server.add_webhook
+def update_queue(payload: WebhookPayload) -> None:
+    """Redownloads the queue dataset each time it updates"""
+    if payload.repo.type == "dataset" and payload.event.action == "update":
+        current_time = datetime.datetime.now()
+        global LAST_UPDATE_QUEUE
+        if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
+            print("Would have updated the queue")
+            # We only redownload is last update was more than 10 minutes ago, as the queue is
+            # updated regularly and heavy to download
+            #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
+            LAST_UPDATE_QUEUE = datetime.datetime.now()
+webhooks_server.launch()

app_bkp.py DELETED Viewed

@@ -1,316 +0,0 @@
-import os
-import logging
-import time
-import datetime
-import gradio as gr
-import datasets
-from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-from src.display.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    FAQ_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    Precision,
-    WeightType,
-    fields,
-)
-from src.envs import (
-    API,
-    EVAL_REQUESTS_PATH,
-    AGGREGATED_REPO,
-    HF_TOKEN,
-    QUEUE_REPO,
-    REPO_ID,
-    HF_HOME,
-)
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
-# Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
-# This controls whether a full initialization should be performed.
-DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
-LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
-def restart_space():
-    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
-def time_diff_wrapper(func):
-    def wrapper(*args, **kwargs):
-        start_time = time.time()
-        result = func(*args, **kwargs)
-        end_time = time.time()
-        diff = end_time - start_time
-        logging.info(f"Time taken for {func.__name__}: {diff} seconds")
-        return result
-    return wrapper
-@time_diff_wrapper
-def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
-    """Download dataset with exponential backoff retries."""
-    attempt = 0
-    while attempt < max_attempts:
-        try:
-            logging.info(f"Downloading {repo_id} to {local_dir}")
-            snapshot_download(
-                repo_id=repo_id,
-                local_dir=local_dir,
-                repo_type=repo_type,
-                tqdm_class=None,
-                etag_timeout=30,
-                max_workers=8,
-            )
-            logging.info("Download successful")
-            return
-        except Exception as e:
-            wait_time = backoff_factor**attempt
-            logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
-            time.sleep(wait_time)
-            attempt += 1
-    raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
-def get_latest_data_leaderboard(leaderboard_initial_df = None):
-    current_time = datetime.datetime.now()
-    global LAST_UPDATE_LEADERBOARD
-    if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
-        return leaderboard_initial_df
-    LAST_UPDATE_LEADERBOARD = current_time
-    leaderboard_dataset = datasets.load_dataset(
-        AGGREGATED_REPO,
-        "default",
-        split="train",
-        cache_dir=HF_HOME,
-        download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-        verification_mode="no_checks"
-    )
-    leaderboard_df = get_leaderboard_df(
-        leaderboard_dataset=leaderboard_dataset,
-        cols=COLS,
-        benchmark_cols=BENCHMARK_COLS,
-    )
-    return leaderboard_df
-def get_latest_data_queue():
-    eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-    return eval_queue_dfs
-def init_space():
-    """Initializes the application space, loading only necessary data."""
-    if DO_FULL_INIT:
-        # These downloads only occur on full initialization
-        try:
-            download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
-        except Exception:
-            restart_space()
-    # Always redownload the leaderboard DataFrame
-    leaderboard_df = get_latest_data_leaderboard()
-    # Evaluation queue DataFrame retrieval is independent of initialization detail level
-    eval_queue_dfs = get_latest_data_queue()
-    return leaderboard_df, eval_queue_dfs
-# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
-# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-leaderboard_df, eval_queue_dfs = init_space()
-finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
-# Data processing for plots now only on demand in the respective Gradio tab
-def load_and_create_plots():
-    plot_df = create_plot_df(create_scores_df(leaderboard_df))
-    return plot_df
-def init_leaderboard(dataframe):
-    return Leaderboard(
-        value = dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
-            ),
-            ColumnFilter(
-                AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
-            ),
-            ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
-            ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(leaderboard_df)
-        with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
-            with gr.Row():
-                with gr.Column():
-                    plot_df = load_and_create_plots()
-                    chart = create_metric_plot_obj(
-                        plot_df,
-                        [AutoEvalColumn.average.name],
-                        title="Average of Top Scores and Human Baseline Over Time (from last update)",
-                    )
-                    gr.Plot(value=chart, min_width=500)
-                with gr.Column():
-                    plot_df = load_and_create_plots()
-                    chart = create_metric_plot_obj(
-                        plot_df,
-                        BENCHMARK_COLS,
-                        title="Top Scores and Human Baseline Over Time (from last update)",
-                    )
-                    gr.Plot(value=chart, min_width=500)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
-            gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
-            countdown = gr.HTML(
-                """<div align="center">
-                <div position: relative>
-                <img
-                    src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
-                    allowtransparency="true"
-                    style="display:block;width:100%;height:auto;"
-                />
-                <iframe
-                    src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
-                    style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
-                    scrolling="no"
-                    allowtransparency="true"
-                    frameborder="0"
-                    allowfullscreen
-                />
-                </div>
-                </div>"""
-            )
-            #gif = gr.Image(value="./gif.gif", interactive=False)
-            gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-    demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
-demo.queue(default_concurrency_limit=40)
-# Start ephemeral Spaces on PRs (see config in README.md)
-from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
-def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
-    # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
-    # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
-    # ht to Lucain!
-    if SPACE_ID is None:
-        print("Not in a Space: Space CI disabled.")
-        return WebhooksServer(ui=demo)
-    if IS_EPHEMERAL_SPACE:
-        print("In an ephemeral Space: Space CI disabled.")
-        return WebhooksServer(ui=demo)
-    card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
-    config = card.data.get("space_ci", {})
-    print(f"Enabling Space CI with config from README: {config}")
-    return configure_space_ci(
-        blocks=ui,
-        trusted_authors=config.get("trusted_authors"),
-        private=config.get("private", "auto"),
-        variables=config.get("variables", "auto"),
-        secrets=config.get("secrets"),
-        hardware=config.get("hardware"),
-        storage=config.get("storage"),
-    )
-# Create webhooks server (with CI url if in Space and not ephemeral)
-webhooks_server = enable_space_ci_and_return_server(ui=demo)
-# Add webhooks
-@webhooks_server.add_webhook
-def update_leaderboard(payload: WebhookPayload) -> None:
-    """Redownloads the leaderboard dataset each time it updates"""
-    if payload.repo.type == "dataset" and payload.event.action == "update":
-        datasets.load_dataset(
-            AGGREGATED_REPO,
-            "default",
-            split="train",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
-            verification_mode="no_checks"
-        )
-# The below code is not used at the moment, as we can manage the queue file locally
-LAST_UPDATE_QUEUE = datetime.datetime.now()
-@webhooks_server.add_webhook
-def update_queue(payload: WebhookPayload) -> None:
-    """Redownloads the queue dataset each time it updates"""
-    if payload.repo.type == "dataset" and payload.event.action == "update":
-        current_time = datetime.datetime.now()
-        global LAST_UPDATE_QUEUE
-        if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
-            print("Would have updated the queue")
-            # We only redownload is last update was more than 10 minutes ago, as the queue is
-            # updated regularly and heavy to download
-            #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
-            LAST_UPDATE_QUEUE = datetime.datetime.now()
-webhooks_server.launch()

gif.gif DELETED Viewed

Git LFS Details

SHA256: ca34fd48c50eda15857dffedd1659921e7ae33e1d53f5e7afa34696040f4ef80
Pointer size: 132 Bytes
Size of remote file: 3.85 MB

pyproject.toml CHANGED Viewed

@@ -38,16 +38,18 @@ numpy = "1.26.0"
 pandas = "2.2.2"
 plotly = "5.14.1"
 python-dateutil = "2.8.2"
-requests = "2.28.2"
 sentencepiece = "^0.2.0"
 tqdm = "4.65.0"
 transformers = "4.41.1"
 tokenizers = ">=0.15.0"
 gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
-gradio = " 4.20.0"
 isort = "^5.13.2"
 ruff = "^0.3.5"
 gradio-leaderboard = "0.0.8"
 [build-system]
 requires = ["poetry-core"]

 pandas = "2.2.2"
 plotly = "5.14.1"
 python-dateutil = "2.8.2"
 sentencepiece = "^0.2.0"
 tqdm = "4.65.0"
 transformers = "4.41.1"
 tokenizers = ">=0.15.0"
 gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
 isort = "^5.13.2"
 ruff = "^0.3.5"
 gradio-leaderboard = "0.0.8"
+gradio = {extras = ["oauth"], version = "^4.36.1"}
+requests = "^2.31.0"
+requests-oauthlib = "^1.3.1"
+schedule = "^1.2.2"
 [build-system]
 requires = ["poetry-core"]

requirements.txt CHANGED Viewed

@@ -8,11 +8,16 @@ numpy==1.26.0
 pandas==2.2.2
 plotly==5.14.1
 python-dateutil==2.8.2
-requests==2.28.2
 sentencepiece
 tqdm==4.65.0
 transformers==4.41.1
 tokenizers>=0.15.0
 gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
-gradio==4.20.0
 gradio_leaderboard==0.0.9

 pandas==2.2.2
 plotly==5.14.1
 python-dateutil==2.8.2
 sentencepiece
 tqdm==4.65.0
 transformers==4.41.1
 tokenizers>=0.15.0
 gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
+isort
+ruff
+gradio==4.31.0
+gradio[oauth]
 gradio_leaderboard==0.0.9
+requests==2.31.0
+requests-oauthlib== 1.3.1
+schedule == 1.2.2

src/display/about.py CHANGED Viewed

@@ -219,6 +219,89 @@ CITATION_BUTTON_TEXT = r"""
   publisher = {Hugging Face},
   howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
 }
-????
 """

   publisher = {Hugging Face},
   howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
 }
+@software{eval-harness,
+  author       = {Gao, Leo and
+                  Tow, Jonathan and
+                  Biderman, Stella and
+                  Black, Sid and
+                  DiPofi, Anthony and
+                  Foster, Charles and
+                  Golding, Laurence and
+                  Hsu, Jeffrey and
+                  McDonell, Kyle and
+                  Muennighoff, Niklas and
+                  Phang, Jason and
+                  Reynolds, Laria and
+                  Tang, Eric and
+                  Thite, Anish and
+                  Wang, Ben and
+                  Wang, Kevin and
+                  Zou, Andy},
+  title        = {A framework for few-shot language model evaluation},
+  month        = sep,
+  year         = 2021,
+  publisher    = {Zenodo},
+  version      = {v0.0.1},
+  doi          = {10.5281/zenodo.5371628},
+  url          = {https://doi.org/10.5281/zenodo.5371628}
+}
+@misc{clark2018think,
+      title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
+      author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
+      year={2018},
+      eprint={1803.05457},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+@misc{zellers2019hellaswag,
+      title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+      author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
+      year={2019},
+      eprint={1905.07830},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{hendrycks2021measuring,
+      title={Measuring Massive Multitask Language Understanding},
+      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+      year={2021},
+      eprint={2009.03300},
+      archivePrefix={arXiv},
+      primaryClass={cs.CY}
+}
+@misc{lin2022truthfulqa,
+      title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
+      author={Stephanie Lin and Jacob Hilton and Owain Evans},
+      year={2022},
+      eprint={2109.07958},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{DBLP:journals/corr/abs-1907-10641,
+      title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
+      author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
+      year={2019},
+      eprint={1907.10641},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{DBLP:journals/corr/abs-2110-14168,
+      title={Training Verifiers to Solve Math Word Problems},
+      author={Karl Cobbe and
+                  Vineet Kosaraju and
+                  Mohammad Bavarian and
+                  Mark Chen and
+                  Heewoo Jun and
+                  Lukasz Kaiser and
+                  Matthias Plappert and
+                  Jerry Tworek and
+                  Jacob Hilton and
+                  Reiichiro Nakano and
+                  Christopher Hesse and
+                  John Schulman},
+      year={2021},
+      eprint={2110.14168},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
 """

src/display/css_html_js.py CHANGED Viewed

@@ -9,7 +9,7 @@ table th:first-child {
 /* Full width space */
 .gradio-container {
-  max-width: 95%!important;
 }
 /* Text style and margins */
@@ -48,7 +48,7 @@ table th:first-child {
 }
 /* Filters style */
-#filter_type{
     border: 0;
     padding-left: 0;
     padding-top: 0;
@@ -56,29 +56,53 @@ table th:first-child {
 #filter_type label {
     display: flex;
 }
-#filter_type label > span{
     margin-top: var(--spacing-lg);
     margin-right: 0.5em;
 }
-#filter_type label > .wrap{
     width: 103px;
 }
-#filter_type label > .wrap .wrap-inner{
     padding: 2px;
 }
-#filter_type label > .wrap .wrap-inner input{
-    width: 1px
 }
-#filter-columns-type{
-    border:0;
-    padding:0.5;
 }
-#filter-columns-size{
-    border:0;
-    padding:0.5;
 }
-#box-filter > .form{
-    border: 0
 }
 """

 /* Full width space */
 .gradio-container {
+    max-width: 95% !important;
 }
 /* Text style and margins */
 }
 /* Filters style */
+#filter_type {
     border: 0;
     padding-left: 0;
     padding-top: 0;
 #filter_type label {
     display: flex;
 }
+#filter_type label > span {
     margin-top: var(--spacing-lg);
     margin-right: 0.5em;
 }
+#filter_type label > .wrap {
     width: 103px;
 }
+#filter_type label > .wrap .wrap-inner {
     padding: 2px;
 }
+#filter_type label > .wrap .wrap-inner input {
+    width: 1px;
 }
+#filter-columns-type {
+    border: 0;
+    padding: 0.5;
+}
+#filter-columns-size {
+    border: 0;
+    padding: 0.5;
 }
+#box-filter > .form {
+    border: 0;
 }
+/* Header styles */
+#header-title {
+    text-align: left;
+    display: inline-block;
+}
+#header-row {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+#header-row .gradio-html {
+    flex-grow: 1;
+}
+#oauth-button {
+    height: auto;
+    min-width: max-content;
+    white-space: nowrap;
+    padding: 10px 20px;
+    border-radius: 4px;
 }
 """

src/display/formatting.py CHANGED Viewed

@@ -11,7 +11,7 @@ def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
     details_model_name = model_name.replace("/", "__")
-    details_link = f"https://huggingface.co/datasets/open-llm-leaderboard-old/details_{details_model_name}"
     return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")

     link = f"https://huggingface.co/{model_name}"
     details_model_name = model_name.replace("/", "__")
+    details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/{details_model_name}-details"
     return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")

src/display/utils.py CHANGED Viewed

@@ -49,12 +49,23 @@ class Task:
 class Tasks(Enum):
-    arc = Task("arc:challenge", "acc_norm", "ARC")
-    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
-    mmlu = Task("hendrycksTest", "acc", "MMLU")
-    truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
-    winogrande = Task("winogrande", "acc", "Winogrande")
-    gsm8k = Task("gsm8k", "acc", "GSM8K")
 # These classes are for user facing column names,
@@ -77,7 +88,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 # Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -94,7 +106,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
 auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
-# Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
 # We use make dataclass to dynamically fill the scores from Tasks
@@ -103,30 +118,31 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
     precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
-baseline_row = {
-    AutoEvalColumn.model.name: "<p>Baseline</p>",
-    AutoEvalColumn.revision.name: "N/A",
-    AutoEvalColumn.precision.name: None,
-    AutoEvalColumn.merged.name: False,
-    AutoEvalColumn.average.name: 31.0,
-    AutoEvalColumn.arc.name: 25.0,
-    AutoEvalColumn.hellaswag.name: 25.0,
-    AutoEvalColumn.mmlu.name: 25.0,
-    AutoEvalColumn.truthfulqa.name: 25.0,
-    AutoEvalColumn.winogrande.name: 50.0,
-    AutoEvalColumn.gsm8k.name: 0.21,
-    AutoEvalColumn.fullname.name: "baseline",
-    AutoEvalColumn.model_type.name: "",
-    AutoEvalColumn.not_flagged.name: False,
-}
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -136,22 +152,22 @@ baseline_row = {
 # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
 # GSM8K: paper
 # Define the human baselines
-human_baseline_row = {
-    AutoEvalColumn.model.name: "<p>Human performance</p>",
-    AutoEvalColumn.revision.name: "N/A",
-    AutoEvalColumn.precision.name: None,
-    AutoEvalColumn.average.name: 92.75,
-    AutoEvalColumn.merged.name: False,
-    AutoEvalColumn.arc.name: 80.0,
-    AutoEvalColumn.hellaswag.name: 95.0,
-    AutoEvalColumn.mmlu.name: 89.8,
-    AutoEvalColumn.truthfulqa.name: 94.0,
-    AutoEvalColumn.winogrande.name: 94.0,
-    AutoEvalColumn.gsm8k.name: 100,
-    AutoEvalColumn.fullname.name: "human_baseline",
-    AutoEvalColumn.model_type.name: "",
-    AutoEvalColumn.not_flagged.name: False,
-}
 @dataclass
@@ -166,22 +182,22 @@ class ModelType(Enum):
     FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
     merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
-    Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
     @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
             return ModelType.FT
-        if "continously pretrained" in type or "🟩" in type:
             return ModelType.CPT
-        if "pretrained" in type or "🟢" in type:
             return ModelType.PT
-        if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
             return ModelType.chat
-        if "merge" in type or "🤝" in type:
             return ModelType.merges
         return ModelType.Unknown
@@ -200,6 +216,7 @@ class Precision(Enum):
     qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16

 class Tasks(Enum):
+    ifeval = Task("leaderboard_ifeval", "strict_acc,none", "IFEval")
+    ifeval_raw = Task("leaderboard_ifeval", "strict_acc,none", "IFEval Raw")
+    bbh = Task("leaderboard_bbh", "acc_norm,none", "BBH")
+    bbh_raw = Task("leaderboard_bbh", "acc_norm,none", "BBH Raw")
+    math = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5")
+    math_raw = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5 Raw")
+    gpqa = Task("leaderboard_gpqa", "acc_norm,none", "GPQA")
+    gpqa_raw = Task("leaderboard_gpqa", "acc_norm,none", "GPQA Raw")
+    musr = Task("leaderboard_musr", "acc_norm,none", "MUSR")
+    musr_raw = Task("leaderboard_musr", "acc_norm,none", "MUSR Raw")
+    mmlu_pro = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO")
+    mmlu_pro_raw = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO Raw")
 # These classes are for user facing column names,
 # Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
+    displayed_by_default = not task.name.endswith("_raw")
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default=displayed_by_default)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
+auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Chat Template", "bool", False)])
+auto_eval_column_dict.append(["maintainers_highlight", ColumnContent, ColumnContent("Maintainer's Highlight", "bool", False, hidden=True)])
+# fullname structure: <user>/<model_name>
 auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
+    model_link = ColumnContent("model_link", "markdown", True)
+    model_name = ColumnContent("model_name", "str", True)
     revision = ColumnContent("revision", "str", True)
+    #private = ColumnContent("private", "bool", True)  # Should not be displayed
     precision = ColumnContent("precision", "str", True)
+    #weight_type = ColumnContent("weight_type", "str", "Original") # Might be confusing, to think about
     status = ColumnContent("status", "str", True)
+# baseline_row = {
+#     AutoEvalColumn.model.name: "<p>Baseline</p>",
+#     AutoEvalColumn.revision.name: "N/A",
+#     AutoEvalColumn.precision.name: None,
+#     AutoEvalColumn.merged.name: False,
+#     AutoEvalColumn.average.name: 31.0,
+#     AutoEvalColumn.arc.name: 25.0,
+#     AutoEvalColumn.hellaswag.name: 25.0,
+#     AutoEvalColumn.mmlu.name: 25.0,
+#     AutoEvalColumn.truthfulqa.name: 25.0,
+#     AutoEvalColumn.winogrande.name: 50.0,
+#     AutoEvalColumn.gsm8k.name: 0.21,
+#     AutoEvalColumn.fullname.name: "baseline",
+#     AutoEvalColumn.model_type.name: "",
+#     AutoEvalColumn.not_flagged.name: False,
+# }
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
 # GSM8K: paper
 # Define the human baselines
+# human_baseline_row = {
+#     AutoEvalColumn.model.name: "<p>Human performance</p>",
+#     AutoEvalColumn.revision.name: "N/A",
+#     AutoEvalColumn.precision.name: None,
+#     AutoEvalColumn.average.name: 92.75,
+#     AutoEvalColumn.merged.name: False,
+#     AutoEvalColumn.arc.name: 80.0,
+#     AutoEvalColumn.hellaswag.name: 95.0,
+#     AutoEvalColumn.mmlu.name: 89.8,
+#     AutoEvalColumn.truthfulqa.name: 94.0,
+#     AutoEvalColumn.winogrande.name: 94.0,
+#     AutoEvalColumn.gsm8k.name: 100,
+#     AutoEvalColumn.fullname.name: "human_baseline",
+#     AutoEvalColumn.model_type.name: "",
+#     AutoEvalColumn.not_flagged.name: False,
+# }
 @dataclass
     FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
     merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
+    Unknown = ModelDetails(name="❓ other", symbol="❓")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
     @staticmethod
+    def from_str(m_type):
+        if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
             return ModelType.FT
+        if "continuously pretrained" in m_type or "🟩" in m_type:
             return ModelType.CPT
+        if "pretrained" in m_type or "🟢" in m_type:
             return ModelType.PT
+        if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
             return ModelType.chat
+        if "merge" in m_type or "🤝" in m_type:
             return ModelType.merges
         return ModelType.Unknown
     qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
+    @staticmethod
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16

src/envs.py CHANGED Viewed

@@ -4,9 +4,10 @@ from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-REPO_ID = "open-llm-leaderboard-old/open_llm_leaderboard"
-QUEUE_REPO = "open-llm-leaderboard-old/requests"
-AGGREGATED_REPO = "open-llm-leaderboard-old/contents"
 HF_HOME = os.getenv("HF_HOME", ".")
@@ -20,11 +21,12 @@ if not os.access(HF_HOME, os.W_OK):
 else:
     print("Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
-HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
 API = HfApi(token=HF_TOKEN)

 # clone / pull the lmeh eval data
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+REPO_ID = "open-llm-leaderboard/open_llm_leaderboard_v2"
+QUEUE_REPO = "open-llm-leaderboard/requests"
+AGGREGATED_REPO = "open-llm-leaderboard/contents"
+VOTES_REPO = "open-llm-leaderboard/votes"
 HF_HOME = os.getenv("HF_HOME", ".")
 else:
     print("Write access confirmed for HF_HOME")
+VOTES_PATH = os.path.join(HF_HOME, "model-votes")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 # Rate limit variables
 RATE_LIMIT_PERIOD = 7
 RATE_LIMIT_QUOTA = 5
+HAS_HIGHER_RATE_LIMIT = []
 API = HfApi(token=HF_TOKEN)

src/leaderboard/filter_models.py CHANGED Viewed

@@ -4,122 +4,8 @@ from src.display.utils import AutoEvalColumn
 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
-FLAGGED_MODELS = {
-    "merged": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
-    "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
-    "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
-    "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
-    "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
-    "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
-    "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
-    "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
-    "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
-    "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
-    "jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
-    "dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
-    "udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
-    "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
-    "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
-    "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
-    "alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
-    "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
-    "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
-    "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
-    # Merges not indicated
-    "gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
-    "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
-    "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
-    "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
-    "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
-    "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
-    "udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    # MoErges
-    "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
-    # Other - contamination mostly
-    "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
-    "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
-    "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
-    "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
-}
 # Models which have been requested by orgs to not be submitted on the leaderboard
 DO_NOT_SUBMIT_MODELS = [
@@ -133,12 +19,16 @@ DO_NOT_SUBMIT_MODELS = [
 def flag_models(leaderboard_data: list[dict]):
     """Flags models based on external criteria or flagged status."""
     for model_data in leaderboard_data:
         # If a model is not flagged, use its "fullname" as a key
         if model_data[AutoEvalColumn.not_flagged.name]:
             flag_key = model_data[AutoEvalColumn.fullname.name]
         else:
-            # Merges and moes are flagged
-            flag_key = "merged"
         # Reverse the logic: Check for non-flagged models instead
         if flag_key in FLAGGED_MODELS:

 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
+# None for the v2 so far!
+FLAGGED_MODELS = {}
 # Models which have been requested by orgs to not be submitted on the leaderboard
 DO_NOT_SUBMIT_MODELS = [
 def flag_models(leaderboard_data: list[dict]):
     """Flags models based on external criteria or flagged status."""
     for model_data in leaderboard_data:
+        # Skip flagging if maintainers_highlight is True
+        if model_data.get(AutoEvalColumn.maintainers_highlight.name, False):
+            model_data[AutoEvalColumn.not_flagged.name] = True
+            continue
         # If a model is not flagged, use its "fullname" as a key
         if model_data[AutoEvalColumn.not_flagged.name]:
             flag_key = model_data[AutoEvalColumn.fullname.name]
         else:
+            flag_key = None
         # Reverse the logic: Check for non-flagged models instead
         if flag_key in FLAGGED_MODELS:

src/populate.py CHANGED Viewed

@@ -2,14 +2,15 @@ import pathlib
 import pandas as pd
 from datasets import Dataset
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models_flags
 from src.display.utils import load_json_data
 def _process_model_data(entry, model_name_key="model", revision_key="revision"):
     """Enrich model data with clickable links and revisions."""
-    entry[EvalQueueColumn.model.name] = make_clickable_model(entry.get(model_name_key, ""))
     entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
     return entry
@@ -50,4 +51,4 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return df

 import pandas as pd
 from datasets import Dataset
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.filter_models import filter_models_flags
 from src.display.utils import load_json_data
 def _process_model_data(entry, model_name_key="model", revision_key="revision"):
     """Enrich model data with clickable links and revisions."""
+    entry[EvalQueueColumn.model_name.name] = entry.get(model_name_key, "")
+    entry[EvalQueueColumn.model_link.name] = make_clickable_model(entry.get(model_name_key, ""))
     entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
     return entry
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return df

src/submission/submit.py CHANGED Viewed

@@ -32,6 +32,7 @@ def add_new_eval(
     precision: str,
     weight_type: str,
     model_type: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -129,6 +130,7 @@ def add_new_eval(
         "model_type": model_type,
         "job_id": -1,
         "job_start_time": None,
     }
     supplementary_info = {

     precision: str,
     weight_type: str,
     model_type: str,
+    use_chat_template: bool,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
         "model_type": model_type,
         "job_id": -1,
         "job_start_time": None,
+        "use_chat_template": use_chat_template,
     }
     supplementary_info = {

src/tools/plots.py CHANGED Viewed

@@ -4,7 +4,7 @@ import plotly.express as px
 from plotly.graph_objs import Figure
 from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
-from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS

 from plotly.graph_objs import Figure
 from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
+# from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS

src/voting/vote_system.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import json
+import logging
+import pathlib
+import pandas as pd
+import gradio as gr
+import schedule
+import time
+from datetime import datetime, timezone
+from src.display.utils import EvalQueueColumn
+from src.envs import API
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class VoteManager:
+    def __init__(self, votes_path, eval_requests_path, repo_id):
+        self.votes_path = votes_path
+        self.eval_requests_path = eval_requests_path
+        self.repo_id = repo_id
+        self.vote_dataset = self.read_vote_dataset()
+        self.vote_check_set = self.make_check_set(self.vote_dataset)
+        self.votes_to_upload = []
+    def init_vote_dataset(self):
+        self.vote_dataset = self.read_vote_dataset()
+        self.vote_check_set = self.make_check_set(self.vote_dataset)
+    def read_vote_dataset(self):
+        result = []
+        votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
+        if votes_file.exists():
+            with open(votes_file, "r") as f:
+                for line in f:
+                    data = json.loads(line.strip())
+                    result.append(data)
+        result = pd.DataFrame(result)
+        return result
+    def make_check_set(self, vote_dataset: pd.DataFrame):
+        result = list()
+        for row in vote_dataset.itertuples(index=False, name='vote'):
+            result.append((row.model, row.revision, row.username))
+        return set(result)
+    def get_model_revision(self, selected_model: str) -> str:
+        """Fetch the revision for the given model from the request files."""
+        for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
+            if user_folder.is_dir():
+                for file in user_folder.glob("*.json"):
+                    with open(file, "r") as f:
+                        data = json.load(f)
+                        if data.get("model") == selected_model:
+                            return data.get("revision", "main")
+        return "main"
+    def create_request_vote_df(self, pending_models_df: gr.Dataframe):
+        if pending_models_df.empty or not "model_name" in pending_models_df.columns:
+            return pending_models_df
+        self.vote_dataset = self.read_vote_dataset()
+        vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
+        pending_models_df_votes = pd.merge(
+            pending_models_df,
+            vote_counts,
+            left_on=["model_name", 'revision'],
+            right_on=['model', 'revision'],
+            how='left'
+        )
+        # Filling empty votes
+        pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
+        pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
+        # Removing useless columns
+        pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
+        return pending_models_df_votes
+    # Function to be called when a user votes for a model
+    def add_vote(
+            self,
+            selected_model: str,
+            pending_models_df: gr.Dataframe,
+            profile: gr.OAuthProfile | None
+        ):
+        logger.debug(f"Type of list before usage: {type(list)}")
+        # model_name, revision, user_id, timestamp
+        if selected_model in ["str", ""]:
+            gr.Warning("No model selected")
+            return
+        if profile is None:
+            gr.Warning("Hub Login required")
+            return
+        vote_username = profile.username
+        model_revision = self.get_model_revision(selected_model)
+        # tuple (immutable) for checking than already voted for model
+        check_tuple = (selected_model, model_revision, vote_username)
+        if check_tuple in self.vote_check_set:
+            gr.Warning("Already voted for this model")
+            return
+        current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+        vote_obj = {
+            "model": selected_model,
+            "revision": model_revision,
+            "username": vote_username,
+            "timestamp": current_time
+        }
+        # Append the vote to the JSONL file
+        try:
+            votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
+            with open(votes_file, "a") as f:
+                f.write(json.dumps(vote_obj) + "\n")
+            logger.info(f"Vote added locally: {vote_obj}")
+            self.votes_to_upload.append(vote_obj)
+        except Exception as e:
+            logger.error(f"Failed to write vote to file: {e}")
+            gr.Warning("Failed to record vote. Please try again")
+            return
+        self.vote_check_set.add(check_tuple)
+        gr.Info(f"Voted for {selected_model}")
+        return self.create_request_vote_df(pending_models_df)
+    def upload_votes(self):
+        if self.votes_to_upload:
+            votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
+            try:
+                with open(votes_file, "rb") as f:
+                    API.upload_file(
+                        path_or_fileobj=f,
+                        path_in_repo="votes_data.jsonl",
+                        repo_id=self.repo_id,
+                        repo_type="dataset",
+                        commit_message="Updating votes_data.jsonl with new votes",
+                    )
+                logger.info("Votes uploaded to votes repository")
+                self.votes_to_upload.clear()
+            except Exception as e:
+                logger.error(f"Failed to upload votes to repository: {e}")
+def run_scheduler(vote_manager):
+    while True:
+        schedule.run_pending()
+        time.sleep(1)