Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
·
beb2b32
1
Parent(s):
aa85eec
update v2
Browse files- README.md +4 -2
- app.py +404 -25
- app_bkp.py +0 -316
- gif.gif +0 -3
- pyproject.toml +4 -2
- requirements.txt +7 -2
- src/display/about.py +85 -2
- src/display/css_html_js.py +39 -15
- src/display/formatting.py +1 -1
- src/display/utils.py +67 -50
- src/envs.py +6 -4
- src/leaderboard/filter_models.py +8 -118
- src/populate.py +4 -3
- src/submission/submit.py +2 -0
- src/tools/plots.py +1 -1
- src/voting/vote_system.py +151 -0
README.md
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
---
|
| 2 |
-
title: Open LLM Leaderboard
|
| 3 |
emoji: 🏆
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
| 11 |
fullWidth: true
|
| 12 |
startup_duration_timeout: 1h
|
|
|
|
| 13 |
space_ci:
|
| 14 |
private: true
|
| 15 |
secrets:
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Open LLM Leaderboard 2
|
| 3 |
emoji: 🏆
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.36.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
| 11 |
+
duplicated_from: open-llm-leaderboard/open_llm_leaderboard
|
| 12 |
fullWidth: true
|
| 13 |
startup_duration_timeout: 1h
|
| 14 |
+
hf_oauth: true
|
| 15 |
space_ci:
|
| 16 |
private: true
|
| 17 |
secrets:
|
app.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
import os
|
| 2 |
import logging
|
| 3 |
import time
|
|
|
|
| 4 |
import datetime
|
| 5 |
import gradio as gr
|
|
|
|
| 6 |
import datasets
|
| 7 |
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
|
| 8 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
from src.display.about import (
|
| 11 |
CITATION_BUTTON_LABEL,
|
| 12 |
CITATION_BUTTON_TEXT,
|
|
@@ -27,6 +32,7 @@ from src.display.utils import (
|
|
| 27 |
Precision,
|
| 28 |
WeightType,
|
| 29 |
fields,
|
|
|
|
| 30 |
)
|
| 31 |
from src.envs import (
|
| 32 |
API,
|
|
@@ -35,35 +41,343 @@ from src.envs import (
|
|
| 35 |
HF_TOKEN,
|
| 36 |
QUEUE_REPO,
|
| 37 |
REPO_ID,
|
|
|
|
|
|
|
| 38 |
HF_HOME,
|
| 39 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
with gr.Row():
|
| 69 |
with gr.Accordion("📙 Citation", open=False):
|
|
@@ -75,4 +389,69 @@ with demo:
|
|
| 75 |
show_copy_button=True,
|
| 76 |
)
|
| 77 |
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import logging
|
| 3 |
import time
|
| 4 |
+
import schedule
|
| 5 |
import datetime
|
| 6 |
import gradio as gr
|
| 7 |
+
from threading import Thread
|
| 8 |
import datasets
|
| 9 |
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
|
| 10 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 11 |
|
| 12 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 13 |
+
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
| 14 |
+
|
| 15 |
from src.display.about import (
|
| 16 |
CITATION_BUTTON_LABEL,
|
| 17 |
CITATION_BUTTON_TEXT,
|
|
|
|
| 32 |
Precision,
|
| 33 |
WeightType,
|
| 34 |
fields,
|
| 35 |
+
EvalQueueColumn
|
| 36 |
)
|
| 37 |
from src.envs import (
|
| 38 |
API,
|
|
|
|
| 41 |
HF_TOKEN,
|
| 42 |
QUEUE_REPO,
|
| 43 |
REPO_ID,
|
| 44 |
+
VOTES_REPO,
|
| 45 |
+
VOTES_PATH,
|
| 46 |
HF_HOME,
|
| 47 |
)
|
| 48 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 49 |
+
from src.submission.submit import add_new_eval
|
| 50 |
+
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
| 51 |
+
from src.voting.vote_system import VoteManager, run_scheduler
|
| 52 |
|
| 53 |
+
# Configure logging
|
| 54 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 55 |
+
|
| 56 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 57 |
+
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
| 58 |
+
|
| 59 |
+
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
| 60 |
+
# This controls whether a full initialization should be performed.
|
| 61 |
+
DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
| 62 |
+
LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
|
| 63 |
+
LEADERBOARD_DF = None
|
| 64 |
+
|
| 65 |
+
def restart_space():
|
| 66 |
+
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def time_diff_wrapper(func):
|
| 70 |
+
def wrapper(*args, **kwargs):
|
| 71 |
+
start_time = time.time()
|
| 72 |
+
result = func(*args, **kwargs)
|
| 73 |
+
end_time = time.time()
|
| 74 |
+
diff = end_time - start_time
|
| 75 |
+
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
| 76 |
+
return result
|
| 77 |
+
|
| 78 |
+
return wrapper
|
| 79 |
|
| 80 |
+
|
| 81 |
+
@time_diff_wrapper
|
| 82 |
+
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 83 |
+
"""Download dataset with exponential backoff retries."""
|
| 84 |
+
attempt = 0
|
| 85 |
+
while attempt < max_attempts:
|
| 86 |
+
try:
|
| 87 |
+
logging.info(f"Downloading {repo_id} to {local_dir}")
|
| 88 |
+
snapshot_download(
|
| 89 |
+
repo_id=repo_id,
|
| 90 |
+
local_dir=local_dir,
|
| 91 |
+
repo_type=repo_type,
|
| 92 |
+
tqdm_class=None,
|
| 93 |
+
etag_timeout=30,
|
| 94 |
+
max_workers=8,
|
| 95 |
+
)
|
| 96 |
+
logging.info("Download successful")
|
| 97 |
+
return
|
| 98 |
+
except Exception as e:
|
| 99 |
+
wait_time = backoff_factor**attempt
|
| 100 |
+
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
| 101 |
+
time.sleep(wait_time)
|
| 102 |
+
attempt += 1
|
| 103 |
+
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 104 |
+
|
| 105 |
+
def get_latest_data_leaderboard(leaderboard_initial_df = None):
|
| 106 |
+
current_time = datetime.datetime.now()
|
| 107 |
+
global LAST_UPDATE_LEADERBOARD
|
| 108 |
+
if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
|
| 109 |
+
return leaderboard_initial_df
|
| 110 |
+
LAST_UPDATE_LEADERBOARD = current_time
|
| 111 |
+
leaderboard_dataset = datasets.load_dataset(
|
| 112 |
+
AGGREGATED_REPO,
|
| 113 |
+
"default",
|
| 114 |
+
split="train",
|
| 115 |
+
cache_dir=HF_HOME,
|
| 116 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 117 |
+
verification_mode="no_checks"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
global LEADERBOARD_DF
|
| 121 |
+
LEADERBOARD_DF = get_leaderboard_df(
|
| 122 |
+
leaderboard_dataset=leaderboard_dataset,
|
| 123 |
+
cols=COLS,
|
| 124 |
+
benchmark_cols=BENCHMARK_COLS,
|
| 125 |
)
|
| 126 |
+
|
| 127 |
+
return LEADERBOARD_DF
|
| 128 |
+
|
| 129 |
+
def get_latest_data_queue():
|
| 130 |
+
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 131 |
+
return eval_queue_dfs
|
| 132 |
+
|
| 133 |
+
def init_space():
|
| 134 |
+
"""Initializes the application space, loading only necessary data."""
|
| 135 |
+
if DO_FULL_INIT:
|
| 136 |
+
# These downloads only occur on full initialization
|
| 137 |
+
try:
|
| 138 |
+
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 139 |
+
download_dataset(VOTES_REPO, VOTES_PATH)
|
| 140 |
+
except Exception:
|
| 141 |
+
restart_space()
|
| 142 |
+
|
| 143 |
+
# Always redownload the leaderboard DataFrame
|
| 144 |
+
global LEADERBOARD_DF
|
| 145 |
+
LEADERBOARD_DF = get_latest_data_leaderboard()
|
| 146 |
+
|
| 147 |
+
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
| 148 |
+
eval_queue_dfs = get_latest_data_queue()
|
| 149 |
+
|
| 150 |
+
return LEADERBOARD_DF, eval_queue_dfs
|
| 151 |
+
|
| 152 |
+
# Initialize VoteManager
|
| 153 |
+
vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# Schedule the upload_votes method to run every 15 minutes
|
| 157 |
+
schedule.every(15).minutes.do(vote_manager.upload_votes)
|
| 158 |
+
|
| 159 |
+
# Start the scheduler in a separate thread
|
| 160 |
+
scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
|
| 161 |
+
scheduler_thread.start()
|
| 162 |
+
|
| 163 |
+
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 164 |
+
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 165 |
+
LEADERBOARD_DF, eval_queue_dfs = init_space()
|
| 166 |
+
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
# Data processing for plots now only on demand in the respective Gradio tab
|
| 170 |
+
def load_and_create_plots():
|
| 171 |
+
plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
|
| 172 |
+
return plot_df
|
| 173 |
+
|
| 174 |
+
# Function to check if a user is logged in
|
| 175 |
+
def check_login(profile: gr.OAuthProfile | None) -> bool:
|
| 176 |
+
if profile is None:
|
| 177 |
+
return False
|
| 178 |
+
return True
|
| 179 |
+
|
| 180 |
+
def init_leaderboard(dataframe):
|
| 181 |
+
if dataframe is None or dataframe.empty:
|
| 182 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 183 |
+
return Leaderboard(
|
| 184 |
+
value=dataframe,
|
| 185 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 186 |
+
select_columns=SelectColumns(
|
| 187 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 188 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
| 189 |
+
label="Select Columns to Display:",
|
| 190 |
+
),
|
| 191 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
|
| 192 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 193 |
+
filter_columns=[
|
| 194 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 195 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 196 |
+
ColumnFilter(
|
| 197 |
+
AutoEvalColumn.params.name,
|
| 198 |
+
type="slider",
|
| 199 |
+
min=0.01,
|
| 200 |
+
max=150,
|
| 201 |
+
label="Select the number of parameters (B)",
|
| 202 |
+
),
|
| 203 |
+
ColumnFilter(
|
| 204 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 205 |
+
),
|
| 206 |
+
ColumnFilter(
|
| 207 |
+
AutoEvalColumn.merged.name, type="boolean", label="Merge/MoErge", default=True
|
| 208 |
+
),
|
| 209 |
+
ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
|
| 210 |
+
ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
|
| 211 |
+
ColumnFilter(AutoEvalColumn.maintainers_highlight.name, type="boolean", label="Show only maintainer's highlight", default=False),
|
| 212 |
+
],
|
| 213 |
+
bool_checkboxgroup_label="Hide models",
|
| 214 |
+
interactive=False,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
main_block = gr.Blocks(css=custom_css)
|
| 218 |
+
with main_block:
|
| 219 |
+
with gr.Row(elem_id="header-row"):
|
| 220 |
+
gr.HTML(TITLE)
|
| 221 |
+
|
| 222 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 223 |
+
|
| 224 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 225 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 226 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 227 |
+
|
| 228 |
+
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
|
| 229 |
+
with gr.Column():
|
| 230 |
+
with gr.Row():
|
| 231 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 232 |
+
|
| 233 |
+
with gr.Row():
|
| 234 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 235 |
+
|
| 236 |
+
with gr.Row():
|
| 237 |
+
with gr.Column():
|
| 238 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 239 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="latest")
|
| 240 |
+
with gr.Row():
|
| 241 |
+
model_type = gr.Dropdown(
|
| 242 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 243 |
+
label="Model type",
|
| 244 |
+
multiselect=False,
|
| 245 |
+
value=ModelType.FT.to_str(" : "),
|
| 246 |
+
interactive=True,
|
| 247 |
+
)
|
| 248 |
+
chat_template_toggle = gr.Checkbox(
|
| 249 |
+
label="Use chat template",
|
| 250 |
+
value=False,
|
| 251 |
+
info="Is your model a chat model?",
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
with gr.Column():
|
| 255 |
+
precision = gr.Dropdown(
|
| 256 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 257 |
+
label="Precision",
|
| 258 |
+
multiselect=False,
|
| 259 |
+
value="float16",
|
| 260 |
+
interactive=True,
|
| 261 |
+
)
|
| 262 |
+
weight_type = gr.Dropdown(
|
| 263 |
+
choices=[i.value.name for i in WeightType],
|
| 264 |
+
label="Weights type",
|
| 265 |
+
multiselect=False,
|
| 266 |
+
value="Original",
|
| 267 |
+
interactive=True,
|
| 268 |
+
)
|
| 269 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 270 |
+
|
| 271 |
+
with gr.Column():
|
| 272 |
+
with gr.Accordion(
|
| 273 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 274 |
+
open=False,
|
| 275 |
+
):
|
| 276 |
+
with gr.Row():
|
| 277 |
+
finished_eval_table = gr.components.Dataframe(
|
| 278 |
+
value=finished_eval_queue_df,
|
| 279 |
+
headers=EVAL_COLS,
|
| 280 |
+
datatype=EVAL_TYPES,
|
| 281 |
+
row_count=5,
|
| 282 |
+
)
|
| 283 |
+
with gr.Accordion(
|
| 284 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 285 |
+
open=False,
|
| 286 |
+
):
|
| 287 |
+
with gr.Row():
|
| 288 |
+
running_eval_table = gr.components.Dataframe(
|
| 289 |
+
value=running_eval_queue_df,
|
| 290 |
+
headers=EVAL_COLS,
|
| 291 |
+
datatype=EVAL_TYPES,
|
| 292 |
+
row_count=5,
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
with gr.Accordion(
|
| 296 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 297 |
+
open=False,
|
| 298 |
+
):
|
| 299 |
+
with gr.Row():
|
| 300 |
+
pending_eval_table = gr.components.Dataframe(
|
| 301 |
+
value=pending_eval_queue_df,
|
| 302 |
+
headers=EVAL_COLS,
|
| 303 |
+
datatype=EVAL_TYPES,
|
| 304 |
+
row_count=5,
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
submit_button = gr.Button("Submit Eval")
|
| 308 |
+
submission_result = gr.Markdown()
|
| 309 |
+
|
| 310 |
+
# The chat template checkbox update function
|
| 311 |
+
def update_chat_checkbox(model_type_value):
|
| 312 |
+
return ModelType.from_str(model_type_value) == ModelType.chat
|
| 313 |
+
|
| 314 |
+
model_type.change(
|
| 315 |
+
fn=update_chat_checkbox,
|
| 316 |
+
inputs=[model_type], # Pass the current checkbox value
|
| 317 |
+
outputs=chat_template_toggle,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
submit_button.click(
|
| 321 |
+
add_new_eval,
|
| 322 |
+
[
|
| 323 |
+
model_name_textbox,
|
| 324 |
+
base_model_name_textbox,
|
| 325 |
+
revision_name_textbox,
|
| 326 |
+
precision,
|
| 327 |
+
weight_type,
|
| 328 |
+
model_type,
|
| 329 |
+
chat_template_toggle,
|
| 330 |
+
],
|
| 331 |
+
submission_result,
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
# Ensure the values in 'pending_eval_queue_df' are correct and ready for the DataFrame component
|
| 335 |
+
with gr.TabItem("🆙 Model Vote"):
|
| 336 |
+
with gr.Row():
|
| 337 |
+
gr.Markdown(
|
| 338 |
+
"## Vote for the models which should be evaluated first! \nYou'll need to sign in with the button above first. All votes are recorded.",
|
| 339 |
+
elem_classes="markdown-text"
|
| 340 |
+
)
|
| 341 |
+
login_button = gr.LoginButton(elem_id="oauth-button")
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
with gr.Row():
|
| 345 |
+
pending_models = pending_eval_queue_df[EvalQueueColumn.model_name.name].to_list()
|
| 346 |
+
|
| 347 |
+
with gr.Column():
|
| 348 |
+
selected_model = gr.Dropdown(
|
| 349 |
+
choices=pending_models,
|
| 350 |
+
label="Models",
|
| 351 |
+
multiselect=False,
|
| 352 |
+
value="str",
|
| 353 |
+
interactive=True,
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
vote_button = gr.Button("Vote", variant="primary")
|
| 357 |
+
|
| 358 |
+
with gr.Row():
|
| 359 |
+
with gr.Accordion(
|
| 360 |
+
f"Available models pending ({len(pending_eval_queue_df)})",
|
| 361 |
+
open=True,
|
| 362 |
+
):
|
| 363 |
+
with gr.Row():
|
| 364 |
+
pending_eval_table_votes = gr.components.Dataframe(
|
| 365 |
+
value=vote_manager.create_request_vote_df(
|
| 366 |
+
pending_eval_queue_df
|
| 367 |
+
),
|
| 368 |
+
headers=EVAL_COLS,
|
| 369 |
+
datatype=EVAL_TYPES,
|
| 370 |
+
row_count=5,
|
| 371 |
+
interactive=False
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
# Set the click event for the vote button
|
| 375 |
+
vote_button.click(
|
| 376 |
+
vote_manager.add_vote,
|
| 377 |
+
inputs=[selected_model, pending_eval_table],
|
| 378 |
+
outputs=[pending_eval_table_votes]
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
|
| 382 |
with gr.Row():
|
| 383 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
|
| 389 |
show_copy_button=True,
|
| 390 |
)
|
| 391 |
|
| 392 |
+
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
|
| 393 |
+
leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
| 394 |
+
pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
| 395 |
+
|
| 396 |
+
main_block.queue(default_concurrency_limit=40)
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
|
| 400 |
+
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
|
| 401 |
+
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
|
| 402 |
+
# ht to Lucain!
|
| 403 |
+
if SPACE_ID is None:
|
| 404 |
+
print("Not in a Space: Space CI disabled.")
|
| 405 |
+
return WebhooksServer(ui=main_block)
|
| 406 |
+
|
| 407 |
+
if IS_EPHEMERAL_SPACE:
|
| 408 |
+
print("In an ephemeral Space: Space CI disabled.")
|
| 409 |
+
return WebhooksServer(ui=main_block)
|
| 410 |
+
|
| 411 |
+
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
|
| 412 |
+
config = card.data.get("space_ci", {})
|
| 413 |
+
print(f"Enabling Space CI with config from README: {config}")
|
| 414 |
+
|
| 415 |
+
return configure_space_ci(
|
| 416 |
+
blocks=ui,
|
| 417 |
+
trusted_authors=config.get("trusted_authors"),
|
| 418 |
+
private=config.get("private", "auto"),
|
| 419 |
+
variables=config.get("variables", "auto"),
|
| 420 |
+
secrets=config.get("secrets"),
|
| 421 |
+
hardware=config.get("hardware"),
|
| 422 |
+
storage=config.get("storage"),
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
# Create webhooks server (with CI url if in Space and not ephemeral)
|
| 426 |
+
webhooks_server = enable_space_ci_and_return_server(ui=main_block)
|
| 427 |
+
|
| 428 |
+
# Add webhooks
|
| 429 |
+
@webhooks_server.add_webhook
|
| 430 |
+
def update_leaderboard(payload: WebhookPayload) -> None:
|
| 431 |
+
"""Redownloads the leaderboard dataset each time it updates"""
|
| 432 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 433 |
+
datasets.load_dataset(
|
| 434 |
+
AGGREGATED_REPO,
|
| 435 |
+
"default",
|
| 436 |
+
split="train",
|
| 437 |
+
cache_dir=HF_HOME,
|
| 438 |
+
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
| 439 |
+
verification_mode="no_checks"
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
# The below code is not used at the moment, as we can manage the queue file locally
|
| 443 |
+
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
| 444 |
+
@webhooks_server.add_webhook
|
| 445 |
+
def update_queue(payload: WebhookPayload) -> None:
|
| 446 |
+
"""Redownloads the queue dataset each time it updates"""
|
| 447 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 448 |
+
current_time = datetime.datetime.now()
|
| 449 |
+
global LAST_UPDATE_QUEUE
|
| 450 |
+
if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
|
| 451 |
+
print("Would have updated the queue")
|
| 452 |
+
# We only redownload is last update was more than 10 minutes ago, as the queue is
|
| 453 |
+
# updated regularly and heavy to download
|
| 454 |
+
#download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 455 |
+
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
| 456 |
+
|
| 457 |
+
webhooks_server.launch()
|
app_bkp.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import logging
|
| 3 |
-
import time
|
| 4 |
-
import datetime
|
| 5 |
-
import gradio as gr
|
| 6 |
-
import datasets
|
| 7 |
-
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
|
| 8 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 9 |
-
|
| 10 |
-
from src.display.about import (
|
| 11 |
-
CITATION_BUTTON_LABEL,
|
| 12 |
-
CITATION_BUTTON_TEXT,
|
| 13 |
-
EVALUATION_QUEUE_TEXT,
|
| 14 |
-
FAQ_TEXT,
|
| 15 |
-
INTRODUCTION_TEXT,
|
| 16 |
-
LLM_BENCHMARKS_TEXT,
|
| 17 |
-
TITLE,
|
| 18 |
-
)
|
| 19 |
-
from src.display.css_html_js import custom_css
|
| 20 |
-
from src.display.utils import (
|
| 21 |
-
BENCHMARK_COLS,
|
| 22 |
-
COLS,
|
| 23 |
-
EVAL_COLS,
|
| 24 |
-
EVAL_TYPES,
|
| 25 |
-
AutoEvalColumn,
|
| 26 |
-
ModelType,
|
| 27 |
-
Precision,
|
| 28 |
-
WeightType,
|
| 29 |
-
fields,
|
| 30 |
-
)
|
| 31 |
-
from src.envs import (
|
| 32 |
-
API,
|
| 33 |
-
EVAL_REQUESTS_PATH,
|
| 34 |
-
AGGREGATED_REPO,
|
| 35 |
-
HF_TOKEN,
|
| 36 |
-
QUEUE_REPO,
|
| 37 |
-
REPO_ID,
|
| 38 |
-
HF_HOME,
|
| 39 |
-
)
|
| 40 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 41 |
-
from src.submission.submit import add_new_eval
|
| 42 |
-
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
| 43 |
-
|
| 44 |
-
# Configure logging
|
| 45 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
| 49 |
-
# This controls whether a full initialization should be performed.
|
| 50 |
-
DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
| 51 |
-
LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
|
| 52 |
-
|
| 53 |
-
def restart_space():
|
| 54 |
-
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def time_diff_wrapper(func):
|
| 58 |
-
def wrapper(*args, **kwargs):
|
| 59 |
-
start_time = time.time()
|
| 60 |
-
result = func(*args, **kwargs)
|
| 61 |
-
end_time = time.time()
|
| 62 |
-
diff = end_time - start_time
|
| 63 |
-
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
| 64 |
-
return result
|
| 65 |
-
|
| 66 |
-
return wrapper
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
@time_diff_wrapper
|
| 70 |
-
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 71 |
-
"""Download dataset with exponential backoff retries."""
|
| 72 |
-
attempt = 0
|
| 73 |
-
while attempt < max_attempts:
|
| 74 |
-
try:
|
| 75 |
-
logging.info(f"Downloading {repo_id} to {local_dir}")
|
| 76 |
-
snapshot_download(
|
| 77 |
-
repo_id=repo_id,
|
| 78 |
-
local_dir=local_dir,
|
| 79 |
-
repo_type=repo_type,
|
| 80 |
-
tqdm_class=None,
|
| 81 |
-
etag_timeout=30,
|
| 82 |
-
max_workers=8,
|
| 83 |
-
)
|
| 84 |
-
logging.info("Download successful")
|
| 85 |
-
return
|
| 86 |
-
except Exception as e:
|
| 87 |
-
wait_time = backoff_factor**attempt
|
| 88 |
-
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
| 89 |
-
time.sleep(wait_time)
|
| 90 |
-
attempt += 1
|
| 91 |
-
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 92 |
-
|
| 93 |
-
def get_latest_data_leaderboard(leaderboard_initial_df = None):
|
| 94 |
-
current_time = datetime.datetime.now()
|
| 95 |
-
global LAST_UPDATE_LEADERBOARD
|
| 96 |
-
if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
|
| 97 |
-
return leaderboard_initial_df
|
| 98 |
-
LAST_UPDATE_LEADERBOARD = current_time
|
| 99 |
-
leaderboard_dataset = datasets.load_dataset(
|
| 100 |
-
AGGREGATED_REPO,
|
| 101 |
-
"default",
|
| 102 |
-
split="train",
|
| 103 |
-
cache_dir=HF_HOME,
|
| 104 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 105 |
-
verification_mode="no_checks"
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
leaderboard_df = get_leaderboard_df(
|
| 109 |
-
leaderboard_dataset=leaderboard_dataset,
|
| 110 |
-
cols=COLS,
|
| 111 |
-
benchmark_cols=BENCHMARK_COLS,
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
return leaderboard_df
|
| 115 |
-
|
| 116 |
-
def get_latest_data_queue():
|
| 117 |
-
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 118 |
-
return eval_queue_dfs
|
| 119 |
-
|
| 120 |
-
def init_space():
|
| 121 |
-
"""Initializes the application space, loading only necessary data."""
|
| 122 |
-
if DO_FULL_INIT:
|
| 123 |
-
# These downloads only occur on full initialization
|
| 124 |
-
try:
|
| 125 |
-
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 126 |
-
except Exception:
|
| 127 |
-
restart_space()
|
| 128 |
-
|
| 129 |
-
# Always redownload the leaderboard DataFrame
|
| 130 |
-
leaderboard_df = get_latest_data_leaderboard()
|
| 131 |
-
|
| 132 |
-
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
| 133 |
-
eval_queue_dfs = get_latest_data_queue()
|
| 134 |
-
|
| 135 |
-
return leaderboard_df, eval_queue_dfs
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 139 |
-
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 140 |
-
leaderboard_df, eval_queue_dfs = init_space()
|
| 141 |
-
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
# Data processing for plots now only on demand in the respective Gradio tab
|
| 145 |
-
def load_and_create_plots():
|
| 146 |
-
plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
| 147 |
-
return plot_df
|
| 148 |
-
|
| 149 |
-
def init_leaderboard(dataframe):
|
| 150 |
-
return Leaderboard(
|
| 151 |
-
value = dataframe,
|
| 152 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 153 |
-
select_columns=SelectColumns(
|
| 154 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 155 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
| 156 |
-
label="Select Columns to Display:",
|
| 157 |
-
),
|
| 158 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
|
| 159 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 160 |
-
filter_columns=[
|
| 161 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 162 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 163 |
-
ColumnFilter(
|
| 164 |
-
AutoEvalColumn.params.name,
|
| 165 |
-
type="slider",
|
| 166 |
-
min=0.01,
|
| 167 |
-
max=150,
|
| 168 |
-
label="Select the number of parameters (B)",
|
| 169 |
-
),
|
| 170 |
-
ColumnFilter(
|
| 171 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
|
| 172 |
-
),
|
| 173 |
-
ColumnFilter(
|
| 174 |
-
AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
|
| 175 |
-
),
|
| 176 |
-
ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
|
| 177 |
-
ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
|
| 178 |
-
],
|
| 179 |
-
bool_checkboxgroup_label="Hide models",
|
| 180 |
-
interactive=False,
|
| 181 |
-
)
|
| 182 |
-
|
| 183 |
-
demo = gr.Blocks(css=custom_css)
|
| 184 |
-
with demo:
|
| 185 |
-
gr.HTML(TITLE)
|
| 186 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 187 |
-
|
| 188 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 189 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 190 |
-
leaderboard = init_leaderboard(leaderboard_df)
|
| 191 |
-
|
| 192 |
-
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
| 193 |
-
with gr.Row():
|
| 194 |
-
with gr.Column():
|
| 195 |
-
plot_df = load_and_create_plots()
|
| 196 |
-
chart = create_metric_plot_obj(
|
| 197 |
-
plot_df,
|
| 198 |
-
[AutoEvalColumn.average.name],
|
| 199 |
-
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
| 200 |
-
)
|
| 201 |
-
gr.Plot(value=chart, min_width=500)
|
| 202 |
-
with gr.Column():
|
| 203 |
-
plot_df = load_and_create_plots()
|
| 204 |
-
chart = create_metric_plot_obj(
|
| 205 |
-
plot_df,
|
| 206 |
-
BENCHMARK_COLS,
|
| 207 |
-
title="Top Scores and Human Baseline Over Time (from last update)",
|
| 208 |
-
)
|
| 209 |
-
gr.Plot(value=chart, min_width=500)
|
| 210 |
-
|
| 211 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 212 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 213 |
-
|
| 214 |
-
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
| 215 |
-
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
| 216 |
-
|
| 217 |
-
with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
|
| 218 |
-
countdown = gr.HTML(
|
| 219 |
-
"""<div align="center">
|
| 220 |
-
<div position: relative>
|
| 221 |
-
<img
|
| 222 |
-
src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
|
| 223 |
-
allowtransparency="true"
|
| 224 |
-
style="display:block;width:100%;height:auto;"
|
| 225 |
-
/>
|
| 226 |
-
<iframe
|
| 227 |
-
src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&timezone=Europe%2FParis&width=&style=circles&uid=815898&loc=https://logwork.com/countdown-fxmc&language=en&textcolor=&background=%23ffd21e&date=2024-06-26%2015%3A00%3A00&digitscolor=%23ff9d00&unitscolor=&"
|
| 228 |
-
style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
|
| 229 |
-
scrolling="no"
|
| 230 |
-
allowtransparency="true"
|
| 231 |
-
frameborder="0"
|
| 232 |
-
allowfullscreen
|
| 233 |
-
/>
|
| 234 |
-
</div>
|
| 235 |
-
</div>"""
|
| 236 |
-
)
|
| 237 |
-
#gif = gr.Image(value="./gif.gif", interactive=False)
|
| 238 |
-
gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
|
| 239 |
-
|
| 240 |
-
with gr.Row():
|
| 241 |
-
with gr.Accordion("📙 Citation", open=False):
|
| 242 |
-
citation_button = gr.Textbox(
|
| 243 |
-
value=CITATION_BUTTON_TEXT,
|
| 244 |
-
label=CITATION_BUTTON_LABEL,
|
| 245 |
-
lines=20,
|
| 246 |
-
elem_id="citation-button",
|
| 247 |
-
show_copy_button=True,
|
| 248 |
-
)
|
| 249 |
-
|
| 250 |
-
demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
demo.queue(default_concurrency_limit=40)
|
| 254 |
-
|
| 255 |
-
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 256 |
-
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
| 257 |
-
|
| 258 |
-
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
|
| 259 |
-
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
|
| 260 |
-
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
|
| 261 |
-
# ht to Lucain!
|
| 262 |
-
if SPACE_ID is None:
|
| 263 |
-
print("Not in a Space: Space CI disabled.")
|
| 264 |
-
return WebhooksServer(ui=demo)
|
| 265 |
-
|
| 266 |
-
if IS_EPHEMERAL_SPACE:
|
| 267 |
-
print("In an ephemeral Space: Space CI disabled.")
|
| 268 |
-
return WebhooksServer(ui=demo)
|
| 269 |
-
|
| 270 |
-
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
|
| 271 |
-
config = card.data.get("space_ci", {})
|
| 272 |
-
print(f"Enabling Space CI with config from README: {config}")
|
| 273 |
-
|
| 274 |
-
return configure_space_ci(
|
| 275 |
-
blocks=ui,
|
| 276 |
-
trusted_authors=config.get("trusted_authors"),
|
| 277 |
-
private=config.get("private", "auto"),
|
| 278 |
-
variables=config.get("variables", "auto"),
|
| 279 |
-
secrets=config.get("secrets"),
|
| 280 |
-
hardware=config.get("hardware"),
|
| 281 |
-
storage=config.get("storage"),
|
| 282 |
-
)
|
| 283 |
-
|
| 284 |
-
# Create webhooks server (with CI url if in Space and not ephemeral)
|
| 285 |
-
webhooks_server = enable_space_ci_and_return_server(ui=demo)
|
| 286 |
-
|
| 287 |
-
# Add webhooks
|
| 288 |
-
@webhooks_server.add_webhook
|
| 289 |
-
def update_leaderboard(payload: WebhookPayload) -> None:
|
| 290 |
-
"""Redownloads the leaderboard dataset each time it updates"""
|
| 291 |
-
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 292 |
-
datasets.load_dataset(
|
| 293 |
-
AGGREGATED_REPO,
|
| 294 |
-
"default",
|
| 295 |
-
split="train",
|
| 296 |
-
cache_dir=HF_HOME,
|
| 297 |
-
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
| 298 |
-
verification_mode="no_checks"
|
| 299 |
-
)
|
| 300 |
-
|
| 301 |
-
# The below code is not used at the moment, as we can manage the queue file locally
|
| 302 |
-
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
| 303 |
-
@webhooks_server.add_webhook
|
| 304 |
-
def update_queue(payload: WebhookPayload) -> None:
|
| 305 |
-
"""Redownloads the queue dataset each time it updates"""
|
| 306 |
-
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 307 |
-
current_time = datetime.datetime.now()
|
| 308 |
-
global LAST_UPDATE_QUEUE
|
| 309 |
-
if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
|
| 310 |
-
print("Would have updated the queue")
|
| 311 |
-
# We only redownload is last update was more than 10 minutes ago, as the queue is
|
| 312 |
-
# updated regularly and heavy to download
|
| 313 |
-
#download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 314 |
-
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
| 315 |
-
|
| 316 |
-
webhooks_server.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gif.gif
DELETED
Git LFS Details
|
pyproject.toml
CHANGED
|
@@ -38,16 +38,18 @@ numpy = "1.26.0"
|
|
| 38 |
pandas = "2.2.2"
|
| 39 |
plotly = "5.14.1"
|
| 40 |
python-dateutil = "2.8.2"
|
| 41 |
-
requests = "2.28.2"
|
| 42 |
sentencepiece = "^0.2.0"
|
| 43 |
tqdm = "4.65.0"
|
| 44 |
transformers = "4.41.1"
|
| 45 |
tokenizers = ">=0.15.0"
|
| 46 |
gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
|
| 47 |
-
gradio = " 4.20.0"
|
| 48 |
isort = "^5.13.2"
|
| 49 |
ruff = "^0.3.5"
|
| 50 |
gradio-leaderboard = "0.0.8"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
[build-system]
|
| 53 |
requires = ["poetry-core"]
|
|
|
|
| 38 |
pandas = "2.2.2"
|
| 39 |
plotly = "5.14.1"
|
| 40 |
python-dateutil = "2.8.2"
|
|
|
|
| 41 |
sentencepiece = "^0.2.0"
|
| 42 |
tqdm = "4.65.0"
|
| 43 |
transformers = "4.41.1"
|
| 44 |
tokenizers = ">=0.15.0"
|
| 45 |
gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
|
|
|
|
| 46 |
isort = "^5.13.2"
|
| 47 |
ruff = "^0.3.5"
|
| 48 |
gradio-leaderboard = "0.0.8"
|
| 49 |
+
gradio = {extras = ["oauth"], version = "^4.36.1"}
|
| 50 |
+
requests = "^2.31.0"
|
| 51 |
+
requests-oauthlib = "^1.3.1"
|
| 52 |
+
schedule = "^1.2.2"
|
| 53 |
|
| 54 |
[build-system]
|
| 55 |
requires = ["poetry-core"]
|
requirements.txt
CHANGED
|
@@ -8,11 +8,16 @@ numpy==1.26.0
|
|
| 8 |
pandas==2.2.2
|
| 9 |
plotly==5.14.1
|
| 10 |
python-dateutil==2.8.2
|
| 11 |
-
requests==2.28.2
|
| 12 |
sentencepiece
|
| 13 |
tqdm==4.65.0
|
| 14 |
transformers==4.41.1
|
| 15 |
tokenizers>=0.15.0
|
| 16 |
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
| 18 |
gradio_leaderboard==0.0.9
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
pandas==2.2.2
|
| 9 |
plotly==5.14.1
|
| 10 |
python-dateutil==2.8.2
|
|
|
|
| 11 |
sentencepiece
|
| 12 |
tqdm==4.65.0
|
| 13 |
transformers==4.41.1
|
| 14 |
tokenizers>=0.15.0
|
| 15 |
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
|
| 16 |
+
isort
|
| 17 |
+
ruff
|
| 18 |
+
gradio==4.31.0
|
| 19 |
+
gradio[oauth]
|
| 20 |
gradio_leaderboard==0.0.9
|
| 21 |
+
requests==2.31.0
|
| 22 |
+
requests-oauthlib== 1.3.1
|
| 23 |
+
schedule == 1.2.2
|
src/display/about.py
CHANGED
|
@@ -219,6 +219,89 @@ CITATION_BUTTON_TEXT = r"""
|
|
| 219 |
publisher = {Hugging Face},
|
| 220 |
howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
|
| 221 |
}
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
"""
|
|
|
|
| 219 |
publisher = {Hugging Face},
|
| 220 |
howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
|
| 221 |
}
|
| 222 |
+
@software{eval-harness,
|
| 223 |
+
author = {Gao, Leo and
|
| 224 |
+
Tow, Jonathan and
|
| 225 |
+
Biderman, Stella and
|
| 226 |
+
Black, Sid and
|
| 227 |
+
DiPofi, Anthony and
|
| 228 |
+
Foster, Charles and
|
| 229 |
+
Golding, Laurence and
|
| 230 |
+
Hsu, Jeffrey and
|
| 231 |
+
McDonell, Kyle and
|
| 232 |
+
Muennighoff, Niklas and
|
| 233 |
+
Phang, Jason and
|
| 234 |
+
Reynolds, Laria and
|
| 235 |
+
Tang, Eric and
|
| 236 |
+
Thite, Anish and
|
| 237 |
+
Wang, Ben and
|
| 238 |
+
Wang, Kevin and
|
| 239 |
+
Zou, Andy},
|
| 240 |
+
title = {A framework for few-shot language model evaluation},
|
| 241 |
+
month = sep,
|
| 242 |
+
year = 2021,
|
| 243 |
+
publisher = {Zenodo},
|
| 244 |
+
version = {v0.0.1},
|
| 245 |
+
doi = {10.5281/zenodo.5371628},
|
| 246 |
+
url = {https://doi.org/10.5281/zenodo.5371628}
|
| 247 |
+
}
|
| 248 |
+
@misc{clark2018think,
|
| 249 |
+
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
| 250 |
+
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
| 251 |
+
year={2018},
|
| 252 |
+
eprint={1803.05457},
|
| 253 |
+
archivePrefix={arXiv},
|
| 254 |
+
primaryClass={cs.AI}
|
| 255 |
+
}
|
| 256 |
+
@misc{zellers2019hellaswag,
|
| 257 |
+
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
| 258 |
+
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
| 259 |
+
year={2019},
|
| 260 |
+
eprint={1905.07830},
|
| 261 |
+
archivePrefix={arXiv},
|
| 262 |
+
primaryClass={cs.CL}
|
| 263 |
+
}
|
| 264 |
+
@misc{hendrycks2021measuring,
|
| 265 |
+
title={Measuring Massive Multitask Language Understanding},
|
| 266 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
| 267 |
+
year={2021},
|
| 268 |
+
eprint={2009.03300},
|
| 269 |
+
archivePrefix={arXiv},
|
| 270 |
+
primaryClass={cs.CY}
|
| 271 |
+
}
|
| 272 |
+
@misc{lin2022truthfulqa,
|
| 273 |
+
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
| 274 |
+
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
| 275 |
+
year={2022},
|
| 276 |
+
eprint={2109.07958},
|
| 277 |
+
archivePrefix={arXiv},
|
| 278 |
+
primaryClass={cs.CL}
|
| 279 |
+
}
|
| 280 |
+
@misc{DBLP:journals/corr/abs-1907-10641,
|
| 281 |
+
title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
|
| 282 |
+
author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
|
| 283 |
+
year={2019},
|
| 284 |
+
eprint={1907.10641},
|
| 285 |
+
archivePrefix={arXiv},
|
| 286 |
+
primaryClass={cs.CL}
|
| 287 |
+
}
|
| 288 |
+
@misc{DBLP:journals/corr/abs-2110-14168,
|
| 289 |
+
title={Training Verifiers to Solve Math Word Problems},
|
| 290 |
+
author={Karl Cobbe and
|
| 291 |
+
Vineet Kosaraju and
|
| 292 |
+
Mohammad Bavarian and
|
| 293 |
+
Mark Chen and
|
| 294 |
+
Heewoo Jun and
|
| 295 |
+
Lukasz Kaiser and
|
| 296 |
+
Matthias Plappert and
|
| 297 |
+
Jerry Tworek and
|
| 298 |
+
Jacob Hilton and
|
| 299 |
+
Reiichiro Nakano and
|
| 300 |
+
Christopher Hesse and
|
| 301 |
+
John Schulman},
|
| 302 |
+
year={2021},
|
| 303 |
+
eprint={2110.14168},
|
| 304 |
+
archivePrefix={arXiv},
|
| 305 |
+
primaryClass={cs.CL}
|
| 306 |
+
}
|
| 307 |
"""
|
src/display/css_html_js.py
CHANGED
|
@@ -9,7 +9,7 @@ table th:first-child {
|
|
| 9 |
|
| 10 |
/* Full width space */
|
| 11 |
.gradio-container {
|
| 12 |
-
|
| 13 |
}
|
| 14 |
|
| 15 |
/* Text style and margins */
|
|
@@ -48,7 +48,7 @@ table th:first-child {
|
|
| 48 |
}
|
| 49 |
|
| 50 |
/* Filters style */
|
| 51 |
-
#filter_type{
|
| 52 |
border: 0;
|
| 53 |
padding-left: 0;
|
| 54 |
padding-top: 0;
|
|
@@ -56,29 +56,53 @@ table th:first-child {
|
|
| 56 |
#filter_type label {
|
| 57 |
display: flex;
|
| 58 |
}
|
| 59 |
-
#filter_type label > span{
|
| 60 |
margin-top: var(--spacing-lg);
|
| 61 |
margin-right: 0.5em;
|
| 62 |
}
|
| 63 |
-
#filter_type label > .wrap{
|
| 64 |
width: 103px;
|
| 65 |
}
|
| 66 |
-
#filter_type label > .wrap .wrap-inner{
|
| 67 |
padding: 2px;
|
| 68 |
}
|
| 69 |
-
#filter_type label > .wrap .wrap-inner input{
|
| 70 |
-
width: 1px
|
| 71 |
}
|
| 72 |
-
#filter-columns-type{
|
| 73 |
-
border:0;
|
| 74 |
-
padding:0.5;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
}
|
| 76 |
-
#filter
|
| 77 |
-
border:0;
|
| 78 |
-
padding:0.5;
|
| 79 |
}
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
"""
|
| 84 |
|
|
|
|
| 9 |
|
| 10 |
/* Full width space */
|
| 11 |
.gradio-container {
|
| 12 |
+
max-width: 95% !important;
|
| 13 |
}
|
| 14 |
|
| 15 |
/* Text style and margins */
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
/* Filters style */
|
| 51 |
+
#filter_type {
|
| 52 |
border: 0;
|
| 53 |
padding-left: 0;
|
| 54 |
padding-top: 0;
|
|
|
|
| 56 |
#filter_type label {
|
| 57 |
display: flex;
|
| 58 |
}
|
| 59 |
+
#filter_type label > span {
|
| 60 |
margin-top: var(--spacing-lg);
|
| 61 |
margin-right: 0.5em;
|
| 62 |
}
|
| 63 |
+
#filter_type label > .wrap {
|
| 64 |
width: 103px;
|
| 65 |
}
|
| 66 |
+
#filter_type label > .wrap .wrap-inner {
|
| 67 |
padding: 2px;
|
| 68 |
}
|
| 69 |
+
#filter_type label > .wrap .wrap-inner input {
|
| 70 |
+
width: 1px;
|
| 71 |
}
|
| 72 |
+
#filter-columns-type {
|
| 73 |
+
border: 0;
|
| 74 |
+
padding: 0.5;
|
| 75 |
+
}
|
| 76 |
+
#filter-columns-size {
|
| 77 |
+
border: 0;
|
| 78 |
+
padding: 0.5;
|
| 79 |
}
|
| 80 |
+
#box-filter > .form {
|
| 81 |
+
border: 0;
|
|
|
|
| 82 |
}
|
| 83 |
+
|
| 84 |
+
/* Header styles */
|
| 85 |
+
#header-title {
|
| 86 |
+
text-align: left;
|
| 87 |
+
display: inline-block;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
#header-row {
|
| 91 |
+
display: flex;
|
| 92 |
+
justify-content: space-between;
|
| 93 |
+
align-items: center;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
#header-row .gradio-html {
|
| 97 |
+
flex-grow: 1;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
#oauth-button {
|
| 101 |
+
height: auto;
|
| 102 |
+
min-width: max-content;
|
| 103 |
+
white-space: nowrap;
|
| 104 |
+
padding: 10px 20px;
|
| 105 |
+
border-radius: 4px;
|
| 106 |
}
|
| 107 |
"""
|
| 108 |
|
src/display/formatting.py
CHANGED
|
@@ -11,7 +11,7 @@ def make_clickable_model(model_name):
|
|
| 11 |
link = f"https://huggingface.co/{model_name}"
|
| 12 |
|
| 13 |
details_model_name = model_name.replace("/", "__")
|
| 14 |
-
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard
|
| 15 |
|
| 16 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
| 17 |
|
|
|
|
| 11 |
link = f"https://huggingface.co/{model_name}"
|
| 12 |
|
| 13 |
details_model_name = model_name.replace("/", "__")
|
| 14 |
+
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/{details_model_name}-details"
|
| 15 |
|
| 16 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
| 17 |
|
src/display/utils.py
CHANGED
|
@@ -49,12 +49,23 @@ class Task:
|
|
| 49 |
|
| 50 |
|
| 51 |
class Tasks(Enum):
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
# These classes are for user facing column names,
|
|
@@ -77,7 +88,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
| 77 |
# Scores
|
| 78 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 79 |
for task in Tasks:
|
| 80 |
-
|
|
|
|
| 81 |
# Model information
|
| 82 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 83 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
@@ -94,7 +106,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
|
|
| 94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
| 96 |
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
| 98 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
| 99 |
|
| 100 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
@@ -103,30 +118,31 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
| 103 |
|
| 104 |
@dataclass(frozen=True)
|
| 105 |
class EvalQueueColumn: # Queue column
|
| 106 |
-
|
|
|
|
| 107 |
revision = ColumnContent("revision", "str", True)
|
| 108 |
-
private = ColumnContent("private", "bool", True)
|
| 109 |
precision = ColumnContent("precision", "str", True)
|
| 110 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
| 111 |
status = ColumnContent("status", "str", True)
|
| 112 |
|
| 113 |
|
| 114 |
-
baseline_row = {
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
}
|
| 130 |
|
| 131 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
| 132 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
|
@@ -136,22 +152,22 @@ baseline_row = {
|
|
| 136 |
# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
|
| 137 |
# GSM8K: paper
|
| 138 |
# Define the human baselines
|
| 139 |
-
human_baseline_row = {
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
}
|
| 155 |
|
| 156 |
|
| 157 |
@dataclass
|
|
@@ -166,22 +182,22 @@ class ModelType(Enum):
|
|
| 166 |
FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
|
| 167 |
chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
|
| 168 |
merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
|
| 169 |
-
Unknown = ModelDetails(name="", symbol="
|
| 170 |
|
| 171 |
def to_str(self, separator=" "):
|
| 172 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
| 173 |
|
| 174 |
@staticmethod
|
| 175 |
-
def from_str(
|
| 176 |
-
if
|
| 177 |
return ModelType.FT
|
| 178 |
-
if "
|
| 179 |
return ModelType.CPT
|
| 180 |
-
if "pretrained" in
|
| 181 |
return ModelType.PT
|
| 182 |
-
if any([k in
|
| 183 |
return ModelType.chat
|
| 184 |
-
if "merge" in
|
| 185 |
return ModelType.merges
|
| 186 |
return ModelType.Unknown
|
| 187 |
|
|
@@ -200,6 +216,7 @@ class Precision(Enum):
|
|
| 200 |
qt_GPTQ = ModelDetails("GPTQ")
|
| 201 |
Unknown = ModelDetails("?")
|
| 202 |
|
|
|
|
| 203 |
def from_str(precision):
|
| 204 |
if precision in ["torch.float16", "float16"]:
|
| 205 |
return Precision.float16
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
class Tasks(Enum):
|
| 52 |
+
ifeval = Task("leaderboard_ifeval", "strict_acc,none", "IFEval")
|
| 53 |
+
ifeval_raw = Task("leaderboard_ifeval", "strict_acc,none", "IFEval Raw")
|
| 54 |
+
|
| 55 |
+
bbh = Task("leaderboard_bbh", "acc_norm,none", "BBH")
|
| 56 |
+
bbh_raw = Task("leaderboard_bbh", "acc_norm,none", "BBH Raw")
|
| 57 |
+
|
| 58 |
+
math = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5")
|
| 59 |
+
math_raw = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5 Raw")
|
| 60 |
+
|
| 61 |
+
gpqa = Task("leaderboard_gpqa", "acc_norm,none", "GPQA")
|
| 62 |
+
gpqa_raw = Task("leaderboard_gpqa", "acc_norm,none", "GPQA Raw")
|
| 63 |
+
|
| 64 |
+
musr = Task("leaderboard_musr", "acc_norm,none", "MUSR")
|
| 65 |
+
musr_raw = Task("leaderboard_musr", "acc_norm,none", "MUSR Raw")
|
| 66 |
+
|
| 67 |
+
mmlu_pro = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO")
|
| 68 |
+
mmlu_pro_raw = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO Raw")
|
| 69 |
|
| 70 |
|
| 71 |
# These classes are for user facing column names,
|
|
|
|
| 88 |
# Scores
|
| 89 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 90 |
for task in Tasks:
|
| 91 |
+
displayed_by_default = not task.name.endswith("_raw")
|
| 92 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default=displayed_by_default)])
|
| 93 |
# Model information
|
| 94 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 95 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 106 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 107 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
| 108 |
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
|
| 109 |
+
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Chat Template", "bool", False)])
|
| 110 |
+
auto_eval_column_dict.append(["maintainers_highlight", ColumnContent, ColumnContent("Maintainer's Highlight", "bool", False, hidden=True)])
|
| 111 |
+
|
| 112 |
+
# fullname structure: <user>/<model_name>
|
| 113 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
| 114 |
|
| 115 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
|
|
| 118 |
|
| 119 |
@dataclass(frozen=True)
|
| 120 |
class EvalQueueColumn: # Queue column
|
| 121 |
+
model_link = ColumnContent("model_link", "markdown", True)
|
| 122 |
+
model_name = ColumnContent("model_name", "str", True)
|
| 123 |
revision = ColumnContent("revision", "str", True)
|
| 124 |
+
#private = ColumnContent("private", "bool", True) # Should not be displayed
|
| 125 |
precision = ColumnContent("precision", "str", True)
|
| 126 |
+
#weight_type = ColumnContent("weight_type", "str", "Original") # Might be confusing, to think about
|
| 127 |
status = ColumnContent("status", "str", True)
|
| 128 |
|
| 129 |
|
| 130 |
+
# baseline_row = {
|
| 131 |
+
# AutoEvalColumn.model.name: "<p>Baseline</p>",
|
| 132 |
+
# AutoEvalColumn.revision.name: "N/A",
|
| 133 |
+
# AutoEvalColumn.precision.name: None,
|
| 134 |
+
# AutoEvalColumn.merged.name: False,
|
| 135 |
+
# AutoEvalColumn.average.name: 31.0,
|
| 136 |
+
# AutoEvalColumn.arc.name: 25.0,
|
| 137 |
+
# AutoEvalColumn.hellaswag.name: 25.0,
|
| 138 |
+
# AutoEvalColumn.mmlu.name: 25.0,
|
| 139 |
+
# AutoEvalColumn.truthfulqa.name: 25.0,
|
| 140 |
+
# AutoEvalColumn.winogrande.name: 50.0,
|
| 141 |
+
# AutoEvalColumn.gsm8k.name: 0.21,
|
| 142 |
+
# AutoEvalColumn.fullname.name: "baseline",
|
| 143 |
+
# AutoEvalColumn.model_type.name: "",
|
| 144 |
+
# AutoEvalColumn.not_flagged.name: False,
|
| 145 |
+
# }
|
| 146 |
|
| 147 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
| 148 |
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
|
|
|
|
| 152 |
# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
|
| 153 |
# GSM8K: paper
|
| 154 |
# Define the human baselines
|
| 155 |
+
# human_baseline_row = {
|
| 156 |
+
# AutoEvalColumn.model.name: "<p>Human performance</p>",
|
| 157 |
+
# AutoEvalColumn.revision.name: "N/A",
|
| 158 |
+
# AutoEvalColumn.precision.name: None,
|
| 159 |
+
# AutoEvalColumn.average.name: 92.75,
|
| 160 |
+
# AutoEvalColumn.merged.name: False,
|
| 161 |
+
# AutoEvalColumn.arc.name: 80.0,
|
| 162 |
+
# AutoEvalColumn.hellaswag.name: 95.0,
|
| 163 |
+
# AutoEvalColumn.mmlu.name: 89.8,
|
| 164 |
+
# AutoEvalColumn.truthfulqa.name: 94.0,
|
| 165 |
+
# AutoEvalColumn.winogrande.name: 94.0,
|
| 166 |
+
# AutoEvalColumn.gsm8k.name: 100,
|
| 167 |
+
# AutoEvalColumn.fullname.name: "human_baseline",
|
| 168 |
+
# AutoEvalColumn.model_type.name: "",
|
| 169 |
+
# AutoEvalColumn.not_flagged.name: False,
|
| 170 |
+
# }
|
| 171 |
|
| 172 |
|
| 173 |
@dataclass
|
|
|
|
| 182 |
FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
|
| 183 |
chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
|
| 184 |
merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
|
| 185 |
+
Unknown = ModelDetails(name="❓ other", symbol="❓")
|
| 186 |
|
| 187 |
def to_str(self, separator=" "):
|
| 188 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
| 189 |
|
| 190 |
@staticmethod
|
| 191 |
+
def from_str(m_type):
|
| 192 |
+
if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
|
| 193 |
return ModelType.FT
|
| 194 |
+
if "continuously pretrained" in m_type or "🟩" in m_type:
|
| 195 |
return ModelType.CPT
|
| 196 |
+
if "pretrained" in m_type or "🟢" in m_type:
|
| 197 |
return ModelType.PT
|
| 198 |
+
if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
|
| 199 |
return ModelType.chat
|
| 200 |
+
if "merge" in m_type or "🤝" in m_type:
|
| 201 |
return ModelType.merges
|
| 202 |
return ModelType.Unknown
|
| 203 |
|
|
|
|
| 216 |
qt_GPTQ = ModelDetails("GPTQ")
|
| 217 |
Unknown = ModelDetails("?")
|
| 218 |
|
| 219 |
+
@staticmethod
|
| 220 |
def from_str(precision):
|
| 221 |
if precision in ["torch.float16", "float16"]:
|
| 222 |
return Precision.float16
|
src/envs.py
CHANGED
|
@@ -4,9 +4,10 @@ from huggingface_hub import HfApi
|
|
| 4 |
# clone / pull the lmeh eval data
|
| 5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 6 |
|
| 7 |
-
REPO_ID = "open-llm-leaderboard
|
| 8 |
-
QUEUE_REPO = "open-llm-leaderboard
|
| 9 |
-
AGGREGATED_REPO = "open-llm-leaderboard
|
|
|
|
| 10 |
|
| 11 |
HF_HOME = os.getenv("HF_HOME", ".")
|
| 12 |
|
|
@@ -20,11 +21,12 @@ if not os.access(HF_HOME, os.W_OK):
|
|
| 20 |
else:
|
| 21 |
print("Write access confirmed for HF_HOME")
|
| 22 |
|
|
|
|
| 23 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
| 24 |
|
| 25 |
# Rate limit variables
|
| 26 |
RATE_LIMIT_PERIOD = 7
|
| 27 |
RATE_LIMIT_QUOTA = 5
|
| 28 |
-
HAS_HIGHER_RATE_LIMIT = [
|
| 29 |
|
| 30 |
API = HfApi(token=HF_TOKEN)
|
|
|
|
| 4 |
# clone / pull the lmeh eval data
|
| 5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 6 |
|
| 7 |
+
REPO_ID = "open-llm-leaderboard/open_llm_leaderboard_v2"
|
| 8 |
+
QUEUE_REPO = "open-llm-leaderboard/requests"
|
| 9 |
+
AGGREGATED_REPO = "open-llm-leaderboard/contents"
|
| 10 |
+
VOTES_REPO = "open-llm-leaderboard/votes"
|
| 11 |
|
| 12 |
HF_HOME = os.getenv("HF_HOME", ".")
|
| 13 |
|
|
|
|
| 21 |
else:
|
| 22 |
print("Write access confirmed for HF_HOME")
|
| 23 |
|
| 24 |
+
VOTES_PATH = os.path.join(HF_HOME, "model-votes")
|
| 25 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
| 26 |
|
| 27 |
# Rate limit variables
|
| 28 |
RATE_LIMIT_PERIOD = 7
|
| 29 |
RATE_LIMIT_QUOTA = 5
|
| 30 |
+
HAS_HIGHER_RATE_LIMIT = []
|
| 31 |
|
| 32 |
API = HfApi(token=HF_TOKEN)
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -4,122 +4,8 @@ from src.display.utils import AutoEvalColumn
|
|
| 4 |
|
| 5 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 6 |
# (Model name to forum discussion link)
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
|
| 10 |
-
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
|
| 11 |
-
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
|
| 12 |
-
"Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
|
| 13 |
-
"TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
|
| 14 |
-
"gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
|
| 15 |
-
"AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
|
| 16 |
-
"AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
|
| 17 |
-
"AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
|
| 18 |
-
"fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
|
| 19 |
-
"jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 20 |
-
"rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 21 |
-
"rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 22 |
-
"GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 23 |
-
"GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 24 |
-
"GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 25 |
-
"viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 26 |
-
"GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 27 |
-
"janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 28 |
-
"ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 29 |
-
"fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 30 |
-
"mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 31 |
-
"mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 32 |
-
"Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 33 |
-
"GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 34 |
-
"quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 35 |
-
"quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 36 |
-
"quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 37 |
-
"mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 38 |
-
"cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 39 |
-
"jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 40 |
-
"v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 41 |
-
"v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 42 |
-
"rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 43 |
-
"zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
|
| 44 |
-
"dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
| 45 |
-
"udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
|
| 46 |
-
"dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
|
| 47 |
-
"eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
| 48 |
-
"abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
| 49 |
-
"alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
| 50 |
-
"nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
| 51 |
-
"CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
| 52 |
-
"liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
| 53 |
-
# Merges not indicated
|
| 54 |
-
"gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 55 |
-
"gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 56 |
-
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 57 |
-
"kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 58 |
-
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 59 |
-
"kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 60 |
-
"fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 61 |
-
"perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 62 |
-
"rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 63 |
-
"rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 64 |
-
"Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 65 |
-
"aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 66 |
-
"NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 67 |
-
"Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 68 |
-
"OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 69 |
-
"perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 70 |
-
"v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 71 |
-
"Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 72 |
-
"DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 73 |
-
"PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 74 |
-
"Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 75 |
-
"Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 76 |
-
"perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 77 |
-
"elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 78 |
-
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 79 |
-
"Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 80 |
-
"diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 81 |
-
"Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 82 |
-
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 83 |
-
"Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 84 |
-
"garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 85 |
-
"Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 86 |
-
"uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
| 87 |
-
"DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
| 88 |
-
"cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
| 89 |
-
"DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
| 90 |
-
"DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
| 91 |
-
"gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
| 92 |
-
"udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 93 |
-
"kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 94 |
-
"kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 95 |
-
"Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 96 |
-
"mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 97 |
-
"Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 98 |
-
"ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 99 |
-
"Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 100 |
-
"SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 101 |
-
"bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 102 |
-
"cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 103 |
-
"bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 104 |
-
"jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 105 |
-
# MoErges
|
| 106 |
-
"cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 107 |
-
"cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 108 |
-
"gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 109 |
-
"macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 110 |
-
"cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 111 |
-
"macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 112 |
-
"macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 113 |
-
"macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 114 |
-
"cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 115 |
-
"macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 116 |
-
"macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
| 117 |
-
# Other - contamination mostly
|
| 118 |
-
"DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
|
| 119 |
-
"CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
|
| 120 |
-
"Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
|
| 121 |
-
"Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
|
| 122 |
-
}
|
| 123 |
|
| 124 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
| 125 |
DO_NOT_SUBMIT_MODELS = [
|
|
@@ -133,12 +19,16 @@ DO_NOT_SUBMIT_MODELS = [
|
|
| 133 |
def flag_models(leaderboard_data: list[dict]):
|
| 134 |
"""Flags models based on external criteria or flagged status."""
|
| 135 |
for model_data in leaderboard_data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
# If a model is not flagged, use its "fullname" as a key
|
| 137 |
if model_data[AutoEvalColumn.not_flagged.name]:
|
| 138 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
| 139 |
else:
|
| 140 |
-
|
| 141 |
-
flag_key = "merged"
|
| 142 |
|
| 143 |
# Reverse the logic: Check for non-flagged models instead
|
| 144 |
if flag_key in FLAGGED_MODELS:
|
|
|
|
| 4 |
|
| 5 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 6 |
# (Model name to forum discussion link)
|
| 7 |
+
# None for the v2 so far!
|
| 8 |
+
FLAGGED_MODELS = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
| 11 |
DO_NOT_SUBMIT_MODELS = [
|
|
|
|
| 19 |
def flag_models(leaderboard_data: list[dict]):
|
| 20 |
"""Flags models based on external criteria or flagged status."""
|
| 21 |
for model_data in leaderboard_data:
|
| 22 |
+
# Skip flagging if maintainers_highlight is True
|
| 23 |
+
if model_data.get(AutoEvalColumn.maintainers_highlight.name, False):
|
| 24 |
+
model_data[AutoEvalColumn.not_flagged.name] = True
|
| 25 |
+
continue
|
| 26 |
+
|
| 27 |
# If a model is not flagged, use its "fullname" as a key
|
| 28 |
if model_data[AutoEvalColumn.not_flagged.name]:
|
| 29 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
| 30 |
else:
|
| 31 |
+
flag_key = None
|
|
|
|
| 32 |
|
| 33 |
# Reverse the logic: Check for non-flagged models instead
|
| 34 |
if flag_key in FLAGGED_MODELS:
|
src/populate.py
CHANGED
|
@@ -2,14 +2,15 @@ import pathlib
|
|
| 2 |
import pandas as pd
|
| 3 |
from datasets import Dataset
|
| 4 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 5 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 6 |
from src.leaderboard.filter_models import filter_models_flags
|
| 7 |
from src.display.utils import load_json_data
|
| 8 |
|
| 9 |
|
| 10 |
def _process_model_data(entry, model_name_key="model", revision_key="revision"):
|
| 11 |
"""Enrich model data with clickable links and revisions."""
|
| 12 |
-
entry[EvalQueueColumn.
|
|
|
|
| 13 |
entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
|
| 14 |
return entry
|
| 15 |
|
|
@@ -50,4 +51,4 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
|
|
| 50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 51 |
df = df[cols].round(decimals=2)
|
| 52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 53 |
-
return df
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from datasets import Dataset
|
| 4 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 5 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 6 |
from src.leaderboard.filter_models import filter_models_flags
|
| 7 |
from src.display.utils import load_json_data
|
| 8 |
|
| 9 |
|
| 10 |
def _process_model_data(entry, model_name_key="model", revision_key="revision"):
|
| 11 |
"""Enrich model data with clickable links and revisions."""
|
| 12 |
+
entry[EvalQueueColumn.model_name.name] = entry.get(model_name_key, "")
|
| 13 |
+
entry[EvalQueueColumn.model_link.name] = make_clickable_model(entry.get(model_name_key, ""))
|
| 14 |
entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
|
| 15 |
return entry
|
| 16 |
|
|
|
|
| 51 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 52 |
df = df[cols].round(decimals=2)
|
| 53 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 54 |
+
return df
|
src/submission/submit.py
CHANGED
|
@@ -32,6 +32,7 @@ def add_new_eval(
|
|
| 32 |
precision: str,
|
| 33 |
weight_type: str,
|
| 34 |
model_type: str,
|
|
|
|
| 35 |
):
|
| 36 |
global REQUESTED_MODELS
|
| 37 |
global USERS_TO_SUBMISSION_DATES
|
|
@@ -129,6 +130,7 @@ def add_new_eval(
|
|
| 129 |
"model_type": model_type,
|
| 130 |
"job_id": -1,
|
| 131 |
"job_start_time": None,
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
supplementary_info = {
|
|
|
|
| 32 |
precision: str,
|
| 33 |
weight_type: str,
|
| 34 |
model_type: str,
|
| 35 |
+
use_chat_template: bool,
|
| 36 |
):
|
| 37 |
global REQUESTED_MODELS
|
| 38 |
global USERS_TO_SUBMISSION_DATES
|
|
|
|
| 130 |
"model_type": model_type,
|
| 131 |
"job_id": -1,
|
| 132 |
"job_start_time": None,
|
| 133 |
+
"use_chat_template": use_chat_template,
|
| 134 |
}
|
| 135 |
|
| 136 |
supplementary_info = {
|
src/tools/plots.py
CHANGED
|
@@ -4,7 +4,7 @@ import plotly.express as px
|
|
| 4 |
from plotly.graph_objs import Figure
|
| 5 |
|
| 6 |
from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
|
| 7 |
-
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
| 8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
| 9 |
|
| 10 |
|
|
|
|
| 4 |
from plotly.graph_objs import Figure
|
| 5 |
|
| 6 |
from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
|
| 7 |
+
# from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
| 8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
| 9 |
|
| 10 |
|
src/voting/vote_system.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import pathlib
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import schedule
|
| 7 |
+
import time
|
| 8 |
+
from datetime import datetime, timezone
|
| 9 |
+
from src.display.utils import EvalQueueColumn
|
| 10 |
+
|
| 11 |
+
from src.envs import API
|
| 12 |
+
|
| 13 |
+
# Set up logging
|
| 14 |
+
logging.basicConfig(level=logging.INFO)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class VoteManager:
|
| 18 |
+
def __init__(self, votes_path, eval_requests_path, repo_id):
|
| 19 |
+
self.votes_path = votes_path
|
| 20 |
+
self.eval_requests_path = eval_requests_path
|
| 21 |
+
self.repo_id = repo_id
|
| 22 |
+
self.vote_dataset = self.read_vote_dataset()
|
| 23 |
+
self.vote_check_set = self.make_check_set(self.vote_dataset)
|
| 24 |
+
self.votes_to_upload = []
|
| 25 |
+
|
| 26 |
+
def init_vote_dataset(self):
|
| 27 |
+
self.vote_dataset = self.read_vote_dataset()
|
| 28 |
+
self.vote_check_set = self.make_check_set(self.vote_dataset)
|
| 29 |
+
|
| 30 |
+
def read_vote_dataset(self):
|
| 31 |
+
result = []
|
| 32 |
+
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
| 33 |
+
if votes_file.exists():
|
| 34 |
+
with open(votes_file, "r") as f:
|
| 35 |
+
for line in f:
|
| 36 |
+
data = json.loads(line.strip())
|
| 37 |
+
result.append(data)
|
| 38 |
+
result = pd.DataFrame(result)
|
| 39 |
+
return result
|
| 40 |
+
|
| 41 |
+
def make_check_set(self, vote_dataset: pd.DataFrame):
|
| 42 |
+
result = list()
|
| 43 |
+
for row in vote_dataset.itertuples(index=False, name='vote'):
|
| 44 |
+
result.append((row.model, row.revision, row.username))
|
| 45 |
+
return set(result)
|
| 46 |
+
|
| 47 |
+
def get_model_revision(self, selected_model: str) -> str:
|
| 48 |
+
"""Fetch the revision for the given model from the request files."""
|
| 49 |
+
for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
|
| 50 |
+
if user_folder.is_dir():
|
| 51 |
+
for file in user_folder.glob("*.json"):
|
| 52 |
+
with open(file, "r") as f:
|
| 53 |
+
data = json.load(f)
|
| 54 |
+
if data.get("model") == selected_model:
|
| 55 |
+
return data.get("revision", "main")
|
| 56 |
+
return "main"
|
| 57 |
+
|
| 58 |
+
def create_request_vote_df(self, pending_models_df: gr.Dataframe):
|
| 59 |
+
if pending_models_df.empty or not "model_name" in pending_models_df.columns:
|
| 60 |
+
return pending_models_df
|
| 61 |
+
self.vote_dataset = self.read_vote_dataset()
|
| 62 |
+
vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
|
| 63 |
+
|
| 64 |
+
pending_models_df_votes = pd.merge(
|
| 65 |
+
pending_models_df,
|
| 66 |
+
vote_counts,
|
| 67 |
+
left_on=["model_name", 'revision'],
|
| 68 |
+
right_on=['model', 'revision'],
|
| 69 |
+
how='left'
|
| 70 |
+
)
|
| 71 |
+
# Filling empty votes
|
| 72 |
+
pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
|
| 73 |
+
pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
|
| 74 |
+
# Removing useless columns
|
| 75 |
+
pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
|
| 76 |
+
return pending_models_df_votes
|
| 77 |
+
|
| 78 |
+
# Function to be called when a user votes for a model
|
| 79 |
+
def add_vote(
|
| 80 |
+
self,
|
| 81 |
+
selected_model: str,
|
| 82 |
+
pending_models_df: gr.Dataframe,
|
| 83 |
+
profile: gr.OAuthProfile | None
|
| 84 |
+
):
|
| 85 |
+
logger.debug(f"Type of list before usage: {type(list)}")
|
| 86 |
+
# model_name, revision, user_id, timestamp
|
| 87 |
+
if selected_model in ["str", ""]:
|
| 88 |
+
gr.Warning("No model selected")
|
| 89 |
+
return
|
| 90 |
+
|
| 91 |
+
if profile is None:
|
| 92 |
+
gr.Warning("Hub Login required")
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
vote_username = profile.username
|
| 96 |
+
model_revision = self.get_model_revision(selected_model)
|
| 97 |
+
|
| 98 |
+
# tuple (immutable) for checking than already voted for model
|
| 99 |
+
check_tuple = (selected_model, model_revision, vote_username)
|
| 100 |
+
if check_tuple in self.vote_check_set:
|
| 101 |
+
gr.Warning("Already voted for this model")
|
| 102 |
+
return
|
| 103 |
+
|
| 104 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 105 |
+
|
| 106 |
+
vote_obj = {
|
| 107 |
+
"model": selected_model,
|
| 108 |
+
"revision": model_revision,
|
| 109 |
+
"username": vote_username,
|
| 110 |
+
"timestamp": current_time
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
# Append the vote to the JSONL file
|
| 114 |
+
try:
|
| 115 |
+
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
| 116 |
+
with open(votes_file, "a") as f:
|
| 117 |
+
f.write(json.dumps(vote_obj) + "\n")
|
| 118 |
+
logger.info(f"Vote added locally: {vote_obj}")
|
| 119 |
+
|
| 120 |
+
self.votes_to_upload.append(vote_obj)
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"Failed to write vote to file: {e}")
|
| 123 |
+
gr.Warning("Failed to record vote. Please try again")
|
| 124 |
+
return
|
| 125 |
+
|
| 126 |
+
self.vote_check_set.add(check_tuple)
|
| 127 |
+
gr.Info(f"Voted for {selected_model}")
|
| 128 |
+
|
| 129 |
+
return self.create_request_vote_df(pending_models_df)
|
| 130 |
+
|
| 131 |
+
def upload_votes(self):
|
| 132 |
+
if self.votes_to_upload:
|
| 133 |
+
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
| 134 |
+
try:
|
| 135 |
+
with open(votes_file, "rb") as f:
|
| 136 |
+
API.upload_file(
|
| 137 |
+
path_or_fileobj=f,
|
| 138 |
+
path_in_repo="votes_data.jsonl",
|
| 139 |
+
repo_id=self.repo_id,
|
| 140 |
+
repo_type="dataset",
|
| 141 |
+
commit_message="Updating votes_data.jsonl with new votes",
|
| 142 |
+
)
|
| 143 |
+
logger.info("Votes uploaded to votes repository")
|
| 144 |
+
self.votes_to_upload.clear()
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.error(f"Failed to upload votes to repository: {e}")
|
| 147 |
+
|
| 148 |
+
def run_scheduler(vote_manager):
|
| 149 |
+
while True:
|
| 150 |
+
schedule.run_pending()
|
| 151 |
+
time.sleep(1)
|