Spaces:
Runtime error
Runtime error
Print first-timer-friendly score in the table
Browse files
leaderboard/src/leaderboard/app.py
CHANGED
|
@@ -13,7 +13,6 @@ from dotenv import load_dotenv
|
|
| 13 |
from leaderboard.data_loader import (
|
| 14 |
load_benchmark_data,
|
| 15 |
get_unique_values,
|
| 16 |
-
get_first_timer_friendly_models,
|
| 17 |
)
|
| 18 |
from leaderboard.formatters import apply_formatting
|
| 19 |
|
|
@@ -115,23 +114,10 @@ def create_leaderboard_ui():
|
|
| 115 |
"Please set the environment variable to load benchmark data."
|
| 116 |
)
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
"and quick to run. Perfect for getting started with Transformers.js.\n\n"
|
| 123 |
-
"**Showing top 3 models per task type.**"
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
first_timer_models = get_first_timer_friendly_models(df, limit_per_task=3)
|
| 127 |
-
formatted_first_timer = format_dataframe(first_timer_models)
|
| 128 |
-
|
| 129 |
-
first_timer_table = gr.DataFrame(
|
| 130 |
-
value=formatted_first_timer,
|
| 131 |
-
label="Top First-Timer-Friendly Models (by Task)",
|
| 132 |
-
interactive=False,
|
| 133 |
-
wrap=True,
|
| 134 |
-
)
|
| 135 |
|
| 136 |
with gr.Row():
|
| 137 |
refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
|
|
@@ -187,7 +173,11 @@ def create_leaderboard_ui():
|
|
| 187 |
"- **p50/p90**: 50th and 90th percentile values\n\n"
|
| 188 |
"**HuggingFace Metrics:**\n"
|
| 189 |
"- **downloads**: Total downloads from HuggingFace Hub\n"
|
| 190 |
-
"- **likes**: Number of likes on HuggingFace Hub"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
)
|
| 192 |
|
| 193 |
def update_data():
|
|
@@ -195,13 +185,8 @@ def create_leaderboard_ui():
|
|
| 195 |
new_df = load_data()
|
| 196 |
formatted_new_df = format_dataframe(new_df)
|
| 197 |
|
| 198 |
-
# Update first-timer-friendly models (3 per task)
|
| 199 |
-
new_first_timer = get_first_timer_friendly_models(new_df, limit_per_task=3)
|
| 200 |
-
formatted_new_first_timer = format_dataframe(new_first_timer)
|
| 201 |
-
|
| 202 |
return (
|
| 203 |
new_df, # Update cached raw data
|
| 204 |
-
formatted_new_first_timer,
|
| 205 |
formatted_new_df,
|
| 206 |
gr.update(choices=get_unique_values(new_df, "task")),
|
| 207 |
gr.update(choices=get_unique_values(new_df, "platform")),
|
|
@@ -221,7 +206,6 @@ def create_leaderboard_ui():
|
|
| 221 |
fn=update_data,
|
| 222 |
outputs=[
|
| 223 |
raw_data_state,
|
| 224 |
-
first_timer_table,
|
| 225 |
results_table,
|
| 226 |
task_filter,
|
| 227 |
platform_filter,
|
|
|
|
| 13 |
from leaderboard.data_loader import (
|
| 14 |
load_benchmark_data,
|
| 15 |
get_unique_values,
|
|
|
|
| 16 |
)
|
| 17 |
from leaderboard.formatters import apply_formatting
|
| 18 |
|
|
|
|
| 114 |
"Please set the environment variable to load benchmark data."
|
| 115 |
)
|
| 116 |
|
| 117 |
+
gr.Markdown(
|
| 118 |
+
"💡 **Tip:** Sort by the **first_timer_score** column to find models that are "
|
| 119 |
+
"popular, fast to load, and quick to run - perfect for getting started!"
|
| 120 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
with gr.Row():
|
| 123 |
refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
|
|
|
|
| 173 |
"- **p50/p90**: 50th and 90th percentile values\n\n"
|
| 174 |
"**HuggingFace Metrics:**\n"
|
| 175 |
"- **downloads**: Total downloads from HuggingFace Hub\n"
|
| 176 |
+
"- **likes**: Number of likes on HuggingFace Hub\n\n"
|
| 177 |
+
"**First-Timer Score:**\n"
|
| 178 |
+
"- **first_timer_score**: 0-100 score combining popularity (40%), load time (30%), and inference time (30%)\n"
|
| 179 |
+
"- Higher score = better for first-timers (normalized per task)\n"
|
| 180 |
+
"- ⭐⭐⭐ Excellent (80+), ⭐⭐ Good (60+), ⭐ Fair (40+)"
|
| 181 |
)
|
| 182 |
|
| 183 |
def update_data():
|
|
|
|
| 185 |
new_df = load_data()
|
| 186 |
formatted_new_df = format_dataframe(new_df)
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
return (
|
| 189 |
new_df, # Update cached raw data
|
|
|
|
| 190 |
formatted_new_df,
|
| 191 |
gr.update(choices=get_unique_values(new_df, "task")),
|
| 192 |
gr.update(choices=get_unique_values(new_df, "platform")),
|
|
|
|
| 206 |
fn=update_data,
|
| 207 |
outputs=[
|
| 208 |
raw_data_state,
|
|
|
|
| 209 |
results_table,
|
| 210 |
task_filter,
|
| 211 |
platform_filter,
|
leaderboard/src/leaderboard/data_loader.py
CHANGED
|
@@ -74,6 +74,9 @@ def load_benchmark_data(
|
|
| 74 |
# Enrich with HuggingFace model metadata
|
| 75 |
df = enrich_with_hf_metadata(df)
|
| 76 |
|
|
|
|
|
|
|
|
|
|
| 77 |
# Sort by model name and timestamp
|
| 78 |
if "modelId" in df.columns and "timestamp" in df.columns:
|
| 79 |
df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
|
|
@@ -198,6 +201,97 @@ def enrich_with_hf_metadata(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 198 |
return df
|
| 199 |
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -> pd.DataFrame:
|
| 202 |
"""Identify first-timer-friendly models based on popularity and performance, grouped by task.
|
| 203 |
|
|
|
|
| 74 |
# Enrich with HuggingFace model metadata
|
| 75 |
df = enrich_with_hf_metadata(df)
|
| 76 |
|
| 77 |
+
# Add first-timer-friendly score
|
| 78 |
+
df = add_first_timer_score(df)
|
| 79 |
+
|
| 80 |
# Sort by model name and timestamp
|
| 81 |
if "modelId" in df.columns and "timestamp" in df.columns:
|
| 82 |
df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
|
|
|
|
| 201 |
return df
|
| 202 |
|
| 203 |
|
| 204 |
+
def add_first_timer_score(df: pd.DataFrame) -> pd.DataFrame:
|
| 205 |
+
"""Add first-timer-friendly score to all rows in the dataframe.
|
| 206 |
+
|
| 207 |
+
The score is calculated per task, normalized from 0-100 where:
|
| 208 |
+
- Higher score = better for first-timers
|
| 209 |
+
- Based on: downloads (25%), likes (15%), load time (30%), inference time (30%)
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
df: DataFrame containing benchmark results
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
DataFrame with added 'first_timer_score' column
|
| 216 |
+
"""
|
| 217 |
+
if df.empty:
|
| 218 |
+
return df
|
| 219 |
+
|
| 220 |
+
# Filter only successful benchmarks
|
| 221 |
+
filtered = df[df["status"] == "completed"].copy() if "status" in df.columns else df.copy()
|
| 222 |
+
|
| 223 |
+
if filtered.empty:
|
| 224 |
+
# Add empty score column for failed benchmarks
|
| 225 |
+
df["first_timer_score"] = None
|
| 226 |
+
return df
|
| 227 |
+
|
| 228 |
+
# Check if task column exists
|
| 229 |
+
if "task" not in filtered.columns:
|
| 230 |
+
df["first_timer_score"] = None
|
| 231 |
+
return df
|
| 232 |
+
|
| 233 |
+
# Calculate score per task
|
| 234 |
+
for task in filtered["task"].unique():
|
| 235 |
+
task_mask = filtered["task"] == task
|
| 236 |
+
task_df = filtered[task_mask].copy()
|
| 237 |
+
|
| 238 |
+
if task_df.empty:
|
| 239 |
+
continue
|
| 240 |
+
|
| 241 |
+
# Normalize metrics within this task (0-1 scale)
|
| 242 |
+
|
| 243 |
+
# Downloads score (0-1, higher is better)
|
| 244 |
+
if "downloads" in task_df.columns:
|
| 245 |
+
max_downloads = task_df["downloads"].max()
|
| 246 |
+
downloads_score = task_df["downloads"] / max_downloads if max_downloads > 0 else 0
|
| 247 |
+
else:
|
| 248 |
+
downloads_score = 0
|
| 249 |
+
|
| 250 |
+
# Likes score (0-1, higher is better)
|
| 251 |
+
if "likes" in task_df.columns:
|
| 252 |
+
max_likes = task_df["likes"].max()
|
| 253 |
+
likes_score = task_df["likes"] / max_likes if max_likes > 0 else 0
|
| 254 |
+
else:
|
| 255 |
+
likes_score = 0
|
| 256 |
+
|
| 257 |
+
# Load time score (0-1, lower time is better)
|
| 258 |
+
if "load_ms_p50" in task_df.columns:
|
| 259 |
+
max_load = task_df["load_ms_p50"].max()
|
| 260 |
+
load_score = 1 - (task_df["load_ms_p50"] / max_load) if max_load > 0 else 0
|
| 261 |
+
else:
|
| 262 |
+
load_score = 0
|
| 263 |
+
|
| 264 |
+
# Inference time score (0-1, lower time is better)
|
| 265 |
+
if "first_infer_ms_p50" in task_df.columns:
|
| 266 |
+
max_infer = task_df["first_infer_ms_p50"].max()
|
| 267 |
+
infer_score = 1 - (task_df["first_infer_ms_p50"] / max_infer) if max_infer > 0 else 0
|
| 268 |
+
else:
|
| 269 |
+
infer_score = 0
|
| 270 |
+
|
| 271 |
+
# Calculate weighted score and scale to 0-100
|
| 272 |
+
weighted_score = (
|
| 273 |
+
(downloads_score * 0.25) +
|
| 274 |
+
(likes_score * 0.15) +
|
| 275 |
+
(load_score * 0.30) +
|
| 276 |
+
(infer_score * 0.30)
|
| 277 |
+
) * 100
|
| 278 |
+
|
| 279 |
+
# Assign scores back to the filtered dataframe
|
| 280 |
+
filtered.loc[task_mask, "first_timer_score"] = weighted_score
|
| 281 |
+
|
| 282 |
+
# Merge scores back to original dataframe
|
| 283 |
+
if "first_timer_score" in filtered.columns:
|
| 284 |
+
df = df.merge(
|
| 285 |
+
filtered[["id", "first_timer_score"]],
|
| 286 |
+
on="id",
|
| 287 |
+
how="left"
|
| 288 |
+
)
|
| 289 |
+
else:
|
| 290 |
+
df["first_timer_score"] = None
|
| 291 |
+
|
| 292 |
+
return df
|
| 293 |
+
|
| 294 |
+
|
| 295 |
def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -> pd.DataFrame:
|
| 296 |
"""Identify first-timer-friendly models based on popularity and performance, grouped by task.
|
| 297 |
|
leaderboard/src/leaderboard/formatters.py
CHANGED
|
@@ -244,6 +244,31 @@ def format_likes(likes: Optional[int]) -> str:
|
|
| 244 |
return f"{emoji} {likes}"
|
| 245 |
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
def apply_formatting(df_dict: dict) -> dict:
|
| 248 |
"""Apply emoji formatting to a benchmark result dictionary.
|
| 249 |
|
|
@@ -314,4 +339,8 @@ def apply_formatting(df_dict: dict) -> dict:
|
|
| 314 |
if "likes" in formatted:
|
| 315 |
formatted["likes"] = format_likes(formatted["likes"])
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
return formatted
|
|
|
|
| 244 |
return f"{emoji} {likes}"
|
| 245 |
|
| 246 |
|
| 247 |
+
def format_first_timer_score(score: Optional[float]) -> str:
|
| 248 |
+
"""Format first-timer-friendly score with emoji.
|
| 249 |
+
|
| 250 |
+
Args:
|
| 251 |
+
score: First-timer score (0-100)
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
Formatted string with emoji
|
| 255 |
+
"""
|
| 256 |
+
if score is None:
|
| 257 |
+
return "-"
|
| 258 |
+
|
| 259 |
+
# Format based on score (0-100 scale)
|
| 260 |
+
if score >= 80:
|
| 261 |
+
emoji = "⭐⭐⭐" # Excellent
|
| 262 |
+
elif score >= 60:
|
| 263 |
+
emoji = "⭐⭐" # Good
|
| 264 |
+
elif score >= 40:
|
| 265 |
+
emoji = "⭐" # Fair
|
| 266 |
+
else:
|
| 267 |
+
emoji = "·" # Below average
|
| 268 |
+
|
| 269 |
+
return f"{emoji} {score:.0f}"
|
| 270 |
+
|
| 271 |
+
|
| 272 |
def apply_formatting(df_dict: dict) -> dict:
|
| 273 |
"""Apply emoji formatting to a benchmark result dictionary.
|
| 274 |
|
|
|
|
| 339 |
if "likes" in formatted:
|
| 340 |
formatted["likes"] = format_likes(formatted["likes"])
|
| 341 |
|
| 342 |
+
# Format first-timer score
|
| 343 |
+
if "first_timer_score" in formatted:
|
| 344 |
+
formatted["first_timer_score"] = format_first_timer_score(formatted["first_timer_score"])
|
| 345 |
+
|
| 346 |
return formatted
|