whitphx HF Staff commited on
Commit
4d725bc
·
1 Parent(s): 9616a74

Print first-timer-friendly score in the table

Browse files
leaderboard/src/leaderboard/app.py CHANGED
@@ -13,7 +13,6 @@ from dotenv import load_dotenv
13
  from leaderboard.data_loader import (
14
  load_benchmark_data,
15
  get_unique_values,
16
- get_first_timer_friendly_models,
17
  )
18
  from leaderboard.formatters import apply_formatting
19
 
@@ -115,23 +114,10 @@ def create_leaderboard_ui():
115
  "Please set the environment variable to load benchmark data."
116
  )
117
 
118
- # First-timer-friendly models section
119
- with gr.Accordion(" First-Timer-Friendly Models", open=True):
120
- gr.Markdown(
121
- "These models are great for first-timers! They're popular, fast to load, "
122
- "and quick to run. Perfect for getting started with Transformers.js.\n\n"
123
- "**Showing top 3 models per task type.**"
124
- )
125
-
126
- first_timer_models = get_first_timer_friendly_models(df, limit_per_task=3)
127
- formatted_first_timer = format_dataframe(first_timer_models)
128
-
129
- first_timer_table = gr.DataFrame(
130
- value=formatted_first_timer,
131
- label="Top First-Timer-Friendly Models (by Task)",
132
- interactive=False,
133
- wrap=True,
134
- )
135
 
136
  with gr.Row():
137
  refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
@@ -187,7 +173,11 @@ def create_leaderboard_ui():
187
  "- **p50/p90**: 50th and 90th percentile values\n\n"
188
  "**HuggingFace Metrics:**\n"
189
  "- **downloads**: Total downloads from HuggingFace Hub\n"
190
- "- **likes**: Number of likes on HuggingFace Hub"
 
 
 
 
191
  )
192
 
193
  def update_data():
@@ -195,13 +185,8 @@ def create_leaderboard_ui():
195
  new_df = load_data()
196
  formatted_new_df = format_dataframe(new_df)
197
 
198
- # Update first-timer-friendly models (3 per task)
199
- new_first_timer = get_first_timer_friendly_models(new_df, limit_per_task=3)
200
- formatted_new_first_timer = format_dataframe(new_first_timer)
201
-
202
  return (
203
  new_df, # Update cached raw data
204
- formatted_new_first_timer,
205
  formatted_new_df,
206
  gr.update(choices=get_unique_values(new_df, "task")),
207
  gr.update(choices=get_unique_values(new_df, "platform")),
@@ -221,7 +206,6 @@ def create_leaderboard_ui():
221
  fn=update_data,
222
  outputs=[
223
  raw_data_state,
224
- first_timer_table,
225
  results_table,
226
  task_filter,
227
  platform_filter,
 
13
  from leaderboard.data_loader import (
14
  load_benchmark_data,
15
  get_unique_values,
 
16
  )
17
  from leaderboard.formatters import apply_formatting
18
 
 
114
  "Please set the environment variable to load benchmark data."
115
  )
116
 
117
+ gr.Markdown(
118
+ "💡 **Tip:** Sort by the **first_timer_score** column to find models that are "
119
+ "popular, fast to load, and quick to run - perfect for getting started!"
120
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  with gr.Row():
123
  refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
 
173
  "- **p50/p90**: 50th and 90th percentile values\n\n"
174
  "**HuggingFace Metrics:**\n"
175
  "- **downloads**: Total downloads from HuggingFace Hub\n"
176
+ "- **likes**: Number of likes on HuggingFace Hub\n\n"
177
+ "**First-Timer Score:**\n"
178
+ "- **first_timer_score**: 0-100 score combining popularity (40%), load time (30%), and inference time (30%)\n"
179
+ "- Higher score = better for first-timers (normalized per task)\n"
180
+ "- ⭐⭐⭐ Excellent (80+), ⭐⭐ Good (60+), ⭐ Fair (40+)"
181
  )
182
 
183
  def update_data():
 
185
  new_df = load_data()
186
  formatted_new_df = format_dataframe(new_df)
187
 
 
 
 
 
188
  return (
189
  new_df, # Update cached raw data
 
190
  formatted_new_df,
191
  gr.update(choices=get_unique_values(new_df, "task")),
192
  gr.update(choices=get_unique_values(new_df, "platform")),
 
206
  fn=update_data,
207
  outputs=[
208
  raw_data_state,
 
209
  results_table,
210
  task_filter,
211
  platform_filter,
leaderboard/src/leaderboard/data_loader.py CHANGED
@@ -74,6 +74,9 @@ def load_benchmark_data(
74
  # Enrich with HuggingFace model metadata
75
  df = enrich_with_hf_metadata(df)
76
 
 
 
 
77
  # Sort by model name and timestamp
78
  if "modelId" in df.columns and "timestamp" in df.columns:
79
  df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
@@ -198,6 +201,97 @@ def enrich_with_hf_metadata(df: pd.DataFrame) -> pd.DataFrame:
198
  return df
199
 
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -> pd.DataFrame:
202
  """Identify first-timer-friendly models based on popularity and performance, grouped by task.
203
 
 
74
  # Enrich with HuggingFace model metadata
75
  df = enrich_with_hf_metadata(df)
76
 
77
+ # Add first-timer-friendly score
78
+ df = add_first_timer_score(df)
79
+
80
  # Sort by model name and timestamp
81
  if "modelId" in df.columns and "timestamp" in df.columns:
82
  df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
 
201
  return df
202
 
203
 
204
+ def add_first_timer_score(df: pd.DataFrame) -> pd.DataFrame:
205
+ """Add first-timer-friendly score to all rows in the dataframe.
206
+
207
+ The score is calculated per task, normalized from 0-100 where:
208
+ - Higher score = better for first-timers
209
+ - Based on: downloads (25%), likes (15%), load time (30%), inference time (30%)
210
+
211
+ Args:
212
+ df: DataFrame containing benchmark results
213
+
214
+ Returns:
215
+ DataFrame with added 'first_timer_score' column
216
+ """
217
+ if df.empty:
218
+ return df
219
+
220
+ # Filter only successful benchmarks
221
+ filtered = df[df["status"] == "completed"].copy() if "status" in df.columns else df.copy()
222
+
223
+ if filtered.empty:
224
+ # Add empty score column for failed benchmarks
225
+ df["first_timer_score"] = None
226
+ return df
227
+
228
+ # Check if task column exists
229
+ if "task" not in filtered.columns:
230
+ df["first_timer_score"] = None
231
+ return df
232
+
233
+ # Calculate score per task
234
+ for task in filtered["task"].unique():
235
+ task_mask = filtered["task"] == task
236
+ task_df = filtered[task_mask].copy()
237
+
238
+ if task_df.empty:
239
+ continue
240
+
241
+ # Normalize metrics within this task (0-1 scale)
242
+
243
+ # Downloads score (0-1, higher is better)
244
+ if "downloads" in task_df.columns:
245
+ max_downloads = task_df["downloads"].max()
246
+ downloads_score = task_df["downloads"] / max_downloads if max_downloads > 0 else 0
247
+ else:
248
+ downloads_score = 0
249
+
250
+ # Likes score (0-1, higher is better)
251
+ if "likes" in task_df.columns:
252
+ max_likes = task_df["likes"].max()
253
+ likes_score = task_df["likes"] / max_likes if max_likes > 0 else 0
254
+ else:
255
+ likes_score = 0
256
+
257
+ # Load time score (0-1, lower time is better)
258
+ if "load_ms_p50" in task_df.columns:
259
+ max_load = task_df["load_ms_p50"].max()
260
+ load_score = 1 - (task_df["load_ms_p50"] / max_load) if max_load > 0 else 0
261
+ else:
262
+ load_score = 0
263
+
264
+ # Inference time score (0-1, lower time is better)
265
+ if "first_infer_ms_p50" in task_df.columns:
266
+ max_infer = task_df["first_infer_ms_p50"].max()
267
+ infer_score = 1 - (task_df["first_infer_ms_p50"] / max_infer) if max_infer > 0 else 0
268
+ else:
269
+ infer_score = 0
270
+
271
+ # Calculate weighted score and scale to 0-100
272
+ weighted_score = (
273
+ (downloads_score * 0.25) +
274
+ (likes_score * 0.15) +
275
+ (load_score * 0.30) +
276
+ (infer_score * 0.30)
277
+ ) * 100
278
+
279
+ # Assign scores back to the filtered dataframe
280
+ filtered.loc[task_mask, "first_timer_score"] = weighted_score
281
+
282
+ # Merge scores back to original dataframe
283
+ if "first_timer_score" in filtered.columns:
284
+ df = df.merge(
285
+ filtered[["id", "first_timer_score"]],
286
+ on="id",
287
+ how="left"
288
+ )
289
+ else:
290
+ df["first_timer_score"] = None
291
+
292
+ return df
293
+
294
+
295
  def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -> pd.DataFrame:
296
  """Identify first-timer-friendly models based on popularity and performance, grouped by task.
297
 
leaderboard/src/leaderboard/formatters.py CHANGED
@@ -244,6 +244,31 @@ def format_likes(likes: Optional[int]) -> str:
244
  return f"{emoji} {likes}"
245
 
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  def apply_formatting(df_dict: dict) -> dict:
248
  """Apply emoji formatting to a benchmark result dictionary.
249
 
@@ -314,4 +339,8 @@ def apply_formatting(df_dict: dict) -> dict:
314
  if "likes" in formatted:
315
  formatted["likes"] = format_likes(formatted["likes"])
316
 
 
 
 
 
317
  return formatted
 
244
  return f"{emoji} {likes}"
245
 
246
 
247
+ def format_first_timer_score(score: Optional[float]) -> str:
248
+ """Format first-timer-friendly score with emoji.
249
+
250
+ Args:
251
+ score: First-timer score (0-100)
252
+
253
+ Returns:
254
+ Formatted string with emoji
255
+ """
256
+ if score is None:
257
+ return "-"
258
+
259
+ # Format based on score (0-100 scale)
260
+ if score >= 80:
261
+ emoji = "⭐⭐⭐" # Excellent
262
+ elif score >= 60:
263
+ emoji = "⭐⭐" # Good
264
+ elif score >= 40:
265
+ emoji = "⭐" # Fair
266
+ else:
267
+ emoji = "·" # Below average
268
+
269
+ return f"{emoji} {score:.0f}"
270
+
271
+
272
  def apply_formatting(df_dict: dict) -> dict:
273
  """Apply emoji formatting to a benchmark result dictionary.
274
 
 
339
  if "likes" in formatted:
340
  formatted["likes"] = format_likes(formatted["likes"])
341
 
342
+ # Format first-timer score
343
+ if "first_timer_score" in formatted:
344
+ formatted["first_timer_score"] = format_first_timer_score(formatted["first_timer_score"])
345
+
346
  return formatted