Spaces:

whitphx
/

transformersjs-performance-leaderboard-backend

Runtime error

App Files Files Community

whitphx HF Staff commited on 24 days ago

Commit

4d725bc

1 Parent(s): 9616a74

Print first-timer-friendly score in the table

Browse files

Files changed (3) hide show

leaderboard/src/leaderboard/app.py +9 -25
leaderboard/src/leaderboard/data_loader.py +94 -0
leaderboard/src/leaderboard/formatters.py +29 -0

leaderboard/src/leaderboard/app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from dotenv import load_dotenv
 from leaderboard.data_loader import (
     load_benchmark_data,
     get_unique_values,
-    get_first_timer_friendly_models,
 )
 from leaderboard.formatters import apply_formatting
@@ -115,23 +114,10 @@ def create_leaderboard_ui():
                 "Please set the environment variable to load benchmark data."
             )
-        # First-timer-friendly models section
-        with gr.Accordion("✨ First-Timer-Friendly Models", open=True):
-            gr.Markdown(
-                "These models are great for first-timers! They're popular, fast to load, "
-                "and quick to run. Perfect for getting started with Transformers.js.\n\n"
-                "**Showing top 3 models per task type.**"
-            )
-            first_timer_models = get_first_timer_friendly_models(df, limit_per_task=3)
-            formatted_first_timer = format_dataframe(first_timer_models)
-            first_timer_table = gr.DataFrame(
-                value=formatted_first_timer,
-                label="Top First-Timer-Friendly Models (by Task)",
-                interactive=False,
-                wrap=True,
-            )
         with gr.Row():
             refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
@@ -187,7 +173,11 @@ def create_leaderboard_ui():
             "- **p50/p90**: 50th and 90th percentile values\n\n"
             "**HuggingFace Metrics:**\n"
             "- **downloads**: Total downloads from HuggingFace Hub\n"
-            "- **likes**: Number of likes on HuggingFace Hub"
         )
         def update_data():
@@ -195,13 +185,8 @@ def create_leaderboard_ui():
             new_df = load_data()
             formatted_new_df = format_dataframe(new_df)
-            # Update first-timer-friendly models (3 per task)
-            new_first_timer = get_first_timer_friendly_models(new_df, limit_per_task=3)
-            formatted_new_first_timer = format_dataframe(new_first_timer)
             return (
                 new_df,  # Update cached raw data
-                formatted_new_first_timer,
                 formatted_new_df,
                 gr.update(choices=get_unique_values(new_df, "task")),
                 gr.update(choices=get_unique_values(new_df, "platform")),
@@ -221,7 +206,6 @@ def create_leaderboard_ui():
             fn=update_data,
             outputs=[
                 raw_data_state,
-                first_timer_table,
                 results_table,
                 task_filter,
                 platform_filter,

 from leaderboard.data_loader import (
     load_benchmark_data,
     get_unique_values,
 )
 from leaderboard.formatters import apply_formatting
                 "Please set the environment variable to load benchmark data."
             )
+        gr.Markdown(
+            "💡 **Tip:** Sort by the **first_timer_score** column to find models that are "
+            "popular, fast to load, and quick to run - perfect for getting started!"
+        )
         with gr.Row():
             refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
             "- **p50/p90**: 50th and 90th percentile values\n\n"
             "**HuggingFace Metrics:**\n"
             "- **downloads**: Total downloads from HuggingFace Hub\n"
+            "- **likes**: Number of likes on HuggingFace Hub\n\n"
+            "**First-Timer Score:**\n"
+            "- **first_timer_score**: 0-100 score combining popularity (40%), load time (30%), and inference time (30%)\n"
+            "- Higher score = better for first-timers (normalized per task)\n"
+            "- ⭐⭐⭐ Excellent (80+), ⭐⭐ Good (60+), ⭐ Fair (40+)"
         )
         def update_data():
             new_df = load_data()
             formatted_new_df = format_dataframe(new_df)
             return (
                 new_df,  # Update cached raw data
                 formatted_new_df,
                 gr.update(choices=get_unique_values(new_df, "task")),
                 gr.update(choices=get_unique_values(new_df, "platform")),
             fn=update_data,
             outputs=[
                 raw_data_state,
                 results_table,
                 task_filter,
                 platform_filter,

leaderboard/src/leaderboard/data_loader.py CHANGED Viewed

@@ -74,6 +74,9 @@ def load_benchmark_data(
         # Enrich with HuggingFace model metadata
         df = enrich_with_hf_metadata(df)
         # Sort by model name and timestamp
         if "modelId" in df.columns and "timestamp" in df.columns:
             df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
@@ -198,6 +201,97 @@ def enrich_with_hf_metadata(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -> pd.DataFrame:
     """Identify first-timer-friendly models based on popularity and performance, grouped by task.

         # Enrich with HuggingFace model metadata
         df = enrich_with_hf_metadata(df)
+        # Add first-timer-friendly score
+        df = add_first_timer_score(df)
         # Sort by model name and timestamp
         if "modelId" in df.columns and "timestamp" in df.columns:
             df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
     return df
+def add_first_timer_score(df: pd.DataFrame) -> pd.DataFrame:
+    """Add first-timer-friendly score to all rows in the dataframe.
+    The score is calculated per task, normalized from 0-100 where:
+    - Higher score = better for first-timers
+    - Based on: downloads (25%), likes (15%), load time (30%), inference time (30%)
+    Args:
+        df: DataFrame containing benchmark results
+    Returns:
+        DataFrame with added 'first_timer_score' column
+    """
+    if df.empty:
+        return df
+    # Filter only successful benchmarks
+    filtered = df[df["status"] == "completed"].copy() if "status" in df.columns else df.copy()
+    if filtered.empty:
+        # Add empty score column for failed benchmarks
+        df["first_timer_score"] = None
+        return df
+    # Check if task column exists
+    if "task" not in filtered.columns:
+        df["first_timer_score"] = None
+        return df
+    # Calculate score per task
+    for task in filtered["task"].unique():
+        task_mask = filtered["task"] == task
+        task_df = filtered[task_mask].copy()
+        if task_df.empty:
+            continue
+        # Normalize metrics within this task (0-1 scale)
+        # Downloads score (0-1, higher is better)
+        if "downloads" in task_df.columns:
+            max_downloads = task_df["downloads"].max()
+            downloads_score = task_df["downloads"] / max_downloads if max_downloads > 0 else 0
+        else:
+            downloads_score = 0
+        # Likes score (0-1, higher is better)
+        if "likes" in task_df.columns:
+            max_likes = task_df["likes"].max()
+            likes_score = task_df["likes"] / max_likes if max_likes > 0 else 0
+        else:
+            likes_score = 0
+        # Load time score (0-1, lower time is better)
+        if "load_ms_p50" in task_df.columns:
+            max_load = task_df["load_ms_p50"].max()
+            load_score = 1 - (task_df["load_ms_p50"] / max_load) if max_load > 0 else 0
+        else:
+            load_score = 0
+        # Inference time score (0-1, lower time is better)
+        if "first_infer_ms_p50" in task_df.columns:
+            max_infer = task_df["first_infer_ms_p50"].max()
+            infer_score = 1 - (task_df["first_infer_ms_p50"] / max_infer) if max_infer > 0 else 0
+        else:
+            infer_score = 0
+        # Calculate weighted score and scale to 0-100
+        weighted_score = (
+            (downloads_score * 0.25) +
+            (likes_score * 0.15) +
+            (load_score * 0.30) +
+            (infer_score * 0.30)
+        ) * 100
+        # Assign scores back to the filtered dataframe
+        filtered.loc[task_mask, "first_timer_score"] = weighted_score
+    # Merge scores back to original dataframe
+    if "first_timer_score" in filtered.columns:
+        df = df.merge(
+            filtered[["id", "first_timer_score"]],
+            on="id",
+            how="left"
+        )
+    else:
+        df["first_timer_score"] = None
+    return df
 def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -> pd.DataFrame:
     """Identify first-timer-friendly models based on popularity and performance, grouped by task.

leaderboard/src/leaderboard/formatters.py CHANGED Viewed

@@ -244,6 +244,31 @@ def format_likes(likes: Optional[int]) -> str:
     return f"{emoji} {likes}"
 def apply_formatting(df_dict: dict) -> dict:
     """Apply emoji formatting to a benchmark result dictionary.
@@ -314,4 +339,8 @@ def apply_formatting(df_dict: dict) -> dict:
     if "likes" in formatted:
         formatted["likes"] = format_likes(formatted["likes"])
     return formatted

     return f"{emoji} {likes}"
+def format_first_timer_score(score: Optional[float]) -> str:
+    """Format first-timer-friendly score with emoji.
+    Args:
+        score: First-timer score (0-100)
+    Returns:
+        Formatted string with emoji
+    """
+    if score is None:
+        return "-"
+    # Format based on score (0-100 scale)
+    if score >= 80:
+        emoji = "⭐⭐⭐"  # Excellent
+    elif score >= 60:
+        emoji = "⭐⭐"  # Good
+    elif score >= 40:
+        emoji = "⭐"  # Fair
+    else:
+        emoji = "·"  # Below average
+    return f"{emoji} {score:.0f}"
 def apply_formatting(df_dict: dict) -> dict:
     """Apply emoji formatting to a benchmark result dictionary.
     if "likes" in formatted:
         formatted["likes"] = format_likes(formatted["likes"])
+    # Format first-timer score
+    if "first_timer_score" in formatted:
+        formatted["first_timer_score"] = format_first_timer_score(formatted["first_timer_score"])
     return formatted