Spaces:

whitphx
/

transformersjs-bench-leaderboard

Sleeping

File size: 10,988 Bytes

"""
Transformers.js Benchmark Leaderboard

A Gradio app that displays benchmark results from a HuggingFace Dataset repository.
"""

import os
import logging
import pandas as pd
import gradio as gr
from dotenv import load_dotenv

from data_loader import (
    load_benchmark_data,
    get_unique_values,
    get_webgpu_beginner_friendly_models,
    format_recommended_models_as_markdown,
)
from formatters import apply_formatting

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Load environment variables
load_dotenv()

HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")
HF_TOKEN = os.getenv("HF_TOKEN")


def load_data() -> pd.DataFrame:
    """Load benchmark data from configured HF Dataset repository."""
    # Load raw data
    df = load_benchmark_data(
        dataset_repo=HF_DATASET_REPO,
        token=HF_TOKEN,
    )

    return df


def format_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Apply formatting to dataframe for display."""
    if df.empty:
        return df

    return df.apply(lambda row: pd.Series(apply_formatting(row.to_dict())), axis=1)


def filter_data(
    df: pd.DataFrame,
    model_filter: str,
    task_filter: str,
    platform_filter: str,
    device_filter: str,
    mode_filter: str,
    dtype_filter: str,
    status_filter: str,
) -> pd.DataFrame:
    """Filter benchmark data based on user inputs."""
    if df.empty:
        return df

    filtered = df.copy()

    # Model name filter
    if model_filter:
        filtered = filtered[
            filtered["modelId"].str.contains(model_filter, case=False, na=False)
        ]

    # Task filter
    if task_filter and task_filter != "All":
        filtered = filtered[filtered["task"] == task_filter]

    # Platform filter
    if platform_filter and platform_filter != "All":
        filtered = filtered[filtered["platform"] == platform_filter]

    # Device filter
    if device_filter and device_filter != "All":
        filtered = filtered[filtered["device"] == device_filter]

    # Mode filter
    if mode_filter and mode_filter != "All":
        filtered = filtered[filtered["mode"] == mode_filter]

    # DType filter
    if dtype_filter and dtype_filter != "All":
        filtered = filtered[filtered["dtype"] == dtype_filter]

    # Status filter
    if status_filter and status_filter != "All":
        filtered = filtered[filtered["status"] == status_filter]

    return filtered


def create_leaderboard_ui():
    """Create the Gradio UI for the leaderboard."""

    # Load initial data
    df = load_data()
    formatted_df = format_dataframe(df)

    with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
        # Cache raw data in Gradio state to avoid reloading on every filter change
        raw_data_state = gr.State(df)
        gr.Markdown("# 🏆 Transformers.js Benchmark Leaderboard")
        gr.Markdown(
            "Compare benchmark results for different models, platforms, and configurations."
        )

        if not HF_DATASET_REPO:
            gr.Markdown(
                "⚠️ **HF_DATASET_REPO not configured.** "
                "Please set the environment variable to load benchmark data."
            )

        gr.Markdown(
            "💡 **Tip:** Use the recommended models section below to find popular models "
            "that are fast to load and quick to run - perfect for getting started!"
        )

        # Recommended models section
        gr.Markdown("## ⭐ Recommended WebGPU Models for Beginners")
        gr.Markdown(
            "These models are selected for being:\n"
            "- **WebGPU compatible** - Work in modern browsers with GPU acceleration\n"
            "- **Beginner-friendly** - Popular, fast to load, and quick to run\n"
            "- Sorted by task type, showing top 3-5 models per task"
        )

        # Get recommended models
        recommended_models = get_webgpu_beginner_friendly_models(df, limit_per_task=5)
        formatted_recommended = format_dataframe(recommended_models)
        markdown_output = format_recommended_models_as_markdown(recommended_models)

        recommended_table = gr.DataFrame(
            value=formatted_recommended,
            label="Top WebGPU-Compatible Models by Task",
            interactive=False,
            wrap=True,
        )

        gr.Markdown("### 📝 Markdown Output for llms.txt")
        gr.Markdown(
            "Copy the markdown below to embed in your llms.txt or documentation:"
        )

        markdown_textbox = gr.Textbox(
            value=markdown_output,
            label="Markdown for llms.txt",
            lines=20,
            max_lines=30,
            show_copy_button=True,
            interactive=False,
        )

        gr.Markdown("---")
        gr.Markdown("## 🔍 Full Benchmark Results")

        with gr.Row():
            refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")

        with gr.Row():
            model_filter = gr.Textbox(
                label="Model Name",
                placeholder="Filter by model name (e.g., 'bert', 'gpt')",
            )
            task_filter = gr.Dropdown(
                label="Task",
                choices=get_unique_values(df, "task"),
                value="All",
            )

        with gr.Row():
            platform_filter = gr.Dropdown(
                label="Platform",
                choices=get_unique_values(df, "platform"),
                value="All",
            )
            device_filter = gr.Dropdown(
                label="Device",
                choices=get_unique_values(df, "device"),
                value="All",
            )

        with gr.Row():
            mode_filter = gr.Dropdown(
                label="Mode",
                choices=get_unique_values(df, "mode"),
                value="All",
            )
            dtype_filter = gr.Dropdown(
                label="DType",
                choices=get_unique_values(df, "dtype"),
                value="All",
            )
            status_filter = gr.Dropdown(
                label="Status",
                choices=get_unique_values(df, "status"),
                value="All",
            )

        results_table = gr.DataFrame(
            value=formatted_df,
            label="All Benchmark Results",
            interactive=False,
            wrap=True,
        )

        gr.Markdown("### 📊 Metrics")
        gr.Markdown(
            "**Benchmark Metrics:**\n"
            "- **load_ms**: Model loading time in milliseconds\n"
            "- **first_infer_ms**: First inference time in milliseconds\n"
            "- **subsequent_infer_ms**: Subsequent inference time in milliseconds\n"
            "- **p50/p90**: 50th and 90th percentile values\n\n"
            "**HuggingFace Metrics:**\n"
            "- **downloads**: Total downloads from HuggingFace Hub\n"
            "- **likes**: Number of likes on HuggingFace Hub\n\n"
            "**WebGPU Compatibility:**\n"
            "- Models in the recommended section are all WebGPU compatible\n"
            "- WebGPU enables GPU acceleration in modern browsers\n\n"
            "**⚠️ Important Note About Performance Metrics:**\n"
            "All metrics are measured in a controlled benchmark environment. "
            "They are useful for **comparing models against each other**, but may not reflect "
            "actual performance in your environment. Factors like hardware, browser, OS, and system load affect real-world performance. "
            "We recommend testing models in your own environment for accurate measurements."
        )

        def update_data():
            """Reload data from HuggingFace."""
            new_df = load_data()
            formatted_new_df = format_dataframe(new_df)

            # Update recommended models
            new_recommended = get_webgpu_beginner_friendly_models(new_df, limit_per_task=5)
            formatted_new_recommended = format_dataframe(new_recommended)
            new_markdown = format_recommended_models_as_markdown(new_recommended)

            return (
                new_df,  # Update cached raw data
                formatted_new_recommended,  # Update recommended models
                new_markdown,  # Update markdown output
                formatted_new_df,
                gr.update(choices=get_unique_values(new_df, "task")),
                gr.update(choices=get_unique_values(new_df, "platform")),
                gr.update(choices=get_unique_values(new_df, "device")),
                gr.update(choices=get_unique_values(new_df, "mode")),
                gr.update(choices=get_unique_values(new_df, "dtype")),
                gr.update(choices=get_unique_values(new_df, "status")),
            )

        def apply_filters(raw_df, model, task, platform, device, mode, dtype, status):
            """Apply filters and return filtered DataFrame."""
            # Use cached raw data instead of reloading
            filtered = filter_data(raw_df, model, task, platform, device, mode, dtype, status)
            return format_dataframe(filtered)

        # Refresh button updates data and resets filters
        refresh_btn.click(
            fn=update_data,
            outputs=[
                raw_data_state,
                recommended_table,
                markdown_textbox,
                results_table,
                task_filter,
                platform_filter,
                device_filter,
                mode_filter,
                dtype_filter,
                status_filter,
            ],
        )

        # Filter inputs update the table (using cached raw data)
        filter_inputs = [
            raw_data_state,
            model_filter,
            task_filter,
            platform_filter,
            device_filter,
            mode_filter,
            dtype_filter,
            status_filter,
        ]

        model_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )
        task_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )
        platform_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )
        device_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )
        mode_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )
        dtype_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )
        status_filter.change(
            fn=apply_filters,
            inputs=filter_inputs,
            outputs=results_table,
        )

    return demo


demo = create_leaderboard_ui()
demo.launch()