|
|
""" |
|
|
Transformers.js Benchmark Leaderboard |
|
|
|
|
|
A Gradio app that displays benchmark results from a HuggingFace Dataset repository. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import logging |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
from data_loader import ( |
|
|
load_benchmark_data, |
|
|
get_unique_values, |
|
|
get_webgpu_beginner_friendly_models, |
|
|
format_recommended_models_as_markdown, |
|
|
) |
|
|
from formatters import apply_formatting |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
|
) |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO") |
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
def load_data() -> pd.DataFrame: |
|
|
"""Load benchmark data from configured HF Dataset repository.""" |
|
|
|
|
|
df = load_benchmark_data( |
|
|
dataset_repo=HF_DATASET_REPO, |
|
|
token=HF_TOKEN, |
|
|
) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def format_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
|
|
"""Apply formatting to dataframe for display.""" |
|
|
if df.empty: |
|
|
return df |
|
|
|
|
|
return df.apply(lambda row: pd.Series(apply_formatting(row.to_dict())), axis=1) |
|
|
|
|
|
|
|
|
def filter_data( |
|
|
df: pd.DataFrame, |
|
|
model_filter: str, |
|
|
task_filter: str, |
|
|
platform_filter: str, |
|
|
device_filter: str, |
|
|
mode_filter: str, |
|
|
dtype_filter: str, |
|
|
status_filter: str, |
|
|
) -> pd.DataFrame: |
|
|
"""Filter benchmark data based on user inputs.""" |
|
|
if df.empty: |
|
|
return df |
|
|
|
|
|
filtered = df.copy() |
|
|
|
|
|
|
|
|
if model_filter: |
|
|
filtered = filtered[ |
|
|
filtered["modelId"].str.contains(model_filter, case=False, na=False) |
|
|
] |
|
|
|
|
|
|
|
|
if task_filter and task_filter != "All": |
|
|
filtered = filtered[filtered["task"] == task_filter] |
|
|
|
|
|
|
|
|
if platform_filter and platform_filter != "All": |
|
|
filtered = filtered[filtered["platform"] == platform_filter] |
|
|
|
|
|
|
|
|
if device_filter and device_filter != "All": |
|
|
filtered = filtered[filtered["device"] == device_filter] |
|
|
|
|
|
|
|
|
if mode_filter and mode_filter != "All": |
|
|
filtered = filtered[filtered["mode"] == mode_filter] |
|
|
|
|
|
|
|
|
if dtype_filter and dtype_filter != "All": |
|
|
filtered = filtered[filtered["dtype"] == dtype_filter] |
|
|
|
|
|
|
|
|
if status_filter and status_filter != "All": |
|
|
filtered = filtered[filtered["status"] == status_filter] |
|
|
|
|
|
return filtered |
|
|
|
|
|
|
|
|
def create_leaderboard_ui(): |
|
|
"""Create the Gradio UI for the leaderboard.""" |
|
|
|
|
|
|
|
|
df = load_data() |
|
|
formatted_df = format_dataframe(df) |
|
|
|
|
|
with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo: |
|
|
|
|
|
raw_data_state = gr.State(df) |
|
|
gr.Markdown("# π Transformers.js Benchmark Leaderboard") |
|
|
gr.Markdown( |
|
|
"Compare benchmark results for different models, platforms, and configurations." |
|
|
) |
|
|
|
|
|
if not HF_DATASET_REPO: |
|
|
gr.Markdown( |
|
|
"β οΈ **HF_DATASET_REPO not configured.** " |
|
|
"Please set the environment variable to load benchmark data." |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
"π‘ **Tip:** Use the recommended models section below to find popular models " |
|
|
"that are fast to load and quick to run - perfect for getting started!" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("## β Recommended WebGPU Models for Beginners") |
|
|
gr.Markdown( |
|
|
"These models are selected for being:\n" |
|
|
"- **WebGPU compatible** - Work in modern browsers with GPU acceleration\n" |
|
|
"- **Beginner-friendly** - Popular, fast to load, and quick to run\n" |
|
|
"- Sorted by task type, showing top 3-5 models per task" |
|
|
) |
|
|
|
|
|
|
|
|
recommended_models = get_webgpu_beginner_friendly_models(df, limit_per_task=5) |
|
|
formatted_recommended = format_dataframe(recommended_models) |
|
|
markdown_output = format_recommended_models_as_markdown(recommended_models) |
|
|
|
|
|
recommended_table = gr.DataFrame( |
|
|
value=formatted_recommended, |
|
|
label="Top WebGPU-Compatible Models by Task", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
) |
|
|
|
|
|
gr.Markdown("### π Markdown Output for llms.txt") |
|
|
gr.Markdown( |
|
|
"Copy the markdown below to embed in your llms.txt or documentation:" |
|
|
) |
|
|
|
|
|
markdown_textbox = gr.Textbox( |
|
|
value=markdown_output, |
|
|
label="Markdown for llms.txt", |
|
|
lines=20, |
|
|
max_lines=30, |
|
|
show_copy_button=True, |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("## π Full Benchmark Results") |
|
|
|
|
|
with gr.Row(): |
|
|
refresh_btn = gr.Button("π Refresh Data", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
model_filter = gr.Textbox( |
|
|
label="Model Name", |
|
|
placeholder="Filter by model name (e.g., 'bert', 'gpt')", |
|
|
) |
|
|
task_filter = gr.Dropdown( |
|
|
label="Task", |
|
|
choices=get_unique_values(df, "task"), |
|
|
value="All", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
platform_filter = gr.Dropdown( |
|
|
label="Platform", |
|
|
choices=get_unique_values(df, "platform"), |
|
|
value="All", |
|
|
) |
|
|
device_filter = gr.Dropdown( |
|
|
label="Device", |
|
|
choices=get_unique_values(df, "device"), |
|
|
value="All", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
mode_filter = gr.Dropdown( |
|
|
label="Mode", |
|
|
choices=get_unique_values(df, "mode"), |
|
|
value="All", |
|
|
) |
|
|
dtype_filter = gr.Dropdown( |
|
|
label="DType", |
|
|
choices=get_unique_values(df, "dtype"), |
|
|
value="All", |
|
|
) |
|
|
status_filter = gr.Dropdown( |
|
|
label="Status", |
|
|
choices=get_unique_values(df, "status"), |
|
|
value="All", |
|
|
) |
|
|
|
|
|
results_table = gr.DataFrame( |
|
|
value=formatted_df, |
|
|
label="All Benchmark Results", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
) |
|
|
|
|
|
gr.Markdown("### π Metrics") |
|
|
gr.Markdown( |
|
|
"**Benchmark Metrics:**\n" |
|
|
"- **load_ms**: Model loading time in milliseconds\n" |
|
|
"- **first_infer_ms**: First inference time in milliseconds\n" |
|
|
"- **subsequent_infer_ms**: Subsequent inference time in milliseconds\n" |
|
|
"- **p50/p90**: 50th and 90th percentile values\n\n" |
|
|
"**HuggingFace Metrics:**\n" |
|
|
"- **downloads**: Total downloads from HuggingFace Hub\n" |
|
|
"- **likes**: Number of likes on HuggingFace Hub\n\n" |
|
|
"**WebGPU Compatibility:**\n" |
|
|
"- Models in the recommended section are all WebGPU compatible\n" |
|
|
"- WebGPU enables GPU acceleration in modern browsers\n\n" |
|
|
"**β οΈ Important Note About Performance Metrics:**\n" |
|
|
"All metrics are measured in a controlled benchmark environment. " |
|
|
"They are useful for **comparing models against each other**, but may not reflect " |
|
|
"actual performance in your environment. Factors like hardware, browser, OS, and system load affect real-world performance. " |
|
|
"We recommend testing models in your own environment for accurate measurements." |
|
|
) |
|
|
|
|
|
def update_data(): |
|
|
"""Reload data from HuggingFace.""" |
|
|
new_df = load_data() |
|
|
formatted_new_df = format_dataframe(new_df) |
|
|
|
|
|
|
|
|
new_recommended = get_webgpu_beginner_friendly_models(new_df, limit_per_task=5) |
|
|
formatted_new_recommended = format_dataframe(new_recommended) |
|
|
new_markdown = format_recommended_models_as_markdown(new_recommended) |
|
|
|
|
|
return ( |
|
|
new_df, |
|
|
formatted_new_recommended, |
|
|
new_markdown, |
|
|
formatted_new_df, |
|
|
gr.update(choices=get_unique_values(new_df, "task")), |
|
|
gr.update(choices=get_unique_values(new_df, "platform")), |
|
|
gr.update(choices=get_unique_values(new_df, "device")), |
|
|
gr.update(choices=get_unique_values(new_df, "mode")), |
|
|
gr.update(choices=get_unique_values(new_df, "dtype")), |
|
|
gr.update(choices=get_unique_values(new_df, "status")), |
|
|
) |
|
|
|
|
|
def apply_filters(raw_df, model, task, platform, device, mode, dtype, status): |
|
|
"""Apply filters and return filtered DataFrame.""" |
|
|
|
|
|
filtered = filter_data(raw_df, model, task, platform, device, mode, dtype, status) |
|
|
return format_dataframe(filtered) |
|
|
|
|
|
|
|
|
refresh_btn.click( |
|
|
fn=update_data, |
|
|
outputs=[ |
|
|
raw_data_state, |
|
|
recommended_table, |
|
|
markdown_textbox, |
|
|
results_table, |
|
|
task_filter, |
|
|
platform_filter, |
|
|
device_filter, |
|
|
mode_filter, |
|
|
dtype_filter, |
|
|
status_filter, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
filter_inputs = [ |
|
|
raw_data_state, |
|
|
model_filter, |
|
|
task_filter, |
|
|
platform_filter, |
|
|
device_filter, |
|
|
mode_filter, |
|
|
dtype_filter, |
|
|
status_filter, |
|
|
] |
|
|
|
|
|
model_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
task_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
platform_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
device_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
mode_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
dtype_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
status_filter.change( |
|
|
fn=apply_filters, |
|
|
inputs=filter_inputs, |
|
|
outputs=results_table, |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
demo = create_leaderboard_ui() |
|
|
demo.launch() |
|
|
|