Spaces:

whitphx
/

transformersjs-bench-leaderboard

Sleeping

App Files Files Community

transformersjs-bench-leaderboard / app.py

whitphx HF Staff

update

05b75ca 14 days ago

raw

history blame contribute delete

11 kB

	"""
	Transformers.js Benchmark Leaderboard

	A Gradio app that displays benchmark results from a HuggingFace Dataset repository.
	"""

	import os
	import logging
	import pandas as pd
	import gradio as gr
	from dotenv import load_dotenv

	from data_loader import (
	load_benchmark_data,
	get_unique_values,
	get_webgpu_beginner_friendly_models,
	format_recommended_models_as_markdown,
	)
	from formatters import apply_formatting

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)

	# Load environment variables
	load_dotenv()

	HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")
	HF_TOKEN = os.getenv("HF_TOKEN")


	def load_data() -> pd.DataFrame:
	"""Load benchmark data from configured HF Dataset repository."""
	# Load raw data
	df = load_benchmark_data(
	dataset_repo=HF_DATASET_REPO,
	token=HF_TOKEN,
	)

	return df


	def format_dataframe(df: pd.DataFrame) -> pd.DataFrame:
	"""Apply formatting to dataframe for display."""
	if df.empty:
	return df

	return df.apply(lambda row: pd.Series(apply_formatting(row.to_dict())), axis=1)


	def filter_data(
	df: pd.DataFrame,
	model_filter: str,
	task_filter: str,
	platform_filter: str,
	device_filter: str,
	mode_filter: str,
	dtype_filter: str,
	status_filter: str,
	) -> pd.DataFrame:
	"""Filter benchmark data based on user inputs."""
	if df.empty:
	return df

	filtered = df.copy()

	# Model name filter
	if model_filter:
	filtered = filtered[
	filtered["modelId"].str.contains(model_filter, case=False, na=False)
	]

	# Task filter
	if task_filter and task_filter != "All":
	filtered = filtered[filtered["task"] == task_filter]

	# Platform filter
	if platform_filter and platform_filter != "All":
	filtered = filtered[filtered["platform"] == platform_filter]

	# Device filter
	if device_filter and device_filter != "All":
	filtered = filtered[filtered["device"] == device_filter]

	# Mode filter
	if mode_filter and mode_filter != "All":
	filtered = filtered[filtered["mode"] == mode_filter]

	# DType filter
	if dtype_filter and dtype_filter != "All":
	filtered = filtered[filtered["dtype"] == dtype_filter]

	# Status filter
	if status_filter and status_filter != "All":
	filtered = filtered[filtered["status"] == status_filter]

	return filtered


	def create_leaderboard_ui():
	"""Create the Gradio UI for the leaderboard."""

	# Load initial data
	df = load_data()
	formatted_df = format_dataframe(df)

	with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
	# Cache raw data in Gradio state to avoid reloading on every filter change
	raw_data_state = gr.State(df)
	gr.Markdown("# 🏆 Transformers.js Benchmark Leaderboard")
	gr.Markdown(
	"Compare benchmark results for different models, platforms, and configurations."
	)

	if not HF_DATASET_REPO:
	gr.Markdown(
	"⚠️ HF_DATASET_REPO not configured. "
	"Please set the environment variable to load benchmark data."
	)

	gr.Markdown(
	"💡 Tip: Use the recommended models section below to find popular models "
	"that are fast to load and quick to run - perfect for getting started!"
	)

	# Recommended models section
	gr.Markdown("## ⭐ Recommended WebGPU Models for Beginners")
	gr.Markdown(
	"These models are selected for being:\n"
	"- WebGPU compatible - Work in modern browsers with GPU acceleration\n"
	"- Beginner-friendly - Popular, fast to load, and quick to run\n"
	"- Sorted by task type, showing top 3-5 models per task"
	)

	# Get recommended models
	recommended_models = get_webgpu_beginner_friendly_models(df, limit_per_task=5)
	formatted_recommended = format_dataframe(recommended_models)
	markdown_output = format_recommended_models_as_markdown(recommended_models)

	recommended_table = gr.DataFrame(
	value=formatted_recommended,
	label="Top WebGPU-Compatible Models by Task",
	interactive=False,
	wrap=True,
	)

	gr.Markdown("### 📝 Markdown Output for llms.txt")
	gr.Markdown(
	"Copy the markdown below to embed in your llms.txt or documentation:"
	)

	markdown_textbox = gr.Textbox(
	value=markdown_output,
	label="Markdown for llms.txt",
	lines=20,
	max_lines=30,
	show_copy_button=True,
	interactive=False,
	)

	gr.Markdown("---")
	gr.Markdown("## 🔍 Full Benchmark Results")

	with gr.Row():
	refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")

	with gr.Row():
	model_filter = gr.Textbox(
	label="Model Name",
	placeholder="Filter by model name (e.g., 'bert', 'gpt')",
	)
	task_filter = gr.Dropdown(
	label="Task",
	choices=get_unique_values(df, "task"),
	value="All",
	)

	with gr.Row():
	platform_filter = gr.Dropdown(
	label="Platform",
	choices=get_unique_values(df, "platform"),
	value="All",
	)
	device_filter = gr.Dropdown(
	label="Device",
	choices=get_unique_values(df, "device"),
	value="All",
	)

	with gr.Row():
	mode_filter = gr.Dropdown(
	label="Mode",
	choices=get_unique_values(df, "mode"),
	value="All",
	)
	dtype_filter = gr.Dropdown(
	label="DType",
	choices=get_unique_values(df, "dtype"),
	value="All",
	)
	status_filter = gr.Dropdown(
	label="Status",
	choices=get_unique_values(df, "status"),
	value="All",
	)

	results_table = gr.DataFrame(
	value=formatted_df,
	label="All Benchmark Results",
	interactive=False,
	wrap=True,
	)

	gr.Markdown("### 📊 Metrics")
	gr.Markdown(
	"Benchmark Metrics:\n"
	"- load_ms: Model loading time in milliseconds\n"
	"- first_infer_ms: First inference time in milliseconds\n"
	"- subsequent_infer_ms: Subsequent inference time in milliseconds\n"
	"- p50/p90: 50th and 90th percentile values\n\n"
	"HuggingFace Metrics:\n"
	"- downloads: Total downloads from HuggingFace Hub\n"
	"- likes: Number of likes on HuggingFace Hub\n\n"
	"WebGPU Compatibility:\n"
	"- Models in the recommended section are all WebGPU compatible\n"
	"- WebGPU enables GPU acceleration in modern browsers\n\n"
	"⚠️ Important Note About Performance Metrics:\n"
	"All metrics are measured in a controlled benchmark environment. "
	"They are useful for comparing models against each other, but may not reflect "
	"actual performance in your environment. Factors like hardware, browser, OS, and system load affect real-world performance. "
	"We recommend testing models in your own environment for accurate measurements."
	)

	def update_data():
	"""Reload data from HuggingFace."""
	new_df = load_data()
	formatted_new_df = format_dataframe(new_df)

	# Update recommended models
	new_recommended = get_webgpu_beginner_friendly_models(new_df, limit_per_task=5)
	formatted_new_recommended = format_dataframe(new_recommended)
	new_markdown = format_recommended_models_as_markdown(new_recommended)

	return (
	new_df, # Update cached raw data
	formatted_new_recommended, # Update recommended models
	new_markdown, # Update markdown output
	formatted_new_df,
	gr.update(choices=get_unique_values(new_df, "task")),
	gr.update(choices=get_unique_values(new_df, "platform")),
	gr.update(choices=get_unique_values(new_df, "device")),
	gr.update(choices=get_unique_values(new_df, "mode")),
	gr.update(choices=get_unique_values(new_df, "dtype")),
	gr.update(choices=get_unique_values(new_df, "status")),
	)

	def apply_filters(raw_df, model, task, platform, device, mode, dtype, status):
	"""Apply filters and return filtered DataFrame."""
	# Use cached raw data instead of reloading
	filtered = filter_data(raw_df, model, task, platform, device, mode, dtype, status)
	return format_dataframe(filtered)

	# Refresh button updates data and resets filters
	refresh_btn.click(
	fn=update_data,
	outputs=[
	raw_data_state,
	recommended_table,
	markdown_textbox,
	results_table,
	task_filter,
	platform_filter,
	device_filter,
	mode_filter,
	dtype_filter,
	status_filter,
	],
	)

	# Filter inputs update the table (using cached raw data)
	filter_inputs = [
	raw_data_state,
	model_filter,
	task_filter,
	platform_filter,
	device_filter,
	mode_filter,
	dtype_filter,
	status_filter,
	]

	model_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)
	task_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)
	platform_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)
	device_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)
	mode_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)
	dtype_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)
	status_filter.change(
	fn=apply_filters,
	inputs=filter_inputs,
	outputs=results_table,
	)

	return demo


	demo = create_leaderboard_ui()
	demo.launch()