whitphx HF Staff commited on
Commit
576f5ef
·
1 Parent(s): f76cf5a

Create leaderboard

Browse files
leaderboard/.env.example ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # HuggingFace Dataset Repository
2
+ # The dataset repository where benchmark results are stored
3
+ HF_DATASET_REPO=your-username/your-dataset-repo
4
+
5
+ # HuggingFace API Token (optional, for private datasets)
6
+ HF_TOKEN=your_token_here
leaderboard/.gitignore ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+ env/
28
+
29
+ # Environment variables
30
+ .env
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
leaderboard/.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
leaderboard/README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformers.js Benchmark Leaderboard
2
+
3
+ A Gradio-based leaderboard that displays benchmark results from a HuggingFace Dataset repository.
4
+
5
+ ## Features
6
+
7
+ - 📊 Display benchmark results in a searchable/filterable table
8
+ - 🔍 Filter by model name, task, platform, device, mode, and dtype
9
+ - 🔄 Refresh data on demand from HuggingFace Dataset
10
+ - 📈 View performance metrics (load time, inference time, p50/p90 percentiles)
11
+
12
+ ## Setup
13
+
14
+ 1. Install dependencies:
15
+ ```bash
16
+ uv sync
17
+ ```
18
+
19
+ 2. Configure environment variables:
20
+ ```bash
21
+ cp .env.example .env
22
+ ```
23
+
24
+ Edit `.env` and set:
25
+ - `HF_DATASET_REPO`: Your HuggingFace dataset repository (e.g., `username/transformersjs-benchmarks`)
26
+ - `HF_TOKEN`: Your HuggingFace API token (optional, for private datasets)
27
+
28
+ ## Usage
29
+
30
+ Run the leaderboard:
31
+
32
+ ```bash
33
+ uv run python -m leaderboard.app
34
+ ```
35
+
36
+ Or using the installed script:
37
+
38
+ ```bash
39
+ uv run leaderboard
40
+ ```
41
+
42
+ The leaderboard will be available at: http://localhost:7861
43
+
44
+ ## Data Format
45
+
46
+ The leaderboard reads JSONL files from the HuggingFace Dataset repository. Each line should be a JSON object with the following structure:
47
+
48
+ ```json
49
+ {
50
+ "id": "benchmark-id",
51
+ "platform": "web",
52
+ "modelId": "Xenova/all-MiniLM-L6-v2",
53
+ "task": "feature-extraction",
54
+ "mode": "warm",
55
+ "repeats": 3,
56
+ "batchSize": 1,
57
+ "device": "wasm",
58
+ "browser": "chromium",
59
+ "dtype": "fp32",
60
+ "headed": false,
61
+ "status": "completed",
62
+ "timestamp": 1234567890,
63
+ "result": {
64
+ "metrics": {
65
+ "load_ms": {"p50": 100, "p90": 120},
66
+ "first_infer_ms": {"p50": 10, "p90": 15},
67
+ "subsequent_infer_ms": {"p50": 8, "p90": 12}
68
+ },
69
+ "environment": {
70
+ "cpuCores": 10,
71
+ "memory": {"deviceMemory": 8}
72
+ }
73
+ }
74
+ }
75
+ ```
76
+
77
+ ## Development
78
+
79
+ The leaderboard is built with:
80
+ - **Gradio**: Web UI framework
81
+ - **Pandas**: Data manipulation
82
+ - **HuggingFace Hub**: Dataset loading
leaderboard/main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from leaderboard!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
leaderboard/pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "leaderboard"
3
+ version = "0.1.0"
4
+ description = "Transformers.js Benchmark Leaderboard - Display benchmark results from HuggingFace Dataset"
5
+ requires-python = ">=3.13"
6
+ dependencies = [
7
+ "gradio>=5.49.1",
8
+ "huggingface-hub>=0.35.3",
9
+ "pandas>=2.3.3",
10
+ "python-dotenv>=1.1.1",
11
+ ]
12
+
13
+ [project.scripts]
14
+ leaderboard = "leaderboard.app:create_leaderboard_ui"
15
+
16
+ [build-system]
17
+ requires = ["hatchling"]
18
+ build-backend = "hatchling.build"
19
+
20
+ [tool.hatch.build.targets.wheel]
21
+ packages = ["src/leaderboard"]
22
+
23
+ [tool.uv]
24
+ package = true
leaderboard/src/leaderboard/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Transformers.js Benchmark Leaderboard"""
2
+
3
+ from .app import create_leaderboard_ui
4
+ from .data_loader import load_benchmark_data, get_unique_values, flatten_result
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = [
8
+ "create_leaderboard_ui",
9
+ "load_benchmark_data",
10
+ "get_unique_values",
11
+ "flatten_result",
12
+ ]
leaderboard/src/leaderboard/app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Transformers.js Benchmark Leaderboard
3
+
4
+ A Gradio app that displays benchmark results from a HuggingFace Dataset repository.
5
+ """
6
+
7
+ import os
8
+ import pandas as pd
9
+ import gradio as gr
10
+ from dotenv import load_dotenv
11
+
12
+ from leaderboard.data_loader import (
13
+ load_benchmark_data,
14
+ get_unique_values,
15
+ )
16
+
17
+ # Load environment variables
18
+ load_dotenv()
19
+
20
+ HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")
21
+ HF_TOKEN = os.getenv("HF_TOKEN")
22
+
23
+
24
+ def load_data() -> pd.DataFrame:
25
+ """Load benchmark data from configured HF Dataset repository."""
26
+ return load_benchmark_data(
27
+ dataset_repo=HF_DATASET_REPO,
28
+ token=HF_TOKEN,
29
+ )
30
+
31
+
32
+ def filter_data(
33
+ df: pd.DataFrame,
34
+ model_filter: str,
35
+ task_filter: str,
36
+ platform_filter: str,
37
+ device_filter: str,
38
+ mode_filter: str,
39
+ dtype_filter: str,
40
+ ) -> pd.DataFrame:
41
+ """Filter benchmark data based on user inputs."""
42
+ if df.empty:
43
+ return df
44
+
45
+ filtered = df.copy()
46
+
47
+ # Model name filter
48
+ if model_filter:
49
+ filtered = filtered[
50
+ filtered["modelId"].str.contains(model_filter, case=False, na=False)
51
+ ]
52
+
53
+ # Task filter
54
+ if task_filter and task_filter != "All":
55
+ filtered = filtered[filtered["task"] == task_filter]
56
+
57
+ # Platform filter
58
+ if platform_filter and platform_filter != "All":
59
+ filtered = filtered[filtered["platform"] == platform_filter]
60
+
61
+ # Device filter
62
+ if device_filter and device_filter != "All":
63
+ filtered = filtered[filtered["device"] == device_filter]
64
+
65
+ # Mode filter
66
+ if mode_filter and mode_filter != "All":
67
+ filtered = filtered[filtered["mode"] == mode_filter]
68
+
69
+ # DType filter
70
+ if dtype_filter and dtype_filter != "All":
71
+ filtered = filtered[filtered["dtype"] == dtype_filter]
72
+
73
+ return filtered
74
+
75
+
76
+ def create_leaderboard_ui():
77
+ """Create the Gradio UI for the leaderboard."""
78
+
79
+ # Load initial data
80
+ df = load_data()
81
+
82
+ with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
83
+ gr.Markdown("# 🏆 Transformers.js Benchmark Leaderboard")
84
+ gr.Markdown(
85
+ "Compare benchmark results for different models, platforms, and configurations."
86
+ )
87
+
88
+ if not HF_DATASET_REPO:
89
+ gr.Markdown(
90
+ "⚠️ **HF_DATASET_REPO not configured.** "
91
+ "Please set the environment variable to load benchmark data."
92
+ )
93
+
94
+ with gr.Row():
95
+ refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
96
+
97
+ with gr.Row():
98
+ model_filter = gr.Textbox(
99
+ label="Model Name",
100
+ placeholder="Filter by model name (e.g., 'bert', 'gpt')",
101
+ )
102
+ task_filter = gr.Dropdown(
103
+ label="Task",
104
+ choices=get_unique_values(df, "task"),
105
+ value="All",
106
+ )
107
+
108
+ with gr.Row():
109
+ platform_filter = gr.Dropdown(
110
+ label="Platform",
111
+ choices=get_unique_values(df, "platform"),
112
+ value="All",
113
+ )
114
+ device_filter = gr.Dropdown(
115
+ label="Device",
116
+ choices=get_unique_values(df, "device"),
117
+ value="All",
118
+ )
119
+
120
+ with gr.Row():
121
+ mode_filter = gr.Dropdown(
122
+ label="Mode",
123
+ choices=get_unique_values(df, "mode"),
124
+ value="All",
125
+ )
126
+ dtype_filter = gr.Dropdown(
127
+ label="DType",
128
+ choices=get_unique_values(df, "dtype"),
129
+ value="All",
130
+ )
131
+
132
+ results_table = gr.DataFrame(
133
+ value=df,
134
+ label="Benchmark Results",
135
+ interactive=False,
136
+ wrap=True,
137
+ )
138
+
139
+ gr.Markdown("### 📊 Metrics")
140
+ gr.Markdown(
141
+ "- **load_ms**: Model loading time in milliseconds\n"
142
+ "- **first_infer_ms**: First inference time in milliseconds\n"
143
+ "- **subsequent_infer_ms**: Subsequent inference time in milliseconds\n"
144
+ "- **p50/p90**: 50th and 90th percentile values"
145
+ )
146
+
147
+ def update_data():
148
+ """Reload data from HuggingFace."""
149
+ new_df = load_data()
150
+ return (
151
+ new_df,
152
+ gr.update(choices=get_unique_values(new_df, "task")),
153
+ gr.update(choices=get_unique_values(new_df, "platform")),
154
+ gr.update(choices=get_unique_values(new_df, "device")),
155
+ gr.update(choices=get_unique_values(new_df, "mode")),
156
+ gr.update(choices=get_unique_values(new_df, "dtype")),
157
+ )
158
+
159
+ def apply_filters(df, model, task, platform, device, mode, dtype):
160
+ """Apply filters and return filtered DataFrame."""
161
+ return filter_data(df, model, task, platform, device, mode, dtype)
162
+
163
+ # Refresh button updates data and resets filters
164
+ refresh_btn.click(
165
+ fn=update_data,
166
+ outputs=[
167
+ results_table,
168
+ task_filter,
169
+ platform_filter,
170
+ device_filter,
171
+ mode_filter,
172
+ dtype_filter,
173
+ ],
174
+ )
175
+
176
+ # Filter inputs update the table
177
+ filter_inputs = [
178
+ results_table,
179
+ model_filter,
180
+ task_filter,
181
+ platform_filter,
182
+ device_filter,
183
+ mode_filter,
184
+ dtype_filter,
185
+ ]
186
+
187
+ model_filter.change(
188
+ fn=apply_filters,
189
+ inputs=filter_inputs,
190
+ outputs=results_table,
191
+ )
192
+ task_filter.change(
193
+ fn=apply_filters,
194
+ inputs=filter_inputs,
195
+ outputs=results_table,
196
+ )
197
+ platform_filter.change(
198
+ fn=apply_filters,
199
+ inputs=filter_inputs,
200
+ outputs=results_table,
201
+ )
202
+ device_filter.change(
203
+ fn=apply_filters,
204
+ inputs=filter_inputs,
205
+ outputs=results_table,
206
+ )
207
+ mode_filter.change(
208
+ fn=apply_filters,
209
+ inputs=filter_inputs,
210
+ outputs=results_table,
211
+ )
212
+ dtype_filter.change(
213
+ fn=apply_filters,
214
+ inputs=filter_inputs,
215
+ outputs=results_table,
216
+ )
217
+
218
+ return demo
219
+
220
+
221
+ demo = create_leaderboard_ui()
222
+ demo.launch(server_name="0.0.0.0", server_port=7861)
leaderboard/src/leaderboard/data_loader.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data loader module for loading benchmark results from HuggingFace Dataset.
3
+ """
4
+
5
+ import json
6
+ from typing import List, Dict, Any, Optional
7
+ import pandas as pd
8
+ from huggingface_hub import HfApi, hf_hub_download
9
+
10
+
11
+ def load_benchmark_data(
12
+ dataset_repo: str,
13
+ token: Optional[str] = None,
14
+ ) -> pd.DataFrame:
15
+ """Load benchmark data from HuggingFace Dataset repository.
16
+
17
+ Args:
18
+ dataset_repo: HuggingFace dataset repository ID (e.g., "username/dataset-name")
19
+ token: HuggingFace API token (optional, for private datasets)
20
+
21
+ Returns:
22
+ DataFrame containing all benchmark results
23
+ """
24
+ if not dataset_repo:
25
+ return pd.DataFrame()
26
+
27
+ try:
28
+ api = HfApi(token=token)
29
+
30
+ # List all files in the dataset repo
31
+ files = api.list_repo_files(
32
+ repo_id=dataset_repo,
33
+ repo_type="dataset",
34
+ token=token,
35
+ )
36
+
37
+ # Filter for .json files
38
+ json_files = [f for f in files if f.endswith(".json")]
39
+
40
+ if not json_files:
41
+ return pd.DataFrame()
42
+
43
+ # Load all benchmark results
44
+ all_results = []
45
+ for file_path in json_files:
46
+ try:
47
+ result = load_single_benchmark_file(
48
+ dataset_repo=dataset_repo,
49
+ file_path=file_path,
50
+ token=token,
51
+ )
52
+ if result:
53
+ all_results.append(flatten_result(result))
54
+ except Exception as e:
55
+ print(f"Error loading {file_path}: {e}")
56
+ continue
57
+
58
+ if not all_results:
59
+ return pd.DataFrame()
60
+
61
+ # Convert to DataFrame
62
+ df = pd.DataFrame(all_results)
63
+
64
+ # Sort by model name and timestamp
65
+ if "modelId" in df.columns and "timestamp" in df.columns:
66
+ df = df.sort_values(["modelId", "timestamp"], ascending=[True, False])
67
+
68
+ return df
69
+
70
+ except Exception as e:
71
+ print(f"Error loading benchmark data: {e}")
72
+ return pd.DataFrame()
73
+
74
+
75
+ def load_single_benchmark_file(
76
+ dataset_repo: str,
77
+ file_path: str,
78
+ token: Optional[str] = None,
79
+ ) -> Optional[Dict[str, Any]]:
80
+ """Load a single benchmark result file from HuggingFace Dataset.
81
+
82
+ Args:
83
+ dataset_repo: HuggingFace dataset repository ID
84
+ file_path: Path to the JSON file within the dataset
85
+ token: HuggingFace API token (optional)
86
+
87
+ Returns:
88
+ Dictionary containing the benchmark result, or None if failed
89
+ """
90
+ try:
91
+ # Download the file
92
+ local_path = hf_hub_download(
93
+ repo_id=dataset_repo,
94
+ filename=file_path,
95
+ repo_type="dataset",
96
+ token=token,
97
+ )
98
+
99
+ # Read JSON file (single object per file)
100
+ with open(local_path, "r") as f:
101
+ return json.load(f)
102
+
103
+ except Exception as e:
104
+ print(f"Error loading file {file_path}: {e}")
105
+ return None
106
+
107
+
108
+ def flatten_result(result: Dict[str, Any]) -> Dict[str, Any]:
109
+ """Flatten nested benchmark result for display.
110
+
111
+ The HF Dataset format is already flattened by the bench service,
112
+ so we just need to extract the relevant fields.
113
+
114
+ Args:
115
+ result: Raw benchmark result dictionary
116
+
117
+ Returns:
118
+ Flattened dictionary with extracted fields
119
+ """
120
+ flat = {
121
+ "id": result.get("id", ""),
122
+ "platform": result.get("platform", ""),
123
+ "modelId": result.get("modelId", ""),
124
+ "task": result.get("task", ""),
125
+ "mode": result.get("mode", ""),
126
+ "repeats": result.get("repeats", 0),
127
+ "batchSize": result.get("batchSize", 0),
128
+ "device": result.get("device", ""),
129
+ "browser": result.get("browser", ""),
130
+ "dtype": result.get("dtype", ""),
131
+ "headed": result.get("headed", False),
132
+ "status": result.get("status", ""),
133
+ "timestamp": result.get("timestamp", 0),
134
+ "runtime": result.get("runtime", ""),
135
+ }
136
+
137
+ # Extract metrics if available (already at top level)
138
+ if "metrics" in result:
139
+ metrics = result["metrics"]
140
+
141
+ # Load time
142
+ if "load_ms" in metrics and "p50" in metrics["load_ms"]:
143
+ flat["load_ms_p50"] = metrics["load_ms"]["p50"]
144
+ flat["load_ms_p90"] = metrics["load_ms"]["p90"]
145
+
146
+ # First inference time
147
+ if "first_infer_ms" in metrics and "p50" in metrics["first_infer_ms"]:
148
+ flat["first_infer_ms_p50"] = metrics["first_infer_ms"]["p50"]
149
+ flat["first_infer_ms_p90"] = metrics["first_infer_ms"]["p90"]
150
+
151
+ # Subsequent inference time
152
+ if "subsequent_infer_ms" in metrics and "p50" in metrics["subsequent_infer_ms"]:
153
+ flat["subsequent_infer_ms_p50"] = metrics["subsequent_infer_ms"]["p50"]
154
+ flat["subsequent_infer_ms_p90"] = metrics["subsequent_infer_ms"]["p90"]
155
+
156
+ # Extract environment info (already at top level)
157
+ if "environment" in result:
158
+ env = result["environment"]
159
+ flat["cpuCores"] = env.get("cpuCores", 0)
160
+ if "memory" in env:
161
+ flat["memory_gb"] = env["memory"].get("deviceMemory", 0)
162
+
163
+ # Calculate duration
164
+ if "completedAt" in result and "startedAt" in result:
165
+ flat["duration_s"] = (result["completedAt"] - result["startedAt"]) / 1000
166
+
167
+ return flat
168
+
169
+
170
+ def get_unique_values(df: pd.DataFrame, column: str) -> List[str]:
171
+ """Get unique values from a column for dropdown choices.
172
+
173
+ Args:
174
+ df: DataFrame to extract values from
175
+ column: Column name
176
+
177
+ Returns:
178
+ List of unique values with "All" as first item
179
+ """
180
+ if df.empty or column not in df.columns:
181
+ return ["All"]
182
+
183
+ values = df[column].dropna().unique().tolist()
184
+ return ["All"] + sorted([str(v) for v in values])
leaderboard/uv.lock ADDED
The diff for this file is too large to render. See raw diff