Spaces:
Runtime error
Runtime error
fix leaderboard
Browse files
leaderboard/src/leaderboard/app.py
CHANGED
|
@@ -100,6 +100,9 @@ def create_leaderboard_ui():
|
|
| 100 |
df = load_data()
|
| 101 |
formatted_df = format_dataframe(df)
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
|
| 104 |
gr.Markdown("# π Transformers.js Benchmark Leaderboard")
|
| 105 |
gr.Markdown(
|
|
@@ -197,6 +200,7 @@ def create_leaderboard_ui():
|
|
| 197 |
formatted_new_first_timer = format_dataframe(new_first_timer)
|
| 198 |
|
| 199 |
return (
|
|
|
|
| 200 |
formatted_new_first_timer,
|
| 201 |
formatted_new_df,
|
| 202 |
gr.update(choices=get_unique_values(new_df, "task")),
|
|
@@ -206,10 +210,9 @@ def create_leaderboard_ui():
|
|
| 206 |
gr.update(choices=get_unique_values(new_df, "dtype")),
|
| 207 |
)
|
| 208 |
|
| 209 |
-
def apply_filters(
|
| 210 |
"""Apply filters and return filtered DataFrame."""
|
| 211 |
-
#
|
| 212 |
-
raw_df = load_data()
|
| 213 |
filtered = filter_data(raw_df, model, task, platform, device, mode, dtype)
|
| 214 |
return format_dataframe(filtered)
|
| 215 |
|
|
@@ -217,6 +220,7 @@ def create_leaderboard_ui():
|
|
| 217 |
refresh_btn.click(
|
| 218 |
fn=update_data,
|
| 219 |
outputs=[
|
|
|
|
| 220 |
first_timer_table,
|
| 221 |
results_table,
|
| 222 |
task_filter,
|
|
@@ -227,9 +231,9 @@ def create_leaderboard_ui():
|
|
| 227 |
],
|
| 228 |
)
|
| 229 |
|
| 230 |
-
# Filter inputs update the table
|
| 231 |
filter_inputs = [
|
| 232 |
-
|
| 233 |
model_filter,
|
| 234 |
task_filter,
|
| 235 |
platform_filter,
|
|
|
|
| 100 |
df = load_data()
|
| 101 |
formatted_df = format_dataframe(df)
|
| 102 |
|
| 103 |
+
# Cache raw data in Gradio state to avoid reloading on every filter change
|
| 104 |
+
raw_data_state = gr.State(df)
|
| 105 |
+
|
| 106 |
with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
|
| 107 |
gr.Markdown("# π Transformers.js Benchmark Leaderboard")
|
| 108 |
gr.Markdown(
|
|
|
|
| 200 |
formatted_new_first_timer = format_dataframe(new_first_timer)
|
| 201 |
|
| 202 |
return (
|
| 203 |
+
new_df, # Update cached raw data
|
| 204 |
formatted_new_first_timer,
|
| 205 |
formatted_new_df,
|
| 206 |
gr.update(choices=get_unique_values(new_df, "task")),
|
|
|
|
| 210 |
gr.update(choices=get_unique_values(new_df, "dtype")),
|
| 211 |
)
|
| 212 |
|
| 213 |
+
def apply_filters(raw_df, model, task, platform, device, mode, dtype):
|
| 214 |
"""Apply filters and return filtered DataFrame."""
|
| 215 |
+
# Use cached raw data instead of reloading
|
|
|
|
| 216 |
filtered = filter_data(raw_df, model, task, platform, device, mode, dtype)
|
| 217 |
return format_dataframe(filtered)
|
| 218 |
|
|
|
|
| 220 |
refresh_btn.click(
|
| 221 |
fn=update_data,
|
| 222 |
outputs=[
|
| 223 |
+
raw_data_state,
|
| 224 |
first_timer_table,
|
| 225 |
results_table,
|
| 226 |
task_filter,
|
|
|
|
| 231 |
],
|
| 232 |
)
|
| 233 |
|
| 234 |
+
# Filter inputs update the table (using cached raw data)
|
| 235 |
filter_inputs = [
|
| 236 |
+
raw_data_state,
|
| 237 |
model_filter,
|
| 238 |
task_filter,
|
| 239 |
platform_filter,
|
leaderboard/src/leaderboard/data_loader.py
CHANGED
|
@@ -4,10 +4,11 @@ Data loader module for loading benchmark results from HuggingFace Dataset.
|
|
| 4 |
|
| 5 |
import json
|
| 6 |
import logging
|
|
|
|
| 7 |
from typing import List, Dict, Any, Optional
|
| 8 |
from datetime import datetime
|
| 9 |
import pandas as pd
|
| 10 |
-
from huggingface_hub import
|
| 11 |
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
@@ -29,30 +30,32 @@ def load_benchmark_data(
|
|
| 29 |
return pd.DataFrame()
|
| 30 |
|
| 31 |
try:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
files = api.list_repo_files(
|
| 36 |
repo_id=dataset_repo,
|
| 37 |
repo_type="dataset",
|
| 38 |
token=token,
|
| 39 |
)
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
if not json_files:
|
|
|
|
| 45 |
return pd.DataFrame()
|
| 46 |
|
|
|
|
|
|
|
| 47 |
# Load all benchmark results
|
| 48 |
all_results = []
|
| 49 |
for file_path in json_files:
|
| 50 |
try:
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
token=token,
|
| 55 |
-
)
|
| 56 |
if result:
|
| 57 |
flattened = flatten_result(result)
|
| 58 |
all_results.append(flattened)
|
|
@@ -63,6 +66,8 @@ def load_benchmark_data(
|
|
| 63 |
if not all_results:
|
| 64 |
return pd.DataFrame()
|
| 65 |
|
|
|
|
|
|
|
| 66 |
# Convert to DataFrame
|
| 67 |
df = pd.DataFrame(all_results)
|
| 68 |
|
|
@@ -80,39 +85,6 @@ def load_benchmark_data(
|
|
| 80 |
return pd.DataFrame()
|
| 81 |
|
| 82 |
|
| 83 |
-
def load_single_benchmark_file(
|
| 84 |
-
dataset_repo: str,
|
| 85 |
-
file_path: str,
|
| 86 |
-
token: Optional[str] = None,
|
| 87 |
-
) -> Optional[Dict[str, Any]]:
|
| 88 |
-
"""Load a single benchmark result file from HuggingFace Dataset.
|
| 89 |
-
|
| 90 |
-
Args:
|
| 91 |
-
dataset_repo: HuggingFace dataset repository ID
|
| 92 |
-
file_path: Path to the JSON file within the dataset
|
| 93 |
-
token: HuggingFace API token (optional)
|
| 94 |
-
|
| 95 |
-
Returns:
|
| 96 |
-
Dictionary containing the benchmark result, or None if failed
|
| 97 |
-
"""
|
| 98 |
-
try:
|
| 99 |
-
# Download the file
|
| 100 |
-
local_path = hf_hub_download(
|
| 101 |
-
repo_id=dataset_repo,
|
| 102 |
-
filename=file_path,
|
| 103 |
-
repo_type="dataset",
|
| 104 |
-
token=token,
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
# Read JSON file (single object per file)
|
| 108 |
-
with open(local_path, "r") as f:
|
| 109 |
-
return json.load(f)
|
| 110 |
-
|
| 111 |
-
except Exception as e:
|
| 112 |
-
logger.error(f"Error loading file {file_path}: {e}")
|
| 113 |
-
return None
|
| 114 |
-
|
| 115 |
-
|
| 116 |
def flatten_result(result: Dict[str, Any]) -> Dict[str, Any]:
|
| 117 |
"""Flatten nested benchmark result for display.
|
| 118 |
|
|
@@ -305,7 +277,13 @@ def get_first_timer_friendly_models(df: pd.DataFrame, limit_per_task: int = 3) -
|
|
| 305 |
)
|
| 306 |
|
| 307 |
# Group by model and take best score for each model within this task
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# Sort by first-timer score and take top N for this task
|
| 311 |
top_for_task = best_per_model.sort_values("first_timer_score", ascending=False).head(limit_per_task)
|
|
|
|
| 4 |
|
| 5 |
import json
|
| 6 |
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
from typing import List, Dict, Any, Optional
|
| 9 |
from datetime import datetime
|
| 10 |
import pandas as pd
|
| 11 |
+
from huggingface_hub import snapshot_download, list_models
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
|
|
|
| 30 |
return pd.DataFrame()
|
| 31 |
|
| 32 |
try:
|
| 33 |
+
# Download the entire repository snapshot
|
| 34 |
+
logger.info(f"Downloading dataset snapshot from {dataset_repo}...")
|
| 35 |
+
local_dir = snapshot_download(
|
|
|
|
| 36 |
repo_id=dataset_repo,
|
| 37 |
repo_type="dataset",
|
| 38 |
token=token,
|
| 39 |
)
|
| 40 |
+
logger.info(f"Dataset downloaded to {local_dir}")
|
| 41 |
|
| 42 |
+
# Find all JSON files in the downloaded directory
|
| 43 |
+
local_path = Path(local_dir)
|
| 44 |
+
json_files = list(local_path.rglob("*.json"))
|
| 45 |
|
| 46 |
if not json_files:
|
| 47 |
+
logger.warning("No JSON files found in dataset")
|
| 48 |
return pd.DataFrame()
|
| 49 |
|
| 50 |
+
logger.info(f"Found {len(json_files)} JSON files")
|
| 51 |
+
|
| 52 |
# Load all benchmark results
|
| 53 |
all_results = []
|
| 54 |
for file_path in json_files:
|
| 55 |
try:
|
| 56 |
+
with open(file_path, "r") as f:
|
| 57 |
+
result = json.load(f)
|
| 58 |
+
|
|
|
|
|
|
|
| 59 |
if result:
|
| 60 |
flattened = flatten_result(result)
|
| 61 |
all_results.append(flattened)
|
|
|
|
| 66 |
if not all_results:
|
| 67 |
return pd.DataFrame()
|
| 68 |
|
| 69 |
+
logger.info(f"Loaded {len(all_results)} benchmark results")
|
| 70 |
+
|
| 71 |
# Convert to DataFrame
|
| 72 |
df = pd.DataFrame(all_results)
|
| 73 |
|
|
|
|
| 85 |
return pd.DataFrame()
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
def flatten_result(result: Dict[str, Any]) -> Dict[str, Any]:
|
| 89 |
"""Flatten nested benchmark result for display.
|
| 90 |
|
|
|
|
| 277 |
)
|
| 278 |
|
| 279 |
# Group by model and take best score for each model within this task
|
| 280 |
+
# Filter out NaN scores before getting idxmax
|
| 281 |
+
idx_max_series = task_df.groupby("modelId")["first_timer_score"].idxmax()
|
| 282 |
+
# Drop NaN indices
|
| 283 |
+
valid_indices = idx_max_series.dropna()
|
| 284 |
+
if valid_indices.empty:
|
| 285 |
+
continue
|
| 286 |
+
best_per_model = task_df.loc[valid_indices]
|
| 287 |
|
| 288 |
# Sort by first-timer score and take top N for this task
|
| 289 |
top_for_task = best_per_model.sort_values("first_timer_score", ascending=False).head(limit_per_task)
|