whitphx HF Staff commited on
Commit
9438810
Β·
1 Parent(s): 4d725bc
leaderboard/src/leaderboard/app.py CHANGED
@@ -56,6 +56,7 @@ def filter_data(
56
  device_filter: str,
57
  mode_filter: str,
58
  dtype_filter: str,
 
59
  ) -> pd.DataFrame:
60
  """Filter benchmark data based on user inputs."""
61
  if df.empty:
@@ -89,6 +90,10 @@ def filter_data(
89
  if dtype_filter and dtype_filter != "All":
90
  filtered = filtered[filtered["dtype"] == dtype_filter]
91
 
 
 
 
 
92
  return filtered
93
 
94
 
@@ -99,10 +104,9 @@ def create_leaderboard_ui():
99
  df = load_data()
100
  formatted_df = format_dataframe(df)
101
 
102
- # Cache raw data in Gradio state to avoid reloading on every filter change
103
- raw_data_state = gr.State(df)
104
-
105
  with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
 
 
106
  gr.Markdown("# πŸ† Transformers.js Benchmark Leaderboard")
107
  gr.Markdown(
108
  "Compare benchmark results for different models, platforms, and configurations."
@@ -156,6 +160,11 @@ def create_leaderboard_ui():
156
  choices=get_unique_values(df, "dtype"),
157
  value="All",
158
  )
 
 
 
 
 
159
 
160
  results_table = gr.DataFrame(
161
  value=formatted_df,
@@ -193,12 +202,13 @@ def create_leaderboard_ui():
193
  gr.update(choices=get_unique_values(new_df, "device")),
194
  gr.update(choices=get_unique_values(new_df, "mode")),
195
  gr.update(choices=get_unique_values(new_df, "dtype")),
 
196
  )
197
 
198
- def apply_filters(raw_df, model, task, platform, device, mode, dtype):
199
  """Apply filters and return filtered DataFrame."""
200
  # Use cached raw data instead of reloading
201
- filtered = filter_data(raw_df, model, task, platform, device, mode, dtype)
202
  return format_dataframe(filtered)
203
 
204
  # Refresh button updates data and resets filters
@@ -212,6 +222,7 @@ def create_leaderboard_ui():
212
  device_filter,
213
  mode_filter,
214
  dtype_filter,
 
215
  ],
216
  )
217
 
@@ -224,6 +235,7 @@ def create_leaderboard_ui():
224
  device_filter,
225
  mode_filter,
226
  dtype_filter,
 
227
  ]
228
 
229
  model_filter.change(
@@ -256,6 +268,11 @@ def create_leaderboard_ui():
256
  inputs=filter_inputs,
257
  outputs=results_table,
258
  )
 
 
 
 
 
259
 
260
  return demo
261
 
 
56
  device_filter: str,
57
  mode_filter: str,
58
  dtype_filter: str,
59
+ status_filter: str,
60
  ) -> pd.DataFrame:
61
  """Filter benchmark data based on user inputs."""
62
  if df.empty:
 
90
  if dtype_filter and dtype_filter != "All":
91
  filtered = filtered[filtered["dtype"] == dtype_filter]
92
 
93
+ # Status filter
94
+ if status_filter and status_filter != "All":
95
+ filtered = filtered[filtered["status"] == status_filter]
96
+
97
  return filtered
98
 
99
 
 
104
  df = load_data()
105
  formatted_df = format_dataframe(df)
106
 
 
 
 
107
  with gr.Blocks(title="Transformers.js Benchmark Leaderboard") as demo:
108
+ # Cache raw data in Gradio state to avoid reloading on every filter change
109
+ raw_data_state = gr.State(df)
110
  gr.Markdown("# πŸ† Transformers.js Benchmark Leaderboard")
111
  gr.Markdown(
112
  "Compare benchmark results for different models, platforms, and configurations."
 
160
  choices=get_unique_values(df, "dtype"),
161
  value="All",
162
  )
163
+ status_filter = gr.Dropdown(
164
+ label="Status",
165
+ choices=get_unique_values(df, "status"),
166
+ value="All",
167
+ )
168
 
169
  results_table = gr.DataFrame(
170
  value=formatted_df,
 
202
  gr.update(choices=get_unique_values(new_df, "device")),
203
  gr.update(choices=get_unique_values(new_df, "mode")),
204
  gr.update(choices=get_unique_values(new_df, "dtype")),
205
+ gr.update(choices=get_unique_values(new_df, "status")),
206
  )
207
 
208
+ def apply_filters(raw_df, model, task, platform, device, mode, dtype, status):
209
  """Apply filters and return filtered DataFrame."""
210
  # Use cached raw data instead of reloading
211
+ filtered = filter_data(raw_df, model, task, platform, device, mode, dtype, status)
212
  return format_dataframe(filtered)
213
 
214
  # Refresh button updates data and resets filters
 
222
  device_filter,
223
  mode_filter,
224
  dtype_filter,
225
+ status_filter,
226
  ],
227
  )
228
 
 
235
  device_filter,
236
  mode_filter,
237
  dtype_filter,
238
+ status_filter,
239
  ]
240
 
241
  model_filter.change(
 
268
  inputs=filter_inputs,
269
  outputs=results_table,
270
  )
271
+ status_filter.change(
272
+ fn=apply_filters,
273
+ inputs=filter_inputs,
274
+ outputs=results_table,
275
+ )
276
 
277
  return demo
278
 
leaderboard/src/leaderboard/data_loader.py CHANGED
@@ -109,6 +109,11 @@ def flatten_result(result: Dict[str, Any]) -> Dict[str, Any]:
109
  except (ValueError, OSError):
110
  timestamp_dt = None
111
 
 
 
 
 
 
112
  flat = {
113
  "id": result.get("id", ""),
114
  "platform": result.get("platform", ""),
@@ -121,9 +126,16 @@ def flatten_result(result: Dict[str, Any]) -> Dict[str, Any]:
121
  "browser": result.get("browser", ""),
122
  "dtype": result.get("dtype", ""),
123
  "headed": result.get("headed", False),
124
- "status": result.get("status", ""),
125
  "timestamp": timestamp_dt,
126
  "runtime": result.get("runtime", ""),
 
 
 
 
 
 
 
127
  }
128
 
129
  # Extract metrics if available (already at top level)
 
109
  except (ValueError, OSError):
110
  timestamp_dt = None
111
 
112
+ # Determine actual status - if there's an error, it should be "failed"
113
+ status = result.get("status", "")
114
+ if "error" in result:
115
+ status = "failed"
116
+
117
  flat = {
118
  "id": result.get("id", ""),
119
  "platform": result.get("platform", ""),
 
126
  "browser": result.get("browser", ""),
127
  "dtype": result.get("dtype", ""),
128
  "headed": result.get("headed", False),
129
+ "status": status,
130
  "timestamp": timestamp_dt,
131
  "runtime": result.get("runtime", ""),
132
+ # Initialize metric fields with None (will be filled if metrics exist)
133
+ "load_ms_p50": None,
134
+ "load_ms_p90": None,
135
+ "first_infer_ms_p50": None,
136
+ "first_infer_ms_p90": None,
137
+ "subsequent_infer_ms_p50": None,
138
+ "subsequent_infer_ms_p90": None,
139
  }
140
 
141
  # Extract metrics if available (already at top level)