|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import os |
|
|
import json |
|
|
from typing import Dict, List |
|
|
|
|
|
|
|
|
RUNS_DIR = "runs" |
|
|
DATA_DIR = "." |
|
|
COST_COLUMN_SUMMARY = 'Costs (USD)' |
|
|
NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)' |
|
|
|
|
|
|
|
|
def discover_available_runs() -> List[Dict]: |
|
|
"""Scan runs directory and return sorted list of available runs with metadata.""" |
|
|
runs = [] |
|
|
|
|
|
if not os.path.exists(RUNS_DIR): |
|
|
|
|
|
if os.path.exists("data"): |
|
|
return [{ |
|
|
"run_id": "legacy", |
|
|
"title": "AutoBench Run 2 - April 2025", |
|
|
"date": "2025-04-25", |
|
|
"description": "Current run data", |
|
|
"blog_url": "https://huggingface.co/blog/PeterKruger/autobench-2nd-run", |
|
|
"model_count": 27, |
|
|
"is_latest": True, |
|
|
"path": "data" |
|
|
}] |
|
|
return [] |
|
|
|
|
|
for run_dir in os.listdir(RUNS_DIR): |
|
|
run_path = os.path.join(RUNS_DIR, run_dir) |
|
|
if os.path.isdir(run_path): |
|
|
metadata_path = os.path.join(run_path, "metadata.json") |
|
|
if os.path.exists(metadata_path): |
|
|
try: |
|
|
with open(metadata_path, 'r') as f: |
|
|
metadata = json.load(f) |
|
|
metadata["path"] = run_path |
|
|
runs.append(metadata) |
|
|
except Exception as e: |
|
|
print(f"Error loading metadata for {run_dir}: {e}") |
|
|
|
|
|
|
|
|
runs.sort(key=lambda x: x.get("date", ""), reverse=True) |
|
|
return runs |
|
|
|
|
|
def load_run_metadata(run_id: str) -> Dict: |
|
|
"""Load metadata for a specific run.""" |
|
|
runs = discover_available_runs() |
|
|
for run in runs: |
|
|
if run["run_id"] == run_id: |
|
|
return run |
|
|
return {} |
|
|
|
|
|
def get_run_file_path(run_path: str, filename: str) -> str: |
|
|
"""Get the full path to a data file for a specific run.""" |
|
|
return os.path.join(run_path, filename) |
|
|
|
|
|
|
|
|
def load_correlations(run_path: str) -> Dict: |
|
|
"""Load correlation data for a specific run.""" |
|
|
correlations_file = get_run_file_path(run_path, "correlations.json") |
|
|
if os.path.exists(correlations_file): |
|
|
try: |
|
|
with open(correlations_file, 'r') as f: |
|
|
return json.load(f) |
|
|
except Exception as e: |
|
|
print(f"Error loading correlations from {correlations_file}: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
def format_correlations_text(correlations_data: Dict) -> str: |
|
|
"""Format correlation data into a readable text string.""" |
|
|
if not correlations_data or 'correlations' not in correlations_data: |
|
|
return "" |
|
|
|
|
|
correlations = correlations_data['correlations'] |
|
|
if not correlations: |
|
|
return "" |
|
|
|
|
|
|
|
|
correlation_parts = [] |
|
|
for benchmark, percentage in correlations.items(): |
|
|
correlation_parts.append(f"{percentage}% with {benchmark}") |
|
|
|
|
|
if correlation_parts: |
|
|
return f"**Benchmark Correlations:** AutoBench features " + ", ".join(correlation_parts) + "." |
|
|
return "" |
|
|
|
|
|
def load_run_data(run_id: str) -> Dict[str, pd.DataFrame]: |
|
|
"""Load all CSV data for a specific run.""" |
|
|
runs = discover_available_runs() |
|
|
run_metadata = None |
|
|
|
|
|
for run in runs: |
|
|
if run["run_id"] == run_id: |
|
|
run_metadata = run |
|
|
break |
|
|
|
|
|
if not run_metadata: |
|
|
print(f"Run {run_id} not found") |
|
|
return {} |
|
|
|
|
|
run_path = run_metadata["path"] |
|
|
|
|
|
|
|
|
data = {} |
|
|
file_mapping = { |
|
|
"summary": "summary_data.csv", |
|
|
"domain": "domain_ranks.csv", |
|
|
"cost": "cost_data.csv", |
|
|
"avg_latency": "avg_latency.csv", |
|
|
"p99_latency": "p99_latency.csv" |
|
|
} |
|
|
|
|
|
for key, filename in file_mapping.items(): |
|
|
filepath = get_run_file_path(run_path, filename) |
|
|
data[key] = load_data(filepath) |
|
|
|
|
|
|
|
|
data = process_run_data(data) |
|
|
|
|
|
|
|
|
correlations = load_correlations(run_path) |
|
|
data["correlations"] = correlations |
|
|
|
|
|
return data |
|
|
|
|
|
def process_run_data(data: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]: |
|
|
"""Process and clean the loaded data (cost conversion, sorting, etc.).""" |
|
|
df_summary = data.get("summary", pd.DataFrame()) |
|
|
df_cost = data.get("cost", pd.DataFrame()) |
|
|
|
|
|
|
|
|
if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns: |
|
|
df_summary[COST_COLUMN_SUMMARY] = (pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100).round(3) |
|
|
df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True) |
|
|
|
|
|
|
|
|
if not df_cost.empty: |
|
|
model_col_name = 'model_name' |
|
|
cost_cols = [col for col in df_cost.columns if col != model_col_name] |
|
|
for col in cost_cols: |
|
|
df_cost[col] = (pd.to_numeric(df_cost[col], errors='coerce') * 100).round(3) |
|
|
|
|
|
|
|
|
try: |
|
|
df_summary = df_summary.rename(columns={'Model Name': 'Model'}) |
|
|
|
|
|
|
|
|
base_cols = ['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)'] |
|
|
|
|
|
|
|
|
if 'Fail Rate %' in df_summary.columns: |
|
|
base_cols.append('Fail Rate %') |
|
|
if 'Iterations' in df_summary.columns: |
|
|
base_cols.append('Iterations') |
|
|
|
|
|
summary_cols_display = [col for col in base_cols if col in df_summary.columns] |
|
|
df_summary_display = df_summary[summary_cols_display].copy() |
|
|
|
|
|
|
|
|
benchmark_cols = ['Model', 'AutoBench'] |
|
|
|
|
|
|
|
|
chatbot_col = None |
|
|
mmlu_col = None |
|
|
|
|
|
for col in df_summary.columns: |
|
|
if col in ['Chatbot Ar.', 'LMArena']: |
|
|
chatbot_col = col |
|
|
elif col in ['MMLU Index', 'MMLU-Pro']: |
|
|
mmlu_col = col |
|
|
|
|
|
if chatbot_col: |
|
|
benchmark_cols.append(chatbot_col) |
|
|
if 'AAI Index' in df_summary.columns: |
|
|
benchmark_cols.append('AAI Index') |
|
|
if mmlu_col: |
|
|
benchmark_cols.append(mmlu_col) |
|
|
|
|
|
benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns] |
|
|
df_benchmark_display = df_summary[benchmark_cols].copy() |
|
|
|
|
|
|
|
|
for df in [df_summary_display, df_benchmark_display]: |
|
|
if 'AutoBench' in df.columns: |
|
|
df['AutoBench'] = pd.to_numeric(df['AutoBench'], errors='coerce') |
|
|
df.sort_values(by='AutoBench', ascending=False, inplace=True) |
|
|
|
|
|
data["summary_display"] = df_summary_display |
|
|
data["benchmark_display"] = df_benchmark_display |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing display data: {e}") |
|
|
data["summary_display"] = df_summary.copy() |
|
|
data["benchmark_display"] = pd.DataFrame() |
|
|
|
|
|
return data |
|
|
|
|
|
|
|
|
def load_data(filepath, separator=','): |
|
|
"""Loads data, handling potential file not found errors.""" |
|
|
if not os.path.exists(filepath): |
|
|
print(f"Warning: Data file not found at {filepath}") |
|
|
return pd.DataFrame() |
|
|
try: |
|
|
|
|
|
df = pd.read_csv(filepath, sep=separator) |
|
|
|
|
|
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] |
|
|
|
|
|
for col in df.columns: |
|
|
if col != 'Model Name' and col != 'model_name': |
|
|
|
|
|
if df[col].astype(str).str.contains(r'^[0-9.,eE-]+$').any(): |
|
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
return df |
|
|
except Exception as e: |
|
|
print(f"Error loading {filepath}: {e}") |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
print("Discovering available runs...") |
|
|
available_runs = discover_available_runs() |
|
|
if not available_runs: |
|
|
print("No runs found! Please check the runs/ directory structure.") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
latest_run = available_runs[0] |
|
|
print(f"Found {len(available_runs)} run(s). Latest: {latest_run['title']}") |
|
|
|
|
|
|
|
|
print("Loading latest run data...") |
|
|
current_data = load_run_data(latest_run["run_id"]) |
|
|
print("Data loading complete.") |
|
|
|
|
|
|
|
|
def create_cost_scatter_plot(data: Dict[str, pd.DataFrame]) -> tuple: |
|
|
"""Create the cost vs rank scatter plot.""" |
|
|
df_summary = data.get("summary", pd.DataFrame()) |
|
|
|
|
|
if df_summary.empty or 'AutoBench' not in df_summary.columns or NEW_COST_COLUMN_SUMMARY not in df_summary.columns: |
|
|
return None, "_(Insufficient data for Rank vs Cost plot)_" |
|
|
|
|
|
plot_df = df_summary.dropna(subset=['AutoBench', NEW_COST_COLUMN_SUMMARY, 'Model']).copy() |
|
|
plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce') |
|
|
plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) |
|
|
|
|
|
if plot_df.empty: |
|
|
return None, "_(No valid data for Rank vs Cost plot)_" |
|
|
|
|
|
fig_cost = px.scatter( |
|
|
plot_df, |
|
|
x=NEW_COST_COLUMN_SUMMARY, |
|
|
y="AutoBench", |
|
|
text="Model", |
|
|
log_x=True, |
|
|
title="AutoBench Rank vs. Average Cost per Response ($ Cents - Log Scale)", |
|
|
labels={'AutoBench': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost ($ Cents) - Log Scale'}, |
|
|
hover_data=['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] |
|
|
) |
|
|
fig_cost.update_traces(textposition='top center') |
|
|
fig_cost.update_layout( |
|
|
xaxis_title="Avg Cost ($ Cents) - Log Scale", |
|
|
yaxis_title="AutoBench Rank", |
|
|
width=1000, |
|
|
height=800, |
|
|
xaxis2=dict( |
|
|
overlaying='x', |
|
|
matches='x', |
|
|
side='top', |
|
|
showticklabels=True, |
|
|
showline=True, |
|
|
title=None |
|
|
) |
|
|
) |
|
|
return fig_cost, "" |
|
|
|
|
|
def create_avg_latency_plot(data: Dict[str, pd.DataFrame]) -> tuple: |
|
|
"""Create the average latency vs rank scatter plot.""" |
|
|
df_summary = data.get("summary", pd.DataFrame()) |
|
|
|
|
|
if df_summary.empty or 'AutoBench' not in df_summary.columns or 'Avg Answer Duration (sec)' not in df_summary.columns: |
|
|
return None, "_(Insufficient data for Rank vs Avg Latency plot)_" |
|
|
|
|
|
plot_df = df_summary.dropna(subset=['AutoBench', 'Avg Answer Duration (sec)', 'Model']).copy() |
|
|
plot_df['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df['Avg Answer Duration (sec)'], errors='coerce') |
|
|
plot_df = plot_df.dropna(subset=['Avg Answer Duration (sec)']) |
|
|
|
|
|
if plot_df.empty: |
|
|
return None, "_(No valid data for Rank vs Avg Latency plot)_" |
|
|
|
|
|
fig_latency = px.scatter( |
|
|
plot_df, |
|
|
x="Avg Answer Duration (sec)", |
|
|
y="AutoBench", |
|
|
text="Model", |
|
|
log_x=True, |
|
|
title="AutoBench Rank vs. Average Latency (Log Scale)", |
|
|
labels={'AutoBench': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'}, |
|
|
hover_data=['Model', 'AutoBench', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY] |
|
|
) |
|
|
fig_latency.update_traces(textposition='top center') |
|
|
fig_latency.update_layout( |
|
|
xaxis_title="Avg Latency (s) - Log Scale", |
|
|
yaxis_title="AutoBench Rank", |
|
|
width=1000, |
|
|
height=800 |
|
|
) |
|
|
return fig_latency, "" |
|
|
|
|
|
def create_p99_latency_plot(data: Dict[str, pd.DataFrame]) -> tuple: |
|
|
"""Create the P99 latency vs rank scatter plot.""" |
|
|
df_summary = data.get("summary", pd.DataFrame()) |
|
|
|
|
|
if df_summary.empty or 'AutoBench' not in df_summary.columns or 'P99 Answer Duration (sec)' not in df_summary.columns: |
|
|
return None, "_(Insufficient data for Rank vs P99 Latency plot)_" |
|
|
|
|
|
plot_df = df_summary.dropna(subset=['AutoBench', 'P99 Answer Duration (sec)', 'Model']).copy() |
|
|
plot_df['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df['P99 Answer Duration (sec)'], errors='coerce') |
|
|
plot_df = plot_df.dropna(subset=['P99 Answer Duration (sec)']) |
|
|
|
|
|
if plot_df.empty: |
|
|
return None, "_(No valid data for Rank vs P99 Latency plot)_" |
|
|
|
|
|
fig_p99 = px.scatter( |
|
|
plot_df, |
|
|
x="P99 Answer Duration (sec)", |
|
|
y="AutoBench", |
|
|
text="Model", |
|
|
log_x=True, |
|
|
title="AutoBench Rank vs. P99 Latency (Log Scale)", |
|
|
labels={'AutoBench': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'}, |
|
|
hover_data=['Model', 'AutoBench', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY] |
|
|
) |
|
|
fig_p99.update_traces(textposition='top center') |
|
|
fig_p99.update_layout( |
|
|
xaxis_title="P99 Latency (s) - Log Scale", |
|
|
yaxis_title="AutoBench Rank", |
|
|
width=1000, |
|
|
height=800 |
|
|
) |
|
|
return fig_p99, "" |
|
|
|
|
|
def update_leaderboard_data(selected_run_id: str) -> tuple: |
|
|
"""Update all leaderboard components when run selection changes.""" |
|
|
if not selected_run_id: |
|
|
|
|
|
empty_df = pd.DataFrame() |
|
|
return ( |
|
|
empty_df, empty_df, empty_df, empty_df, empty_df, empty_df, |
|
|
None, "", None, "", None, "", |
|
|
"No run selected", "" |
|
|
) |
|
|
|
|
|
|
|
|
data = load_run_data(selected_run_id) |
|
|
run_metadata = load_run_metadata(selected_run_id) |
|
|
|
|
|
if not data: |
|
|
empty_df = pd.DataFrame() |
|
|
return ( |
|
|
empty_df, empty_df, empty_df, empty_df, empty_df, empty_df, |
|
|
None, "Error loading data", None, "Error loading data", None, "Error loading data", |
|
|
f"Error loading run: {selected_run_id}", "" |
|
|
) |
|
|
|
|
|
|
|
|
summary_display = data.get("summary_display", pd.DataFrame()) |
|
|
benchmark_display = data.get("benchmark_display", pd.DataFrame()) |
|
|
cost_df = data.get("cost", pd.DataFrame()) |
|
|
avg_latency_df = data.get("avg_latency", pd.DataFrame()) |
|
|
p99_latency_df = data.get("p99_latency", pd.DataFrame()) |
|
|
domain_df = data.get("domain", pd.DataFrame()) |
|
|
|
|
|
|
|
|
overall_rank_display = summary_display.copy() |
|
|
if 'AutoBench' in overall_rank_display.columns: |
|
|
overall_rank_display.rename(columns={'AutoBench': 'Rank'}, inplace=True) |
|
|
|
|
|
|
|
|
def prepare_table_display(df, model_col='model_name'): |
|
|
if df.empty: |
|
|
return df |
|
|
if model_col in df.columns: |
|
|
cols = [model_col] + [col for col in df.columns if col != model_col] |
|
|
return df[cols] |
|
|
return df |
|
|
|
|
|
cost_display = prepare_table_display(cost_df) |
|
|
avg_latency_display = prepare_table_display(avg_latency_df) |
|
|
p99_latency_display = prepare_table_display(p99_latency_df) |
|
|
|
|
|
|
|
|
domain_display = domain_df.copy() |
|
|
if 'Model Name' in domain_display.columns: |
|
|
cols = ['Model Name'] + [col for col in domain_display.columns if col != 'Model Name'] |
|
|
domain_display = domain_display[cols] |
|
|
|
|
|
|
|
|
cost_plot, cost_msg = create_cost_scatter_plot(data) |
|
|
avg_latency_plot, avg_latency_msg = create_avg_latency_plot(data) |
|
|
p99_latency_plot, p99_latency_msg = create_p99_latency_plot(data) |
|
|
|
|
|
|
|
|
info_msg = f"**Current Run:** {run_metadata.get('title', 'Unknown')} ({run_metadata.get('date', 'Unknown date')})" |
|
|
if 'model_count' in run_metadata: |
|
|
info_msg += f" - {run_metadata['model_count']} models" |
|
|
|
|
|
|
|
|
correlations_text = format_correlations_text(data.get("correlations", {})) |
|
|
|
|
|
return ( |
|
|
overall_rank_display, benchmark_display, cost_display, avg_latency_display, p99_latency_display, domain_display, |
|
|
cost_plot, cost_msg, avg_latency_plot, avg_latency_msg, p99_latency_plot, p99_latency_msg, |
|
|
info_msg, correlations_text |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as app: |
|
|
gr.Markdown("# AutoBench LLM Leaderboard") |
|
|
gr.Markdown( |
|
|
"Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. " |
|
|
"Includes performance, cost, and latency metrics. " |
|
|
"Use the dropdown below to navigate between different benchmark runs." |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
|
|
|
run_choices = [(f"{run['date']} - {run['title']}", run['run_id']) for run in available_runs] |
|
|
run_selector = gr.Dropdown( |
|
|
choices=run_choices, |
|
|
value=latest_run["run_id"], |
|
|
label="📊 Select AutoBench Run", |
|
|
info="Choose a benchmark run to view its results" |
|
|
) |
|
|
with gr.Column(scale=2): |
|
|
current_run_info = gr.Markdown( |
|
|
f"**Current Run:** {latest_run['title']} ({latest_run['date']})" + |
|
|
(f" - {latest_run['model_count']} models" if 'model_count' in latest_run else "") |
|
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
with gr.Tab("Overall Ranking"): |
|
|
gr.Markdown("## Overall Model Performance") |
|
|
gr.Markdown("Models ranked by AutoBench score. Lower cost ($ Cents), latency (s), and fail rate (%) are better. Iterations shows the number of evaluations per model.") |
|
|
|
|
|
|
|
|
initial_correlations = format_correlations_text(current_data.get("correlations", {})) |
|
|
correlations_display = gr.Markdown(value=initial_correlations) |
|
|
|
|
|
overall_ranking_table = gr.DataFrame( |
|
|
current_data.get("summary_display", pd.DataFrame()).copy().rename(columns={'AutoBench': 'Rank'}) if 'AutoBench' in current_data.get("summary_display", pd.DataFrame()).columns else current_data.get("summary_display", pd.DataFrame()), |
|
|
interactive=True, |
|
|
label="Overall Rankings" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("Benchmark Comparison"): |
|
|
gr.Markdown("## Benchmark Comparison") |
|
|
gr.Markdown("Comparison of AutoBench scores with other popular benchmarks. AutoBench features 82.51% correlation with Chatbot Arena, 83.74% with Artificial Analysis Intelligence Index, and 71.51% with MMLU. Models sorted by AutoBench score.") |
|
|
|
|
|
benchmark_comparison_table = gr.DataFrame( |
|
|
current_data.get("benchmark_display", pd.DataFrame()), |
|
|
interactive=True, |
|
|
label="Benchmark Comparison" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("Performance Plots"): |
|
|
gr.Markdown("## Performance Visualizations") |
|
|
gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.") |
|
|
|
|
|
|
|
|
gr.Markdown("### Rank vs. Average Cost") |
|
|
initial_cost_plot, initial_cost_msg = create_cost_scatter_plot(current_data) |
|
|
cost_plot = gr.Plot(value=initial_cost_plot) |
|
|
cost_plot_msg = gr.Markdown(value=initial_cost_msg) |
|
|
|
|
|
|
|
|
gr.Markdown("### Rank vs. Average Latency") |
|
|
initial_avg_latency_plot, initial_avg_latency_msg = create_avg_latency_plot(current_data) |
|
|
avg_latency_plot = gr.Plot(value=initial_avg_latency_plot) |
|
|
avg_latency_plot_msg = gr.Markdown(value=initial_avg_latency_msg) |
|
|
|
|
|
|
|
|
gr.Markdown("### Rank vs. P99 Latency") |
|
|
initial_p99_latency_plot, initial_p99_latency_msg = create_p99_latency_plot(current_data) |
|
|
p99_latency_plot = gr.Plot(value=initial_p99_latency_plot) |
|
|
p99_latency_plot_msg = gr.Markdown(value=initial_p99_latency_msg) |
|
|
|
|
|
|
|
|
with gr.Tab("Cost & Latency Analysis"): |
|
|
gr.Markdown("## Performance vs. Cost/Latency Trade-offs") |
|
|
|
|
|
|
|
|
gr.Markdown("### Cost Breakdown per Domain ($ Cents/Response)") |
|
|
cost_df = current_data.get("cost", pd.DataFrame()) |
|
|
if not cost_df.empty and 'model_name' in cost_df.columns: |
|
|
cols = ['model_name'] + [col for col in cost_df.columns if col != 'model_name'] |
|
|
initial_cost_display = cost_df[cols] |
|
|
else: |
|
|
initial_cost_display = cost_df |
|
|
cost_breakdown_table = gr.DataFrame( |
|
|
value=initial_cost_display, |
|
|
interactive=True, |
|
|
label="Cost Breakdown" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### Average Latency Breakdown per Domain (Seconds)") |
|
|
avg_latency_df = current_data.get("avg_latency", pd.DataFrame()) |
|
|
if not avg_latency_df.empty and 'model_name' in avg_latency_df.columns: |
|
|
cols = ['model_name'] + [col for col in avg_latency_df.columns if col != 'model_name'] |
|
|
initial_avg_latency_display = avg_latency_df[cols] |
|
|
else: |
|
|
initial_avg_latency_display = avg_latency_df |
|
|
avg_latency_breakdown_table = gr.DataFrame( |
|
|
value=initial_avg_latency_display, |
|
|
interactive=True, |
|
|
label="Average Latency Breakdown" |
|
|
) |
|
|
|
|
|
gr.Markdown("### P99 Latency Breakdown per Domain (Seconds)") |
|
|
p99_latency_df = current_data.get("p99_latency", pd.DataFrame()) |
|
|
if not p99_latency_df.empty and 'model_name' in p99_latency_df.columns: |
|
|
cols = ['model_name'] + [col for col in p99_latency_df.columns if col != 'model_name'] |
|
|
initial_p99_latency_display = p99_latency_df[cols] |
|
|
else: |
|
|
initial_p99_latency_display = p99_latency_df |
|
|
p99_latency_breakdown_table = gr.DataFrame( |
|
|
value=initial_p99_latency_display, |
|
|
interactive=True, |
|
|
label="P99 Latency Breakdown" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Tab("Domain Performance"): |
|
|
gr.Markdown("## Performance Across Different Domains") |
|
|
gr.Markdown("Model ranks within specific knowledge or task areas. Higher is better.") |
|
|
|
|
|
domain_df = current_data.get("domain", pd.DataFrame()) |
|
|
if not domain_df.empty and 'Model Name' in domain_df.columns: |
|
|
cols = ['Model Name'] + [col for col in domain_df.columns if col != 'Model Name'] |
|
|
initial_domain_display = domain_df[cols] |
|
|
else: |
|
|
initial_domain_display = domain_df |
|
|
domain_performance_table = gr.DataFrame( |
|
|
value=initial_domain_display, |
|
|
interactive=True, |
|
|
label="Domain Performance" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("About AutoBench"): |
|
|
gr.Markdown(""" |
|
|
## About AutoBench |
|
|
|
|
|
AutoBench is an LLM benchmark where Large Language Models (LLMs) evaluate and rank the responses generated by other LLMs. The questions themselves are also generated by LLMs across a diverse set of domains and ranked for quality. |
|
|
|
|
|
### Methodology |
|
|
1. **Question Generation:** High-quality questions across various domains (Coding, History, Science, etc.) are generated by selected LLMs. |
|
|
2. **Response Generation:** The models being benchmarked generate answers to these questions. |
|
|
3. **Ranking:** Ranking LLMs rank the responses from different models for each question, on a 1-5 scale. |
|
|
4. **Aggregation:** Scores are averaged across multiple questions and domains to produce the final AutoBench rank. |
|
|
|
|
|
### Metrics |
|
|
* **AutoBench Score (AB):** The average rank received by a model's responses across all questions/domains (higher is better). |
|
|
* **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better. |
|
|
* **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better. |
|
|
* **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better. |
|
|
* **Chatbot Arena / Artificial Analysis Intelligence Index / MMLU:** Scores from other well-known benchmarks for comparison (where available). |
|
|
|
|
|
### Data |
|
|
This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc.. |
|
|
|
|
|
### Links |
|
|
* [AutoBench Run 2 Results](https://huggingface.co/blog/PeterKruger/autobench-2nd-run) |
|
|
* [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench) |
|
|
* [Autobench Repositories](https://huggingface.co/AutoBench) |
|
|
|
|
|
**Disclaimer:** Benchmark results provide one perspective on model capabilities. Performance can vary based on specific tasks, prompts, and API conditions. Costs are estimates and subject to change by providers. Latency depends on server load and geographic location. |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
run_selector.change( |
|
|
fn=update_leaderboard_data, |
|
|
inputs=[run_selector], |
|
|
outputs=[ |
|
|
overall_ranking_table, |
|
|
benchmark_comparison_table, |
|
|
cost_breakdown_table, |
|
|
avg_latency_breakdown_table, |
|
|
p99_latency_breakdown_table, |
|
|
domain_performance_table, |
|
|
cost_plot, |
|
|
cost_plot_msg, |
|
|
avg_latency_plot, |
|
|
avg_latency_plot_msg, |
|
|
p99_latency_plot, |
|
|
p99_latency_plot_msg, |
|
|
current_run_info, |
|
|
correlations_display |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Launching Gradio app...") |
|
|
app.launch( |
|
|
favicon_path="static/manifest.json" if os.path.exists("static/manifest.json") else None, |
|
|
show_error=True |
|
|
) |
|
|
print("Gradio app launched.") |