|
|
|
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
sys.path.append(str(Path(__file__).parent)) |
|
|
|
|
|
import fev |
|
|
import pandas as pd |
|
|
|
|
|
from src.utils import format_leaderboard |
|
|
|
|
|
|
|
|
BASELINE_MODEL = "Seasonal Naive" |
|
|
LEAKAGE_IMPUTATION_MODEL = "Chronos-Bolt" |
|
|
SORT_COL = "win_rate" |
|
|
N_RESAMPLES_FOR_CI = 1000 |
|
|
TOP_K_MODELS_TO_PLOT = 15 |
|
|
AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"] |
|
|
|
|
|
|
|
|
def load_summaries(path="."): |
|
|
csv_files = list(Path(path).glob("*.csv")) |
|
|
if not csv_files: |
|
|
raise FileNotFoundError(f"No CSV files found in {path}") |
|
|
dfs = [pd.read_csv(file) for file in csv_files] |
|
|
return pd.concat(dfs, ignore_index=True) |
|
|
|
|
|
|
|
|
def compute_leaderboard(summaries: pd.DataFrame, metric_name: str) -> pd.DataFrame: |
|
|
lb = fev.analysis.leaderboard( |
|
|
summaries=summaries, |
|
|
metric_column=metric_name, |
|
|
missing_strategy="impute", |
|
|
baseline_model=BASELINE_MODEL, |
|
|
leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, |
|
|
) |
|
|
lb = lb.astype("float64").reset_index() |
|
|
|
|
|
lb["skill_score"] = lb["skill_score"] * 100 |
|
|
lb["win_rate"] = lb["win_rate"] * 100 |
|
|
lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100 |
|
|
return lb |
|
|
|
|
|
|
|
|
def compute_pairwise(summaries: pd.DataFrame, metric_name: str, included_models: list[str]) -> pd.DataFrame: |
|
|
if BASELINE_MODEL not in included_models: |
|
|
included_models = included_models + [BASELINE_MODEL] |
|
|
|
|
|
return ( |
|
|
fev.analysis.pairwise_comparison( |
|
|
summaries, |
|
|
included_models=included_models, |
|
|
metric_column=metric_name, |
|
|
baseline_model=BASELINE_MODEL, |
|
|
missing_strategy="impute", |
|
|
n_resamples=N_RESAMPLES_FOR_CI, |
|
|
leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, |
|
|
) |
|
|
.round(3) |
|
|
.reset_index() |
|
|
) |
|
|
|
|
|
|
|
|
def compute_pivot_table(summaries: pd.DataFrame, metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
|
errors = fev.pivot_table(summaries=summaries, metric_column=metric_name, task_columns=["task_name"]) |
|
|
train_overlap = ( |
|
|
fev.pivot_table(summaries=summaries, metric_column="trained_on_this_dataset", task_columns=["task_name"]) |
|
|
.fillna(False) |
|
|
.astype(bool) |
|
|
) |
|
|
|
|
|
is_imputed_baseline = errors.isna() |
|
|
is_leakage_imputed = train_overlap |
|
|
|
|
|
|
|
|
errors = errors.mask(train_overlap, errors[LEAKAGE_IMPUTATION_MODEL], axis=0) |
|
|
for col in errors.columns: |
|
|
if col != BASELINE_MODEL: |
|
|
errors[col] = errors[col].fillna(errors[BASELINE_MODEL]) |
|
|
|
|
|
errors = errors[errors.rank(axis=1).mean().sort_values().index] |
|
|
is_imputed_baseline = is_imputed_baseline[errors.columns] |
|
|
is_leakage_imputed = is_leakage_imputed[errors.columns] |
|
|
|
|
|
errors.index.rename("Task name", inplace=True) |
|
|
is_imputed_baseline.index.rename("Task name", inplace=True) |
|
|
is_leakage_imputed.index.rename("Task name", inplace=True) |
|
|
|
|
|
return errors.reset_index(), is_imputed_baseline.reset_index(), is_leakage_imputed.reset_index() |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Generate leaderboard tables from CSV summaries") |
|
|
parser.add_argument("-s", "--summaries-path", default=".", help="Path to directory containing CSV files") |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
tables_dir = Path("tables") |
|
|
tables_dir.mkdir(exist_ok=True) |
|
|
|
|
|
print("Loading summaries...") |
|
|
summaries = load_summaries(args.summaries_path) |
|
|
|
|
|
for metric in AVAILABLE_METRICS: |
|
|
print(f"Processing {metric}...") |
|
|
|
|
|
|
|
|
leaderboard_df = compute_leaderboard(summaries, metric) |
|
|
leaderboard_df.to_csv(tables_dir / f"leaderboard_{metric}.csv", index=False) |
|
|
|
|
|
|
|
|
top_k_models = ( |
|
|
leaderboard_df.sort_values(by=SORT_COL, ascending=False).head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist() |
|
|
) |
|
|
|
|
|
|
|
|
pairwise_df = compute_pairwise(summaries, metric, top_k_models) |
|
|
pairwise_df.to_csv(tables_dir / f"pairwise_{metric}.csv", index=False) |
|
|
|
|
|
|
|
|
pivot_df, baseline_imputed, leakage_imputed = compute_pivot_table(summaries, metric) |
|
|
pivot_df.to_csv(tables_dir / f"pivot_{metric}.csv", index=False) |
|
|
baseline_imputed.to_csv(tables_dir / f"pivot_{metric}_baseline_imputed.csv", index=False) |
|
|
leakage_imputed.to_csv(tables_dir / f"pivot_{metric}_leakage_imputed.csv", index=False) |
|
|
|
|
|
print(f" Saved: leaderboard_{metric}.csv, pairwise_{metric}.csv, pivot_{metric}.csv") |
|
|
|
|
|
print(f"All tables saved to {tables_dir}/") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|