fev-leaderboard / pages /fev_bench.py
shchuro's picture
Add tables
18a3564
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent))
import pandas as pd
import streamlit as st
from streamlit.elements.lib.column_types import ColumnConfig
from src.strings import (
CITATION_FEV,
CITATION_HEADER,
FEV_BENCHMARK_BASIC_INFO,
FEV_BENCHMARK_DETAILS,
PAIRWISE_BENCHMARK_DETAILS,
get_pivot_legend,
)
from src.utils import (
COLORS,
construct_pairwise_chart,
format_leaderboard,
format_metric_name,
get_metric_description,
)
st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")
TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
SORT_COL = "win_rate"
AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]
@st.cache_data()
def get_leaderboard(metric_name: str) -> pd.DataFrame:
return pd.read_csv(f"tables/leaderboard_{metric_name}.csv")
@st.cache_data()
def get_pairwise(metric_name: str) -> pd.DataFrame:
return pd.read_csv(f"tables/pairwise_{metric_name}.csv")
@st.cache_data()
def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv")
baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv")
leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv")
return pivot_df, baseline_imputed, leakage_imputed
with st.sidebar:
selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
st.caption(get_metric_description(selected_metric))
cols = st.columns(spec=[0.025, 0.95, 0.025])
with cols[1] as main_container:
st.markdown(TITLE, unsafe_allow_html=True)
metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
pairwise_df = get_pairwise(selected_metric)
st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
df_styled = format_leaderboard(metric_df)
st.dataframe(
df_styled,
width="stretch",
hide_index=True,
column_config={
"model_name": ColumnConfig(label="Model Name", alignment="left"),
"win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
"skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
"median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
"training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
"num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
"zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
"org": ColumnConfig(label="Organization", alignment="left"),
"link": st.column_config.LinkColumn(label="Link", display_text="🔗"),
},
)
with st.expander("See details"):
st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)
st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])
with chart_col_1:
st.altair_chart(
construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
use_container_width=True,
)
with chart_col_2:
st.altair_chart(
construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
use_container_width=True,
)
with st.expander("See details"):
st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)
st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
with st.expander("Show detailed results"):
st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True)
pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric)
pivot_df = pivot_df.set_index("Task name")
baseline_imputed = baseline_imputed.set_index("Task name")
leakage_imputed = leakage_imputed.set_index("Task name")
def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed):
rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}
def highlight_by_position(styler):
for row_idx in errors.index:
row_ranks = errors.loc[row_idx].rank(method="min")
for col_idx in errors.columns:
rank = row_ranks[col_idx]
style_parts = []
if rank <= 3:
style_parts.append(f"background-color: {rank_colors[rank]}")
if is_leakage_imputed.loc[row_idx, col_idx]:
style_parts.append(f"color: {COLORS['leakage_impute']}")
elif is_baseline_imputed.loc[row_idx, col_idx]:
style_parts.append(f"color: {COLORS['failure_impute']}")
elif not style_parts:
style_parts.append(f"color: {COLORS['text_default']}")
if style_parts:
styler = styler.map(
lambda x, s="; ".join(style_parts): s,
subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
)
return styler
return highlight_by_position(errors.style).format(precision=3)
st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed))
st.divider()
st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
st.markdown(CITATION_HEADER)
st.markdown(CITATION_FEV)