File size: 5,963 Bytes
eb8267f 18a3564 eb8267f 18a3564 eb8267f 18a3564 eb8267f 18a3564 eb8267f 18a3564 eb8267f 18a3564 eb8267f 18a3564 eb8267f 18a3564 eb8267f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent))
import pandas as pd
import streamlit as st
from streamlit.elements.lib.column_types import ColumnConfig
from src.strings import (
CITATION_FEV,
CITATION_HEADER,
FEV_BENCHMARK_BASIC_INFO,
FEV_BENCHMARK_DETAILS,
PAIRWISE_BENCHMARK_DETAILS,
get_pivot_legend,
)
from src.utils import (
COLORS,
construct_pairwise_chart,
format_leaderboard,
format_metric_name,
get_metric_description,
)
st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")
TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
SORT_COL = "win_rate"
AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]
@st.cache_data()
def get_leaderboard(metric_name: str) -> pd.DataFrame:
return pd.read_csv(f"tables/leaderboard_{metric_name}.csv")
@st.cache_data()
def get_pairwise(metric_name: str) -> pd.DataFrame:
return pd.read_csv(f"tables/pairwise_{metric_name}.csv")
@st.cache_data()
def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv")
baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv")
leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv")
return pivot_df, baseline_imputed, leakage_imputed
with st.sidebar:
selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
st.caption(get_metric_description(selected_metric))
cols = st.columns(spec=[0.025, 0.95, 0.025])
with cols[1] as main_container:
st.markdown(TITLE, unsafe_allow_html=True)
metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
pairwise_df = get_pairwise(selected_metric)
st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
df_styled = format_leaderboard(metric_df)
st.dataframe(
df_styled,
width="stretch",
hide_index=True,
column_config={
"model_name": ColumnConfig(label="Model Name", alignment="left"),
"win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
"skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
"median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
"training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
"num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
"zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
"org": ColumnConfig(label="Organization", alignment="left"),
"link": st.column_config.LinkColumn(label="Link", display_text="π"),
},
)
with st.expander("See details"):
st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)
st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])
with chart_col_1:
st.altair_chart(
construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
use_container_width=True,
)
with chart_col_2:
st.altair_chart(
construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
use_container_width=True,
)
with st.expander("See details"):
st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)
st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
with st.expander("Show detailed results"):
st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True)
pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric)
pivot_df = pivot_df.set_index("Task name")
baseline_imputed = baseline_imputed.set_index("Task name")
leakage_imputed = leakage_imputed.set_index("Task name")
def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed):
rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}
def highlight_by_position(styler):
for row_idx in errors.index:
row_ranks = errors.loc[row_idx].rank(method="min")
for col_idx in errors.columns:
rank = row_ranks[col_idx]
style_parts = []
if rank <= 3:
style_parts.append(f"background-color: {rank_colors[rank]}")
if is_leakage_imputed.loc[row_idx, col_idx]:
style_parts.append(f"color: {COLORS['leakage_impute']}")
elif is_baseline_imputed.loc[row_idx, col_idx]:
style_parts.append(f"color: {COLORS['failure_impute']}")
elif not style_parts:
style_parts.append(f"color: {COLORS['text_default']}")
if style_parts:
styler = styler.map(
lambda x, s="; ".join(style_parts): s,
subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
)
return styler
return highlight_by_position(errors.style).format(precision=3)
st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed))
st.divider()
st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
st.markdown(CITATION_HEADER)
st.markdown(CITATION_FEV)
|