fev-leaderboard

Running

App Files Files Community

fev-leaderboard / pages /fev_bench.py

shchuro

Add tables

18a3564 30 days ago

raw

history blame contribute delete

5.96 kB

	import sys
	from pathlib import Path

	sys.path.append(str(Path(__file__).parent))

	import pandas as pd
	import streamlit as st
	from streamlit.elements.lib.column_types import ColumnConfig

	from src.strings import (
	CITATION_FEV,
	CITATION_HEADER,
	FEV_BENCHMARK_BASIC_INFO,
	FEV_BENCHMARK_DETAILS,
	PAIRWISE_BENCHMARK_DETAILS,
	get_pivot_legend,
	)
	from src.utils import (
	COLORS,
	construct_pairwise_chart,
	format_leaderboard,
	format_metric_name,
	get_metric_description,
	)

	st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")

	TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
	SORT_COL = "win_rate"
	AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]


	@st.cache_data()
	def get_leaderboard(metric_name: str) -> pd.DataFrame:
	return pd.read_csv(f"tables/leaderboard_{metric_name}.csv")


	@st.cache_data()
	def get_pairwise(metric_name: str) -> pd.DataFrame:
	return pd.read_csv(f"tables/pairwise_{metric_name}.csv")


	@st.cache_data()
	def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
	pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv")
	baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv")
	leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv")
	return pivot_df, baseline_imputed, leakage_imputed


	with st.sidebar:
	selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
	st.caption(get_metric_description(selected_metric))

	cols = st.columns(spec=[0.025, 0.95, 0.025])

	with cols[1] as main_container:
	st.markdown(TITLE, unsafe_allow_html=True)

	metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
	pairwise_df = get_pairwise(selected_metric)

	st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
	st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
	df_styled = format_leaderboard(metric_df)
	st.dataframe(
	df_styled,
	width="stretch",
	hide_index=True,
	column_config={
	"model_name": ColumnConfig(label="Model Name", alignment="left"),
	"win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
	"skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
	"median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
	"training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
	"num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
	"zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
	"org": ColumnConfig(label="Organization", alignment="left"),
	"link": st.column_config.LinkColumn(label="Link", display_text="🔗"),
	},
	)

	with st.expander("See details"):
	st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)

	st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
	chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])

	with chart_col_1:
	st.altair_chart(
	construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
	use_container_width=True,
	)

	with chart_col_2:
	st.altair_chart(
	construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
	use_container_width=True,
	)

	with st.expander("See details"):
	st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)

	st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
	with st.expander("Show detailed results"):
	st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True)
	pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric)
	pivot_df = pivot_df.set_index("Task name")
	baseline_imputed = baseline_imputed.set_index("Task name")
	leakage_imputed = leakage_imputed.set_index("Task name")

	def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed):
	rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}

	def highlight_by_position(styler):
	for row_idx in errors.index:
	row_ranks = errors.loc[row_idx].rank(method="min")
	for col_idx in errors.columns:
	rank = row_ranks[col_idx]
	style_parts = []
	if rank <= 3:
	style_parts.append(f"background-color: {rank_colors[rank]}")
	if is_leakage_imputed.loc[row_idx, col_idx]:
	style_parts.append(f"color: {COLORS['leakage_impute']}")
	elif is_baseline_imputed.loc[row_idx, col_idx]:
	style_parts.append(f"color: {COLORS['failure_impute']}")
	elif not style_parts:
	style_parts.append(f"color: {COLORS['text_default']}")
	if style_parts:
	styler = styler.map(
	lambda x, s="; ".join(style_parts): s,
	subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
	)
	return styler

	return highlight_by_position(errors.style).format(precision=3)

	st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed))

	st.divider()
	st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
	st.markdown(CITATION_HEADER)
	st.markdown(CITATION_FEV)