Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| import sys | |
| from pathlib import Path | |
| sys.path.append(str(Path(__file__).parent)) | |
| import pandas as pd | |
| import streamlit as st | |
| from streamlit.elements.lib.column_types import ColumnConfig | |
| from src.strings import ( | |
| CITATION_FEV, | |
| CITATION_HEADER, | |
| FEV_BENCHMARK_BASIC_INFO, | |
| FEV_BENCHMARK_DETAILS, | |
| PAIRWISE_BENCHMARK_DETAILS, | |
| get_pivot_legend, | |
| ) | |
| from src.utils import ( | |
| COLORS, | |
| construct_pairwise_chart, | |
| format_leaderboard, | |
| format_metric_name, | |
| get_metric_description, | |
| ) | |
| st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:") | |
| TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>" | |
| SORT_COL = "win_rate" | |
| AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"] | |
| def get_leaderboard(metric_name: str) -> pd.DataFrame: | |
| return pd.read_csv(f"tables/leaderboard_{metric_name}.csv") | |
| def get_pairwise(metric_name: str) -> pd.DataFrame: | |
| return pd.read_csv(f"tables/pairwise_{metric_name}.csv") | |
| def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
| pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv") | |
| baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv") | |
| leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv") | |
| return pivot_df, baseline_imputed, leakage_imputed | |
| with st.sidebar: | |
| selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name) | |
| st.caption(get_metric_description(selected_metric)) | |
| cols = st.columns(spec=[0.025, 0.95, 0.025]) | |
| with cols[1] as main_container: | |
| st.markdown(TITLE, unsafe_allow_html=True) | |
| metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False) | |
| pairwise_df = get_pairwise(selected_metric) | |
| st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True) | |
| st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True) | |
| df_styled = format_leaderboard(metric_df) | |
| st.dataframe( | |
| df_styled, | |
| width="stretch", | |
| hide_index=True, | |
| column_config={ | |
| "model_name": ColumnConfig(label="Model Name", alignment="left"), | |
| "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"), | |
| "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"), | |
| "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"), | |
| "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"), | |
| "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"), | |
| "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"), | |
| "org": ColumnConfig(label="Organization", alignment="left"), | |
| "link": st.column_config.LinkColumn(label="Link", display_text="🔗"), | |
| }, | |
| ) | |
| with st.expander("See details"): | |
| st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True) | |
| st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True) | |
| chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45]) | |
| with chart_col_1: | |
| st.altair_chart( | |
| construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric), | |
| use_container_width=True, | |
| ) | |
| with chart_col_2: | |
| st.altair_chart( | |
| construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric), | |
| use_container_width=True, | |
| ) | |
| with st.expander("See details"): | |
| st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True) | |
| st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True) | |
| with st.expander("Show detailed results"): | |
| st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True) | |
| pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric) | |
| pivot_df = pivot_df.set_index("Task name") | |
| baseline_imputed = baseline_imputed.set_index("Task name") | |
| leakage_imputed = leakage_imputed.set_index("Task name") | |
| def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed): | |
| rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]} | |
| def highlight_by_position(styler): | |
| for row_idx in errors.index: | |
| row_ranks = errors.loc[row_idx].rank(method="min") | |
| for col_idx in errors.columns: | |
| rank = row_ranks[col_idx] | |
| style_parts = [] | |
| if rank <= 3: | |
| style_parts.append(f"background-color: {rank_colors[rank]}") | |
| if is_leakage_imputed.loc[row_idx, col_idx]: | |
| style_parts.append(f"color: {COLORS['leakage_impute']}") | |
| elif is_baseline_imputed.loc[row_idx, col_idx]: | |
| style_parts.append(f"color: {COLORS['failure_impute']}") | |
| elif not style_parts: | |
| style_parts.append(f"color: {COLORS['text_default']}") | |
| if style_parts: | |
| styler = styler.map( | |
| lambda x, s="; ".join(style_parts): s, | |
| subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx], | |
| ) | |
| return styler | |
| return highlight_by_position(errors.style).format(precision=3) | |
| st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed)) | |
| st.divider() | |
| st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True) | |
| st.markdown(CITATION_HEADER) | |
| st.markdown(CITATION_FEV) | |

