Spaces:

autogluon
/

fev-leaderboard-mirror

Running

File size: 5,963 Bytes

import sys
from pathlib import Path

sys.path.append(str(Path(__file__).parent))

import pandas as pd
import streamlit as st
from streamlit.elements.lib.column_types import ColumnConfig

from src.strings import (
    CITATION_FEV,
    CITATION_HEADER,
    FEV_BENCHMARK_BASIC_INFO,
    FEV_BENCHMARK_DETAILS,
    PAIRWISE_BENCHMARK_DETAILS,
    get_pivot_legend,
)
from src.utils import (
    COLORS,
    construct_pairwise_chart,
    format_leaderboard,
    format_metric_name,
    get_metric_description,
)

st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")

TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
SORT_COL = "win_rate"
AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]


@st.cache_data()
def get_leaderboard(metric_name: str) -> pd.DataFrame:
    return pd.read_csv(f"tables/leaderboard_{metric_name}.csv")


@st.cache_data()
def get_pairwise(metric_name: str) -> pd.DataFrame:
    return pd.read_csv(f"tables/pairwise_{metric_name}.csv")


@st.cache_data()
def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv")
    baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv")
    leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv")
    return pivot_df, baseline_imputed, leakage_imputed


with st.sidebar:
    selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
    st.caption(get_metric_description(selected_metric))

cols = st.columns(spec=[0.025, 0.95, 0.025])

with cols[1] as main_container:
    st.markdown(TITLE, unsafe_allow_html=True)

    metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
    pairwise_df = get_pairwise(selected_metric)

    st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
    st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
    df_styled = format_leaderboard(metric_df)
    st.dataframe(
        df_styled,
        width="stretch",
        hide_index=True,
        column_config={
            "model_name": ColumnConfig(label="Model Name", alignment="left"),
            "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
            "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
            "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
            "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
            "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
            "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
            "org": ColumnConfig(label="Organization", alignment="left"),
            "link": st.column_config.LinkColumn(label="Link", display_text="🔗"),
        },
    )

    with st.expander("See details"):
        st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)

    st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
    chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])

    with chart_col_1:
        st.altair_chart(
            construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
            use_container_width=True,
        )

    with chart_col_2:
        st.altair_chart(
            construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
            use_container_width=True,
        )

    with st.expander("See details"):
        st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)

    st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
    with st.expander("Show detailed results"):
        st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True)
        pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric)
        pivot_df = pivot_df.set_index("Task name")
        baseline_imputed = baseline_imputed.set_index("Task name")
        leakage_imputed = leakage_imputed.set_index("Task name")

        def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed):
            rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}

            def highlight_by_position(styler):
                for row_idx in errors.index:
                    row_ranks = errors.loc[row_idx].rank(method="min")
                    for col_idx in errors.columns:
                        rank = row_ranks[col_idx]
                        style_parts = []
                        if rank <= 3:
                            style_parts.append(f"background-color: {rank_colors[rank]}")
                        if is_leakage_imputed.loc[row_idx, col_idx]:
                            style_parts.append(f"color: {COLORS['leakage_impute']}")
                        elif is_baseline_imputed.loc[row_idx, col_idx]:
                            style_parts.append(f"color: {COLORS['failure_impute']}")
                        elif not style_parts:
                            style_parts.append(f"color: {COLORS['text_default']}")
                        if style_parts:
                            styler = styler.map(
                                lambda x, s="; ".join(style_parts): s,
                                subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
                            )
                return styler

            return highlight_by_position(errors.style).format(precision=3)

        st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed))

    st.divider()
    st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
    st.markdown(CITATION_HEADER)
    st.markdown(CITATION_FEV)