File size: 5,963 Bytes
eb8267f
 
 
18a3564
eb8267f
 
 
 
 
 
 
 
 
 
 
 
 
 
18a3564
eb8267f
 
 
 
 
 
 
 
 
 
 
 
 
 
18a3564
 
eb8267f
 
 
18a3564
 
eb8267f
 
 
18a3564
 
 
 
 
eb8267f
 
 
 
 
 
 
 
 
 
 
 
18a3564
eb8267f
 
 
 
 
 
18a3564
eb8267f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18a3564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb8267f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).parent))

import pandas as pd
import streamlit as st
from streamlit.elements.lib.column_types import ColumnConfig

from src.strings import (
    CITATION_FEV,
    CITATION_HEADER,
    FEV_BENCHMARK_BASIC_INFO,
    FEV_BENCHMARK_DETAILS,
    PAIRWISE_BENCHMARK_DETAILS,
    get_pivot_legend,
)
from src.utils import (
    COLORS,
    construct_pairwise_chart,
    format_leaderboard,
    format_metric_name,
    get_metric_description,
)

st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")

TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
SORT_COL = "win_rate"
AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]


@st.cache_data()
def get_leaderboard(metric_name: str) -> pd.DataFrame:
    return pd.read_csv(f"tables/leaderboard_{metric_name}.csv")


@st.cache_data()
def get_pairwise(metric_name: str) -> pd.DataFrame:
    return pd.read_csv(f"tables/pairwise_{metric_name}.csv")


@st.cache_data()
def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv")
    baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv")
    leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv")
    return pivot_df, baseline_imputed, leakage_imputed


with st.sidebar:
    selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
    st.caption(get_metric_description(selected_metric))

cols = st.columns(spec=[0.025, 0.95, 0.025])

with cols[1] as main_container:
    st.markdown(TITLE, unsafe_allow_html=True)

    metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
    pairwise_df = get_pairwise(selected_metric)

    st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
    st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
    df_styled = format_leaderboard(metric_df)
    st.dataframe(
        df_styled,
        width="stretch",
        hide_index=True,
        column_config={
            "model_name": ColumnConfig(label="Model Name", alignment="left"),
            "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
            "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
            "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
            "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
            "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
            "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
            "org": ColumnConfig(label="Organization", alignment="left"),
            "link": st.column_config.LinkColumn(label="Link", display_text="πŸ”—"),
        },
    )

    with st.expander("See details"):
        st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)

    st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
    chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])

    with chart_col_1:
        st.altair_chart(
            construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
            use_container_width=True,
        )

    with chart_col_2:
        st.altair_chart(
            construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
            use_container_width=True,
        )

    with st.expander("See details"):
        st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)

    st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
    with st.expander("Show detailed results"):
        st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True)
        pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric)
        pivot_df = pivot_df.set_index("Task name")
        baseline_imputed = baseline_imputed.set_index("Task name")
        leakage_imputed = leakage_imputed.set_index("Task name")

        def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed):
            rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}

            def highlight_by_position(styler):
                for row_idx in errors.index:
                    row_ranks = errors.loc[row_idx].rank(method="min")
                    for col_idx in errors.columns:
                        rank = row_ranks[col_idx]
                        style_parts = []
                        if rank <= 3:
                            style_parts.append(f"background-color: {rank_colors[rank]}")
                        if is_leakage_imputed.loc[row_idx, col_idx]:
                            style_parts.append(f"color: {COLORS['leakage_impute']}")
                        elif is_baseline_imputed.loc[row_idx, col_idx]:
                            style_parts.append(f"color: {COLORS['failure_impute']}")
                        elif not style_parts:
                            style_parts.append(f"color: {COLORS['text_default']}")
                        if style_parts:
                            styler = styler.map(
                                lambda x, s="; ".join(style_parts): s,
                                subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
                            )
                return styler

            return highlight_by_position(errors.style).format(precision=3)

        st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed))

    st.divider()
    st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
    st.markdown(CITATION_HEADER)
    st.markdown(CITATION_FEV)