benchbench

Runtime error

App Files Files Community

Yotam-Perlitz commited on Sep 3, 2024

Commit

1e20a46

1 Parent(s): 3ce2cf9

update example file

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (2) hide show

app.py +84 -274
assets/{mybench.csv → mybench_240901.csv} +0 -0

app.py CHANGED Viewed

@@ -7,221 +7,15 @@ import streamlit as st
 from bat import Benchmark, Config, Reporter, Tester
-def get_nice_benchmark_name(bench_name):
-    prettified_names = {
-        "holmes": "Holmes",
-        "helm_lite_narrativeqa": "Helm Lite NarrativeQA",
-        "helm_lite_naturalquestionsopen": "Helm Lite NaturalQuestionsOpen",
-        "helm_lite_naturalquestionsclosed": "Helm Lite NaturalQuestionsClosed",
-        "helm_lite_openbookqa": "Helm Lite OpenBookQA",
-        "helm_lite_mmlu": "Helm Lite MMLU",
-        "helm_lite_math_equivalentcot": "Helm Lite MathEquivalentCOT",
-        "helm_lite_gsm8k": "Helm Lite GSM8K",
-        "helm_lite_legalbench": "Helm Lite LegalBench",
-        "helm_lite_medqa": "Helm Lite MedQA",
-        "helm_lite_wmt2014": "Helm Lite WMT2014",
-        "hfv2_bbh": "HFv2 BBH",
-        "hfv2_bbh_raw": "HFv2 BBH Raw",
-        "hfv2_gpqa": "HFv2 GPQA",
-        "hfv2_ifeval": "HFv2 IFEval",
-        "hfv2_math_lvl_5": "HFv2 Math Level 5",
-        "hfv2_mmlu_pro": "HFv2 MMLU Pro",
-        "hfv2_musr": "HFv2 MuSR",
-        "oc_mmlu": "OpenCompass MMLU",
-        "oc_mmlu_pro": "OpenCompass MMLU Pro",
-        "oc_cmmlu": "OpenCompass CMMLU",
-        "oc_bbh": "OpenCompass BBH",
-        "oc_gqpa_dimand": "OpenCompass GQPA-Dimand",
-        "oc_humaneval": "OpenCompass HumanEval",
-        "oc_ifeval": "OpenCompass IFEval",
-        "helm_mmlu": "Helm MMLU",
-        "helm_boolq": "Helm BoolQ",
-        "helm_narrativeqa": "Helm NarrativeQA",
-        "helm_naturalquestionsclosed": "Helm NaturalQuestionsClosed",
-        "helm_naturalquestionsopen": "Helm NaturalQuestionsOpen",
-        "helm_quac": "Helm QuAC",
-        "helm_openbookqa": "Helm OpenBookQA",
-        "helm_imdb": "Helm IMDB",
-        "helm_civilcomments": "Helm CivilComments",
-        "helm_raft": "Helm RAFT",
-        "mmlu_pro": "MMLU Pro",
-        "mixeval_triviaqa": "MixEval TriviaQA",
-        "mixeval_mmlu": "MixEval MMLU",
-        "mixeval_drop": "MixEval DROP",
-        "mixeval_hellaswag": "MixEval HellaSwag",
-        "mixeval_commonsenseqa": "MixEval CommonsenseQA",
-        "mixeval_triviaqa_hard": "MixEval TriviaQA Hard",
-        "mixeval_mmlu_hard": "MixEval MMLU Hard",
-        "mixeval_drop_hard": "MixEval DROP Hard",
-        "oc_language": "OpenCompass Language",
-        "oc_knowledge": "OpenCompass Knowledge",
-        "oc_reasoning": "OpenCompass Reasoning",
-        "oc_math": "OpenCompass Math",
-        "oc_code": "OpenCompass Code",
-        "oc_instruct": "OpenCompass Instruction",
-        "oc_agent": "OpenCompass Agent",
-        "oc_arena": "OpenCompass Arena",
-        "lb_reasoning": "LiveBench Reasoning",
-        "lb_coding": "LiveBench Coding",
-        "lb_mathematics": "LiveBench Mathematics",
-        "lb_data_analysis": "LiveBench Data Analysis",
-        "lb_language": "LiveBench Language",
-        "lb_if": "LiveBench Instruction Following",
-        "wb_info_seek": "WildBench Information Seeking",
-        "wb_creative": "WildBench Creative",
-        "wb_code_debug": "WildBench Code Debugging",
-        "wb_math_data": "WildBench Math & Data",
-        "wb_reason_plan": "WildBench Reasoning & Planning",
-        "wb_score": "WildBench Score",
-        "hfv1_arc": "HFv1 ARC",
-        "hfv1_gsm8k": "HFv1 GSM8K",
-        "hfv1_hellaswag": "HFv1 HellaSwag",
-        "hfv1_mmlu": "HFv1 MMLU",
-        "hfv1_truthfulqa": "HFv1 TruthfulQA",
-        "hfv1_winogrande": "HFv1 Winogrande",
-        "biggen_grounding": "BigBench Grounding",
-        "biggen_instruction_following": "BigBench Instruction Following",
-        "biggen_planning": "BigBench Planning",
-        "biggen_reasoning": "BigBench Reasoning",
-        "biggen_refinement": "BigBench Refinement",
-        "biggen_safety": "BigBench Safety",
-        "biggen_theory_of_mind": "BigBench Theory of Mind",
-        "biggen_tool_usage": "BigBench Tool Usage",
-        "biggen_multilingual": "BigBench Multilingual",
-        "lb_reasoning_average": "LiveBench Reasoning Average",
-        "lb_coding_average": "LiveBench Coding Average",
-        "lb_mathematics_average": "LiveBench Mathematics Average",
-        "lb_data_analysis_average": "LiveBench Data Analysis Average",
-        "lb_language_average": "LiveBench Language Average",
-        "lb_if_average": "LiveBench Instruction Following Average",
-        "helm_lite": "Helm Lite",
-        "hf_open_llm_v2": "HF OpenLLM v2",
-        "opencompass_academic": "OpenCompass Academic",
-        "arena_elo": "Arena Elo",
-        "helm_classic": "Helm Classic",
-        "mixeval": "MixEval",
-        "mixeval_hard": "MixEval Hard",
-        "opencompass": "OpenCompass",
-        "alphacaeval_v2lc": "AlphacaEval v2lc",
-        "livebench_240725": "LiveBench 240725",
-        "wb_elo_lc": "WildBench Elo LC",
-        "arena_hard": "Arena Hard",
-        "agentbench": "AgentBench",
-        "hf_open_llm_v1": "HF OpenLLM v1",
-        "biggen": "BigBench",
-        "livebench_240624": "LiveBench 240624",
-        "mt_bench": "MT-Bench",
-    }
-    if bench_name in prettified_names:
-        return prettified_names[bench_name]
-    else:
-        return bench_name
 holistic_scenarios = [
-    get_nice_benchmark_name(scen)
-    for scen in [
-        # "holmes",
-        "helm_lite",
-        # "narrativeqa",
-        # "naturalquestionsopen",
-        # "naturalquestionsclosed",
-        # "openbookqa",
-        # "mmlu",
-        # "math_equivalentcot",
-        # "gsm8k",
-        # "legalbench",
-        # "medqa",
-        # "wmt2014",
-        # "arc_c",
-        # "arc_e",
-        # "boolq",
-        # "csqa",
-        # "hellaswag",
-        # "piqa",
-        # "siqa",
-        # "winogrande",
-        # "olmes_average",
-        # "bbh",
-        # "bbh_raw",
-        # "gpqa",
-        "hf_open_llm_v2",
-        # "ifeval",
-        # "math_lvl_5",
-        # "mmlu_pro",
-        # "musr",
-        "opencompass_academic",
-        # "oc_mmlu",
-        # "oc_mmlu_pro",
-        # "oc_cmmlu",
-        # "oc_bbh",
-        # "oc_gqpa_dimand",
-        # "oc_math",
-        # "oc_humaneval",
-        # "oc_ifeval",
-        # "helm_mmlu",
-        "arena_elo",
-        "helm_classic",
-        # "quac",
-        # "truthfulqa",
-        # "ms_marcoregular",
-        # "ms_marcotrec",
-        # "cnn/dailymail",
-        # "xsum",
-        # "imdb",
-        # "civilcomments",
-        # "raft",
-        "mixeval_hard",
-        "mixeval",
-        # "arena_elo0527",
-        "opencompass",
-        # "oc_language",
-        # "oc_knowledge",
-        # "oc_reasoning",
-        # "oc_code",
-        # "oc_instruct",
-        # "oc_agent",
-        # "oc_arena",
-        "alphacaeval_v2lc",
-        "livebench_240725",
-        "livebench_240624",
-        # "lb_reasoning",
-        # "lb_coding",
-        # "lb_mathematics",
-        # "lb_data_analysis",
-        # "lb_language",
-        # "lb_if",
-        "wb_elo_lc",
-        # "wb_info_seek",
-        # "wb_creative",
-        # "wb_code_debug",
-        # "wb_math_data",
-        # "wb_reason_plan",
-        # "wb_score",
-        # "boolqmixed",
-        "arena_hard",
-        "agentbench",
-        # "arc",
-        "hf_open_llm_v1",
-        "biggen",
-        # "biggen_grounding",
-        # "biggen_instruction_following",
-        # "biggen_planning",
-        # "biggen_reasoning",
-        # "biggen_refinement",
-        # "biggen_safety",
-        # "biggen_theory_of_mind",
-        # "biggen_tool_usage",
-        # "biggen_multilingual",
-        # "lb_global_average",
-        # "lb_reasoning_average",
-        # "lb_coding_average",
-        # "lb_mathematics_average",
-        # "lb_data_analysis_average",
-        # "lb_language_average",
-        # "lb_if_average",
-    ]
 ]
@@ -245,30 +39,31 @@ all_scenarios_for_aggragate = (
 st.subheader("The Leaderboard", divider=True)
 # st.subheader("🏋️‍♂️ BenchBench Leaderboard 🏋", divider=True)
-leftcol, rightcol = st.columns([2, 1])
-with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
-    with st.form("my_form"):
-        all_scenarios_for_aggragate_with_all = [
-            get_nice_benchmark_name(scenario)
-            for scenario in all_scenarios_for_aggragate
-        ]
-        aggragate_scenarios = st.multiselect(
-            "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
-            all_scenarios_for_aggragate,
-            holistic_scenarios,
-        )
         corr_type = st.selectbox(
             label="Select Correlation type", options=["kendall", "pearson"], index=0
         )
-        aggragate_scenario_blacklist = [
-            scen
-            for scen in all_scenarios_for_aggragate
-            if scen not in aggragate_scenarios
-        ]
         model_select_strategy = st.selectbox(
             label="Select strategy",
@@ -289,23 +84,25 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
         submitted = st.form_submit_button(label="Run BAT")
-uploaded_file = st.file_uploader("add your benchmark as a CSV")
-st.download_button(
-    label="Download example CSV",
-    data=pd.read_csv("assets/mybench.csv").to_csv(index=False).encode("utf-8"),
-    file_name="mybench.csv",
-    mime="text/csv",
-)
-my_benchmark = Benchmark()
-if uploaded_file is not None:
-    df = pd.read_csv(uploaded_file)
-    my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
 def run_load(
-    aggragate_scenario_blacklist=[],
     n_models_taken_list=[5],
     model_select_strategy_list=["random"],
     corr_types=["kendall"],
@@ -315,7 +112,7 @@ def run_load(
 ):
     # Create a hash of the inputs to generate a unique cache file for each set of inputs
     input_str = (
-        str(aggragate_scenario_blacklist)
         + str(n_models_taken_list)
         + str(model_select_strategy_list)
         + str(corr_types)
@@ -358,25 +155,30 @@ def run_load(
             n_exps=n_exps if n_models_taken_list != [0] else 1,
         )
-        holistic = Benchmark()
-        holistic.load_local_catalog()
-        holistic.df = holistic.df.query("scenario in @holistic_scenarios")
-        holistic.clear_repeated_scenarios()
-        holistic.add_aggragete(
             new_col_name="aggregate",
-            agg_source_name="holistic",
-            scenario_blacklist=aggragate_scenario_blacklist,
-            min_scenario_for_models_to_appear_in_agg=5,
         )
-        aggragate_scores = holistic.df.query('scenario=="aggregate"')[
             ["model", "score"]
         ].sort_values(by="score", ascending=False)
-        allbench = Benchmark()
-        allbench.load_local_catalog()
         # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
         allbench.extend(my_benchmark)
@@ -384,8 +186,8 @@ def run_load(
         allbench.clear_repeated_scenarios()
         # removing and adding the holistic scenarios
-        allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
-        allbench = allbench.extend(holistic)
         tester = Tester(cfg=cfg)
@@ -403,7 +205,7 @@ def run_load(
 agreements, aggragare_score_df = run_load(
-    aggragate_scenario_blacklist=aggragate_scenario_blacklist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
     corr_types=[corr_type],
@@ -422,12 +224,18 @@ z_scores["corr_with_agg"] = z_scores["corr_with_agg"].round(2)
 z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
 # z_scores["n_models_of_corr_with_agg"] = z_scores["n_models_of_corr_with_agg"].round(1)
-z_scores["date"] = z_scores["source"].apply(lambda x: x.split(".csv")[0].split("_")[-1])
-# print(z_scores["scenario"].unique().tolist())
-z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
 data = (
     z_scores.rename(
         columns={
@@ -468,24 +276,26 @@ styled_data = (
         vmax=1,
     )
     .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
 )
-# print(data["Benchmark"].unique().tolist())
 st.dataframe(
     data=styled_data,
-    column_order=[
-        "Benchmark",
-        "Z Score",
-        corr_name,
-        "p-value of Corr.",
-        "Snapshot Date",
-    ],
     hide_index=True,
     use_container_width=True,
     height=500,
 )
 aggragare_score_df.rename(
     columns={
         "model": "Model",
@@ -787,7 +597,7 @@ benchmarks = data["Benchmark"].unique().tolist()
 plotted_scenario = st.selectbox(
     "Choose Benchmark to plot",
     benchmarks,
-    index=benchmarks.index("Arena Elo"),
 )

 from bat import Benchmark, Config, Reporter, Tester
 holistic_scenarios = [
+    "Helm Lite",
+    "HF OpenLLM v2",
+    "OpenCompass Academic",
+    "LMSys Arena",
+    "Helm Classic",
+    "AlphacaEval v2lc",
+    "LiveBench 240725",
+    "WildBench Elo LC",
 ]
 st.subheader("The Leaderboard", divider=True)
 # st.subheader("🏋️‍♂️ BenchBench Leaderboard 🏋", divider=True)
+with st.form("my_form_0"):
+    # leftcol, rightcol = st.columns([5, 1])
+    # with leftcol:
+    aggragate_scenarios = st.multiselect(
+        "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
+        all_scenarios_for_aggragate,
+        holistic_scenarios,
+    )
+    # with rightcol:
+    # st.markdown("###")
+    submitted = st.form_submit_button(label="\n\nRun BAT\n\n")
+with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
+    with st.form("my_form_1"):
         corr_type = st.selectbox(
             label="Select Correlation type", options=["kendall", "pearson"], index=0
         )
+        aggragate_scenario_whitelist = aggragate_scenarios
+        # [
+        #     scen
+        #     for scen in all_scenarios_for_aggragate
+        #     if scen not in aggragate_scenarios
+        # ]
         model_select_strategy = st.selectbox(
             label="Select strategy",
         submitted = st.form_submit_button(label="Run BAT")
+with st.expander("Add your benchmarks here!", icon="🔥"):
+    uploaded_file = st.file_uploader("Add your benchmark as a CSV")
+    st.download_button(
+        label="Download example CSV",
+        data=pd.read_csv("assets/mybench_240901.csv")
+        .to_csv(index=False)
+        .encode("utf-8"),
+        file_name="mybench_240901.csv",
+        mime="text/csv",
+    )
+    my_benchmark = Benchmark()
+    if uploaded_file is not None:
+        df = pd.read_csv(uploaded_file)
+        my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
 def run_load(
+    aggregate_scenario_whitelist,
     n_models_taken_list=[5],
     model_select_strategy_list=["random"],
     corr_types=["kendall"],
 ):
     # Create a hash of the inputs to generate a unique cache file for each set of inputs
     input_str = (
+        str(aggregate_scenario_whitelist)
         + str(n_models_taken_list)
         + str(model_select_strategy_list)
         + str(corr_types)
             n_exps=n_exps if n_models_taken_list != [0] else 1,
         )
+        # holistic = Benchmark()
+        # holistic.load_local_catalog()
+        # holistic.df = holistic.df.query("scenario in @holistic_scenarios")
+        # holistic.clear_repeated_scenarios()
+        # aggragate_scores = holistic.df.query('scenario=="aggregate"')[
+        #     ["model", "score"]
+        # ].sort_values(by="score", ascending=False)
+        allbench = Benchmark()
+        allbench.load_local_catalog()
+        allbench.add_aggregate(
             new_col_name="aggregate",
+            agg_source_name="aggregate",
+            scenario_whitelist=aggregate_scenario_whitelist,
+            min_scenario_for_models_to_appear_in_agg=1,
         )
+        aggragate_scores = allbench.df.query('scenario=="aggregate"')[
             ["model", "score"]
         ].sort_values(by="score", ascending=False)
         # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
         allbench.extend(my_benchmark)
         allbench.clear_repeated_scenarios()
         # removing and adding the holistic scenarios
+        # allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
+        # allbench = allbench.extend(holistic)
         tester = Tester(cfg=cfg)
 agreements, aggragare_score_df = run_load(
+    aggregate_scenario_whitelist=aggragate_scenario_whitelist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
     corr_types=[corr_type],
 z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
 # z_scores["n_models_of_corr_with_agg"] = z_scores["n_models_of_corr_with_agg"].round(1)
+z_scores["date"] = z_scores["source"].apply(
+    lambda x: x.split(".csv")[0].split("_")[-1]
+    if "frozen" not in x
+    else x.split(".csv")[0].split("_")[-2]
+)
+# print(z_scores["scenario"].unique().tolist())
+# z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
+z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
+# , format="%y%m%d"
 data = (
     z_scores.rename(
         columns={
         vmax=1,
     )
     .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
+    .set_properties(**{"text-align": "center"})
 )
+cols_used = [
+    "Benchmark",
+    "Z Score",
+    corr_name,
+    "p-value of Corr.",
+    "Snapshot Date",
+]
 st.dataframe(
     data=styled_data,
+    column_order=cols_used,
     hide_index=True,
     use_container_width=True,
     height=500,
+    column_config={col: {"alignment": "center"} for col in cols_used},
 )
 aggragare_score_df.rename(
     columns={
         "model": "Model",
 plotted_scenario = st.selectbox(
     "Choose Benchmark to plot",
     benchmarks,
+    index=benchmarks.index("LMSys Arena"),
 )

assets/{mybench.csv → mybench_240901.csv} RENAMED Viewed

File without changes