benchbench

Runtime error

Yotam-Perlitz commited on Sep 6, 2024

Commit

765f7ba

1 Parent(s): a50e6f5

revise text

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,11 +26,12 @@ st.markdown(
 )
 st.markdown(
-    "We are excited to share the BenchBench-Leaderboard, a crucial component of our comprehensive research work -- [Benchmark Agreement Testing Done Right: A Guide for LLM Benchmark Evaluation](https://arxiv.org/abs/2407.13696). "
-    "This leaderboard is a meta-benchmark that ranks benchmarks based on their agreement with the crowd harnessing many different references. "
 )
 all_scenarios_for_aggragate = Benchmark()
 all_scenarios_for_aggragate.load_local_catalog()
 all_scenarios_for_aggragate = (
@@ -128,8 +129,14 @@ with st.expander("Add your benchmarks here!", icon="🔥"):
         overlap_models = set(aggregate_models).intersection(uploaded_models)
         if len(overlap_models) < n_models_taken_list[0]:
             st.warning(
-                f"You have just {len(overlap_models)} models intersecting with the aggregate!"
-                f"Here are some models you should run your benchmark over:{aggregate_models}"
             )
@@ -191,7 +198,7 @@ def run_load(
             scenario_whitelist=aggregate_scenario_whitelist,
             min_scenario_for_models_to_appear_in_agg=1
             if len(aggregate_scenario_whitelist) == 1
-            else len(aggregate_scenario_whitelist) // 2,
         )
         allbench.extend(my_benchmark)

 )
 st.markdown(
+    """
+    This leaderboard, featured in our work -- [Benchmark Agreement Testing Done Right: A Guide for LLM Benchmark Evaluation](https://arxiv.org/abs/2407.13696),
+    serves as a meta-benchmark. It ranks individual benchmarks based on their agreement with an aggregated reference benchmark, which harnesses insights from numerous diverse benchmarks.
+    """
 )
 all_scenarios_for_aggragate = Benchmark()
 all_scenarios_for_aggragate.load_local_catalog()
 all_scenarios_for_aggragate = (
         overlap_models = set(aggregate_models).intersection(uploaded_models)
         if len(overlap_models) < n_models_taken_list[0]:
             st.warning(
+                f"You have just {len(overlap_models)} models intersecting with the aggregate!\n"
+            )
+            st.info(
+                f"Here are some models you could run your benchmark over:{[m for m in aggregate_models if m not in uploaded_models]}"
+            )
+            st.info(
+                f"Model that you have and the aggragate does not: {[m for m in uploaded_models if m not in aggregate_models]}"
             )
             scenario_whitelist=aggregate_scenario_whitelist,
             min_scenario_for_models_to_appear_in_agg=1
             if len(aggregate_scenario_whitelist) == 1
+            else len(aggregate_scenario_whitelist) // 3,
         )
         allbench.extend(my_benchmark)