Spaces:
Runtime error
Runtime error
Yotam-Perlitz
commited on
Commit
·
765f7ba
1
Parent(s):
a50e6f5
revise text
Browse filesSigned-off-by: Yotam-Perlitz <y.perlitz@ibm.com>
app.py
CHANGED
|
@@ -26,11 +26,12 @@ st.markdown(
|
|
| 26 |
)
|
| 27 |
|
| 28 |
st.markdown(
|
| 29 |
-
"
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
-
|
| 34 |
all_scenarios_for_aggragate = Benchmark()
|
| 35 |
all_scenarios_for_aggragate.load_local_catalog()
|
| 36 |
all_scenarios_for_aggragate = (
|
|
@@ -128,8 +129,14 @@ with st.expander("Add your benchmarks here!", icon="🔥"):
|
|
| 128 |
overlap_models = set(aggregate_models).intersection(uploaded_models)
|
| 129 |
if len(overlap_models) < n_models_taken_list[0]:
|
| 130 |
st.warning(
|
| 131 |
-
f"You have just {len(overlap_models)} models intersecting with the aggregate
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
)
|
| 134 |
|
| 135 |
|
|
@@ -191,7 +198,7 @@ def run_load(
|
|
| 191 |
scenario_whitelist=aggregate_scenario_whitelist,
|
| 192 |
min_scenario_for_models_to_appear_in_agg=1
|
| 193 |
if len(aggregate_scenario_whitelist) == 1
|
| 194 |
-
else len(aggregate_scenario_whitelist) //
|
| 195 |
)
|
| 196 |
|
| 197 |
allbench.extend(my_benchmark)
|
|
|
|
| 26 |
)
|
| 27 |
|
| 28 |
st.markdown(
|
| 29 |
+
"""
|
| 30 |
+
This leaderboard, featured in our work -- [Benchmark Agreement Testing Done Right: A Guide for LLM Benchmark Evaluation](https://arxiv.org/abs/2407.13696),
|
| 31 |
+
serves as a meta-benchmark. It ranks individual benchmarks based on their agreement with an aggregated reference benchmark, which harnesses insights from numerous diverse benchmarks.
|
| 32 |
+
"""
|
| 33 |
)
|
| 34 |
|
|
|
|
| 35 |
all_scenarios_for_aggragate = Benchmark()
|
| 36 |
all_scenarios_for_aggragate.load_local_catalog()
|
| 37 |
all_scenarios_for_aggragate = (
|
|
|
|
| 129 |
overlap_models = set(aggregate_models).intersection(uploaded_models)
|
| 130 |
if len(overlap_models) < n_models_taken_list[0]:
|
| 131 |
st.warning(
|
| 132 |
+
f"You have just {len(overlap_models)} models intersecting with the aggregate!\n"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
st.info(
|
| 136 |
+
f"Here are some models you could run your benchmark over:{[m for m in aggregate_models if m not in uploaded_models]}"
|
| 137 |
+
)
|
| 138 |
+
st.info(
|
| 139 |
+
f"Model that you have and the aggragate does not: {[m for m in uploaded_models if m not in aggregate_models]}"
|
| 140 |
)
|
| 141 |
|
| 142 |
|
|
|
|
| 198 |
scenario_whitelist=aggregate_scenario_whitelist,
|
| 199 |
min_scenario_for_models_to_appear_in_agg=1
|
| 200 |
if len(aggregate_scenario_whitelist) == 1
|
| 201 |
+
else len(aggregate_scenario_whitelist) // 3,
|
| 202 |
)
|
| 203 |
|
| 204 |
allbench.extend(my_benchmark)
|