Terry Zhuo
commited on
Commit
·
ae7a86d
1
Parent(s):
1e748fb
fix
Browse files- app.py +27 -8
- src/utils.py +1 -2
app.py
CHANGED
|
@@ -109,7 +109,7 @@ def select_columns(df, columns):
|
|
| 109 |
return filtered_df
|
| 110 |
|
| 111 |
|
| 112 |
-
def
|
| 113 |
if query == "all":
|
| 114 |
return df[leaderboard_table.columns]
|
| 115 |
else:
|
|
@@ -118,6 +118,16 @@ def filter_items(df, leaderboard_table, query):
|
|
| 118 |
return filtered_df[leaderboard_table.columns]
|
| 119 |
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def search_table(df, leaderboard_table, query):
|
| 122 |
filtered_df = df[(df["model"].str.contains(query, case=False))]
|
| 123 |
return filtered_df[leaderboard_table.columns]
|
|
@@ -174,13 +184,18 @@ with demo:
|
|
| 174 |
show_label=False,
|
| 175 |
elem_id="search-bar",
|
| 176 |
)
|
| 177 |
-
|
| 178 |
label="⏚ Filter model types",
|
| 179 |
-
choices=["all", "🟢 base", "🔶 instruction-tuned", "EXT external-evaluation"],
|
| 180 |
value="all",
|
| 181 |
elem_id="filter-columns",
|
| 182 |
)
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
leaderboard_df = gr.components.Dataframe(
|
| 185 |
value=df[
|
| 186 |
[
|
|
@@ -210,9 +225,14 @@ with demo:
|
|
| 210 |
[hidden_leaderboard_df, leaderboard_df, search_bar],
|
| 211 |
leaderboard_df,
|
| 212 |
)
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
[hidden_leaderboard_df, leaderboard_df,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
leaderboard_df,
|
| 217 |
)
|
| 218 |
shown_columns.change(
|
|
@@ -229,7 +249,6 @@ with demo:
|
|
| 229 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
| 230 |
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
| 231 |
- `size` is the amount of activated model weight during inference.
|
| 232 |
-
- Some instruction-tuned models are marked with 🟢 symbol, as they miss the chat templates in their tokenizer configurations.
|
| 233 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 234 |
- For more details check the 📝 About section.
|
| 235 |
- Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
|
|
|
|
| 109 |
return filtered_df
|
| 110 |
|
| 111 |
|
| 112 |
+
def filter_types(df, leaderboard_table, query):
|
| 113 |
if query == "all":
|
| 114 |
return df[leaderboard_table.columns]
|
| 115 |
else:
|
|
|
|
| 118 |
return filtered_df[leaderboard_table.columns]
|
| 119 |
|
| 120 |
|
| 121 |
+
def filter_direct_complete(df, leaderboard_table, query):
|
| 122 |
+
if query == "all":
|
| 123 |
+
return df[leaderboard_table.columns]
|
| 124 |
+
|
| 125 |
+
if query == "chat template":
|
| 126 |
+
return df[~df["direct_complete"]][leaderboard_table.columns]
|
| 127 |
+
else:
|
| 128 |
+
return df[df["direct_complete"]][leaderboard_table.columns]
|
| 129 |
+
|
| 130 |
+
|
| 131 |
def search_table(df, leaderboard_table, query):
|
| 132 |
filtered_df = df[(df["model"].str.contains(query, case=False))]
|
| 133 |
return filtered_df[leaderboard_table.columns]
|
|
|
|
| 184 |
show_label=False,
|
| 185 |
elem_id="search-bar",
|
| 186 |
)
|
| 187 |
+
filter_types_columns = gr.Radio(
|
| 188 |
label="⏚ Filter model types",
|
| 189 |
+
choices=["all", "🟢 base", "🔶 instruction-tuned"], #, "EXT external-evaluation"],
|
| 190 |
value="all",
|
| 191 |
elem_id="filter-columns",
|
| 192 |
)
|
| 193 |
+
filter_prompting_columns = gr.Radio(
|
| 194 |
+
label="⏚ Filter prompting",
|
| 195 |
+
choices=["all", "chat template", "direct complete"],
|
| 196 |
+
value="all",
|
| 197 |
+
elem_id="filter-direct-complete",
|
| 198 |
+
)
|
| 199 |
leaderboard_df = gr.components.Dataframe(
|
| 200 |
value=df[
|
| 201 |
[
|
|
|
|
| 225 |
[hidden_leaderboard_df, leaderboard_df, search_bar],
|
| 226 |
leaderboard_df,
|
| 227 |
)
|
| 228 |
+
filter_types_columns.change(
|
| 229 |
+
filter_types,
|
| 230 |
+
[hidden_leaderboard_df, leaderboard_df, filter_types_columns],
|
| 231 |
+
leaderboard_df,
|
| 232 |
+
)
|
| 233 |
+
filter_prompting_columns.change(
|
| 234 |
+
filter_direct_complete,
|
| 235 |
+
[hidden_leaderboard_df, leaderboard_df, filter_prompting_columns],
|
| 236 |
leaderboard_df,
|
| 237 |
)
|
| 238 |
shown_columns.change(
|
|
|
|
| 249 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
| 250 |
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
| 251 |
- `size` is the amount of activated model weight during inference.
|
|
|
|
| 252 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 253 |
- For more details check the 📝 About section.
|
| 254 |
- Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
|
src/utils.py
CHANGED
|
@@ -24,12 +24,11 @@ def fields(raw_class):
|
|
| 24 |
class AutoEvalColumn: # Auto evals column
|
| 25 |
model_type_symbol = ColumnContent("type", "str", True)
|
| 26 |
model = ColumnContent("model", "markdown", True)
|
| 27 |
-
size = ColumnContent("size", "number", False)
|
| 28 |
complete_score = ColumnContent("complete", "number", True)
|
| 29 |
instruct_score = ColumnContent("instruct", "number", True)
|
| 30 |
elo_mle = ColumnContent("elo_mle", "number", True)
|
| 31 |
dummy = ColumnContent("model", "str", True)
|
| 32 |
-
|
| 33 |
|
| 34 |
|
| 35 |
def model_hyperlink(link, model_name):
|
|
|
|
| 24 |
class AutoEvalColumn: # Auto evals column
|
| 25 |
model_type_symbol = ColumnContent("type", "str", True)
|
| 26 |
model = ColumnContent("model", "markdown", True)
|
|
|
|
| 27 |
complete_score = ColumnContent("complete", "number", True)
|
| 28 |
instruct_score = ColumnContent("instruct", "number", True)
|
| 29 |
elo_mle = ColumnContent("elo_mle", "number", True)
|
| 30 |
dummy = ColumnContent("model", "str", True)
|
| 31 |
+
size = ColumnContent("size", "number", False)
|
| 32 |
|
| 33 |
|
| 34 |
def model_hyperlink(link, model_name):
|