Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
727eb6f
1
Parent(s):
f477fda
added Validated tab
Browse files- app.py +24 -6
- dabstep_benchmark/leaderboard.py +5 -2
app.py
CHANGED
|
@@ -18,13 +18,31 @@ if __name__ == "__main__":
|
|
| 18 |
with demo:
|
| 19 |
gr.Markdown(TITLE)
|
| 20 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
)
|
| 29 |
# create a Gradio event listener that runs when the page is loaded to populate the dataframe
|
| 30 |
demo.load(lambda: generate_leaderboard_df(), None, leaderboard_table)
|
|
|
|
| 18 |
with demo:
|
| 19 |
gr.Markdown(TITLE)
|
| 20 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 21 |
+
|
| 22 |
+
# Generate leaderboard data once
|
| 23 |
+
leaderboard_df = generate_leaderboard_df()
|
| 24 |
|
| 25 |
+
# Filter validated and unvalidated
|
| 26 |
+
validated = leaderboard_df[leaderboard_df["validated"] == True].drop(columns=["validated"]).copy()
|
| 27 |
+
unvalidated = leaderboard_df[leaderboard_df["validated"] == False].drop(columns=["validated"]).copy()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
with gr.Tab("Validated"):
|
| 31 |
+
leaderboard_table = gr.components.Dataframe(
|
| 32 |
+
value=validated,
|
| 33 |
+
datatype=["markdown", "str", "str", "str", "markdown", "str", "str", "str"],
|
| 34 |
+
interactive=False,
|
| 35 |
+
column_widths=["20%"],
|
| 36 |
+
wrap=True,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
with gr.Tab("Unvalidated"):
|
| 40 |
+
leaderboard_table = gr.components.Dataframe(
|
| 41 |
+
value=unvalidated,
|
| 42 |
+
datatype=["markdown", "str", "str", "str", "markdown", "str", "str", "str"],
|
| 43 |
+
interactive=False,
|
| 44 |
+
column_widths=["20%"],
|
| 45 |
+
wrap=True,
|
| 46 |
)
|
| 47 |
# create a Gradio event listener that runs when the page is loaded to populate the dataframe
|
| 48 |
demo.load(lambda: generate_leaderboard_df(), None, leaderboard_table)
|
dabstep_benchmark/leaderboard.py
CHANGED
|
@@ -154,6 +154,7 @@ def process_submission(
|
|
| 154 |
submission_df["organisation"] = f"{organisation} | user {profile.username}"
|
| 155 |
submission_df["repo_url"] = repo_url
|
| 156 |
submission_df["date"] = datetime.date.today().strftime("%d-%m-%Y")
|
|
|
|
| 157 |
|
| 158 |
# add empty reasoning trace if one is not provided to not break schema of datasets
|
| 159 |
if "reasoning_trace" not in submission_df.columns:
|
|
@@ -242,7 +243,8 @@ def generate_leaderboard_df() -> pd.DataFrame:
|
|
| 242 |
"model_family",
|
| 243 |
"organisation",
|
| 244 |
"repo_url",
|
| 245 |
-
"date"
|
|
|
|
| 246 |
]
|
| 247 |
]
|
| 248 |
)
|
|
@@ -288,7 +290,8 @@ def generate_leaderboard_df() -> pd.DataFrame:
|
|
| 288 |
"organisation": "Organization",
|
| 289 |
"repo_url": "Repo URL",
|
| 290 |
"model_family": "Model Family",
|
| 291 |
-
"date": "Date"
|
|
|
|
| 292 |
}
|
| 293 |
col_order = [new_col_name for new_col_name in col_map.values()]
|
| 294 |
leaderboard_df.rename(columns=col_map, inplace=True)
|
|
|
|
| 154 |
submission_df["organisation"] = f"{organisation} | user {profile.username}"
|
| 155 |
submission_df["repo_url"] = repo_url
|
| 156 |
submission_df["date"] = datetime.date.today().strftime("%d-%m-%Y")
|
| 157 |
+
submission_df["validated"] = False #unvalidated by default
|
| 158 |
|
| 159 |
# add empty reasoning trace if one is not provided to not break schema of datasets
|
| 160 |
if "reasoning_trace" not in submission_df.columns:
|
|
|
|
| 243 |
"model_family",
|
| 244 |
"organisation",
|
| 245 |
"repo_url",
|
| 246 |
+
"date",
|
| 247 |
+
"validated"
|
| 248 |
]
|
| 249 |
]
|
| 250 |
)
|
|
|
|
| 290 |
"organisation": "Organization",
|
| 291 |
"repo_url": "Repo URL",
|
| 292 |
"model_family": "Model Family",
|
| 293 |
+
"date": "Date",
|
| 294 |
+
"validated": "validated"
|
| 295 |
}
|
| 296 |
col_order = [new_col_name for new_col_name in col_map.values()]
|
| 297 |
leaderboard_df.rename(columns=col_map, inplace=True)
|