Terry Zhuo
commited on
Commit
·
14a3287
1
Parent(s):
5fa61d0
update w/ hard only
Browse files
app.py
CHANGED
|
@@ -150,26 +150,26 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
| 150 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 151 |
|
| 152 |
def get_latest_data_leaderboard(
|
| 153 |
-
leaderboard_initial_df = None,
|
| 154 |
hard_leaderboard_initial_df = None,
|
| 155 |
-
elo_task_df = None,
|
| 156 |
-
elo_bench_df = None,
|
| 157 |
hard_elo_task_df = None,
|
| 158 |
hard_elo_bench_df = None,
|
| 159 |
-
complete_solve_df = None,
|
| 160 |
-
instruct_solve_df = None,
|
| 161 |
hard_complete_solve_df = None,
|
| 162 |
hard_instruct_solve_df = None
|
| 163 |
):
|
| 164 |
global NEW_DATA_ON_LEADERBOARD
|
| 165 |
-
global LEADERBOARD_DF
|
| 166 |
global HARD_LEADERBOARD_DF
|
| 167 |
-
global ELO_TASK_DF
|
| 168 |
-
global ELO_BENCH_DF
|
| 169 |
global HARD_ELO_TASK_DF
|
| 170 |
global HARD_ELO_BENCH_DF
|
| 171 |
-
global COMPLETE_SOLVE_DF
|
| 172 |
-
global INSTRUCT_SOLVE_DF
|
| 173 |
global HARD_COMPLETE_SOLVE_DF
|
| 174 |
global HARD_INSTRUCT_SOLVE_DF
|
| 175 |
|
|
@@ -183,10 +183,10 @@ def get_latest_data_leaderboard(
|
|
| 183 |
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 184 |
verification_mode="no_checks"
|
| 185 |
)
|
| 186 |
-
LEADERBOARD_DF = get_leaderboard_df(
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
)
|
| 190 |
hard_leaderboard_dataset = datasets.load_dataset(
|
| 191 |
HARD_RESULT_REPO,
|
| 192 |
"default",
|
|
@@ -201,24 +201,24 @@ def get_latest_data_leaderboard(
|
|
| 201 |
)
|
| 202 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
| 203 |
|
| 204 |
-
elo_task_df = datasets.load_dataset(
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
).to_pandas()
|
| 212 |
-
elo_bench_df = datasets.load_dataset(
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
).to_pandas()
|
| 220 |
-
ELO_TASK_DF = elo_task_df
|
| 221 |
-
ELO_BENCH_DF = elo_bench_df
|
| 222 |
|
| 223 |
hard_elo_task_df = datasets.load_dataset(
|
| 224 |
HARD_ELO_REPO,
|
|
@@ -239,24 +239,24 @@ def get_latest_data_leaderboard(
|
|
| 239 |
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 240 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 241 |
|
| 242 |
-
complete_solve_df = datasets.load_dataset(
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
).to_pandas()
|
| 250 |
-
instruct_solve_df = datasets.load_dataset(
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
).to_pandas()
|
| 258 |
-
COMPLETE_SOLVE_DF = complete_solve_df
|
| 259 |
-
INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 260 |
|
| 261 |
hard_complete_solve_df = datasets.load_dataset(
|
| 262 |
HARD_SOLVE_REPO,
|
|
@@ -280,41 +280,41 @@ def get_latest_data_leaderboard(
|
|
| 280 |
NEW_DATA_ON_LEADERBOARD = False
|
| 281 |
|
| 282 |
else:
|
| 283 |
-
LEADERBOARD_DF = leaderboard_initial_df
|
| 284 |
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
| 285 |
-
ELO_TASK_DF = elo_task_df
|
| 286 |
-
ELO_BENCH_DF = elo_bench_df
|
| 287 |
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 288 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 289 |
-
COMPLETE_SOLVE_DF = complete_solve_df
|
| 290 |
-
INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 291 |
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
| 292 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
| 293 |
|
| 294 |
-
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
def init_space():
|
| 298 |
"""Initializes the application space, loading only necessary data."""
|
| 299 |
|
| 300 |
# Always redownload the leaderboard DataFrame
|
| 301 |
-
global LEADERBOARD_DF
|
| 302 |
global HARD_LEADERBOARD_DF
|
| 303 |
-
global ELO_TASK_DF
|
| 304 |
-
global ELO_BENCH_DF
|
| 305 |
global HARD_ELO_TASK_DF
|
| 306 |
global HARD_ELO_BENCH_DF
|
| 307 |
-
global COMPLETE_SOLVE_DF
|
| 308 |
-
global INSTRUCT_SOLVE_DF
|
| 309 |
global HARD_COMPLETE_SOLVE_DF
|
| 310 |
global HARD_INSTRUCT_SOLVE_DF
|
| 311 |
|
| 312 |
-
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
|
|
|
| 313 |
|
| 314 |
-
#
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 318 |
|
| 319 |
# Initialize VoteManager
|
| 320 |
# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
|
@@ -329,11 +329,11 @@ def init_space():
|
|
| 329 |
|
| 330 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 331 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 332 |
-
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
|
| 333 |
-
ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
|
| 334 |
-
COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
|
| 335 |
-
HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 336 |
-
|
| 337 |
|
| 338 |
# Data processing for plots now only on demand in the respective Gradio tab
|
| 339 |
# def load_and_create_plots():
|
|
@@ -378,107 +378,108 @@ def init_others(dataframe):
|
|
| 378 |
main_block = gr.Blocks(css=custom_css)
|
| 379 |
with main_block as demo:
|
| 380 |
with gr.Row(elem_id="header-row"):
|
| 381 |
-
gr.HTML(TITLE + "<p>Total models: " + str(len(
|
| 382 |
|
| 383 |
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 384 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 385 |
-
with gr.Tab("💎 Hard Set") as hard_tabs:
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
|
|
|
| 421 |
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
with gr.Tab("🎯 Full Set") as full_tabs:
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
with gr.TabItem("📝 About", id=3):
|
| 483 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
| 484 |
with gr.TabItem("🔎 Data Viewer", id="viewer"):
|
|
@@ -521,7 +522,8 @@ with main_block as demo:
|
|
| 521 |
show_copy_button=True,
|
| 522 |
)
|
| 523 |
|
| 524 |
-
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
|
|
|
| 525 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
| 526 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
| 527 |
|
|
|
|
| 150 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 151 |
|
| 152 |
def get_latest_data_leaderboard(
|
| 153 |
+
# leaderboard_initial_df = None,
|
| 154 |
hard_leaderboard_initial_df = None,
|
| 155 |
+
# elo_task_df = None,
|
| 156 |
+
# elo_bench_df = None,
|
| 157 |
hard_elo_task_df = None,
|
| 158 |
hard_elo_bench_df = None,
|
| 159 |
+
# complete_solve_df = None,
|
| 160 |
+
# instruct_solve_df = None,
|
| 161 |
hard_complete_solve_df = None,
|
| 162 |
hard_instruct_solve_df = None
|
| 163 |
):
|
| 164 |
global NEW_DATA_ON_LEADERBOARD
|
| 165 |
+
# global LEADERBOARD_DF
|
| 166 |
global HARD_LEADERBOARD_DF
|
| 167 |
+
# global ELO_TASK_DF
|
| 168 |
+
# global ELO_BENCH_DF
|
| 169 |
global HARD_ELO_TASK_DF
|
| 170 |
global HARD_ELO_BENCH_DF
|
| 171 |
+
# global COMPLETE_SOLVE_DF
|
| 172 |
+
# global INSTRUCT_SOLVE_DF
|
| 173 |
global HARD_COMPLETE_SOLVE_DF
|
| 174 |
global HARD_INSTRUCT_SOLVE_DF
|
| 175 |
|
|
|
|
| 183 |
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 184 |
verification_mode="no_checks"
|
| 185 |
)
|
| 186 |
+
# LEADERBOARD_DF = get_leaderboard_df(
|
| 187 |
+
# leaderboard_dataset=leaderboard_dataset,
|
| 188 |
+
# cols=COLS,
|
| 189 |
+
# )
|
| 190 |
hard_leaderboard_dataset = datasets.load_dataset(
|
| 191 |
HARD_RESULT_REPO,
|
| 192 |
"default",
|
|
|
|
| 201 |
)
|
| 202 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
| 203 |
|
| 204 |
+
# elo_task_df = datasets.load_dataset(
|
| 205 |
+
# ELO_REPO,
|
| 206 |
+
# "default",
|
| 207 |
+
# split="task_no_tie",
|
| 208 |
+
# cache_dir=HF_HOME,
|
| 209 |
+
# download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 210 |
+
# verification_mode="no_checks"
|
| 211 |
+
# ).to_pandas()
|
| 212 |
+
# elo_bench_df = datasets.load_dataset(
|
| 213 |
+
# ELO_REPO,
|
| 214 |
+
# "default",
|
| 215 |
+
# split="benchmark_tie",
|
| 216 |
+
# cache_dir=HF_HOME,
|
| 217 |
+
# download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 218 |
+
# verification_mode="no_checks"
|
| 219 |
+
# ).to_pandas()
|
| 220 |
+
# ELO_TASK_DF = elo_task_df
|
| 221 |
+
# ELO_BENCH_DF = elo_bench_df
|
| 222 |
|
| 223 |
hard_elo_task_df = datasets.load_dataset(
|
| 224 |
HARD_ELO_REPO,
|
|
|
|
| 239 |
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 240 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 241 |
|
| 242 |
+
# complete_solve_df = datasets.load_dataset(
|
| 243 |
+
# SOLVE_REPO,
|
| 244 |
+
# "default",
|
| 245 |
+
# split="complete",
|
| 246 |
+
# cache_dir=HF_HOME,
|
| 247 |
+
# download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 248 |
+
# verification_mode="no_checks"
|
| 249 |
+
# ).to_pandas()
|
| 250 |
+
# instruct_solve_df = datasets.load_dataset(
|
| 251 |
+
# SOLVE_REPO,
|
| 252 |
+
# "default",
|
| 253 |
+
# split="instruct",
|
| 254 |
+
# cache_dir=HF_HOME,
|
| 255 |
+
# download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 256 |
+
# verification_mode="no_checks"
|
| 257 |
+
# ).to_pandas()
|
| 258 |
+
# COMPLETE_SOLVE_DF = complete_solve_df
|
| 259 |
+
# INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 260 |
|
| 261 |
hard_complete_solve_df = datasets.load_dataset(
|
| 262 |
HARD_SOLVE_REPO,
|
|
|
|
| 280 |
NEW_DATA_ON_LEADERBOARD = False
|
| 281 |
|
| 282 |
else:
|
| 283 |
+
# LEADERBOARD_DF = leaderboard_initial_df
|
| 284 |
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
| 285 |
+
# ELO_TASK_DF = elo_task_df
|
| 286 |
+
# ELO_BENCH_DF = elo_bench_df
|
| 287 |
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 288 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 289 |
+
# COMPLETE_SOLVE_DF = complete_solve_df
|
| 290 |
+
# INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 291 |
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
| 292 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
| 293 |
|
| 294 |
+
# return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 295 |
+
return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 296 |
|
| 297 |
|
| 298 |
def init_space():
|
| 299 |
"""Initializes the application space, loading only necessary data."""
|
| 300 |
|
| 301 |
# Always redownload the leaderboard DataFrame
|
| 302 |
+
# global LEADERBOARD_DF
|
| 303 |
global HARD_LEADERBOARD_DF
|
| 304 |
+
# global ELO_TASK_DF
|
| 305 |
+
# global ELO_BENCH_DF
|
| 306 |
global HARD_ELO_TASK_DF
|
| 307 |
global HARD_ELO_BENCH_DF
|
| 308 |
+
# global COMPLETE_SOLVE_DF
|
| 309 |
+
# global INSTRUCT_SOLVE_DF
|
| 310 |
global HARD_COMPLETE_SOLVE_DF
|
| 311 |
global HARD_INSTRUCT_SOLVE_DF
|
| 312 |
|
| 313 |
+
# LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
| 314 |
+
HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
| 315 |
|
| 316 |
+
# return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 317 |
+
return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
|
|
|
|
|
|
| 318 |
|
| 319 |
# Initialize VoteManager
|
| 320 |
# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
|
|
|
| 329 |
|
| 330 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 331 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 332 |
+
# LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
|
| 333 |
+
# ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
|
| 334 |
+
# COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
|
| 335 |
+
# HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 336 |
+
HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 337 |
|
| 338 |
# Data processing for plots now only on demand in the respective Gradio tab
|
| 339 |
# def load_and_create_plots():
|
|
|
|
| 378 |
main_block = gr.Blocks(css=custom_css)
|
| 379 |
with main_block as demo:
|
| 380 |
with gr.Row(elem_id="header-row"):
|
| 381 |
+
gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
|
| 382 |
|
| 383 |
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 384 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 385 |
+
# with gr.Tab("💎 Hard Set") as hard_tabs:
|
| 386 |
+
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
|
| 387 |
+
hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
|
| 388 |
+
gr.Markdown(
|
| 389 |
+
"""
|
| 390 |
+
**Notes:**
|
| 391 |
+
- For the efficiency reasons, we only display the Hard Set leaderboard.
|
| 392 |
+
- _Hard Set_ vs _Full Set_:
|
| 393 |
+
- <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
|
| 394 |
+
- <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
|
| 395 |
+
- _Complete_ vs _Instruct_:
|
| 396 |
+
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
|
| 397 |
+
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
| 398 |
+
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
| 399 |
+
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
| 400 |
+
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
|
| 401 |
+
- `#Act Params (B)` is the number of activated model parameters during inference.
|
| 402 |
+
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 403 |
+
- For more details check the 📝 About section.
|
| 404 |
+
""",
|
| 405 |
+
elem_classes="markdown-text",
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
with gr.TabItem("📊 Elo Rating", id="hard_elo"):
|
| 409 |
+
with gr.Column():
|
| 410 |
+
with gr.Group():
|
| 411 |
+
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 412 |
+
hard_task_elo_map = gr.Plot()
|
| 413 |
+
hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
|
| 414 |
+
demo.load(plot_elo_mle, [hard_elo_task_gr],
|
| 415 |
+
hard_task_elo_map)
|
| 416 |
+
with gr.Group():
|
| 417 |
+
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 418 |
+
hard_bench_elo_map = gr.Plot()
|
| 419 |
+
hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
|
| 420 |
+
demo.load(plot_elo_mle, [hard_elo_bench_gr],
|
| 421 |
+
hard_bench_elo_map)
|
| 422 |
|
| 423 |
+
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
| 424 |
+
with gr.Column():
|
| 425 |
+
hard_complete_map = gr.Plot()
|
| 426 |
+
hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
|
| 427 |
+
demo.load(plot_solve_rate, [hard_complete_solve_gr,
|
| 428 |
+
gr.Textbox("Complete", visible=False),
|
| 429 |
+
gr.Number(10, visible=False),
|
| 430 |
+
gr.Number(16, visible=False),
|
| 431 |
+
], hard_complete_map)
|
| 432 |
+
hard_instruct_map = gr.Plot()
|
| 433 |
+
hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
|
| 434 |
+
demo.load(plot_solve_rate, [hard_instruct_solve_gr,
|
| 435 |
+
gr.Textbox("Instruct", visible=False),
|
| 436 |
+
gr.Number(10, visible=False),
|
| 437 |
+
gr.Number(16, visible=False),
|
| 438 |
+
], hard_instruct_map)
|
| 439 |
+
# with gr.Tab("🎯 Full Set") as full_tabs:
|
| 440 |
+
# with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
|
| 441 |
+
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 442 |
+
# gr.Markdown(
|
| 443 |
+
# """
|
| 444 |
+
# **Notes:**
|
| 445 |
+
# - _Complete_ vs _Instruct_:
|
| 446 |
+
# - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
| 447 |
+
# - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
| 448 |
+
# - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
| 449 |
+
# - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
|
| 450 |
+
# - `size` is the amount of activated model weight during inference.
|
| 451 |
+
# - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 452 |
+
# - For more details check the 📝 About section.
|
| 453 |
+
# """,
|
| 454 |
+
# elem_classes="markdown-text",
|
| 455 |
+
# )
|
| 456 |
|
| 457 |
+
# with gr.TabItem("📊 Elo Rating", id="full_elo"):
|
| 458 |
+
# with gr.Column():
|
| 459 |
+
# with gr.Group():
|
| 460 |
|
| 461 |
+
# gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 462 |
+
# task_elo_map = gr.Plot()
|
| 463 |
+
# elo_task_gr = init_others(ELO_TASK_DF)
|
| 464 |
+
# demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
|
| 465 |
+
# with gr.Group():
|
| 466 |
+
# gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 467 |
+
# bench_elo_map = gr.Plot()
|
| 468 |
+
# elo_bench_gr = init_others(ELO_BENCH_DF)
|
| 469 |
+
# demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
|
| 470 |
|
| 471 |
+
# with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
| 472 |
+
# with gr.Column():
|
| 473 |
+
# complete_map = gr.Plot()
|
| 474 |
+
# complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
|
| 475 |
+
# demo.load(plot_solve_rate, [complete_solve_gr,
|
| 476 |
+
# gr.Textbox("Complete", visible=False),
|
| 477 |
+
# ], complete_map)
|
| 478 |
+
# instruct_map = gr.Plot()
|
| 479 |
+
# instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
|
| 480 |
+
# demo.load(plot_solve_rate, [instruct_solve_gr,
|
| 481 |
+
# gr.Textbox("Instruct", visible=False),
|
| 482 |
+
# ], instruct_map)
|
| 483 |
with gr.TabItem("📝 About", id=3):
|
| 484 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
| 485 |
with gr.TabItem("🔎 Data Viewer", id="viewer"):
|
|
|
|
| 522 |
show_copy_button=True,
|
| 523 |
)
|
| 524 |
|
| 525 |
+
# main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
| 526 |
+
main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
| 527 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
| 528 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
| 529 |
|