bigcodebench-leaderboard

Running

App Files Files Community

Terry Zhuo commited on Jul 25, 2024

Commit

14a3287

1 Parent(s): 5fa61d0

update w/ hard only

Browse files

Files changed (1) hide show

app.py +168 -166

app.py CHANGED Viewed

@@ -150,26 +150,26 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 def get_latest_data_leaderboard(
-    leaderboard_initial_df = None,
     hard_leaderboard_initial_df = None,
-    elo_task_df = None,
-    elo_bench_df = None,
     hard_elo_task_df = None,
     hard_elo_bench_df = None,
-    complete_solve_df = None,
-    instruct_solve_df = None,
     hard_complete_solve_df = None,
     hard_instruct_solve_df = None
     ):
     global NEW_DATA_ON_LEADERBOARD
-    global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
-    global ELO_TASK_DF
-    global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
-    global COMPLETE_SOLVE_DF
-    global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
@@ -183,10 +183,10 @@ def get_latest_data_leaderboard(
             download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
             verification_mode="no_checks"
         )
-        LEADERBOARD_DF = get_leaderboard_df(
-            leaderboard_dataset=leaderboard_dataset,
-            cols=COLS,
-        )
         hard_leaderboard_dataset = datasets.load_dataset(
             HARD_RESULT_REPO,
             "default",
@@ -201,24 +201,24 @@ def get_latest_data_leaderboard(
         )
         HARD_LEADERBOARD_DF = hard_leaderboard_df
-        elo_task_df = datasets.load_dataset(
-            ELO_REPO,
-            "default",
-            split="task_no_tie",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        elo_bench_df = datasets.load_dataset(
-            ELO_REPO,
-            "default",
-            split="benchmark_tie",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        ELO_TASK_DF = elo_task_df
-        ELO_BENCH_DF = elo_bench_df
         hard_elo_task_df = datasets.load_dataset(
             HARD_ELO_REPO,
@@ -239,24 +239,24 @@ def get_latest_data_leaderboard(
         HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
-        complete_solve_df = datasets.load_dataset(
-            SOLVE_REPO,
-            "default",
-            split="complete",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        instruct_solve_df = datasets.load_dataset(
-            SOLVE_REPO,
-            "default",
-            split="instruct",
-            cache_dir=HF_HOME,
-            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-            verification_mode="no_checks"
-        ).to_pandas()
-        COMPLETE_SOLVE_DF = complete_solve_df
-        INSTRUCT_SOLVE_DF = instruct_solve_df
         hard_complete_solve_df = datasets.load_dataset(
             HARD_SOLVE_REPO,
@@ -280,41 +280,41 @@ def get_latest_data_leaderboard(
         NEW_DATA_ON_LEADERBOARD = False
     else:
-        LEADERBOARD_DF = leaderboard_initial_df
         HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
-        ELO_TASK_DF = elo_task_df
-        ELO_BENCH_DF = elo_bench_df
         HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
-        COMPLETE_SOLVE_DF = complete_solve_df
-        INSTRUCT_SOLVE_DF = instruct_solve_df
         HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
         HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
-    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 def init_space():
     """Initializes the application space, loading only necessary data."""
     # Always redownload the leaderboard DataFrame
-    global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
-    global ELO_TASK_DF
-    global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
-    global COMPLETE_SOLVE_DF
-    global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
-    LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
-    # Evaluation queue DataFrame retrieval is independent of initialization detail level
-    # eval_queue_dfs = get_latest_data_queue()
-    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 # Initialize VoteManager
 # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
@@ -329,11 +329,11 @@ def init_space():
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
-ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
-COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
-HARD_INSTRUCT_SOLVE_DF = init_space()
 # Data processing for plots now only on demand in the respective Gradio tab
 # def load_and_create_plots():
@@ -378,107 +378,108 @@ def init_others(dataframe):
 main_block = gr.Blocks(css=custom_css)
 with main_block as demo:
     with gr.Row(elem_id="header-row"):
-        gr.HTML(TITLE + "<p>Total models: " + str(len(LEADERBOARD_DF))+ "</p>")
     # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.Tab("💎 Hard Set") as hard_tabs:
-            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
-                hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
-                gr.Markdown(
-                    """
-                **Notes:**
-                - _Hard Set_ vs _Full Set_:
-                    - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
-                    - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
-                - _Complete_ vs _Instruct_:
-                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
-                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
-                - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
-                - `Average` is the average of `Complete` and `Instruct` when both are available.
-                - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
-                - `#Act Params (B)` is the number of activated model parameters during inference.
-                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
-                - For more details check the 📝 About section.
-                """,
-                    elem_classes="markdown-text",
-                )
-            with gr.TabItem("📊 Elo Rating", id="hard_elo"):
-                with gr.Column():
-                    with gr.Group():
-                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
-                        hard_task_elo_map = gr.Plot()
-                        hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
-                        demo.load(plot_elo_mle, [hard_elo_task_gr],
-                                    hard_task_elo_map)
-                    with gr.Group():
-                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
-                        hard_bench_elo_map = gr.Plot()
-                        hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
-                        demo.load(plot_elo_mle, [hard_elo_bench_gr],
-                                    hard_bench_elo_map)
-            with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
-                with gr.Column():
-                    hard_complete_map = gr.Plot()
-                    hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
-                    demo.load(plot_solve_rate, [hard_complete_solve_gr,
-                                                gr.Textbox("Complete", visible=False),
-                                                gr.Number(10, visible=False),
-                                                gr.Number(16, visible=False),
-                                                ], hard_complete_map)
-                    hard_instruct_map = gr.Plot()
-                    hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
-                    demo.load(plot_solve_rate, [hard_instruct_solve_gr,
-                                                gr.Textbox("Instruct", visible=False),
-                                                gr.Number(10, visible=False),
-                                                gr.Number(16, visible=False),
-                                                ], hard_instruct_map)
-        with gr.Tab("🎯 Full Set") as full_tabs:
-            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
-                leaderboard = init_leaderboard(LEADERBOARD_DF)
-                gr.Markdown(
-                    """
-                **Notes:**
-                - _Complete_ vs _Instruct_:
-                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
-                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
-                - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
-                - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
-                - `size` is the amount of activated model weight during inference.
-                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
-                - For more details check the 📝 About section.
-                """,
-                    elem_classes="markdown-text",
-                )
-            with gr.TabItem("📊 Elo Rating", id="full_elo"):
-                with gr.Column():
-                    with gr.Group():
-                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
-                        task_elo_map = gr.Plot()
-                        elo_task_gr = init_others(ELO_TASK_DF)
-                        demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
-                    with gr.Group():
-                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
-                        bench_elo_map = gr.Plot()
-                        elo_bench_gr = init_others(ELO_BENCH_DF)
-                        demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
-            with gr.TabItem("🧩 Solve Rate", id="full_solve"):
-                with gr.Column():
-                    complete_map = gr.Plot()
-                    complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
-                    demo.load(plot_solve_rate, [complete_solve_gr,
-                                                gr.Textbox("Complete", visible=False),
-                                                ], complete_map)
-                    instruct_map = gr.Plot()
-                    instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
-                    demo.load(plot_solve_rate, [instruct_solve_gr,
-                                                gr.Textbox("Instruct", visible=False),
-                                                ], instruct_map)
         with gr.TabItem("📝 About", id=3):
             gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🔎 Data Viewer", id="viewer"):
@@ -521,7 +522,8 @@ with main_block as demo:
                 show_copy_button=True,
             )
-    main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
     # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
     # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])

     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 def get_latest_data_leaderboard(
+    # leaderboard_initial_df = None,
     hard_leaderboard_initial_df = None,
+    # elo_task_df = None,
+    # elo_bench_df = None,
     hard_elo_task_df = None,
     hard_elo_bench_df = None,
+    # complete_solve_df = None,
+    # instruct_solve_df = None,
     hard_complete_solve_df = None,
     hard_instruct_solve_df = None
     ):
     global NEW_DATA_ON_LEADERBOARD
+    # global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
+    # global ELO_TASK_DF
+    # global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
+    # global COMPLETE_SOLVE_DF
+    # global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
             download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
             verification_mode="no_checks"
         )
+        # LEADERBOARD_DF = get_leaderboard_df(
+        #     leaderboard_dataset=leaderboard_dataset,
+        #     cols=COLS,
+        # )
         hard_leaderboard_dataset = datasets.load_dataset(
             HARD_RESULT_REPO,
             "default",
         )
         HARD_LEADERBOARD_DF = hard_leaderboard_df
+        # elo_task_df = datasets.load_dataset(
+        #     ELO_REPO,
+        #     "default",
+        #     split="task_no_tie",
+        #     cache_dir=HF_HOME,
+        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+        #     verification_mode="no_checks"
+        # ).to_pandas()
+        # elo_bench_df = datasets.load_dataset(
+        #     ELO_REPO,
+        #     "default",
+        #     split="benchmark_tie",
+        #     cache_dir=HF_HOME,
+        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+        #     verification_mode="no_checks"
+        # ).to_pandas()
+        # ELO_TASK_DF = elo_task_df
+        # ELO_BENCH_DF = elo_bench_df
         hard_elo_task_df = datasets.load_dataset(
             HARD_ELO_REPO,
         HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
+        # complete_solve_df = datasets.load_dataset(
+        #     SOLVE_REPO,
+        #     "default",
+        #     split="complete",
+        #     cache_dir=HF_HOME,
+        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+        #     verification_mode="no_checks"
+        # ).to_pandas()
+        # instruct_solve_df = datasets.load_dataset(
+        #     SOLVE_REPO,
+        #     "default",
+        #     split="instruct",
+        #     cache_dir=HF_HOME,
+        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+        #     verification_mode="no_checks"
+        # ).to_pandas()
+        # COMPLETE_SOLVE_DF = complete_solve_df
+        # INSTRUCT_SOLVE_DF = instruct_solve_df
         hard_complete_solve_df = datasets.load_dataset(
             HARD_SOLVE_REPO,
         NEW_DATA_ON_LEADERBOARD = False
     else:
+        # LEADERBOARD_DF = leaderboard_initial_df
         HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
+        # ELO_TASK_DF = elo_task_df
+        # ELO_BENCH_DF = elo_bench_df
         HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
+        # COMPLETE_SOLVE_DF = complete_solve_df
+        # INSTRUCT_SOLVE_DF = instruct_solve_df
         HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
         HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
+    # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+    return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 def init_space():
     """Initializes the application space, loading only necessary data."""
     # Always redownload the leaderboard DataFrame
+    # global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
+    # global ELO_TASK_DF
+    # global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
+    # global COMPLETE_SOLVE_DF
+    # global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
+    # LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
+    HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
+    # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+    return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 # Initialize VoteManager
 # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+# LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
+# ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
+# COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
+# HARD_INSTRUCT_SOLVE_DF = init_space()
+HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
 # Data processing for plots now only on demand in the respective Gradio tab
 # def load_and_create_plots():
 main_block = gr.Blocks(css=custom_css)
 with main_block as demo:
     with gr.Row(elem_id="header-row"):
+        gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
     # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        # with gr.Tab("💎 Hard Set") as hard_tabs:
+        with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
+            hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
+            gr.Markdown(
+                """
+            **Notes:**
+            - For the efficiency reasons, we only display the Hard Set leaderboard.
+            - _Hard Set_ vs _Full Set_:
+                - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
+                - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
+            - _Complete_ vs _Instruct_:
+                - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
+                - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
+            - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
+            - `Average` is the average of `Complete` and `Instruct` when both are available.
+            - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
+            - `#Act Params (B)` is the number of activated model parameters during inference.
+            - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
+            - For more details check the 📝 About section.
+            """,
+                elem_classes="markdown-text",
+            )
+        with gr.TabItem("📊 Elo Rating", id="hard_elo"):
+            with gr.Column():
+                with gr.Group():
+                    gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
+                    hard_task_elo_map = gr.Plot()
+                    hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
+                    demo.load(plot_elo_mle, [hard_elo_task_gr],
+                                hard_task_elo_map)
+                with gr.Group():
+                    gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
+                    hard_bench_elo_map = gr.Plot()
+                    hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
+                    demo.load(plot_elo_mle, [hard_elo_bench_gr],
+                                hard_bench_elo_map)
+        with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
+            with gr.Column():
+                hard_complete_map = gr.Plot()
+                hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
+                demo.load(plot_solve_rate, [hard_complete_solve_gr,
+                                            gr.Textbox("Complete", visible=False),
+                                            gr.Number(10, visible=False),
+                                            gr.Number(16, visible=False),
+                                            ], hard_complete_map)
+                hard_instruct_map = gr.Plot()
+                hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
+                demo.load(plot_solve_rate, [hard_instruct_solve_gr,
+                                            gr.Textbox("Instruct", visible=False),
+                                            gr.Number(10, visible=False),
+                                            gr.Number(16, visible=False),
+                                            ], hard_instruct_map)
+        # with gr.Tab("🎯 Full Set") as full_tabs:
+        #     with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
+        #         leaderboard = init_leaderboard(LEADERBOARD_DF)
+        #         gr.Markdown(
+        #             """
+        #         **Notes:**
+        #         - _Complete_ vs _Instruct_:
+        #             - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
+        #             - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
+        #         - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
+        #         - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
+        #         - `size` is the amount of activated model weight during inference.
+        #         - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
+        #         - For more details check the 📝 About section.
+        #         """,
+        #             elem_classes="markdown-text",
+        #         )
+        #     with gr.TabItem("📊 Elo Rating", id="full_elo"):
+        #         with gr.Column():
+        #             with gr.Group():
+        #                 gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
+        #                 task_elo_map = gr.Plot()
+        #                 elo_task_gr = init_others(ELO_TASK_DF)
+        #                 demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
+        #             with gr.Group():
+        #                 gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
+        #                 bench_elo_map = gr.Plot()
+        #                 elo_bench_gr = init_others(ELO_BENCH_DF)
+        #                 demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
+        #     with gr.TabItem("🧩 Solve Rate", id="full_solve"):
+        #         with gr.Column():
+        #             complete_map = gr.Plot()
+        #             complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
+        #             demo.load(plot_solve_rate, [complete_solve_gr,
+        #                                         gr.Textbox("Complete", visible=False),
+        #                                         ], complete_map)
+        #             instruct_map = gr.Plot()
+        #             instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
+        #             demo.load(plot_solve_rate, [instruct_solve_gr,
+        #                                         gr.Textbox("Instruct", visible=False),
+        #                                         ], instruct_map)
         with gr.TabItem("📝 About", id=3):
             gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🔎 Data Viewer", id="viewer"):
                 show_copy_button=True,
             )
+    # main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
+    main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
     # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
     # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])