Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		loading_from_contents (#766)
Browse files- init - cleaning the code base, plus adding the new system to load from contents (4fc38646dccf6d3719eaf48a8dfd05c4a032fad0)
- added collections back to main (8618a2a9da2186516ef4dec2dd87f14322de9719)
- rm (459932d6f2d58fe06ffd5392686f723a08c9b734)
- simplified env vars (23f614e2ea6cf7dcfb37912a464e2d8c24085b70)
- test 1 with webhooks (784d3edc7dc5f5f0439a082a4d0a1cf6376416f6)
- small modif (32ea1bc7cefef89f251e4de467b3d49579d60feb)
- trying with open link (0cb7d54ebfa0af3b1fb240a5cd2d043799379791)
- Update app.py (e3b01f36af4a62b3cc3ba1cd88e665ad496fb839)
- removing share true (3cc4e3e275d1561c7aaa647db593d33d90434f1f)
- Update app.py (52608b2305c0c499835dc0a9892e57b2fa4f61af)
- Update app.py (953dbe38df6163c16df1b40daa579c81c07f72db)
- the webhooks will download the model at each update, and demo.load will restart the viewer at each page refresh (388bfbdf61f906fb0574cf8477aaf19941548368)
- added plots back (294422eeb5e3bcfb489bdf41322bbc3c7cc1632c)
- fixed! (fa8d7663cb995885cb91746a89ce1a2b3ff7f7ca)
- replace HuggingFaceH4 by open-llm-leaderboard (2acf509d0df752206adf666c682823be1a99991f)
- rm dynamic file reference (b4f48ba26897f4c72d213355f91b21555be04da8)
Co-authored-by: Lucain Pouget <Wauplin@users.noreply.huggingface.co>
- README.md +1 -2
 - app.py +129 -79
 - requirements.txt +1 -1
 - src/display/about.py +2 -2
 - src/display/utils.py +1 -0
 - src/envs.py +4 -18
 - src/leaderboard/filter_models.py +122 -110
 - src/leaderboard/read_evals.py +0 -261
 - src/populate.py +8 -7
 - src/scripts/update_all_request_files.py +0 -129
 - src/submission/check_validity.py +1 -1
 - src/submission/submit.py +3 -30
 - src/tools/collections.py +0 -76
 - src/{scripts → tools}/create_request_file.py +0 -0
 - src/tools/model_backlinks.py +2 -2
 - src/tools/plots.py +7 -13
 
| 
         @@ -8,14 +8,13 @@ sdk_version: 4.9.0 
     | 
|
| 8 | 
         
             
            app_file: app.py
         
     | 
| 9 | 
         
             
            pinned: true
         
     | 
| 10 | 
         
             
            license: apache-2.0
         
     | 
| 11 | 
         
            -
            duplicated_from: HuggingFaceH4/open_llm_leaderboard
         
     | 
| 12 | 
         
             
            fullWidth: true
         
     | 
| 13 | 
         
             
            startup_duration_timeout: 1h
         
     | 
| 14 | 
         
             
            space_ci:
         
     | 
| 15 | 
         
             
              private: true
         
     | 
| 16 | 
         
             
              secrets:
         
     | 
| 17 | 
         
             
              - HF_TOKEN
         
     | 
| 18 | 
         
            -
              -  
     | 
| 19 | 
         
             
            tags:
         
     | 
| 20 | 
         
             
            - leaderboard
         
     | 
| 21 | 
         
             
            short_description: Track, rank and evaluate open LLMs and chatbots
         
     | 
| 
         | 
|
| 8 | 
         
             
            app_file: app.py
         
     | 
| 9 | 
         
             
            pinned: true
         
     | 
| 10 | 
         
             
            license: apache-2.0
         
     | 
| 
         | 
|
| 11 | 
         
             
            fullWidth: true
         
     | 
| 12 | 
         
             
            startup_duration_timeout: 1h
         
     | 
| 13 | 
         
             
            space_ci:
         
     | 
| 14 | 
         
             
              private: true
         
     | 
| 15 | 
         
             
              secrets:
         
     | 
| 16 | 
         
             
              - HF_TOKEN
         
     | 
| 17 | 
         
            +
              - WEBHOOK_SECRET
         
     | 
| 18 | 
         
             
            tags:
         
     | 
| 19 | 
         
             
            - leaderboard
         
     | 
| 20 | 
         
             
            short_description: Track, rank and evaluate open LLMs and chatbots
         
     | 
| 
         @@ -2,10 +2,9 @@ import os 
     | 
|
| 2 | 
         
             
            import logging
         
     | 
| 3 | 
         
             
            import time
         
     | 
| 4 | 
         
             
            import gradio as gr
         
     | 
| 5 | 
         
            -
             
     | 
| 6 | 
         
            -
            from huggingface_hub import snapshot_download
         
     | 
| 7 | 
         
             
            from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
         
     | 
| 8 | 
         
            -
            from gradio_space_ci import enable_space_ci
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            from src.display.about import (
         
     | 
| 11 | 
         
             
                CITATION_BUTTON_LABEL,
         
     | 
| 
         @@ -30,32 +29,27 @@ from src.display.utils import ( 
     | 
|
| 30 | 
         
             
            )
         
     | 
| 31 | 
         
             
            from src.envs import (
         
     | 
| 32 | 
         
             
                API,
         
     | 
| 33 | 
         
            -
                DYNAMIC_INFO_FILE_PATH,
         
     | 
| 34 | 
         
            -
                DYNAMIC_INFO_PATH,
         
     | 
| 35 | 
         
            -
                DYNAMIC_INFO_REPO,
         
     | 
| 36 | 
         
             
                EVAL_REQUESTS_PATH,
         
     | 
| 37 | 
         
            -
                 
     | 
| 38 | 
         
            -
                 
     | 
| 39 | 
         
            -
                IS_PUBLIC,
         
     | 
| 40 | 
         
             
                QUEUE_REPO,
         
     | 
| 41 | 
         
             
                REPO_ID,
         
     | 
| 42 | 
         
            -
                 
     | 
| 43 | 
         
             
            )
         
     | 
| 44 | 
         
             
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         
     | 
| 45 | 
         
            -
            from src.scripts.update_all_request_files import update_dynamic_files
         
     | 
| 46 | 
         
             
            from src.submission.submit import add_new_eval
         
     | 
| 47 | 
         
            -
            from src.tools.collections import update_collections
         
     | 
| 48 | 
         
             
            from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
         
     | 
| 49 | 
         | 
| 50 | 
         
             
            # Configure logging
         
     | 
| 51 | 
         
             
            logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
         
     | 
| 52 | 
         | 
| 53 | 
         
            -
            # Start ephemeral Spaces on PRs (see config in README.md)
         
     | 
| 54 | 
         
            -
            enable_space_ci()
         
     | 
| 55 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 56 | 
         | 
| 57 | 
         
             
            def restart_space():
         
     | 
| 58 | 
         
            -
                API.restart_space(repo_id=REPO_ID, token= 
     | 
| 59 | 
         | 
| 60 | 
         | 
| 61 | 
         
             
            def time_diff_wrapper(func):
         
     | 
| 
         @@ -94,54 +88,90 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba 
     | 
|
| 94 | 
         
             
                        attempt += 1
         
     | 
| 95 | 
         
             
                raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
         
     | 
| 96 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 97 | 
         | 
| 98 | 
         
            -
            def init_space( 
     | 
| 99 | 
         
             
                """Initializes the application space, loading only necessary data."""
         
     | 
| 100 | 
         
            -
                if  
     | 
| 101 | 
         
             
                    # These downloads only occur on full initialization
         
     | 
| 102 | 
         
             
                    try:
         
     | 
| 103 | 
         
             
                        download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         
     | 
| 104 | 
         
            -
                        download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
         
     | 
| 105 | 
         
            -
                        download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
         
     | 
| 106 | 
         
             
                    except Exception:
         
     | 
| 107 | 
         
             
                        restart_space()
         
     | 
| 108 | 
         | 
| 109 | 
         
            -
                # Always  
     | 
| 110 | 
         
            -
                 
     | 
| 111 | 
         
            -
                    results_path=EVAL_RESULTS_PATH,
         
     | 
| 112 | 
         
            -
                    requests_path=EVAL_REQUESTS_PATH,
         
     | 
| 113 | 
         
            -
                    dynamic_path=DYNAMIC_INFO_FILE_PATH,
         
     | 
| 114 | 
         
            -
                    cols=COLS,
         
     | 
| 115 | 
         
            -
                    benchmark_cols=BENCHMARK_COLS,
         
     | 
| 116 | 
         
            -
                )
         
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
                if full_init:
         
     | 
| 119 | 
         
            -
                    # Collection update only happens on full initialization
         
     | 
| 120 | 
         
            -
                    update_collections(original_df)
         
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
                leaderboard_df = original_df.copy()
         
     | 
| 123 | 
         | 
| 124 | 
         
             
                # Evaluation queue DataFrame retrieval is independent of initialization detail level
         
     | 
| 125 | 
         
            -
                eval_queue_dfs =  
     | 
| 126 | 
         | 
| 127 | 
         
            -
                return leaderboard_df,  
     | 
| 128 | 
         | 
| 129 | 
         | 
| 130 | 
         
            -
            # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
         
     | 
| 131 | 
         
            -
            # This controls whether a full initialization should be performed.
         
     | 
| 132 | 
         
            -
            do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
         
     | 
| 133 | 
         
            -
             
     | 
| 134 | 
         
             
            # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
         
     | 
| 135 | 
         
             
            # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
         
     | 
| 136 | 
         
            -
            leaderboard_df,  
     | 
| 137 | 
         
             
            finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
         
     | 
| 138 | 
         | 
| 139 | 
         | 
| 140 | 
         
             
            # Data processing for plots now only on demand in the respective Gradio tab
         
     | 
| 141 | 
         
             
            def load_and_create_plots():
         
     | 
| 142 | 
         
            -
                plot_df = create_plot_df(create_scores_df( 
     | 
| 143 | 
         
             
                return plot_df
         
     | 
| 144 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 145 | 
         | 
| 146 | 
         
             
            demo = gr.Blocks(css=custom_css)
         
     | 
| 147 | 
         
             
            with demo:
         
     | 
| 
         @@ -150,37 +180,7 @@ with demo: 
     | 
|
| 150 | 
         | 
| 151 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 152 | 
         
             
                    with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
         
     | 
| 153 | 
         
            -
                        leaderboard =  
     | 
| 154 | 
         
            -
                            value=leaderboard_df,
         
     | 
| 155 | 
         
            -
                            datatype=[c.type for c in fields(AutoEvalColumn)],
         
     | 
| 156 | 
         
            -
                            select_columns=SelectColumns(
         
     | 
| 157 | 
         
            -
                                default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
         
     | 
| 158 | 
         
            -
                                cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
         
     | 
| 159 | 
         
            -
                                label="Select Columns to Display:",
         
     | 
| 160 | 
         
            -
                            ),
         
     | 
| 161 | 
         
            -
                            search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
         
     | 
| 162 | 
         
            -
                            hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         
     | 
| 163 | 
         
            -
                            filter_columns=[
         
     | 
| 164 | 
         
            -
                                ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
         
     | 
| 165 | 
         
            -
                                ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
         
     | 
| 166 | 
         
            -
                                ColumnFilter(
         
     | 
| 167 | 
         
            -
                                    AutoEvalColumn.params.name,
         
     | 
| 168 | 
         
            -
                                    type="slider",
         
     | 
| 169 | 
         
            -
                                    min=0.01,
         
     | 
| 170 | 
         
            -
                                    max=150,
         
     | 
| 171 | 
         
            -
                                    label="Select the number of parameters (B)",
         
     | 
| 172 | 
         
            -
                                ),
         
     | 
| 173 | 
         
            -
                                ColumnFilter(
         
     | 
| 174 | 
         
            -
                                    AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
         
     | 
| 175 | 
         
            -
                                ),
         
     | 
| 176 | 
         
            -
                                ColumnFilter(
         
     | 
| 177 | 
         
            -
                                    AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
         
     | 
| 178 | 
         
            -
                                ),
         
     | 
| 179 | 
         
            -
                                ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
         
     | 
| 180 | 
         
            -
                                ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
         
     | 
| 181 | 
         
            -
                            ],
         
     | 
| 182 | 
         
            -
                            bool_checkboxgroup_label="Hide models",
         
     | 
| 183 | 
         
            -
                        )
         
     | 
| 184 | 
         | 
| 185 | 
         
             
                    with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
         
     | 
| 186 | 
         
             
                        with gr.Row():
         
     | 
| 
         @@ -219,7 +219,6 @@ with demo: 
     | 
|
| 219 | 
         
             
                            with gr.Column():
         
     | 
| 220 | 
         
             
                                model_name_textbox = gr.Textbox(label="Model name")
         
     | 
| 221 | 
         
             
                                revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
         
     | 
| 222 | 
         
            -
                                private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
         
     | 
| 223 | 
         
             
                                model_type = gr.Dropdown(
         
     | 
| 224 | 
         
             
                                    choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
         
     | 
| 225 | 
         
             
                                    label="Model type",
         
     | 
| 
         @@ -290,7 +289,6 @@ with demo: 
     | 
|
| 290 | 
         
             
                                base_model_name_textbox,
         
     | 
| 291 | 
         
             
                                revision_name_textbox,
         
     | 
| 292 | 
         
             
                                precision,
         
     | 
| 293 | 
         
            -
                                private,
         
     | 
| 294 | 
         
             
                                weight_type,
         
     | 
| 295 | 
         
             
                                model_type,
         
     | 
| 296 | 
         
             
                            ],
         
     | 
| 
         @@ -307,9 +305,61 @@ with demo: 
     | 
|
| 307 | 
         
             
                            show_copy_button=True,
         
     | 
| 308 | 
         
             
                        )
         
     | 
| 309 | 
         | 
| 310 | 
         
            -
             
     | 
| 311 | 
         
            -
             
     | 
| 312 | 
         
            -
             
     | 
| 313 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 314 | 
         | 
| 315 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 2 | 
         
             
            import logging
         
     | 
| 3 | 
         
             
            import time
         
     | 
| 4 | 
         
             
            import gradio as gr
         
     | 
| 5 | 
         
            +
            import datasets
         
     | 
| 6 | 
         
            +
            from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
         
     | 
| 7 | 
         
             
            from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
         
     | 
| 
         | 
|
| 8 | 
         | 
| 9 | 
         
             
            from src.display.about import (
         
     | 
| 10 | 
         
             
                CITATION_BUTTON_LABEL,
         
     | 
| 
         | 
|
| 29 | 
         
             
            )
         
     | 
| 30 | 
         
             
            from src.envs import (
         
     | 
| 31 | 
         
             
                API,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 32 | 
         
             
                EVAL_REQUESTS_PATH,
         
     | 
| 33 | 
         
            +
                AGGREGATED_REPO,
         
     | 
| 34 | 
         
            +
                HF_TOKEN,
         
     | 
| 
         | 
|
| 35 | 
         
             
                QUEUE_REPO,
         
     | 
| 36 | 
         
             
                REPO_ID,
         
     | 
| 37 | 
         
            +
                HF_HOME,
         
     | 
| 38 | 
         
             
            )
         
     | 
| 39 | 
         
             
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         
     | 
| 
         | 
|
| 40 | 
         
             
            from src.submission.submit import add_new_eval
         
     | 
| 
         | 
|
| 41 | 
         
             
            from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
         
     | 
| 42 | 
         | 
| 43 | 
         
             
            # Configure logging
         
     | 
| 44 | 
         
             
            logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
         
     | 
| 45 | 
         | 
| 
         | 
|
| 
         | 
|
| 46 | 
         | 
| 47 | 
         
            +
            # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
         
     | 
| 48 | 
         
            +
            # This controls whether a full initialization should be performed.
         
     | 
| 49 | 
         
            +
            DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
         
     | 
| 50 | 
         | 
| 51 | 
         
             
            def restart_space():
         
     | 
| 52 | 
         
            +
                API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
         
     | 
| 53 | 
         | 
| 54 | 
         | 
| 55 | 
         
             
            def time_diff_wrapper(func):
         
     | 
| 
         | 
|
| 88 | 
         
             
                        attempt += 1
         
     | 
| 89 | 
         
             
                raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
         
     | 
| 90 | 
         | 
| 91 | 
         
            +
            def get_latest_data_leaderboard():
         
     | 
| 92 | 
         
            +
                leaderboard_dataset = datasets.load_dataset(
         
     | 
| 93 | 
         
            +
                    AGGREGATED_REPO, 
         
     | 
| 94 | 
         
            +
                    "default", 
         
     | 
| 95 | 
         
            +
                    split="train", 
         
     | 
| 96 | 
         
            +
                    cache_dir=HF_HOME, 
         
     | 
| 97 | 
         
            +
                    download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset 
         
     | 
| 98 | 
         
            +
                    verification_mode="no_checks"
         
     | 
| 99 | 
         
            +
                )
         
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
                leaderboard_df = get_leaderboard_df(
         
     | 
| 102 | 
         
            +
                    leaderboard_dataset=leaderboard_dataset, 
         
     | 
| 103 | 
         
            +
                    cols=COLS,
         
     | 
| 104 | 
         
            +
                    benchmark_cols=BENCHMARK_COLS,
         
     | 
| 105 | 
         
            +
                )
         
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
                return leaderboard_df
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
            def get_latest_data_queue():
         
     | 
| 110 | 
         
            +
                eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
         
     | 
| 111 | 
         
            +
                return eval_queue_dfs
         
     | 
| 112 | 
         | 
| 113 | 
         
            +
            def init_space():
         
     | 
| 114 | 
         
             
                """Initializes the application space, loading only necessary data."""
         
     | 
| 115 | 
         
            +
                if DO_FULL_INIT:
         
     | 
| 116 | 
         
             
                    # These downloads only occur on full initialization
         
     | 
| 117 | 
         
             
                    try:
         
     | 
| 118 | 
         
             
                        download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         
     | 
| 
         | 
|
| 
         | 
|
| 119 | 
         
             
                    except Exception:
         
     | 
| 120 | 
         
             
                        restart_space()
         
     | 
| 121 | 
         | 
| 122 | 
         
            +
                # Always redownload the leaderboard DataFrame
         
     | 
| 123 | 
         
            +
                leaderboard_df = get_latest_data_leaderboard()
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 124 | 
         | 
| 125 | 
         
             
                # Evaluation queue DataFrame retrieval is independent of initialization detail level
         
     | 
| 126 | 
         
            +
                eval_queue_dfs = get_latest_data_queue()
         
     | 
| 127 | 
         | 
| 128 | 
         
            +
                return leaderboard_df, eval_queue_dfs
         
     | 
| 129 | 
         | 
| 130 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 131 | 
         
             
            # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
         
     | 
| 132 | 
         
             
            # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
         
     | 
| 133 | 
         
            +
            leaderboard_df, eval_queue_dfs = init_space()
         
     | 
| 134 | 
         
             
            finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
         
     | 
| 135 | 
         | 
| 136 | 
         | 
| 137 | 
         
             
            # Data processing for plots now only on demand in the respective Gradio tab
         
     | 
| 138 | 
         
             
            def load_and_create_plots():
         
     | 
| 139 | 
         
            +
                plot_df = create_plot_df(create_scores_df(leaderboard_df))
         
     | 
| 140 | 
         
             
                return plot_df
         
     | 
| 141 | 
         | 
| 142 | 
         
            +
            def init_leaderboard(dataframe):
         
     | 
| 143 | 
         
            +
                return Leaderboard(
         
     | 
| 144 | 
         
            +
                    value = dataframe,
         
     | 
| 145 | 
         
            +
                    datatype=[c.type for c in fields(AutoEvalColumn)],
         
     | 
| 146 | 
         
            +
                    select_columns=SelectColumns(
         
     | 
| 147 | 
         
            +
                        default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
         
     | 
| 148 | 
         
            +
                        cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
         
     | 
| 149 | 
         
            +
                        label="Select Columns to Display:",
         
     | 
| 150 | 
         
            +
                    ),
         
     | 
| 151 | 
         
            +
                    search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
         
     | 
| 152 | 
         
            +
                    hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         
     | 
| 153 | 
         
            +
                    filter_columns=[
         
     | 
| 154 | 
         
            +
                        ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
         
     | 
| 155 | 
         
            +
                        ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
         
     | 
| 156 | 
         
            +
                        ColumnFilter(
         
     | 
| 157 | 
         
            +
                            AutoEvalColumn.params.name,
         
     | 
| 158 | 
         
            +
                            type="slider",
         
     | 
| 159 | 
         
            +
                            min=0.01,
         
     | 
| 160 | 
         
            +
                            max=150,
         
     | 
| 161 | 
         
            +
                            label="Select the number of parameters (B)",
         
     | 
| 162 | 
         
            +
                        ),
         
     | 
| 163 | 
         
            +
                        ColumnFilter(
         
     | 
| 164 | 
         
            +
                            AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
         
     | 
| 165 | 
         
            +
                        ),
         
     | 
| 166 | 
         
            +
                        ColumnFilter(
         
     | 
| 167 | 
         
            +
                            AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
         
     | 
| 168 | 
         
            +
                        ),
         
     | 
| 169 | 
         
            +
                        ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
         
     | 
| 170 | 
         
            +
                        ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
         
     | 
| 171 | 
         
            +
                    ],
         
     | 
| 172 | 
         
            +
                    bool_checkboxgroup_label="Hide models",
         
     | 
| 173 | 
         
            +
                )
         
     | 
| 174 | 
         
            +
             
     | 
| 175 | 
         | 
| 176 | 
         
             
            demo = gr.Blocks(css=custom_css)
         
     | 
| 177 | 
         
             
            with demo:
         
     | 
| 
         | 
|
| 180 | 
         | 
| 181 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 182 | 
         
             
                    with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
         
     | 
| 183 | 
         
            +
                        leaderboard = init_leaderboard(leaderboard_df)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 184 | 
         | 
| 185 | 
         
             
                    with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
         
     | 
| 186 | 
         
             
                        with gr.Row():
         
     | 
| 
         | 
|
| 219 | 
         
             
                            with gr.Column():
         
     | 
| 220 | 
         
             
                                model_name_textbox = gr.Textbox(label="Model name")
         
     | 
| 221 | 
         
             
                                revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
         
     | 
| 
         | 
|
| 222 | 
         
             
                                model_type = gr.Dropdown(
         
     | 
| 223 | 
         
             
                                    choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
         
     | 
| 224 | 
         
             
                                    label="Model type",
         
     | 
| 
         | 
|
| 289 | 
         
             
                                base_model_name_textbox,
         
     | 
| 290 | 
         
             
                                revision_name_textbox,
         
     | 
| 291 | 
         
             
                                precision,
         
     | 
| 
         | 
|
| 292 | 
         
             
                                weight_type,
         
     | 
| 293 | 
         
             
                                model_type,
         
     | 
| 294 | 
         
             
                            ],
         
     | 
| 
         | 
|
| 305 | 
         
             
                            show_copy_button=True,
         
     | 
| 306 | 
         
             
                        )
         
     | 
| 307 | 
         | 
| 308 | 
         
            +
                demo.load(fn=get_latest_data_leaderboard, inputs=None, outputs=[leaderboard])
         
     | 
| 309 | 
         
            +
                demo.load(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
         
     | 
| 310 | 
         
            +
             
     | 
| 311 | 
         
            +
            demo.queue(default_concurrency_limit=40)
         
     | 
| 312 | 
         
            +
             
     | 
| 313 | 
         
            +
            # Start ephemeral Spaces on PRs (see config in README.md)
         
     | 
| 314 | 
         
            +
            from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
         
     | 
| 315 | 
         
            +
             
     | 
| 316 | 
         
            +
            def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
         
     | 
| 317 | 
         
            +
                # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
         
     | 
| 318 | 
         
            +
                # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
         
     | 
| 319 | 
         
            +
                # ht to Lucain!
         
     | 
| 320 | 
         
            +
                if SPACE_ID is None:
         
     | 
| 321 | 
         
            +
                    print("Not in a Space: Space CI disabled.")
         
     | 
| 322 | 
         
            +
                    return WebhooksServer(ui=demo)
         
     | 
| 323 | 
         
            +
             
     | 
| 324 | 
         
            +
                if IS_EPHEMERAL_SPACE:
         
     | 
| 325 | 
         
            +
                    print("In an ephemeral Space: Space CI disabled.")
         
     | 
| 326 | 
         
            +
                    return WebhooksServer(ui=demo)
         
     | 
| 327 | 
         
            +
             
     | 
| 328 | 
         
            +
                card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
         
     | 
| 329 | 
         
            +
                config = card.data.get("space_ci", {})
         
     | 
| 330 | 
         
            +
                print(f"Enabling Space CI with config from README: {config}")
         
     | 
| 331 | 
         
            +
             
     | 
| 332 | 
         
            +
                return configure_space_ci(
         
     | 
| 333 | 
         
            +
                    blocks=ui,
         
     | 
| 334 | 
         
            +
                    trusted_authors=config.get("trusted_authors"),
         
     | 
| 335 | 
         
            +
                    private=config.get("private", "auto"),
         
     | 
| 336 | 
         
            +
                    variables=config.get("variables", "auto"),
         
     | 
| 337 | 
         
            +
                    secrets=config.get("secrets"),
         
     | 
| 338 | 
         
            +
                    hardware=config.get("hardware"),
         
     | 
| 339 | 
         
            +
                    storage=config.get("storage"),
         
     | 
| 340 | 
         
            +
                )
         
     | 
| 341 | 
         | 
| 342 | 
         
            +
            # Create webhooks server (with CI url if in Space and not ephemeral)
         
     | 
| 343 | 
         
            +
            webhooks_server = enable_space_ci_and_return_server(ui=demo)
         
     | 
| 344 | 
         
            +
             
     | 
| 345 | 
         
            +
            # Add webhooks
         
     | 
| 346 | 
         
            +
            @webhooks_server.add_webhook
         
     | 
| 347 | 
         
            +
            async def update_leaderboard(payload: WebhookPayload) -> None:
         
     | 
| 348 | 
         
            +
                """Redownloads the leaderboard dataset each time it updates"""
         
     | 
| 349 | 
         
            +
                if payload.repo.type == "dataset" and payload.event.action == "update":
         
     | 
| 350 | 
         
            +
                    datasets.load_dataset(
         
     | 
| 351 | 
         
            +
                        AGGREGATED_REPO, 
         
     | 
| 352 | 
         
            +
                        "default", 
         
     | 
| 353 | 
         
            +
                        split="train", 
         
     | 
| 354 | 
         
            +
                        cache_dir=HF_HOME, 
         
     | 
| 355 | 
         
            +
                        download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, 
         
     | 
| 356 | 
         
            +
                        verification_mode="no_checks"
         
     | 
| 357 | 
         
            +
                    )
         
     | 
| 358 | 
         
            +
             
     | 
| 359 | 
         
            +
            @webhooks_server.add_webhook    
         
     | 
| 360 | 
         
            +
            async def update_queue(payload: WebhookPayload) -> None:
         
     | 
| 361 | 
         
            +
                """Redownloads the queue dataset each time it updates"""
         
     | 
| 362 | 
         
            +
                if payload.repo.type == "dataset" and payload.event.action == "update":
         
     | 
| 363 | 
         
            +
                    download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         
     | 
| 364 | 
         
            +
             
     | 
| 365 | 
         
            +
            webhooks_server.launch()
         
     | 
| 
         @@ -15,4 +15,4 @@ transformers==4.41.1 
     | 
|
| 15 | 
         
             
            tokenizers>=0.15.0
         
     | 
| 16 | 
         
             
            gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
         
     | 
| 17 | 
         
             
            gradio==4.20.0
         
     | 
| 18 | 
         
            -
            gradio_leaderboard==0.0. 
     | 
| 
         | 
|
| 15 | 
         
             
            tokenizers>=0.15.0
         
     | 
| 16 | 
         
             
            gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
         
     | 
| 17 | 
         
             
            gradio==4.20.0
         
     | 
| 18 | 
         
            +
            gradio_leaderboard==0.0.9
         
     | 
| 
         @@ -81,7 +81,7 @@ To get more information about quantization, see: 
     | 
|
| 81 | 
         
             
            - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
         
     | 
| 82 | 
         | 
| 83 | 
         
             
            ### Useful links
         
     | 
| 84 | 
         
            -
            - [Community resources](https://huggingface.co/spaces/ 
     | 
| 85 | 
         
             
            - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
         
     | 
| 86 | 
         | 
| 87 | 
         
             
            ### Other cool leaderboards:
         
     | 
| 
         @@ -217,7 +217,7 @@ CITATION_BUTTON_TEXT = r""" 
     | 
|
| 217 | 
         
             
              title = {Open LLM Leaderboard},
         
     | 
| 218 | 
         
             
              year = {2023},
         
     | 
| 219 | 
         
             
              publisher = {Hugging Face},
         
     | 
| 220 | 
         
            -
              howpublished = "\url{https://huggingface.co/spaces/ 
     | 
| 221 | 
         
             
            }
         
     | 
| 222 | 
         
             
            @software{eval-harness,
         
     | 
| 223 | 
         
             
              author       = {Gao, Leo and
         
     | 
| 
         | 
|
| 81 | 
         
             
            - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
         
     | 
| 82 | 
         | 
| 83 | 
         
             
            ### Useful links
         
     | 
| 84 | 
         
            +
            - [Community resources](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/174)
         
     | 
| 85 | 
         
             
            - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
         
     | 
| 86 | 
         | 
| 87 | 
         
             
            ### Other cool leaderboards:
         
     | 
| 
         | 
|
| 217 | 
         
             
              title = {Open LLM Leaderboard},
         
     | 
| 218 | 
         
             
              year = {2023},
         
     | 
| 219 | 
         
             
              publisher = {Hugging Face},
         
     | 
| 220 | 
         
            +
              howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
         
     | 
| 221 | 
         
             
            }
         
     | 
| 222 | 
         
             
            @software{eval-harness,
         
     | 
| 223 | 
         
             
              author       = {Gao, Leo and
         
     | 
| 
         @@ -93,6 +93,7 @@ auto_eval_column_dict.append( 
     | 
|
| 93 | 
         
             
            auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
         
     | 
| 94 | 
         
             
            auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
         
     | 
| 95 | 
         
             
            auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
         
     | 
| 
         | 
|
| 96 | 
         
             
            # Dummy column for the search bar (hidden by the custom CSS)
         
     | 
| 97 | 
         
             
            auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
         
     | 
| 98 | 
         | 
| 
         | 
|
| 93 | 
         
             
            auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
         
     | 
| 94 | 
         
             
            auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
         
     | 
| 95 | 
         
             
            auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
         
     | 
| 96 | 
         
            +
            auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
         
     | 
| 97 | 
         
             
            # Dummy column for the search bar (hidden by the custom CSS)
         
     | 
| 98 | 
         
             
            auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
         
     | 
| 99 | 
         | 
| 
         @@ -2,17 +2,11 @@ import os 
     | 
|
| 2 | 
         
             
            from huggingface_hub import HfApi
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # clone / pull the lmeh eval data
         
     | 
| 5 | 
         
            -
             
     | 
| 6 | 
         | 
| 7 | 
         
            -
            REPO_ID = " 
     | 
| 8 | 
         
             
            QUEUE_REPO = "open-llm-leaderboard/requests"
         
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
            RESULTS_REPO = "open-llm-leaderboard/results"
         
     | 
| 11 | 
         
            -
             
     | 
| 12 | 
         
            -
            PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
         
     | 
| 13 | 
         
            -
            PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
         
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
            IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
         
     | 
| 16 | 
         | 
| 17 | 
         
             
            HF_HOME = os.getenv("HF_HOME", ".")
         
     | 
| 18 | 
         | 
| 
         @@ -27,18 +21,10 @@ else: 
     | 
|
| 27 | 
         
             
                print("Write access confirmed for HF_HOME")
         
     | 
| 28 | 
         | 
| 29 | 
         
             
            EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
         
     | 
| 30 | 
         
            -
            EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
         
     | 
| 31 | 
         
            -
            DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
         
     | 
| 32 | 
         
            -
            DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
         
     | 
| 33 | 
         
            -
             
     | 
| 34 | 
         
            -
            EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
         
     | 
| 35 | 
         
            -
            EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
         
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         
            -
            PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
         
     | 
| 38 | 
         | 
| 39 | 
         
             
            # Rate limit variables
         
     | 
| 40 | 
         
             
            RATE_LIMIT_PERIOD = 7
         
     | 
| 41 | 
         
             
            RATE_LIMIT_QUOTA = 5
         
     | 
| 42 | 
         
             
            HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
         
     | 
| 43 | 
         | 
| 44 | 
         
            -
            API = HfApi(token= 
     | 
| 
         | 
|
| 2 | 
         
             
            from huggingface_hub import HfApi
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            # clone / pull the lmeh eval data
         
     | 
| 5 | 
         
            +
            HF_TOKEN = os.environ.get("HF_TOKEN", None)
         
     | 
| 6 | 
         | 
| 7 | 
         
            +
            REPO_ID = "open-llm-leaderboard/open_llm_leaderboard"
         
     | 
| 8 | 
         
             
            QUEUE_REPO = "open-llm-leaderboard/requests"
         
     | 
| 9 | 
         
            +
            AGGREGATED_REPO = "open-llm-leaderboard/contents"
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 10 | 
         | 
| 11 | 
         
             
            HF_HOME = os.getenv("HF_HOME", ".")
         
     | 
| 12 | 
         | 
| 
         | 
|
| 21 | 
         
             
                print("Write access confirmed for HF_HOME")
         
     | 
| 22 | 
         | 
| 23 | 
         
             
            EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 24 | 
         | 
| 25 | 
         
             
            # Rate limit variables
         
     | 
| 26 | 
         
             
            RATE_LIMIT_PERIOD = 7
         
     | 
| 27 | 
         
             
            RATE_LIMIT_QUOTA = 5
         
     | 
| 28 | 
         
             
            HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
         
     | 
| 29 | 
         | 
| 30 | 
         
            +
            API = HfApi(token=HF_TOKEN)
         
     | 
| 
         @@ -5,120 +5,120 @@ from src.display.utils import AutoEvalColumn 
     | 
|
| 5 | 
         
             
            # Models which have been flagged by users as being problematic for a reason or another
         
     | 
| 6 | 
         
             
            # (Model name to forum discussion link)
         
     | 
| 7 | 
         
             
            FLAGGED_MODELS = {
         
     | 
| 8 | 
         
            -
                "merged": "https://huggingface.co/spaces/ 
     | 
| 9 | 
         
            -
                "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/ 
     | 
| 10 | 
         
            -
                "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/ 
     | 
| 11 | 
         
            -
                "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/ 
     | 
| 12 | 
         
            -
                "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/ 
     | 
| 13 | 
         
            -
                "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/ 
     | 
| 14 | 
         
            -
                "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/ 
     | 
| 15 | 
         
            -
                "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/ 
     | 
| 16 | 
         
            -
                "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/ 
     | 
| 17 | 
         
            -
                "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/ 
     | 
| 18 | 
         
            -
                "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/ 
     | 
| 19 | 
         
            -
                "jan-hq/trinity-v1": "https://huggingface.co/spaces/ 
     | 
| 20 | 
         
            -
                "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/ 
     | 
| 21 | 
         
            -
                "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/ 
     | 
| 22 | 
         
            -
                "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/ 
     | 
| 23 | 
         
            -
                "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/ 
     | 
| 24 | 
         
            -
                "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/ 
     | 
| 25 | 
         
            -
                "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/ 
     | 
| 26 | 
         
            -
                "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/ 
     | 
| 27 | 
         
            -
                "janai-hq/trinity-v1": "https://huggingface.co/spaces/ 
     | 
| 28 | 
         
            -
                "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/ 
     | 
| 29 | 
         
            -
                "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/ 
     | 
| 30 | 
         
            -
                "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/ 
     | 
| 31 | 
         
            -
                "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/ 
     | 
| 32 | 
         
            -
                "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/ 
     | 
| 33 | 
         
            -
                "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/ 
     | 
| 34 | 
         
            -
                "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/ 
     | 
| 35 | 
         
            -
                "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/ 
     | 
| 36 | 
         
            -
                "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/ 
     | 
| 37 | 
         
            -
                "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/ 
     | 
| 38 | 
         
            -
                "cookinai/BruinHermes": "https://huggingface.co/spaces/ 
     | 
| 39 | 
         
            -
                "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/ 
     | 
| 40 | 
         
            -
                "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/ 
     | 
| 41 | 
         
            -
                "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/ 
     | 
| 42 | 
         
            -
                "rwitz2/pee": "https://huggingface.co/spaces/ 
     | 
| 43 | 
         
            -
                "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/ 
     | 
| 44 | 
         
            -
                "dillfrescott/trinity-medium": "https://huggingface.co/spaces/ 
     | 
| 45 | 
         
            -
                "udkai/Garrulus": "https://huggingface.co/spaces/ 
     | 
| 46 | 
         
             
                "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
         
     | 
| 47 | 
         
            -
                "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/ 
     | 
| 48 | 
         
            -
                "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/ 
     | 
| 49 | 
         
            -
                "alnrg2arg/test2_3": "https://huggingface.co/spaces/ 
     | 
| 50 | 
         
            -
                "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/ 
     | 
| 51 | 
         
            -
                "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/ 
     | 
| 52 | 
         
            -
                "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/ 
     | 
| 53 | 
         
             
                # Merges not indicated
         
     | 
| 54 | 
         
            -
                "gagan3012/MetaModelv2": "https://huggingface.co/spaces/ 
     | 
| 55 | 
         
            -
                "gagan3012/MetaModelv3": "https://huggingface.co/spaces/ 
     | 
| 56 | 
         
            -
                "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/ 
     | 
| 57 | 
         
            -
                "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/ 
     | 
| 58 | 
         
            -
                "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/ 
     | 
| 59 | 
         
            -
                "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/ 
     | 
| 60 | 
         
            -
                "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/ 
     | 
| 61 | 
         
            -
                "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/ 
     | 
| 62 | 
         
            -
                "rwitz/go-bruins-v2": "https://huggingface.co/spaces/ 
     | 
| 63 | 
         
            -
                "rwitz/go-bruins": "https://huggingface.co/spaces/ 
     | 
| 64 | 
         
            -
                "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/ 
     | 
| 65 | 
         
            -
                "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/ 
     | 
| 66 | 
         
            -
                "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/ 
     | 
| 67 | 
         
            -
                "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/ 
     | 
| 68 | 
         
            -
                "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/ 
     | 
| 69 | 
         
            -
                "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/ 
     | 
| 70 | 
         
            -
                "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/ 
     | 
| 71 | 
         
            -
                "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/ 
     | 
| 72 | 
         
            -
                "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/ 
     | 
| 73 | 
         
            -
                "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/ 
     | 
| 74 | 
         
            -
                "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/ 
     | 
| 75 | 
         
            -
                "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/ 
     | 
| 76 | 
         
            -
                "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/ 
     | 
| 77 | 
         
            -
                "elinas/chronos007-70b": "https://huggingface.co/spaces/ 
     | 
| 78 | 
         
            -
                "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/ 
     | 
| 79 | 
         
            -
                "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/ 
     | 
| 80 | 
         
            -
                "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/ 
     | 
| 81 | 
         
            -
                "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/ 
     | 
| 82 | 
         
            -
                "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/ 
     | 
| 83 | 
         
            -
                "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/ 
     | 
| 84 | 
         
            -
                "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/ 
     | 
| 85 | 
         
            -
                "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/ 
     | 
| 86 | 
         
            -
                "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/ 
     | 
| 87 | 
         
            -
                "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/ 
     | 
| 88 | 
         
            -
                "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/ 
     | 
| 89 | 
         
            -
                "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/ 
     | 
| 90 | 
         
            -
                "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/ 
     | 
| 91 | 
         
            -
                "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/ 
     | 
| 92 | 
         
            -
                "udkai/Turdus": "https://huggingface.co/spaces/ 
     | 
| 93 | 
         
            -
                "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/ 
     | 
| 94 | 
         
            -
                "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/ 
     | 
| 95 | 
         
            -
                "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/ 
     | 
| 96 | 
         
            -
                "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/ 
     | 
| 97 | 
         
            -
                "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/ 
     | 
| 98 | 
         
            -
                "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/ 
     | 
| 99 | 
         
            -
                "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/ 
     | 
| 100 | 
         
            -
                "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/ 
     | 
| 101 | 
         
            -
                "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/ 
     | 
| 102 | 
         
            -
                "cookinai/OpenCM-14": "https://huggingface.co/spaces/ 
     | 
| 103 | 
         
            -
                "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/ 
     | 
| 104 | 
         
            -
                "jan-hq/supermario-v2": "https://huggingface.co/spaces/ 
     | 
| 105 | 
         
             
                # MoErges
         
     | 
| 106 | 
         
            -
                "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/ 
     | 
| 107 | 
         
            -
                "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/ 
     | 
| 108 | 
         
            -
                "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/ 
     | 
| 109 | 
         
            -
                "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/ 
     | 
| 110 | 
         
            -
                "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/ 
     | 
| 111 | 
         
            -
                "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/ 
     | 
| 112 | 
         
            -
                "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/ 
     | 
| 113 | 
         
            -
                "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/ 
     | 
| 114 | 
         
            -
                "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/ 
     | 
| 115 | 
         
            -
                "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/ 
     | 
| 116 | 
         
            -
                "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/ 
     | 
| 117 | 
         
             
                # Other - contamination mostly
         
     | 
| 118 | 
         
            -
                "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/ 
     | 
| 119 | 
         
            -
                "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/ 
     | 
| 120 | 
         
            -
                "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/ 
     | 
| 121 | 
         
            -
                "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/ 
     | 
| 122 | 
         
             
            }
         
     | 
| 123 | 
         | 
| 124 | 
         
             
            # Models which have been requested by orgs to not be submitted on the leaderboard
         
     | 
| 
         @@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]): 
     | 
|
| 167 | 
         
             
                    leaderboard_data.pop(ix)
         
     | 
| 168 | 
         
             
                return leaderboard_data
         
     | 
| 169 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 170 | 
         | 
| 171 | 
         
             
            def filter_models_flags(leaderboard_data: list[dict]):
         
     | 
| 172 | 
         
             
                leaderboard_data = remove_forbidden_models(leaderboard_data)
         
     | 
| 
         | 
|
| 5 | 
         
             
            # Models which have been flagged by users as being problematic for a reason or another
         
     | 
| 6 | 
         
             
            # (Model name to forum discussion link)
         
     | 
| 7 | 
         
             
            FLAGGED_MODELS = {
         
     | 
| 8 | 
         
            +
                "merged": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 9 | 
         
            +
                "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
         
     | 
| 10 | 
         
            +
                "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
         
     | 
| 11 | 
         
            +
                "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
         
     | 
| 12 | 
         
            +
                "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
         
     | 
| 13 | 
         
            +
                "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
         
     | 
| 14 | 
         
            +
                "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
         
     | 
| 15 | 
         
            +
                "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
         
     | 
| 16 | 
         
            +
                "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
         
     | 
| 17 | 
         
            +
                "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
         
     | 
| 18 | 
         
            +
                "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
         
     | 
| 19 | 
         
            +
                "jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 20 | 
         
            +
                "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 21 | 
         
            +
                "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 22 | 
         
            +
                "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 23 | 
         
            +
                "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 24 | 
         
            +
                "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 25 | 
         
            +
                "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 26 | 
         
            +
                "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 27 | 
         
            +
                "janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 28 | 
         
            +
                "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 29 | 
         
            +
                "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 30 | 
         
            +
                "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 31 | 
         
            +
                "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 32 | 
         
            +
                "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 33 | 
         
            +
                "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 34 | 
         
            +
                "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 35 | 
         
            +
                "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 36 | 
         
            +
                "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 37 | 
         
            +
                "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 38 | 
         
            +
                "cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 39 | 
         
            +
                "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 40 | 
         
            +
                "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 41 | 
         
            +
                "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 42 | 
         
            +
                "rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 43 | 
         
            +
                "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
         
     | 
| 44 | 
         
            +
                "dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         
     | 
| 45 | 
         
            +
                "udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
         
     | 
| 46 | 
         
             
                "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
         
     | 
| 47 | 
         
            +
                "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         
     | 
| 48 | 
         
            +
                "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         
     | 
| 49 | 
         
            +
                "alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         
     | 
| 50 | 
         
            +
                "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         
     | 
| 51 | 
         
            +
                "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         
     | 
| 52 | 
         
            +
                "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         
     | 
| 53 | 
         
             
                # Merges not indicated
         
     | 
| 54 | 
         
            +
                "gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 55 | 
         
            +
                "gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 56 | 
         
            +
                "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 57 | 
         
            +
                "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 58 | 
         
            +
                "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 59 | 
         
            +
                "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 60 | 
         
            +
                "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 61 | 
         
            +
                "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 62 | 
         
            +
                "rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 63 | 
         
            +
                "rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 64 | 
         
            +
                "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 65 | 
         
            +
                "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 66 | 
         
            +
                "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 67 | 
         
            +
                "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 68 | 
         
            +
                "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 69 | 
         
            +
                "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 70 | 
         
            +
                "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 71 | 
         
            +
                "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 72 | 
         
            +
                "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 73 | 
         
            +
                "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 74 | 
         
            +
                "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 75 | 
         
            +
                "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 76 | 
         
            +
                "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 77 | 
         
            +
                "elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 78 | 
         
            +
                "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 79 | 
         
            +
                "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 80 | 
         
            +
                "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 81 | 
         
            +
                "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 82 | 
         
            +
                "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 83 | 
         
            +
                "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 84 | 
         
            +
                "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 85 | 
         
            +
                "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 86 | 
         
            +
                "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         
     | 
| 87 | 
         
            +
                "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         
     | 
| 88 | 
         
            +
                "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         
     | 
| 89 | 
         
            +
                "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         
     | 
| 90 | 
         
            +
                "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         
     | 
| 91 | 
         
            +
                "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         
     | 
| 92 | 
         
            +
                "udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 93 | 
         
            +
                "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 94 | 
         
            +
                "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 95 | 
         
            +
                "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 96 | 
         
            +
                "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 97 | 
         
            +
                "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 98 | 
         
            +
                "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 99 | 
         
            +
                "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 100 | 
         
            +
                "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 101 | 
         
            +
                "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 102 | 
         
            +
                "cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 103 | 
         
            +
                "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 104 | 
         
            +
                "jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 105 | 
         
             
                # MoErges
         
     | 
| 106 | 
         
            +
                "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 107 | 
         
            +
                "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 108 | 
         
            +
                "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 109 | 
         
            +
                "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 110 | 
         
            +
                "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 111 | 
         
            +
                "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 112 | 
         
            +
                "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 113 | 
         
            +
                "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 114 | 
         
            +
                "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 115 | 
         
            +
                "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 116 | 
         
            +
                "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         
     | 
| 117 | 
         
             
                # Other - contamination mostly
         
     | 
| 118 | 
         
            +
                "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
         
     | 
| 119 | 
         
            +
                "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
         
     | 
| 120 | 
         
            +
                "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
         
     | 
| 121 | 
         
            +
                "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
         
     | 
| 122 | 
         
             
            }
         
     | 
| 123 | 
         | 
| 124 | 
         
             
            # Models which have been requested by orgs to not be submitted on the leaderboard
         
     | 
| 
         | 
|
| 167 | 
         
             
                    leaderboard_data.pop(ix)
         
     | 
| 168 | 
         
             
                return leaderboard_data
         
     | 
| 169 | 
         | 
| 170 | 
         
            +
            """
         
     | 
| 171 | 
         
            +
            def remove_forbidden_models(leaderboard_data):
         
     | 
| 172 | 
         
            +
                #Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
         
     | 
| 173 | 
         
            +
                indices_to_remove = []
         
     | 
| 174 | 
         
            +
                for ix, row in leaderboard_data.iterrows():
         
     | 
| 175 | 
         
            +
                    if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
         
     | 
| 176 | 
         
            +
                        indices_to_remove.append(ix)
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
                # Remove the models from the list
         
     | 
| 179 | 
         
            +
                return leaderboard_data.drop(indices_to_remove)
         
     | 
| 180 | 
         
            +
            """
         
     | 
| 181 | 
         
            +
             
     | 
| 182 | 
         | 
| 183 | 
         
             
            def filter_models_flags(leaderboard_data: list[dict]):
         
     | 
| 184 | 
         
             
                leaderboard_data = remove_forbidden_models(leaderboard_data)
         
     | 
| 
         @@ -1,261 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import json
         
     | 
| 2 | 
         
            -
            from pathlib import Path
         
     | 
| 3 | 
         
            -
            from json import JSONDecodeError
         
     | 
| 4 | 
         
            -
            import logging
         
     | 
| 5 | 
         
            -
            import math
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
            -
            from dataclasses import dataclass, field
         
     | 
| 8 | 
         
            -
            from typing import Optional, Dict, List
         
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
            from tqdm import tqdm
         
     | 
| 11 | 
         
            -
            from tqdm.contrib.logging import logging_redirect_tqdm
         
     | 
| 12 | 
         
            -
             
     | 
| 13 | 
         
            -
            import numpy as np
         
     | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
            -
            from src.display.formatting import make_clickable_model
         
     | 
| 16 | 
         
            -
            from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
         
     | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
            -
            # Configure logging
         
     | 
| 19 | 
         
            -
            logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
            @dataclass
         
     | 
| 23 | 
         
            -
            class EvalResult:
         
     | 
| 24 | 
         
            -
                # Also see src.display.utils.AutoEvalColumn for what will be displayed.
         
     | 
| 25 | 
         
            -
                eval_name: str  # org_model_precision (uid)
         
     | 
| 26 | 
         
            -
                full_model: str  # org/model (path on hub)
         
     | 
| 27 | 
         
            -
                org: Optional[str]
         
     | 
| 28 | 
         
            -
                model: str
         
     | 
| 29 | 
         
            -
                revision: str  # commit hash, "" if main
         
     | 
| 30 | 
         
            -
                results: Dict[str, float]
         
     | 
| 31 | 
         
            -
                precision: Precision = Precision.Unknown
         
     | 
| 32 | 
         
            -
                model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
         
     | 
| 33 | 
         
            -
                weight_type: WeightType = WeightType.Original
         
     | 
| 34 | 
         
            -
                architecture: str = "Unknown"  # From config file
         
     | 
| 35 | 
         
            -
                license: str = "?"
         
     | 
| 36 | 
         
            -
                likes: int = 0
         
     | 
| 37 | 
         
            -
                num_params: int = 0
         
     | 
| 38 | 
         
            -
                date: str = ""  # submission date of request file
         
     | 
| 39 | 
         
            -
                still_on_hub: bool = True
         
     | 
| 40 | 
         
            -
                is_merge: bool = False
         
     | 
| 41 | 
         
            -
                not_flagged: bool = False
         
     | 
| 42 | 
         
            -
                status: str = "FINISHED"
         
     | 
| 43 | 
         
            -
                # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
         
     | 
| 44 | 
         
            -
                tags: List[str] = field(default_factory=list)
         
     | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
            -
                @classmethod
         
     | 
| 47 | 
         
            -
                def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
         
     | 
| 48 | 
         
            -
                    with open(json_filepath, "r") as fp:
         
     | 
| 49 | 
         
            -
                        data = json.load(fp)
         
     | 
| 50 | 
         
            -
             
     | 
| 51 | 
         
            -
                    config = data.get("config_general", {})
         
     | 
| 52 | 
         
            -
                    precision = Precision.from_str(config.get("model_dtype", "unknown"))
         
     | 
| 53 | 
         
            -
                    org_and_model = config.get("model_name", "").split("/", 1)
         
     | 
| 54 | 
         
            -
                    org = org_and_model[0] if len(org_and_model) > 1 else None
         
     | 
| 55 | 
         
            -
                    model = org_and_model[-1]
         
     | 
| 56 | 
         
            -
                    if len(org_and_model) == 1:
         
     | 
| 57 | 
         
            -
                        org = None
         
     | 
| 58 | 
         
            -
                        model = org_and_model[0]
         
     | 
| 59 | 
         
            -
                        result_key = f"{model}_{precision.value.name}"
         
     | 
| 60 | 
         
            -
                    else:
         
     | 
| 61 | 
         
            -
                        org = org_and_model[0]
         
     | 
| 62 | 
         
            -
                        model = org_and_model[1]
         
     | 
| 63 | 
         
            -
                        result_key = f"{org}_{model}_{precision.value.name}"
         
     | 
| 64 | 
         
            -
                    full_model = "/".join(org_and_model)
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
                    results = cls.extract_results(data)  # Properly call the method to extract results
         
     | 
| 67 | 
         
            -
             
     | 
| 68 | 
         
            -
                    return cls(
         
     | 
| 69 | 
         
            -
                        eval_name=result_key,
         
     | 
| 70 | 
         
            -
                        full_model=full_model,
         
     | 
| 71 | 
         
            -
                        org=org,
         
     | 
| 72 | 
         
            -
                        model=model,
         
     | 
| 73 | 
         
            -
                        results=results,
         
     | 
| 74 | 
         
            -
                        precision=precision,
         
     | 
| 75 | 
         
            -
                        revision=config.get("model_sha", ""),
         
     | 
| 76 | 
         
            -
                    )
         
     | 
| 77 | 
         
            -
             
     | 
| 78 | 
         
            -
                @staticmethod
         
     | 
| 79 | 
         
            -
                def extract_results(data: Dict) -> Dict[str, float]:
         
     | 
| 80 | 
         
            -
                    """
         
     | 
| 81 | 
         
            -
                    Extract and process benchmark results from a given dict.
         
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
                    Parameters:
         
     | 
| 84 | 
         
            -
                    - data (Dict): A dictionary containing benchmark data. This dictionary must
         
     | 
| 85 | 
         
            -
                    include 'versions' and 'results' keys with respective sub-data.
         
     | 
| 86 | 
         
            -
             
     | 
| 87 | 
         
            -
                    Returns:
         
     | 
| 88 | 
         
            -
                    - Dict[str, float]: A dictionary where keys are benchmark names and values
         
     | 
| 89 | 
         
            -
                    are the processed average scores as percentages.
         
     | 
| 90 | 
         
            -
             
     | 
| 91 | 
         
            -
                    Notes:
         
     | 
| 92 | 
         
            -
                    - The method specifically checks for certain benchmark names to skip outdated entries.
         
     | 
| 93 | 
         
            -
                    - Handles NaN values by setting the corresponding benchmark result to 0.0.
         
     | 
| 94 | 
         
            -
                    - Averages scores across metrics for benchmarks found in the data, in a percentage format.
         
     | 
| 95 | 
         
            -
                    """
         
     | 
| 96 | 
         
            -
                    results = {}
         
     | 
| 97 | 
         
            -
                    for task in Tasks:
         
     | 
| 98 | 
         
            -
                        task = task.value
         
     | 
| 99 | 
         
            -
                        # We skip old mmlu entries
         
     | 
| 100 | 
         
            -
                        if task.benchmark == "hendrycksTest":
         
     | 
| 101 | 
         
            -
                            for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
         
     | 
| 102 | 
         
            -
                                if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
         
     | 
| 103 | 
         
            -
                                    continue
         
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
                        # Some benchamrk values are NaNs, mostly truthfulQA
         
     | 
| 106 | 
         
            -
                        # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
         
     | 
| 107 | 
         
            -
                        # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
         
     | 
| 108 | 
         
            -
                        for k, v in data["results"].items():
         
     | 
| 109 | 
         
            -
                            if task.benchmark in k:
         
     | 
| 110 | 
         
            -
                                if math.isnan(float(v[task.metric])):
         
     | 
| 111 | 
         
            -
                                    results[task.benchmark] = 0.0
         
     | 
| 112 | 
         
            -
                                    continue
         
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
                        # We average all scores of a given metric (mostly for mmlu)
         
     | 
| 115 | 
         
            -
                        accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
         
     | 
| 116 | 
         
            -
                        if accs.size == 0 or any([acc is None for acc in accs]):
         
     | 
| 117 | 
         
            -
                            continue
         
     | 
| 118 | 
         
            -
             
     | 
| 119 | 
         
            -
                        mean_acc = np.mean(accs) * 100.0
         
     | 
| 120 | 
         
            -
                        results[task.benchmark] = mean_acc
         
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
                    return results
         
     | 
| 123 | 
         
            -
             
     | 
| 124 | 
         
            -
                def update_with_request_file(self, requests_path):
         
     | 
| 125 | 
         
            -
                    """Finds the relevant request file for the current model and updates info with it."""
         
     | 
| 126 | 
         
            -
                    try:
         
     | 
| 127 | 
         
            -
                        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         
     | 
| 128 | 
         
            -
                        if request_file is None:
         
     | 
| 129 | 
         
            -
                            logging.warning(f"No request file for {self.org}/{self.model}")
         
     | 
| 130 | 
         
            -
                            self.status = "FAILED"
         
     | 
| 131 | 
         
            -
                            return
         
     | 
| 132 | 
         
            -
             
     | 
| 133 | 
         
            -
                        with open(request_file, "r") as f:
         
     | 
| 134 | 
         
            -
                            request = json.load(f)
         
     | 
| 135 | 
         
            -
             
     | 
| 136 | 
         
            -
                        self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
         
     | 
| 137 | 
         
            -
                        self.weight_type = WeightType[request.get("weight_type", "Original")]
         
     | 
| 138 | 
         
            -
                        self.num_params = int(request.get("params", 0))  # Ensuring type safety
         
     | 
| 139 | 
         
            -
                        self.date = request.get("submitted_time", "")
         
     | 
| 140 | 
         
            -
                        self.architecture = request.get("architectures", "Unknown")
         
     | 
| 141 | 
         
            -
                        self.status = request.get("status", "FAILED")
         
     | 
| 142 | 
         
            -
             
     | 
| 143 | 
         
            -
                    except FileNotFoundError:
         
     | 
| 144 | 
         
            -
                        self.status = "FAILED"
         
     | 
| 145 | 
         
            -
                        logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
         
     | 
| 146 | 
         
            -
                    except JSONDecodeError:
         
     | 
| 147 | 
         
            -
                        self.status = "FAILED"
         
     | 
| 148 | 
         
            -
                        logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
         
     | 
| 149 | 
         
            -
                    except KeyError as e:
         
     | 
| 150 | 
         
            -
                        self.status = "FAILED"
         
     | 
| 151 | 
         
            -
                        logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
         
     | 
| 152 | 
         
            -
                    except Exception as e:  # Catch-all for any other unexpected exceptions
         
     | 
| 153 | 
         
            -
                        self.status = "FAILED"
         
     | 
| 154 | 
         
            -
                        logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
         
     | 
| 155 | 
         
            -
             
     | 
| 156 | 
         
            -
                def update_with_dynamic_file_dict(self, file_dict):
         
     | 
| 157 | 
         
            -
                    """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
         
     | 
| 158 | 
         
            -
                    # Default values set for optional or potentially missing keys.
         
     | 
| 159 | 
         
            -
                    self.license = file_dict.get("license", "?")
         
     | 
| 160 | 
         
            -
                    self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
         
     | 
| 161 | 
         
            -
                    self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
         
     | 
| 162 | 
         
            -
                    self.tags = file_dict.get("tags", [])
         
     | 
| 163 | 
         
            -
             
     | 
| 164 | 
         
            -
                    # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
         
     | 
| 165 | 
         
            -
                    self.not_flagged = not (any("flagged" in tag for tag in self.tags))
         
     | 
| 166 | 
         
            -
             
     | 
| 167 | 
         
            -
                def to_dict(self):
         
     | 
| 168 | 
         
            -
                    """Converts the Eval Result to a dict compatible with our dataframe display"""
         
     | 
| 169 | 
         
            -
                    average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         
     | 
| 170 | 
         
            -
                    data_dict = {
         
     | 
| 171 | 
         
            -
                        "eval_name": self.eval_name,  # not a column, just a save name,
         
     | 
| 172 | 
         
            -
                        AutoEvalColumn.precision.name: self.precision.value.name,
         
     | 
| 173 | 
         
            -
                        AutoEvalColumn.model_type.name: self.model_type.value.name,
         
     | 
| 174 | 
         
            -
                        AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
         
     | 
| 175 | 
         
            -
                        AutoEvalColumn.weight_type.name: self.weight_type.value.name,
         
     | 
| 176 | 
         
            -
                        AutoEvalColumn.architecture.name: self.architecture,
         
     | 
| 177 | 
         
            -
                        AutoEvalColumn.model.name: make_clickable_model(self.full_model),
         
     | 
| 178 | 
         
            -
                        AutoEvalColumn.fullname.name: self.full_model,
         
     | 
| 179 | 
         
            -
                        AutoEvalColumn.revision.name: self.revision,
         
     | 
| 180 | 
         
            -
                        AutoEvalColumn.average.name: average,
         
     | 
| 181 | 
         
            -
                        AutoEvalColumn.license.name: self.license,
         
     | 
| 182 | 
         
            -
                        AutoEvalColumn.likes.name: self.likes,
         
     | 
| 183 | 
         
            -
                        AutoEvalColumn.params.name: self.num_params,
         
     | 
| 184 | 
         
            -
                        AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         
     | 
| 185 | 
         
            -
                        AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
         
     | 
| 186 | 
         
            -
                        AutoEvalColumn.moe.name: not (
         
     | 
| 187 | 
         
            -
                            ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
         
     | 
| 188 | 
         
            -
                        ),
         
     | 
| 189 | 
         
            -
                        AutoEvalColumn.not_flagged.name: self.not_flagged,
         
     | 
| 190 | 
         
            -
                    }
         
     | 
| 191 | 
         
            -
             
     | 
| 192 | 
         
            -
                    for task in Tasks:
         
     | 
| 193 | 
         
            -
                        data_dict[task.value.col_name] = self.results[task.value.benchmark]
         
     | 
| 194 | 
         
            -
             
     | 
| 195 | 
         
            -
                    return data_dict
         
     | 
| 196 | 
         
            -
             
     | 
| 197 | 
         
            -
             
     | 
| 198 | 
         
            -
            def get_request_file_for_model(requests_path, model_name, precision):
         
     | 
| 199 | 
         
            -
                """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
         
     | 
| 200 | 
         
            -
                requests_path = Path(requests_path)
         
     | 
| 201 | 
         
            -
                pattern = f"{model_name}_eval_request_*.json"
         
     | 
| 202 | 
         
            -
             
     | 
| 203 | 
         
            -
                # Using pathlib to find files matching the pattern
         
     | 
| 204 | 
         
            -
                request_files = list(requests_path.glob(pattern))
         
     | 
| 205 | 
         
            -
             
     | 
| 206 | 
         
            -
                # Sort the files by name in descending order to mimic 'reverse=True'
         
     | 
| 207 | 
         
            -
                request_files.sort(reverse=True)
         
     | 
| 208 | 
         
            -
             
     | 
| 209 | 
         
            -
                # Select the correct request file based on 'status' and 'precision'
         
     | 
| 210 | 
         
            -
                request_file = None
         
     | 
| 211 | 
         
            -
                for request_file in request_files:
         
     | 
| 212 | 
         
            -
                    with request_file.open("r") as f:
         
     | 
| 213 | 
         
            -
                        req_content = json.load(f)
         
     | 
| 214 | 
         
            -
                        if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
         
     | 
| 215 | 
         
            -
                            request_file = str(request_file)
         
     | 
| 216 | 
         
            -
             
     | 
| 217 | 
         
            -
                # Return empty string if no file found that matches criteria
         
     | 
| 218 | 
         
            -
                return request_file
         
     | 
| 219 | 
         
            -
             
     | 
| 220 | 
         
            -
             
     | 
| 221 | 
         
            -
            def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
         
     | 
| 222 | 
         
            -
                """From the path of the results folder root, extract all needed info for results"""
         
     | 
| 223 | 
         
            -
                with open(dynamic_path) as f:
         
     | 
| 224 | 
         
            -
                    dynamic_data = json.load(f)
         
     | 
| 225 | 
         
            -
             
     | 
| 226 | 
         
            -
                results_path = Path(results_path)
         
     | 
| 227 | 
         
            -
                model_files = list(results_path.rglob("results_*.json"))
         
     | 
| 228 | 
         
            -
                model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
         
     | 
| 229 | 
         
            -
             
     | 
| 230 | 
         
            -
                eval_results = {}
         
     | 
| 231 | 
         
            -
                # Wrap model_files iteration with tqdm for progress display
         
     | 
| 232 | 
         
            -
                for model_result_filepath in tqdm(model_files, desc="Processing model files"):
         
     | 
| 233 | 
         
            -
                    # Creation of result
         
     | 
| 234 | 
         
            -
                    eval_result = EvalResult.init_from_json_file(model_result_filepath)
         
     | 
| 235 | 
         
            -
                    with logging_redirect_tqdm():
         
     | 
| 236 | 
         
            -
                        eval_result.update_with_request_file(requests_path)
         
     | 
| 237 | 
         
            -
             
     | 
| 238 | 
         
            -
                    if eval_result.full_model in dynamic_data:
         
     | 
| 239 | 
         
            -
                        eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
         
     | 
| 240 | 
         
            -
                        # Hardcoding because of gating problem
         
     | 
| 241 | 
         
            -
                        if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
         
     | 
| 242 | 
         
            -
                            eval_result.still_on_hub = True
         
     | 
| 243 | 
         
            -
             
     | 
| 244 | 
         
            -
                    # Store results of same eval together
         
     | 
| 245 | 
         
            -
                    eval_name = eval_result.eval_name
         
     | 
| 246 | 
         
            -
                    if eval_name in eval_results.keys():
         
     | 
| 247 | 
         
            -
                        eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         
     | 
| 248 | 
         
            -
                    else:
         
     | 
| 249 | 
         
            -
                        eval_results[eval_name] = eval_result
         
     | 
| 250 | 
         
            -
             
     | 
| 251 | 
         
            -
                results = []
         
     | 
| 252 | 
         
            -
                for k, v in eval_results.items():
         
     | 
| 253 | 
         
            -
                    try:
         
     | 
| 254 | 
         
            -
                        if v.status == "FINISHED":
         
     | 
| 255 | 
         
            -
                            v.to_dict()  # we test if the dict version is complete
         
     | 
| 256 | 
         
            -
                            results.append(v)
         
     | 
| 257 | 
         
            -
                    except KeyError as e:
         
     | 
| 258 | 
         
            -
                        logging.error(f"Error while checking model {k} {v.date} json, no key: {e}")  # not all eval values present
         
     | 
| 259 | 
         
            -
                        continue
         
     | 
| 260 | 
         
            -
             
     | 
| 261 | 
         
            -
                return results
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import pathlib
         
     | 
| 2 | 
         
             
            import pandas as pd
         
     | 
| 
         | 
|
| 3 | 
         
             
            from src.display.formatting import has_no_nan_values, make_clickable_model
         
     | 
| 4 | 
         
             
            from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
         
     | 
| 5 | 
         
             
            from src.leaderboard.filter_models import filter_models_flags
         
     | 
| 6 | 
         
            -
            from src.leaderboard.read_evals import get_raw_eval_results
         
     | 
| 7 | 
         
             
            from src.display.utils import load_json_data
         
     | 
| 8 | 
         | 
| 9 | 
         | 
| 
         @@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols): 
     | 
|
| 39 | 
         
             
                return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
         
     | 
| 40 | 
         | 
| 41 | 
         | 
| 42 | 
         
            -
            def get_leaderboard_df( 
     | 
| 43 | 
         
             
                """Retrieve and process leaderboard data."""
         
     | 
| 44 | 
         
            -
                 
     | 
| 45 | 
         
            -
                 
     | 
| 46 | 
         
            -
                 
     | 
| 
         | 
|
| 47 | 
         | 
| 48 | 
         
            -
                df = pd.DataFrame.from_records( 
     | 
| 49 | 
         
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         
     | 
| 50 | 
         
             
                df = df[cols].round(decimals=2)
         
     | 
| 51 | 
         
             
                df = df[has_no_nan_values(df, benchmark_cols)]
         
     | 
| 52 | 
         
            -
                return  
     | 
| 
         | 
|
| 1 | 
         
             
            import pathlib
         
     | 
| 2 | 
         
             
            import pandas as pd
         
     | 
| 3 | 
         
            +
            from datasets import Dataset
         
     | 
| 4 | 
         
             
            from src.display.formatting import has_no_nan_values, make_clickable_model
         
     | 
| 5 | 
         
             
            from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
         
     | 
| 6 | 
         
             
            from src.leaderboard.filter_models import filter_models_flags
         
     | 
| 
         | 
|
| 7 | 
         
             
            from src.display.utils import load_json_data
         
     | 
| 8 | 
         | 
| 9 | 
         | 
| 
         | 
|
| 39 | 
         
             
                return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
         
     | 
| 40 | 
         | 
| 41 | 
         | 
| 42 | 
         
            +
            def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
         
     | 
| 43 | 
         
             
                """Retrieve and process leaderboard data."""
         
     | 
| 44 | 
         
            +
                all_data_json = leaderboard_dataset.to_dict()
         
     | 
| 45 | 
         
            +
                num_items = leaderboard_dataset.num_rows
         
     | 
| 46 | 
         
            +
                all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
         
     | 
| 47 | 
         
            +
                filter_models_flags(all_data_json_list)
         
     | 
| 48 | 
         | 
| 49 | 
         
            +
                df = pd.DataFrame.from_records(all_data_json_list)
         
     | 
| 50 | 
         
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         
     | 
| 51 | 
         
             
                df = df[cols].round(decimals=2)
         
     | 
| 52 | 
         
             
                df = df[has_no_nan_values(df, benchmark_cols)]
         
     | 
| 53 | 
         
            +
                return df
         
     | 
| 
         @@ -1,129 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import json
         
     | 
| 2 | 
         
            -
            import os
         
     | 
| 3 | 
         
            -
            import time
         
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
            from huggingface_hub import snapshot_download
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
            -
            from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
         
     | 
| 8 | 
         
            -
            from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
         
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
             
     | 
| 11 | 
         
            -
            def update_one_model(model_id, data, models_on_the_hub):
         
     | 
| 12 | 
         
            -
                # Model no longer on the hub at all
         
     | 
| 13 | 
         
            -
                if model_id not in models_on_the_hub:
         
     | 
| 14 | 
         
            -
                    data["still_on_hub"] = False
         
     | 
| 15 | 
         
            -
                    data["likes"] = 0
         
     | 
| 16 | 
         
            -
                    data["downloads"] = 0
         
     | 
| 17 | 
         
            -
                    data["created_at"] = ""
         
     | 
| 18 | 
         
            -
                    data["tags"] = []
         
     | 
| 19 | 
         
            -
                    return data
         
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
                # Grabbing model parameters
         
     | 
| 22 | 
         
            -
                model_cfg = models_on_the_hub[model_id]
         
     | 
| 23 | 
         
            -
                data["likes"] = model_cfg.likes
         
     | 
| 24 | 
         
            -
                data["downloads"] = model_cfg.downloads
         
     | 
| 25 | 
         
            -
                data["created_at"] = str(model_cfg.created_at)
         
     | 
| 26 | 
         
            -
                data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
         
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
                # Grabbing model details
         
     | 
| 29 | 
         
            -
                model_name = model_id
         
     | 
| 30 | 
         
            -
                if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
         
     | 
| 31 | 
         
            -
                    if isinstance(model_cfg.card_data.base_model, str):
         
     | 
| 32 | 
         
            -
                        model_name = model_cfg.card_data.base_model  # for adapters, we look at the parent model
         
     | 
| 33 | 
         
            -
                still_on_hub, _, _ = is_model_on_hub(
         
     | 
| 34 | 
         
            -
                    model_name=model_name,
         
     | 
| 35 | 
         
            -
                    revision=data.get("revision"),
         
     | 
| 36 | 
         
            -
                    trust_remote_code=True,
         
     | 
| 37 | 
         
            -
                    test_tokenizer=False,
         
     | 
| 38 | 
         
            -
                    token=H4_TOKEN,
         
     | 
| 39 | 
         
            -
                )
         
     | 
| 40 | 
         
            -
                # If the model doesn't have a model card or a license, we consider it's deleted
         
     | 
| 41 | 
         
            -
                if still_on_hub:
         
     | 
| 42 | 
         
            -
                    try:
         
     | 
| 43 | 
         
            -
                        status, _, model_card = check_model_card(model_id)
         
     | 
| 44 | 
         
            -
                        if status is False:
         
     | 
| 45 | 
         
            -
                            still_on_hub = False
         
     | 
| 46 | 
         
            -
                    except Exception:
         
     | 
| 47 | 
         
            -
                        model_card = None
         
     | 
| 48 | 
         
            -
                        still_on_hub = False
         
     | 
| 49 | 
         
            -
                data["still_on_hub"] = still_on_hub
         
     | 
| 50 | 
         
            -
             
     | 
| 51 | 
         
            -
                tags = get_model_tags(model_card, model_id) if still_on_hub else []
         
     | 
| 52 | 
         
            -
             
     | 
| 53 | 
         
            -
                data["tags"] = tags
         
     | 
| 54 | 
         
            -
                return data
         
     | 
| 55 | 
         
            -
             
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
            def update_models(file_path, models_on_the_hub):
         
     | 
| 58 | 
         
            -
                """
         
     | 
| 59 | 
         
            -
                Search through all JSON files in the specified root folder and its subfolders,
         
     | 
| 60 | 
         
            -
                and update the likes key in JSON dict from value of input dict
         
     | 
| 61 | 
         
            -
                """
         
     | 
| 62 | 
         
            -
                seen_models = []
         
     | 
| 63 | 
         
            -
                with open(file_path, "r") as f:
         
     | 
| 64 | 
         
            -
                    model_infos = json.load(f)
         
     | 
| 65 | 
         
            -
                    for model_id in model_infos.keys():
         
     | 
| 66 | 
         
            -
                        seen_models.append(model_id)
         
     | 
| 67 | 
         
            -
                        model_infos[model_id] = update_one_model(
         
     | 
| 68 | 
         
            -
                            model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
         
     | 
| 69 | 
         
            -
                        )
         
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
                # If new requests files have been created since we started all this
         
     | 
| 72 | 
         
            -
                # we grab them
         
     | 
| 73 | 
         
            -
                all_models = []
         
     | 
| 74 | 
         
            -
                try:
         
     | 
| 75 | 
         
            -
                    for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
         
     | 
| 76 | 
         
            -
                        if ix == 0:
         
     | 
| 77 | 
         
            -
                            continue
         
     | 
| 78 | 
         
            -
                        for file in files:
         
     | 
| 79 | 
         
            -
                            if "eval_request" in file:
         
     | 
| 80 | 
         
            -
                                path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
         
     | 
| 81 | 
         
            -
                                all_models.append(path)
         
     | 
| 82 | 
         
            -
                except Exception as e:
         
     | 
| 83 | 
         
            -
                    print(e)
         
     | 
| 84 | 
         
            -
                    pass
         
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
                for model_id in all_models:
         
     | 
| 87 | 
         
            -
                    if model_id not in seen_models:
         
     | 
| 88 | 
         
            -
                        model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
         
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
            -
                with open(file_path, "w") as f:
         
     | 
| 91 | 
         
            -
                    json.dump(model_infos, f, indent=2)
         
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
             
     | 
| 94 | 
         
            -
            def update_dynamic_files():
         
     | 
| 95 | 
         
            -
                """This will only update metadata for models already linked in the repo, not add missing ones."""
         
     | 
| 96 | 
         
            -
                snapshot_download(
         
     | 
| 97 | 
         
            -
                    repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
         
     | 
| 98 | 
         
            -
                )
         
     | 
| 99 | 
         
            -
             
     | 
| 100 | 
         
            -
                print("UPDATE_DYNAMIC: Loaded snapshot")
         
     | 
| 101 | 
         
            -
                # Get models
         
     | 
| 102 | 
         
            -
                start = time.time()
         
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
                models = list(
         
     | 
| 105 | 
         
            -
                    API.list_models(
         
     | 
| 106 | 
         
            -
                        # filter=ModelFilter(task="text-generation"),
         
     | 
| 107 | 
         
            -
                        full=False,
         
     | 
| 108 | 
         
            -
                        cardData=True,
         
     | 
| 109 | 
         
            -
                        fetch_config=True,
         
     | 
| 110 | 
         
            -
                    )
         
     | 
| 111 | 
         
            -
                )
         
     | 
| 112 | 
         
            -
                id_to_model = {model.id: model for model in models}
         
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
            -
                print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
         
     | 
| 115 | 
         
            -
             
     | 
| 116 | 
         
            -
                start = time.time()
         
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
                update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
         
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
                print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
         
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
                API.upload_file(
         
     | 
| 123 | 
         
            -
                    path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
         
     | 
| 124 | 
         
            -
                    path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
         
     | 
| 125 | 
         
            -
                    repo_id=DYNAMIC_INFO_REPO,
         
     | 
| 126 | 
         
            -
                    repo_type="dataset",
         
     | 
| 127 | 
         
            -
                    commit_message="Daily request file update.",
         
     | 
| 128 | 
         
            -
                )
         
     | 
| 129 | 
         
            -
                print("UPDATE_DYNAMIC: pushed to hub")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         @@ -13,7 +13,7 @@ from src.envs import HAS_HIGHER_RATE_LIMIT 
     | 
|
| 13 | 
         | 
| 14 | 
         | 
| 15 | 
         
             
            # ht to @Wauplin, thank you for the snippet!
         
     | 
| 16 | 
         
            -
            # See https://huggingface.co/spaces/ 
     | 
| 17 | 
         
             
            def check_model_card(repo_id: str) -> tuple[bool, str]:
         
     | 
| 18 | 
         
             
                # Returns operation status, and error message
         
     | 
| 19 | 
         
             
                try:
         
     | 
| 
         | 
|
| 13 | 
         | 
| 14 | 
         | 
| 15 | 
         
             
            # ht to @Wauplin, thank you for the snippet!
         
     | 
| 16 | 
         
            +
            # See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
         
     | 
| 17 | 
         
             
            def check_model_card(repo_id: str) -> tuple[bool, str]:
         
     | 
| 18 | 
         
             
                # Returns operation status, and error message
         
     | 
| 19 | 
         
             
                try:
         
     | 
| 
         @@ -2,16 +2,11 @@ import json 
     | 
|
| 2 | 
         
             
            import os
         
     | 
| 3 | 
         
             
            from datetime import datetime, timezone
         
     | 
| 4 | 
         | 
| 5 | 
         
            -
            from huggingface_hub import snapshot_download
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
             
            from src.display.formatting import styled_error, styled_message, styled_warning
         
     | 
| 8 | 
         
             
            from src.envs import (
         
     | 
| 9 | 
         
             
                API,
         
     | 
| 10 | 
         
            -
                DYNAMIC_INFO_FILE_PATH,
         
     | 
| 11 | 
         
            -
                DYNAMIC_INFO_PATH,
         
     | 
| 12 | 
         
            -
                DYNAMIC_INFO_REPO,
         
     | 
| 13 | 
         
             
                EVAL_REQUESTS_PATH,
         
     | 
| 14 | 
         
            -
                 
     | 
| 15 | 
         
             
                QUEUE_REPO,
         
     | 
| 16 | 
         
             
                RATE_LIMIT_PERIOD,
         
     | 
| 17 | 
         
             
                RATE_LIMIT_QUOTA,
         
     | 
| 
         @@ -35,7 +30,6 @@ def add_new_eval( 
     | 
|
| 35 | 
         
             
                base_model: str,
         
     | 
| 36 | 
         
             
                revision: str,
         
     | 
| 37 | 
         
             
                precision: str,
         
     | 
| 38 | 
         
            -
                private: bool,
         
     | 
| 39 | 
         
             
                weight_type: str,
         
     | 
| 40 | 
         
             
                model_type: str,
         
     | 
| 41 | 
         
             
            ):
         
     | 
| 
         @@ -80,7 +74,7 @@ def add_new_eval( 
     | 
|
| 80 | 
         
             
                # Is the model on the hub?
         
     | 
| 81 | 
         
             
                if weight_type in ["Delta", "Adapter"]:
         
     | 
| 82 | 
         
             
                    base_model_on_hub, error, _ = is_model_on_hub(
         
     | 
| 83 | 
         
            -
                        model_name=base_model, revision=revision, token= 
     | 
| 84 | 
         
             
                    )
         
     | 
| 85 | 
         
             
                    if not base_model_on_hub:
         
     | 
| 86 | 
         
             
                        return styled_error(f'Base model "{base_model}" {error}')
         
     | 
| 
         @@ -126,7 +120,6 @@ def add_new_eval( 
     | 
|
| 126 | 
         
             
                    "model": model,
         
     | 
| 127 | 
         
             
                    "base_model": base_model,
         
     | 
| 128 | 
         
             
                    "revision": model_info.sha, # force to use the exact model commit 
         
     | 
| 129 | 
         
            -
                    "private": private,
         
     | 
| 130 | 
         
             
                    "precision": precision,
         
     | 
| 131 | 
         
             
                    "params": model_size,
         
     | 
| 132 | 
         
             
                    "architectures": architecture,
         
     | 
| 
         @@ -154,7 +147,7 @@ def add_new_eval( 
     | 
|
| 154 | 
         
             
                print("Creating eval file")
         
     | 
| 155 | 
         
             
                OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
         
     | 
| 156 | 
         
             
                os.makedirs(OUT_DIR, exist_ok=True)
         
     | 
| 157 | 
         
            -
                out_path = f"{OUT_DIR}/{model_path} 
     | 
| 158 | 
         | 
| 159 | 
         
             
                with open(out_path, "w") as f:
         
     | 
| 160 | 
         
             
                    f.write(json.dumps(eval_entry))
         
     | 
| 
         @@ -168,26 +161,6 @@ def add_new_eval( 
     | 
|
| 168 | 
         
             
                    commit_message=f"Add {model} to eval queue",
         
     | 
| 169 | 
         
             
                )
         
     | 
| 170 | 
         | 
| 171 | 
         
            -
                # We want to grab the latest version of the submission file to not accidentally overwrite it
         
     | 
| 172 | 
         
            -
                snapshot_download(
         
     | 
| 173 | 
         
            -
                    repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
         
     | 
| 174 | 
         
            -
                )
         
     | 
| 175 | 
         
            -
             
     | 
| 176 | 
         
            -
                with open(DYNAMIC_INFO_FILE_PATH) as f:
         
     | 
| 177 | 
         
            -
                    all_supplementary_info = json.load(f)
         
     | 
| 178 | 
         
            -
             
     | 
| 179 | 
         
            -
                all_supplementary_info[model] = supplementary_info
         
     | 
| 180 | 
         
            -
                with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
         
     | 
| 181 | 
         
            -
                    json.dump(all_supplementary_info, f, indent=2)
         
     | 
| 182 | 
         
            -
             
     | 
| 183 | 
         
            -
                API.upload_file(
         
     | 
| 184 | 
         
            -
                    path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
         
     | 
| 185 | 
         
            -
                    path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
         
     | 
| 186 | 
         
            -
                    repo_id=DYNAMIC_INFO_REPO,
         
     | 
| 187 | 
         
            -
                    repo_type="dataset",
         
     | 
| 188 | 
         
            -
                    commit_message=f"Add {model} to dynamic info queue",
         
     | 
| 189 | 
         
            -
                )
         
     | 
| 190 | 
         
            -
             
     | 
| 191 | 
         
             
                # Remove the local file
         
     | 
| 192 | 
         
             
                os.remove(out_path)
         
     | 
| 193 | 
         | 
| 
         | 
|
| 2 | 
         
             
            import os
         
     | 
| 3 | 
         
             
            from datetime import datetime, timezone
         
     | 
| 4 | 
         | 
| 
         | 
|
| 
         | 
|
| 5 | 
         
             
            from src.display.formatting import styled_error, styled_message, styled_warning
         
     | 
| 6 | 
         
             
            from src.envs import (
         
     | 
| 7 | 
         
             
                API,
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 8 | 
         
             
                EVAL_REQUESTS_PATH,
         
     | 
| 9 | 
         
            +
                HF_TOKEN,
         
     | 
| 10 | 
         
             
                QUEUE_REPO,
         
     | 
| 11 | 
         
             
                RATE_LIMIT_PERIOD,
         
     | 
| 12 | 
         
             
                RATE_LIMIT_QUOTA,
         
     | 
| 
         | 
|
| 30 | 
         
             
                base_model: str,
         
     | 
| 31 | 
         
             
                revision: str,
         
     | 
| 32 | 
         
             
                precision: str,
         
     | 
| 
         | 
|
| 33 | 
         
             
                weight_type: str,
         
     | 
| 34 | 
         
             
                model_type: str,
         
     | 
| 35 | 
         
             
            ):
         
     | 
| 
         | 
|
| 74 | 
         
             
                # Is the model on the hub?
         
     | 
| 75 | 
         
             
                if weight_type in ["Delta", "Adapter"]:
         
     | 
| 76 | 
         
             
                    base_model_on_hub, error, _ = is_model_on_hub(
         
     | 
| 77 | 
         
            +
                        model_name=base_model, revision=revision, token=HF_TOKEN, test_tokenizer=True
         
     | 
| 78 | 
         
             
                    )
         
     | 
| 79 | 
         
             
                    if not base_model_on_hub:
         
     | 
| 80 | 
         
             
                        return styled_error(f'Base model "{base_model}" {error}')
         
     | 
| 
         | 
|
| 120 | 
         
             
                    "model": model,
         
     | 
| 121 | 
         
             
                    "base_model": base_model,
         
     | 
| 122 | 
         
             
                    "revision": model_info.sha, # force to use the exact model commit 
         
     | 
| 
         | 
|
| 123 | 
         
             
                    "precision": precision,
         
     | 
| 124 | 
         
             
                    "params": model_size,
         
     | 
| 125 | 
         
             
                    "architectures": architecture,
         
     | 
| 
         | 
|
| 147 | 
         
             
                print("Creating eval file")
         
     | 
| 148 | 
         
             
                OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
         
     | 
| 149 | 
         
             
                os.makedirs(OUT_DIR, exist_ok=True)
         
     | 
| 150 | 
         
            +
                out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
         
     | 
| 151 | 
         | 
| 152 | 
         
             
                with open(out_path, "w") as f:
         
     | 
| 153 | 
         
             
                    f.write(json.dumps(eval_entry))
         
     | 
| 
         | 
|
| 161 | 
         
             
                    commit_message=f"Add {model} to eval queue",
         
     | 
| 162 | 
         
             
                )
         
     | 
| 163 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 164 | 
         
             
                # Remove the local file
         
     | 
| 165 | 
         
             
                os.remove(out_path)
         
     | 
| 166 | 
         | 
| 
         @@ -1,76 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            import pandas as pd
         
     | 
| 2 | 
         
            -
            from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
         
     | 
| 3 | 
         
            -
            from huggingface_hub.utils._errors import HfHubHTTPError
         
     | 
| 4 | 
         
            -
            from pandas import DataFrame
         
     | 
| 5 | 
         
            -
             
     | 
| 6 | 
         
            -
            from src.display.utils import AutoEvalColumn, ModelType
         
     | 
| 7 | 
         
            -
            from src.envs import H4_TOKEN, PATH_TO_COLLECTION
         
     | 
| 8 | 
         
            -
             
     | 
| 9 | 
         
            -
            # Specific intervals for the collections
         
     | 
| 10 | 
         
            -
            intervals = {
         
     | 
| 11 | 
         
            -
                "1B": pd.Interval(0, 1.5, closed="right"),
         
     | 
| 12 | 
         
            -
                "3B": pd.Interval(2.5, 3.5, closed="neither"),
         
     | 
| 13 | 
         
            -
                "7B": pd.Interval(6, 8, closed="neither"),
         
     | 
| 14 | 
         
            -
                "13B": pd.Interval(10, 14, closed="neither"),
         
     | 
| 15 | 
         
            -
                "30B": pd.Interval(25, 35, closed="neither"),
         
     | 
| 16 | 
         
            -
                "65B": pd.Interval(60, 70, closed="neither"),
         
     | 
| 17 | 
         
            -
            }
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
            -
             
     | 
| 20 | 
         
            -
            def _filter_by_type_and_size(df, model_type, size_interval):
         
     | 
| 21 | 
         
            -
                """Filter DataFrame by model type and parameter size interval."""
         
     | 
| 22 | 
         
            -
                type_emoji = model_type.value.symbol[0]
         
     | 
| 23 | 
         
            -
                filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
         
     | 
| 24 | 
         
            -
                params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
         
     | 
| 25 | 
         
            -
                mask = params_column.apply(lambda x: x in size_interval)
         
     | 
| 26 | 
         
            -
                return filtered_df.loc[mask]
         
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
            def _add_models_to_collection(collection, models, model_type, size):
         
     | 
| 30 | 
         
            -
                """Add best models to the collection and update positions."""
         
     | 
| 31 | 
         
            -
                cur_len_collection = len(collection.items)
         
     | 
| 32 | 
         
            -
                for ix, model in enumerate(models, start=1):
         
     | 
| 33 | 
         
            -
                    try:
         
     | 
| 34 | 
         
            -
                        collection = add_collection_item(
         
     | 
| 35 | 
         
            -
                            PATH_TO_COLLECTION,
         
     | 
| 36 | 
         
            -
                            item_id=model,
         
     | 
| 37 | 
         
            -
                            item_type="model",
         
     | 
| 38 | 
         
            -
                            exists_ok=True,
         
     | 
| 39 | 
         
            -
                            note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
         
     | 
| 40 | 
         
            -
                            token=H4_TOKEN,
         
     | 
| 41 | 
         
            -
                        )
         
     | 
| 42 | 
         
            -
                        # Ensure position is correct if item was added
         
     | 
| 43 | 
         
            -
                        if len(collection.items) > cur_len_collection:
         
     | 
| 44 | 
         
            -
                            item_object_id = collection.items[-1].item_object_id
         
     | 
| 45 | 
         
            -
                            update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
         
     | 
| 46 | 
         
            -
                            cur_len_collection = len(collection.items)
         
     | 
| 47 | 
         
            -
                        break  # assuming we only add the top model
         
     | 
| 48 | 
         
            -
                    except HfHubHTTPError:
         
     | 
| 49 | 
         
            -
                        continue
         
     | 
| 50 | 
         
            -
             
     | 
| 51 | 
         
            -
             
     | 
| 52 | 
         
            -
            def update_collections(df: DataFrame):
         
     | 
| 53 | 
         
            -
                """Update collections by filtering and adding the best models."""
         
     | 
| 54 | 
         
            -
                collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
         
     | 
| 55 | 
         
            -
                cur_best_models = []
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
                for model_type in ModelType:
         
     | 
| 58 | 
         
            -
                    if not model_type.value.name:
         
     | 
| 59 | 
         
            -
                        continue
         
     | 
| 60 | 
         
            -
                    for size, interval in intervals.items():
         
     | 
| 61 | 
         
            -
                        filtered_df = _filter_by_type_and_size(df, model_type, interval)
         
     | 
| 62 | 
         
            -
                        best_models = list(
         
     | 
| 63 | 
         
            -
                            filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
         
     | 
| 64 | 
         
            -
                        )
         
     | 
| 65 | 
         
            -
                        print(model_type.value.symbol, size, best_models)
         
     | 
| 66 | 
         
            -
                        _add_models_to_collection(collection, best_models, model_type, size)
         
     | 
| 67 | 
         
            -
                        cur_best_models.extend(best_models)
         
     | 
| 68 | 
         
            -
             
     | 
| 69 | 
         
            -
                # Cleanup
         
     | 
| 70 | 
         
            -
                existing_models = {item.item_id for item in collection.items}
         
     | 
| 71 | 
         
            -
                to_remove = existing_models - set(cur_best_models)
         
     | 
| 72 | 
         
            -
                for item_id in to_remove:
         
     | 
| 73 | 
         
            -
                    try:
         
     | 
| 74 | 
         
            -
                        delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
         
     | 
| 75 | 
         
            -
                    except HfHubHTTPError:
         
     | 
| 76 | 
         
            -
                        continue
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         
            File without changes
         
     | 
| 
         @@ -630,7 +630,7 @@ models = [ 
     | 
|
| 630 | 
         
             
                "WizardLM/WizardMath-7B-V1.0",
         
     | 
| 631 | 
         
             
                "Norquinal/llama-2-7b-claude-chat",
         
     | 
| 632 | 
         
             
                "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
         
     | 
| 633 | 
         
            -
                " 
     | 
| 634 | 
         
             
                "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
         
     | 
| 635 | 
         
             
                "conceptofmind/LLongMA-2-13b-16k",
         
     | 
| 636 | 
         
             
                "tianyil1/denas-llama2",
         
     | 
| 
         @@ -1039,7 +1039,7 @@ models = [ 
     | 
|
| 1039 | 
         
             
                "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
         
     | 
| 1040 | 
         
             
                "EleutherAI/gpt-neo-2.7B",
         
     | 
| 1041 | 
         
             
                "danielhanchen/open_llama_3b_600bt_preview",
         
     | 
| 1042 | 
         
            -
                " 
     | 
| 1043 | 
         
             
                "pythainlp/wangchanglm-7.5B-sft-en-sharded",
         
     | 
| 1044 | 
         
             
                "beaugogh/pythia-1.4b-deduped-sharegpt",
         
     | 
| 1045 | 
         
             
                "HWERI/pythia-1.4b-deduped-sharegpt",
         
     | 
| 
         | 
|
| 630 | 
         
             
                "WizardLM/WizardMath-7B-V1.0",
         
     | 
| 631 | 
         
             
                "Norquinal/llama-2-7b-claude-chat",
         
     | 
| 632 | 
         
             
                "TheTravellingEngineer/llama2-7b-chat-hf-dpo",
         
     | 
| 633 | 
         
            +
                "open-llm-leaderboard/starchat-beta",
         
     | 
| 634 | 
         
             
                "joehuangx/spatial-vicuna-7b-v1.5-LoRA",
         
     | 
| 635 | 
         
             
                "conceptofmind/LLongMA-2-13b-16k",
         
     | 
| 636 | 
         
             
                "tianyil1/denas-llama2",
         
     | 
| 
         | 
|
| 1039 | 
         
             
                "bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
         
     | 
| 1040 | 
         
             
                "EleutherAI/gpt-neo-2.7B",
         
     | 
| 1041 | 
         
             
                "danielhanchen/open_llama_3b_600bt_preview",
         
     | 
| 1042 | 
         
            +
                "open-llm-leaderboard/starchat-alpha",
         
     | 
| 1043 | 
         
             
                "pythainlp/wangchanglm-7.5B-sft-en-sharded",
         
     | 
| 1044 | 
         
             
                "beaugogh/pythia-1.4b-deduped-sharegpt",
         
     | 
| 1045 | 
         
             
                "HWERI/pythia-1.4b-deduped-sharegpt",
         
     | 
| 
         @@ -6,10 +6,9 @@ from plotly.graph_objs import Figure 
     | 
|
| 6 | 
         
             
            from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
         
     | 
| 7 | 
         
             
            from src.display.utils import human_baseline_row as HUMAN_BASELINE
         
     | 
| 8 | 
         
             
            from src.leaderboard.filter_models import FLAGGED_MODELS
         
     | 
| 9 | 
         
            -
            from src.leaderboard.read_evals import EvalResult
         
     | 
| 10 | 
         | 
| 11 | 
         | 
| 12 | 
         
            -
            def create_scores_df( 
     | 
| 13 | 
         
             
                """
         
     | 
| 14 | 
         
             
                Generates a DataFrame containing the maximum scores until each date.
         
     | 
| 15 | 
         | 
| 
         @@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame: 
     | 
|
| 17 | 
         
             
                :return: A new DataFrame containing the maximum scores until each date for every metric.
         
     | 
| 18 | 
         
             
                """
         
     | 
| 19 | 
         
             
                # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
         
     | 
| 20 | 
         
            -
                results_df = pd. 
     | 
| 21 | 
         
            -
                # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
         
     | 
| 22 | 
         
             
                results_df.sort_values(by="date", inplace=True)
         
     | 
| 23 | 
         | 
| 24 | 
         
             
                # Step 2: Initialize the scores dictionary
         
     | 
| 
         @@ -30,22 +28,18 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame: 
     | 
|
| 30 | 
         
             
                    last_date = ""
         
     | 
| 31 | 
         
             
                    column = task.col_name
         
     | 
| 32 | 
         
             
                    for _, row in results_df.iterrows():
         
     | 
| 33 | 
         
            -
                        current_model = row[ 
     | 
| 34 | 
         
             
                        # We ignore models that are flagged/no longer on the hub/not finished
         
     | 
| 35 | 
         
             
                        to_ignore = (
         
     | 
| 36 | 
         
            -
                            not row[ 
     | 
| 37 | 
         
            -
                            or not row[ 
     | 
| 38 | 
         
             
                            or current_model in FLAGGED_MODELS
         
     | 
| 39 | 
         
            -
                            or row["status"] != "FINISHED"
         
     | 
| 40 | 
         
             
                        )
         
     | 
| 41 | 
         
             
                        if to_ignore:
         
     | 
| 42 | 
         
             
                            continue
         
     | 
| 43 | 
         | 
| 44 | 
         
            -
                        current_date = row[ 
     | 
| 45 | 
         
            -
                         
     | 
| 46 | 
         
            -
                            current_score = np.mean(list(row["results"].values()))
         
     | 
| 47 | 
         
            -
                        else:
         
     | 
| 48 | 
         
            -
                            current_score = row["results"][task.benchmark]
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                        if current_score > current_max:
         
     | 
| 51 | 
         
             
                            if current_date == last_date and len(scores[column]) > 0:
         
     | 
| 
         | 
|
| 6 | 
         
             
            from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
         
     | 
| 7 | 
         
             
            from src.display.utils import human_baseline_row as HUMAN_BASELINE
         
     | 
| 8 | 
         
             
            from src.leaderboard.filter_models import FLAGGED_MODELS
         
     | 
| 
         | 
|
| 9 | 
         | 
| 10 | 
         | 
| 11 | 
         
            +
            def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
         
     | 
| 12 | 
         
             
                """
         
     | 
| 13 | 
         
             
                Generates a DataFrame containing the maximum scores until each date.
         
     | 
| 14 | 
         | 
| 
         | 
|
| 16 | 
         
             
                :return: A new DataFrame containing the maximum scores until each date for every metric.
         
     | 
| 17 | 
         
             
                """
         
     | 
| 18 | 
         
             
                # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
         
     | 
| 19 | 
         
            +
                results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
         
     | 
| 
         | 
|
| 20 | 
         
             
                results_df.sort_values(by="date", inplace=True)
         
     | 
| 21 | 
         | 
| 22 | 
         
             
                # Step 2: Initialize the scores dictionary
         
     | 
| 
         | 
|
| 28 | 
         
             
                    last_date = ""
         
     | 
| 29 | 
         
             
                    column = task.col_name
         
     | 
| 30 | 
         
             
                    for _, row in results_df.iterrows():
         
     | 
| 31 | 
         
            +
                        current_model = row[AutoEvalColumn.fullname.name]
         
     | 
| 32 | 
         
             
                        # We ignore models that are flagged/no longer on the hub/not finished
         
     | 
| 33 | 
         
             
                        to_ignore = (
         
     | 
| 34 | 
         
            +
                            not row[AutoEvalColumn.still_on_hub.name]
         
     | 
| 35 | 
         
            +
                            or not row[AutoEvalColumn.not_flagged.name]
         
     | 
| 36 | 
         
             
                            or current_model in FLAGGED_MODELS
         
     | 
| 
         | 
|
| 37 | 
         
             
                        )
         
     | 
| 38 | 
         
             
                        if to_ignore:
         
     | 
| 39 | 
         
             
                            continue
         
     | 
| 40 | 
         | 
| 41 | 
         
            +
                        current_date = row[AutoEvalColumn.date.name]
         
     | 
| 42 | 
         
            +
                        current_score = row[task.col_name]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 43 | 
         | 
| 44 | 
         
             
                        if current_score > current_max:
         
     | 
| 45 | 
         
             
                            if current_date == last_date and len(scores[column]) > 0:
         
     |