Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

chriscanal commited on Oct 17, 2023

Commit

36bf409

1 Parent(s): 8e47868

Updated plotted models to exclude flagged models

Files changed (1) hide show

src/display_models/plot_results.py CHANGED Viewed

@@ -4,6 +4,7 @@ from plotly.graph_objs import Figure
 import pickle
 from datetime import datetime, timezone
 from typing import List, Dict, Tuple, Any
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -42,6 +43,9 @@ def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
     # copy dataframe to avoid modifying the original
     df = results_df.copy(deep=True)
     # load cache from disk
     try:
         with open("model_info_cache.pkl", "rb") as f:
@@ -216,4 +220,4 @@ def create_metric_plot_obj(
 # Example Usage:
 # human_baselines dictionary is defined.
-# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")

 import pickle
 from datetime import datetime, timezone
 from typing import List, Dict, Tuple, Any
+from src.display_models.model_metadata_flags import FLAGGED_MODELS
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
     # copy dataframe to avoid modifying the original
     df = results_df.copy(deep=True)
+    # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
+    df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
     # load cache from disk
     try:
         with open("model_info_cache.pkl", "rb") as f:
 # Example Usage:
 # human_baselines dictionary is defined.
+# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")