Spaces:

andrewrreed
/

closed-vs-open-arena-elo

Runtime error

App Files Files Community

andrewrreed commited on May 2, 2024

Commit

167137b

1 Parent(s): 311dc3a

Add filters

Browse files

Files changed (4) hide show

app.py +157 -101
release_date_mapping.json +1 -1
requirements.txt +2 -1
utils.py +60 -0

app.py CHANGED Viewed

@@ -2,113 +2,169 @@ import pickle
 import pandas as pd
 import gradio as gr
-from huggingface_hub import HfFileSystem, hf_hub_download
-if gr.NO_RELOAD:
-    ###################
-    ### Load Data
-    ###################
-    key_to_category_name = {
-        "full": "Overall",
-        "coding": "Coding",
-        "long_user": "Longer Query",
-        "english": "English",
-        "chinese": "Chinese",
-        "french": "French",
-        "no_tie": "Exclude Ties",
-        "no_short": "Exclude Short Query (< 5 tokens)",
-        "no_refusal": "Exclude Refusal",
-    }
-    cat_name_to_explanation = {
-        "Overall": "Overall Questions",
-        "Coding": "Coding: whether conversation contains code snippets",
-        "Longer Query": "Longer Query (>= 500 tokens)",
-        "English": "English Prompts",
-        "Chinese": "Chinese Prompts",
-        "French": "French Prompts",
-        "Exclude Ties": "Exclude Ties and Bothbad",
-        "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
-        "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
-    }
-    fs = HfFileSystem()
-    def extract_date(filename):
-        return filename.split("/")[-1].split(".")[0].split("_")[-1]
-    # gather ELO data
-    ELO_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.pkl"
-    elo_files = fs.glob(ELO_DATA_FILES)
-    latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]
-    latest_elo_file_local = hf_hub_download(
-        repo_id="lmsys/chatbot-arena-leaderboard",
-        filename=latest_elo_file.split("/")[-1],
-        repo_type="space",
     )
-    with open(latest_elo_file_local, "rb") as fin:
-        elo_results = pickle.load(fin)
-    arena_dfs = {}
-    for k in key_to_category_name.keys():
-        if k not in elo_results:
-            continue
-        arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
-    # gather open llm leaderboard data
-    LEADERBOARD_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.csv"
-    leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)
-    latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[
-        0
-    ]
-    latest_leaderboard_file_local = hf_hub_download(
-        repo_id="lmsys/chatbot-arena-leaderboard",
-        filename=latest_leaderboard_file.split("/")[-1],
-        repo_type="space",
     )
-    leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
-    ###################
-    ### Prepare Data
-    ###################
-    # merge leaderboard data with ELO data
-    merged_dfs = {}
-    for k, v in arena_dfs.items():
-        merged_dfs[k] = (
-            pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
-            .sort_values("rating", ascending=False)
-            .reset_index(drop=True)
-        )
-    # add release dates into the merged data
-    release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
-    for k, v in merged_dfs.items():
-        merged_dfs[k] = pd.merge(
-            merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
-        )
 df = merged_dfs["Overall"]
-y_min = df["rating"].min()
-y_max = df["rating"].max()
-y_buffer = (y_max - y_min) * 0.1
-with gr.Blocks() as demo:
-    gr.Markdown("# Chatbot Arena Leaderboard")
     with gr.Row():
-        gr.ScatterPlot(
-            df,
-            title="hello",
-            x="Release Date",
-            y="rating",
-            tooltip=["Model", "rating", "num_battles", "Organization", "License"],
-            width=1000,
-            height=700,
-            x_label_angle=-45,
-            y_lim=[y_min - y_buffer, y_max + y_buffer],
         )
-if __name__ == "__main__":
-    demo.launch()

 import pandas as pd
 import gradio as gr
+import plotly.express as px
+from utils import (
+    KEY_TO_CATEGORY_NAME,
+    PROPRIETARY_LICENSES,
+    download_latest_data_from_space,
+)
+# with gr.NO_RELOAD:
+###################
+### Load Data
+###################
+# gather ELO data
+latest_elo_file_local = download_latest_data_from_space(
+    repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
+)
+with open(latest_elo_file_local, "rb") as fin:
+    elo_results = pickle.load(fin)
+arena_dfs = {}
+for k in KEY_TO_CATEGORY_NAME.keys():
+    if k not in elo_results:
+        continue
+    arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
+# gather open llm leaderboard data
+latest_leaderboard_file_local = download_latest_data_from_space(
+    repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
+)
+leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
+###################
+### Prepare Data
+###################
+# merge leaderboard data with ELO data
+merged_dfs = {}
+for k, v in arena_dfs.items():
+    merged_dfs[k] = (
+        pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
+        .sort_values("rating", ascending=False)
+        .reset_index(drop=True)
     )
+# add release dates into the merged data
+release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
+for k, v in merged_dfs.items():
+    merged_dfs[k] = pd.merge(
+        merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
     )
 df = merged_dfs["Overall"]
+df["License"] = df["License"].apply(
+    lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
+)
+df["Release Date"] = pd.to_datetime(df["Release Date"])
+df["Month-Year"] = df["Release Date"].dt.to_period("M")
+df["rating"] = df["rating"].round()
+###################
+### Plot Data
+###################
+date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
+min_elo_score = df["rating"].min().round()
+max_elo_score = df["rating"].max().round()
+upper_models_per_month = int(
+    df.groupby(["Month-Year", "License"])["rating"].apply(lambda x: x.count()).max()
+)
+def build_plot(min_score, max_models_per_month, toggle_annotations):
+    filtered_df = df[(df["rating"] >= min_score)]
+    filtered_df = (
+        filtered_df.groupby(["Month-Year", "License"])
+        .apply(lambda x: x.nlargest(max_models_per_month, "rating"))
+        .reset_index(drop=True)
+    )
+    fig = px.scatter(
+        filtered_df,
+        x="Release Date",
+        y="rating",
+        color="License",
+        hover_name="Model",
+        hover_data=["Organization", "License"],
+        trendline="ols",
+        title=f"Proprietary vs Open LLMs (LMSYS Arena ELO as of {date_updated})",
+        labels={"rating": "Arena ELO", "Release Date": "Release Date"},
+        height=700,
+        template="seaborn",
+    )
+    fig.update_traces(marker=dict(size=10, opacity=0.6))
+    if toggle_annotations:
+        # get the points to annotate (only the highest rated model per month per license)
+        idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
+            "rating"
+        ].idxmax()
+        points_to_annotate_df = filtered_df.loc[idx_to_annotate]
+        for i, row in points_to_annotate_df.iterrows():
+            fig.add_annotation(
+                x=row["Release Date"],
+                y=row["rating"],
+                text=row["Model"],
+                showarrow=True,
+                arrowhead=0,
+            )
+    return fig
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("# Proprietary vs Open LLMs (LMSYS Arena ELO)")
     with gr.Row():
+        min_score = gr.Slider(
+            minimum=min_elo_score,
+            maximum=max_elo_score,
+            value=800,
+            step=50,
+            label="Minimum ELO Score",
+        )
+        max_models_per_month = gr.Slider(
+            value=upper_models_per_month,
+            minimum=1,
+            maximum=upper_models_per_month,
+            step=1,
+            label="Max Models per Month (per License)",
         )
+        toggle_annotations = gr.Radio(
+            choices=[True, False], label="Overlay Best Model Name", value=False
+        )
+    # Show plot
+    plot = gr.Plot()
+    demo.load(
+        fn=build_plot,
+        inputs=[min_score, max_models_per_month, toggle_annotations],
+        outputs=plot,
+    )
+    min_score.change(
+        fn=build_plot,
+        inputs=[min_score, max_models_per_month, toggle_annotations],
+        outputs=plot,
+    )
+    max_models_per_month.change(
+        fn=build_plot,
+        inputs=[min_score, max_models_per_month, toggle_annotations],
+        outputs=plot,
+    )
+    toggle_annotations.change(
+        fn=build_plot,
+        inputs=[min_score, max_models_per_month, toggle_annotations],
+        outputs=plot,
+    )
+demo.launch()
+# if __name__ == "__main__":

release_date_mapping.json CHANGED Viewed

@@ -7,7 +7,7 @@
     {
         "key": "gpt-4-1106-preview",
         "Model": "GPT-4-1106-preview",
-        "Release Date": "2024-11-06"
     },
     {
         "key": "claude-3-opus-20240229",

     {
         "key": "gpt-4-1106-preview",
         "Model": "GPT-4-1106-preview",
+        "Release Date": "2023-11-06"
     },
     {
         "key": "claude-3-opus-20240229",

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 huggingface_hub
 pandas
 plotly
-gradio

 huggingface_hub
 pandas
 plotly
+gradio
+statsmodels

utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import Literal
+from huggingface_hub import HfFileSystem, hf_hub_download
+KEY_TO_CATEGORY_NAME = {
+    "full": "Overall",
+    "coding": "Coding",
+    "long_user": "Longer Query",
+    "english": "English",
+    "chinese": "Chinese",
+    "french": "French",
+    "no_tie": "Exclude Ties",
+    "no_short": "Exclude Short Query (< 5 tokens)",
+    "no_refusal": "Exclude Refusal",
+}
+CAT_NAME_TO_EXPLANATION = {
+    "Overall": "Overall Questions",
+    "Coding": "Coding: whether conversation contains code snippets",
+    "Longer Query": "Longer Query (>= 500 tokens)",
+    "English": "English Prompts",
+    "Chinese": "Chinese Prompts",
+    "French": "French Prompts",
+    "Exclude Ties": "Exclude Ties and Bothbad",
+    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
+    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
+}
+PROPRIETARY_LICENSES = [
+    "Proprietary",
+]
+def download_latest_data_from_space(
+    repo_id: str, file_type: Literal["pkl", "csv"]
+) -> str:
+    """
+    Downloads the latest data file of the specified file type from the given repository space.
+    Args:
+        repo_id (str): The ID of the repository space.
+        file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
+    Returns:
+        str: The local file path of the downloaded data file.
+    """
+    def extract_date(filename):
+        return filename.split("/")[-1].split(".")[0].split("_")[-1]
+    fs = HfFileSystem()
+    data_file_path = f"spaces/{repo_id}/*.{file_type}"
+    files = fs.glob(data_file_path)
+    latest_file = sorted(files, key=extract_date, reverse=True)[0]
+    latest_filepath_local = hf_hub_download(
+        repo_id=repo_id,
+        filename=latest_file.split("/")[-1],
+        repo_type="space",
+    )
+    return latest_filepath_local