Spaces:
Runtime error
Runtime error
Commit
Β·
167137b
1
Parent(s):
311dc3a
Add filters
Browse files- app.py +157 -101
- release_date_mapping.json +1 -1
- requirements.txt +2 -1
- utils.py +60 -0
app.py
CHANGED
|
@@ -2,113 +2,169 @@ import pickle
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
import gradio as gr
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
if k not in elo_results:
|
| 57 |
-
continue
|
| 58 |
-
arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
|
| 59 |
-
|
| 60 |
-
# gather open llm leaderboard data
|
| 61 |
-
LEADERBOARD_DATA_FILES = "spaces/lmsys/chatbot-arena-leaderboard/*.csv"
|
| 62 |
-
leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)
|
| 63 |
-
latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[
|
| 64 |
-
0
|
| 65 |
-
]
|
| 66 |
-
|
| 67 |
-
latest_leaderboard_file_local = hf_hub_download(
|
| 68 |
-
repo_id="lmsys/chatbot-arena-leaderboard",
|
| 69 |
-
filename=latest_leaderboard_file.split("/")[-1],
|
| 70 |
-
repo_type="space",
|
| 71 |
)
|
| 72 |
-
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
|
| 73 |
-
|
| 74 |
-
###################
|
| 75 |
-
### Prepare Data
|
| 76 |
-
###################
|
| 77 |
-
|
| 78 |
-
# merge leaderboard data with ELO data
|
| 79 |
-
merged_dfs = {}
|
| 80 |
-
for k, v in arena_dfs.items():
|
| 81 |
-
merged_dfs[k] = (
|
| 82 |
-
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
|
| 83 |
-
.sort_values("rating", ascending=False)
|
| 84 |
-
.reset_index(drop=True)
|
| 85 |
-
)
|
| 86 |
|
| 87 |
-
# add release dates into the merged data
|
| 88 |
-
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
|
| 89 |
-
for k, v in merged_dfs.items():
|
| 90 |
-
merged_dfs[k] = pd.merge(
|
| 91 |
-
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
|
| 92 |
-
)
|
| 93 |
df = merged_dfs["Overall"]
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
with gr.Row():
|
| 101 |
-
gr.
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
|
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
import gradio as gr
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
|
| 7 |
+
from utils import (
|
| 8 |
+
KEY_TO_CATEGORY_NAME,
|
| 9 |
+
PROPRIETARY_LICENSES,
|
| 10 |
+
download_latest_data_from_space,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# with gr.NO_RELOAD:
|
| 14 |
+
###################
|
| 15 |
+
### Load Data
|
| 16 |
+
###################
|
| 17 |
+
|
| 18 |
+
# gather ELO data
|
| 19 |
+
latest_elo_file_local = download_latest_data_from_space(
|
| 20 |
+
repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
with open(latest_elo_file_local, "rb") as fin:
|
| 24 |
+
elo_results = pickle.load(fin)
|
| 25 |
+
|
| 26 |
+
arena_dfs = {}
|
| 27 |
+
for k in KEY_TO_CATEGORY_NAME.keys():
|
| 28 |
+
if k not in elo_results:
|
| 29 |
+
continue
|
| 30 |
+
arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
|
| 31 |
+
|
| 32 |
+
# gather open llm leaderboard data
|
| 33 |
+
latest_leaderboard_file_local = download_latest_data_from_space(
|
| 34 |
+
repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
|
| 35 |
+
)
|
| 36 |
+
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
|
| 37 |
+
|
| 38 |
+
###################
|
| 39 |
+
### Prepare Data
|
| 40 |
+
###################
|
| 41 |
+
|
| 42 |
+
# merge leaderboard data with ELO data
|
| 43 |
+
merged_dfs = {}
|
| 44 |
+
for k, v in arena_dfs.items():
|
| 45 |
+
merged_dfs[k] = (
|
| 46 |
+
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
|
| 47 |
+
.sort_values("rating", ascending=False)
|
| 48 |
+
.reset_index(drop=True)
|
| 49 |
)
|
| 50 |
|
| 51 |
+
# add release dates into the merged data
|
| 52 |
+
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
|
| 53 |
+
for k, v in merged_dfs.items():
|
| 54 |
+
merged_dfs[k] = pd.merge(
|
| 55 |
+
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
df = merged_dfs["Overall"]
|
| 59 |
+
df["License"] = df["License"].apply(
|
| 60 |
+
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
|
| 61 |
+
)
|
| 62 |
+
df["Release Date"] = pd.to_datetime(df["Release Date"])
|
| 63 |
+
df["Month-Year"] = df["Release Date"].dt.to_period("M")
|
| 64 |
+
df["rating"] = df["rating"].round()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
###################
|
| 68 |
+
### Plot Data
|
| 69 |
+
###################
|
| 70 |
+
|
| 71 |
+
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
|
| 72 |
+
min_elo_score = df["rating"].min().round()
|
| 73 |
+
max_elo_score = df["rating"].max().round()
|
| 74 |
+
upper_models_per_month = int(
|
| 75 |
+
df.groupby(["Month-Year", "License"])["rating"].apply(lambda x: x.count()).max()
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def build_plot(min_score, max_models_per_month, toggle_annotations):
|
| 80 |
+
|
| 81 |
+
filtered_df = df[(df["rating"] >= min_score)]
|
| 82 |
+
filtered_df = (
|
| 83 |
+
filtered_df.groupby(["Month-Year", "License"])
|
| 84 |
+
.apply(lambda x: x.nlargest(max_models_per_month, "rating"))
|
| 85 |
+
.reset_index(drop=True)
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
fig = px.scatter(
|
| 89 |
+
filtered_df,
|
| 90 |
+
x="Release Date",
|
| 91 |
+
y="rating",
|
| 92 |
+
color="License",
|
| 93 |
+
hover_name="Model",
|
| 94 |
+
hover_data=["Organization", "License"],
|
| 95 |
+
trendline="ols",
|
| 96 |
+
title=f"Proprietary vs Open LLMs (LMSYS Arena ELO as of {date_updated})",
|
| 97 |
+
labels={"rating": "Arena ELO", "Release Date": "Release Date"},
|
| 98 |
+
height=700,
|
| 99 |
+
template="seaborn",
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
fig.update_traces(marker=dict(size=10, opacity=0.6))
|
| 103 |
|
| 104 |
+
if toggle_annotations:
|
| 105 |
+
# get the points to annotate (only the highest rated model per month per license)
|
| 106 |
+
idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
|
| 107 |
+
"rating"
|
| 108 |
+
].idxmax()
|
| 109 |
+
points_to_annotate_df = filtered_df.loc[idx_to_annotate]
|
| 110 |
+
|
| 111 |
+
for i, row in points_to_annotate_df.iterrows():
|
| 112 |
+
fig.add_annotation(
|
| 113 |
+
x=row["Release Date"],
|
| 114 |
+
y=row["rating"],
|
| 115 |
+
text=row["Model"],
|
| 116 |
+
showarrow=True,
|
| 117 |
+
arrowhead=0,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
return fig
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
demo = gr.Blocks()
|
| 124 |
+
|
| 125 |
+
with demo:
|
| 126 |
+
gr.Markdown("# Proprietary vs Open LLMs (LMSYS Arena ELO)")
|
| 127 |
with gr.Row():
|
| 128 |
+
min_score = gr.Slider(
|
| 129 |
+
minimum=min_elo_score,
|
| 130 |
+
maximum=max_elo_score,
|
| 131 |
+
value=800,
|
| 132 |
+
step=50,
|
| 133 |
+
label="Minimum ELO Score",
|
| 134 |
+
)
|
| 135 |
+
max_models_per_month = gr.Slider(
|
| 136 |
+
value=upper_models_per_month,
|
| 137 |
+
minimum=1,
|
| 138 |
+
maximum=upper_models_per_month,
|
| 139 |
+
step=1,
|
| 140 |
+
label="Max Models per Month (per License)",
|
| 141 |
)
|
| 142 |
+
toggle_annotations = gr.Radio(
|
| 143 |
+
choices=[True, False], label="Overlay Best Model Name", value=False
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Show plot
|
| 147 |
+
plot = gr.Plot()
|
| 148 |
+
demo.load(
|
| 149 |
+
fn=build_plot,
|
| 150 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
| 151 |
+
outputs=plot,
|
| 152 |
+
)
|
| 153 |
+
min_score.change(
|
| 154 |
+
fn=build_plot,
|
| 155 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
| 156 |
+
outputs=plot,
|
| 157 |
+
)
|
| 158 |
+
max_models_per_month.change(
|
| 159 |
+
fn=build_plot,
|
| 160 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
| 161 |
+
outputs=plot,
|
| 162 |
+
)
|
| 163 |
+
toggle_annotations.change(
|
| 164 |
+
fn=build_plot,
|
| 165 |
+
inputs=[min_score, max_models_per_month, toggle_annotations],
|
| 166 |
+
outputs=plot,
|
| 167 |
+
)
|
| 168 |
|
| 169 |
+
demo.launch()
|
| 170 |
+
# if __name__ == "__main__":
|
release_date_mapping.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
{
|
| 8 |
"key": "gpt-4-1106-preview",
|
| 9 |
"Model": "GPT-4-1106-preview",
|
| 10 |
-
"Release Date": "
|
| 11 |
},
|
| 12 |
{
|
| 13 |
"key": "claude-3-opus-20240229",
|
|
|
|
| 7 |
{
|
| 8 |
"key": "gpt-4-1106-preview",
|
| 9 |
"Model": "GPT-4-1106-preview",
|
| 10 |
+
"Release Date": "2023-11-06"
|
| 11 |
},
|
| 12 |
{
|
| 13 |
"key": "claude-3-opus-20240229",
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
huggingface_hub
|
| 2 |
pandas
|
| 3 |
plotly
|
| 4 |
-
gradio
|
|
|
|
|
|
| 1 |
huggingface_hub
|
| 2 |
pandas
|
| 3 |
plotly
|
| 4 |
+
gradio
|
| 5 |
+
statsmodels
|
utils.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from huggingface_hub import HfFileSystem, hf_hub_download
|
| 4 |
+
|
| 5 |
+
KEY_TO_CATEGORY_NAME = {
|
| 6 |
+
"full": "Overall",
|
| 7 |
+
"coding": "Coding",
|
| 8 |
+
"long_user": "Longer Query",
|
| 9 |
+
"english": "English",
|
| 10 |
+
"chinese": "Chinese",
|
| 11 |
+
"french": "French",
|
| 12 |
+
"no_tie": "Exclude Ties",
|
| 13 |
+
"no_short": "Exclude Short Query (< 5 tokens)",
|
| 14 |
+
"no_refusal": "Exclude Refusal",
|
| 15 |
+
}
|
| 16 |
+
CAT_NAME_TO_EXPLANATION = {
|
| 17 |
+
"Overall": "Overall Questions",
|
| 18 |
+
"Coding": "Coding: whether conversation contains code snippets",
|
| 19 |
+
"Longer Query": "Longer Query (>= 500 tokens)",
|
| 20 |
+
"English": "English Prompts",
|
| 21 |
+
"Chinese": "Chinese Prompts",
|
| 22 |
+
"French": "French Prompts",
|
| 23 |
+
"Exclude Ties": "Exclude Ties and Bothbad",
|
| 24 |
+
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
|
| 25 |
+
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
PROPRIETARY_LICENSES = [
|
| 29 |
+
"Proprietary",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def download_latest_data_from_space(
|
| 34 |
+
repo_id: str, file_type: Literal["pkl", "csv"]
|
| 35 |
+
) -> str:
|
| 36 |
+
"""
|
| 37 |
+
Downloads the latest data file of the specified file type from the given repository space.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
repo_id (str): The ID of the repository space.
|
| 41 |
+
file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
str: The local file path of the downloaded data file.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def extract_date(filename):
|
| 48 |
+
return filename.split("/")[-1].split(".")[0].split("_")[-1]
|
| 49 |
+
|
| 50 |
+
fs = HfFileSystem()
|
| 51 |
+
data_file_path = f"spaces/{repo_id}/*.{file_type}"
|
| 52 |
+
files = fs.glob(data_file_path)
|
| 53 |
+
latest_file = sorted(files, key=extract_date, reverse=True)[0]
|
| 54 |
+
|
| 55 |
+
latest_filepath_local = hf_hub_download(
|
| 56 |
+
repo_id=repo_id,
|
| 57 |
+
filename=latest_file.split("/")[-1],
|
| 58 |
+
repo_type="space",
|
| 59 |
+
)
|
| 60 |
+
return latest_filepath_local
|