Spaces:
Runtime error
Runtime error
| import os | |
| import pickle | |
| import pandas as pd | |
| import gradio as gr | |
| import plotly.express as px | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from utils import ( | |
| KEY_TO_CATEGORY_NAME, | |
| CAT_NAME_TO_EXPLANATION, | |
| download_latest_data_from_space, | |
| get_constants, | |
| update_release_date_mapping, | |
| format_data, | |
| get_trendlines, | |
| find_crossover_point, | |
| ) | |
| ################### | |
| ### Initialize scheduler | |
| ################### | |
| def restart_space(): | |
| HfApi(token=os.getenv("HF_TOKEN", None)).restart_space( | |
| repo_id="andrewrreed/closed-vs-open-arena-elo" | |
| ) | |
| print(f"Space restarted on {datetime.now()}") | |
| # restart the space every day at 9am | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0) | |
| scheduler.start() | |
| ################### | |
| ### Load Data | |
| ################### | |
| # gather ELO data | |
| latest_elo_file_local = download_latest_data_from_space( | |
| repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl" | |
| ) | |
| with open(latest_elo_file_local, "rb") as fin: | |
| elo_results = pickle.load(fin) | |
| # TO-DO: need to also include vision | |
| elo_results = elo_results["text"] | |
| arena_dfs = {} | |
| for k in KEY_TO_CATEGORY_NAME.keys(): | |
| if k not in elo_results: | |
| continue | |
| arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"] | |
| # gather open llm leaderboard data | |
| latest_leaderboard_file_local = download_latest_data_from_space( | |
| repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv" | |
| ) | |
| leaderboard_df = pd.read_csv(latest_leaderboard_file_local) | |
| # load release date mapping data | |
| release_date_mapping = pd.read_json("release_date_mapping.json", orient="records") | |
| ################### | |
| ### Prepare Data | |
| ################### | |
| # update release date mapping with new models | |
| # check for new models in ELO data | |
| new_model_keys_to_add = [ | |
| model | |
| for model in arena_dfs["Overall"].index.to_list() | |
| if model not in release_date_mapping["key"].to_list() | |
| ] | |
| if new_model_keys_to_add: | |
| release_date_mapping = update_release_date_mapping( | |
| new_model_keys_to_add, leaderboard_df, release_date_mapping | |
| ) | |
| # merge leaderboard data with ELO data | |
| merged_dfs = {} | |
| for k, v in arena_dfs.items(): | |
| merged_dfs[k] = ( | |
| pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key") | |
| .sort_values("rating", ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| # add release dates into the merged data | |
| for k, v in merged_dfs.items(): | |
| merged_dfs[k] = pd.merge( | |
| merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key" | |
| ) | |
| # format dataframes | |
| merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()} | |
| # get constants | |
| min_elo_score, max_elo_score, upper_models_per_month = get_constants(merged_dfs) | |
| date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0] | |
| orgs = merged_dfs["Overall"].Organization.unique().tolist() | |
| ################### | |
| ### Build and Plot Data | |
| ################### | |
| def get_data_split(dfs, set_name): | |
| df = dfs[set_name].copy(deep=True) | |
| return df.reset_index(drop=True) | |
| def clean_df_for_display(df): | |
| df = df.loc[ | |
| :, | |
| [ | |
| "Model", | |
| "rating", | |
| "MMLU", | |
| "MT-bench (score)", | |
| "Release Date", | |
| "Organization", | |
| "License", | |
| "Link", | |
| ], | |
| ].rename(columns={"rating": "ELO Score", "MT-bench (score)": "MT-Bench"}) | |
| df["Release Date"] = df["Release Date"].astype(str) | |
| df.sort_values("ELO Score", ascending=False, inplace=True) | |
| df.reset_index(drop=True, inplace=True) | |
| return df | |
| def filter_df(min_score, max_models_per_month, set_selector, org_selector): | |
| df = get_data_split(merged_dfs, set_name=set_selector) | |
| # filter data | |
| filtered_df = df[ | |
| (df["rating"] >= min_score) & (df["Organization"].isin(org_selector)) | |
| ] | |
| filtered_df = ( | |
| filtered_df.groupby(["Month-Year", "License"], group_keys=False) | |
| .apply(lambda x: x.nlargest(max_models_per_month, "rating")) | |
| .reset_index(drop=True) | |
| ) | |
| return filtered_df | |
| def build_plot(toggle_annotations, filtered_df): | |
| # construct plot | |
| custom_colors = {"Open": "#ff7f0e", "Proprietary": "#1f77b4"} | |
| fig = px.scatter( | |
| filtered_df, | |
| x="Release Date", | |
| y="rating", | |
| color="License", | |
| hover_name="Model", | |
| hover_data=["Organization", "License", "Link"], | |
| trendline="ols", | |
| title=f"Open vs Proprietary LLMs by LMSYS Arena ELO Score<br>(as of {date_updated})", | |
| labels={"rating": "Arena ELO", "Release Date": "Release Date"}, | |
| height=700, | |
| template="plotly_dark", | |
| color_discrete_map=custom_colors, | |
| ) | |
| fig.update_layout( | |
| plot_bgcolor="rgba(0,0,0,0)", # Set background color to transparent | |
| paper_bgcolor="rgba(0,0,0,0)", # Set paper (plot) background color to transparent | |
| title={"x": 0.5}, | |
| ) | |
| fig.update_traces(marker=dict(size=10, opacity=0.6)) | |
| # calculate days until crossover | |
| trend1, trend2 = get_trendlines(fig) | |
| crossover = find_crossover_point( | |
| b1=trend1[0], m1=trend1[1], b2=trend2[0], m2=trend2[1] | |
| ) | |
| days_til_crossover = ( | |
| pd.to_datetime(crossover, unit="s") - pd.Timestamp.today() | |
| ).days | |
| # add annotation with number of models and days til crossover | |
| fig.add_annotation( | |
| xref="paper", | |
| yref="paper", # use paper coordinates | |
| x=-0.05, | |
| y=1.13, | |
| text=f"Number of models: {len(filtered_df)}<br>Days til crossover: {days_til_crossover}", | |
| showarrow=False, | |
| font=dict(size=14, color="white"), | |
| bgcolor="rgba(0,0,0,0.5)", | |
| ) | |
| if toggle_annotations: | |
| # get the points to annotate (only the highest rated model per month per license) | |
| idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[ | |
| "rating" | |
| ].idxmax() | |
| points_to_annotate_df = filtered_df.loc[idx_to_annotate] | |
| for i, row in points_to_annotate_df.iterrows(): | |
| fig.add_annotation( | |
| x=row["Release Date"], | |
| y=row["rating"], | |
| text=row["Model"], | |
| showarrow=True, | |
| arrowhead=0, | |
| ) | |
| return fig, clean_df_for_display(filtered_df) | |
| set_dark_mode = """ | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'dark') { | |
| url.searchParams.set('__theme', 'dark'); | |
| window.location.href = url.href; | |
| } | |
| } | |
| """ | |
| with gr.Blocks( | |
| theme=gr.themes.Soft( | |
| primary_hue=gr.themes.colors.sky, | |
| secondary_hue=gr.themes.colors.green, | |
| # spacing_size=gr.themes.sizes.spacing_sm, | |
| text_size=gr.themes.sizes.text_sm, | |
| font=[ | |
| gr.themes.GoogleFont("Open Sans"), | |
| "ui-sans-serif", | |
| "system-ui", | |
| "sans-serif", | |
| ], | |
| ), | |
| js=set_dark_mode, | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center; max-width: 650px; margin: auto;"> | |
| <h1 style="font-weight: 900; margin-top: 5px;">π¬ Progress Tracker: Open vs. Proprietary LLMs π¬</h1> | |
| <p style="text-align: left; margin-top: 30px; margin-bottom: 30px; line-height: 20px;"> | |
| This app visualizes the progress of proprietary and open-source LLMs over time as scored by the <a href="https://leaderboard.lmsys.org/">LMSYS Chatbot Arena</a>. | |
| The idea is inspired by <a href="https://www.linkedin.com/posts/maxime-labonne_arena-elo-graph-updated-with-new-models-activity-7187062633735368705-u2jB">this great work</a> | |
| from <a href="https://huggingface.co/mlabonne/">Maxime Labonne</a>, and is intended to stay up-to-date as new models are released and evaluated. | |
| <div style="text-align: left;"> | |
| <strong>Plot info:</strong> | |
| <br> | |
| <ul style="padding-left: 20px;"> | |
| <li> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena. </li> | |
| <li> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates). </li> | |
| <li> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria. </li> | |
| <ul> | |
| </div> | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Group(): | |
| with gr.Row(variant="compact"): | |
| set_selector = gr.Dropdown( | |
| choices=list(CAT_NAME_TO_EXPLANATION.keys()), | |
| label="Select Category", | |
| value="Overall", | |
| info="Select the category to visualize", | |
| ) | |
| min_score = gr.Slider( | |
| minimum=min_elo_score, | |
| maximum=max_elo_score, | |
| value=(max_elo_score - min_elo_score) * 0.3 + min_elo_score, | |
| step=50, | |
| label="Minimum ELO Score", | |
| info="Filter out low scoring models", | |
| ) | |
| max_models_per_month = gr.Slider( | |
| value=upper_models_per_month - 2, | |
| minimum=1, | |
| maximum=upper_models_per_month, | |
| step=1, | |
| label="Max Models per Month (per License)", | |
| info="Limit to N best models per month per license to reduce clutter", | |
| ) | |
| toggle_annotations = gr.Radio( | |
| choices=[True, False], | |
| label="Overlay Best Model Name", | |
| value=True, | |
| info="Toggle to overlay the name of the best model per month per license", | |
| ) | |
| with gr.Row(variant="compact"): | |
| with gr.Accordion("More options", open=False): | |
| org_selector = gr.Dropdown( | |
| choices=sorted(orgs), | |
| label="Filter by Organization", | |
| value=sorted(orgs), | |
| multiselect=True, | |
| info="Limit organizations included in plot", | |
| ) | |
| # Show plot | |
| filtered_df = gr.State() | |
| with gr.Group(): | |
| with gr.Tab("Plot"): | |
| plot = gr.Plot(show_label=False) | |
| with gr.Tab("Raw Data"): | |
| display_df = gr.DataFrame() | |
| demo.load( | |
| fn=filter_df, | |
| inputs=[min_score, max_models_per_month, set_selector, org_selector], | |
| outputs=filtered_df, | |
| ).then( | |
| fn=build_plot, | |
| inputs=[toggle_annotations, filtered_df], | |
| outputs=[plot, display_df], | |
| ) | |
| min_score.change( | |
| fn=filter_df, | |
| inputs=[min_score, max_models_per_month, set_selector, org_selector], | |
| outputs=filtered_df, | |
| ).then( | |
| fn=build_plot, | |
| inputs=[toggle_annotations, filtered_df], | |
| outputs=[plot, display_df], | |
| ) | |
| max_models_per_month.change( | |
| fn=filter_df, | |
| inputs=[min_score, max_models_per_month, set_selector, org_selector], | |
| outputs=filtered_df, | |
| ).then( | |
| fn=build_plot, | |
| inputs=[toggle_annotations, filtered_df], | |
| outputs=[plot, display_df], | |
| ) | |
| toggle_annotations.change( | |
| fn=filter_df, | |
| inputs=[min_score, max_models_per_month, set_selector, org_selector], | |
| outputs=filtered_df, | |
| ).then( | |
| fn=build_plot, | |
| inputs=[toggle_annotations, filtered_df], | |
| outputs=[plot, display_df], | |
| ) | |
| set_selector.change( | |
| fn=filter_df, | |
| inputs=[min_score, max_models_per_month, set_selector, org_selector], | |
| outputs=filtered_df, | |
| ).then( | |
| fn=build_plot, | |
| inputs=[toggle_annotations, filtered_df], | |
| outputs=[plot, display_df], | |
| ) | |
| org_selector.change( | |
| fn=filter_df, | |
| inputs=[min_score, max_models_per_month, set_selector, org_selector], | |
| outputs=filtered_df, | |
| ).then( | |
| fn=build_plot, | |
| inputs=[toggle_annotations, filtered_df], | |
| outputs=[plot, display_df], | |
| ) | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center; max-width: 650px; margin: auto;"> | |
| <p style="margin-top: 40px;"> If you have any questions, feel free to open a discussion or <a href="https://twitter.com/andrewrreed">reach out to me on social</a>. </p> | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| demo.launch() | |