Spaces:
Running
Running
| import os | |
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from io import BytesIO | |
| from PIL import Image | |
| from datasets.exceptions import DatasetNotFoundError | |
| from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets | |
| from src.similarity import load_data_and_compute_similarities | |
| # Set matplotlib backend for non-GUI environments | |
| plt.switch_backend('Agg') | |
| def create_heatmap(selected_models, selected_dataset, selected_metric): | |
| if not selected_models or not selected_dataset: | |
| return None | |
| # Sort models and get short names | |
| similarities = load_data_and_compute_similarities(selected_models, selected_dataset, selected_metric) | |
| # Check if similarity matrix contains NaN rows | |
| failed_models = [] | |
| for i in range(len(similarities)): | |
| if np.isnan(similarities[i]).all(): | |
| failed_models.append(selected_models[i]) | |
| if failed_models: | |
| gr.Warning(f"Failed to load data for models: {', '.join(failed_models)}") | |
| # Create figure and heatmap using seaborn | |
| plt.figure(figsize=(8, 6)) | |
| ax = sns.heatmap( | |
| similarities, | |
| annot=True, | |
| fmt=".2f", | |
| cmap="viridis", | |
| vmin=0, | |
| vmax=1, | |
| xticklabels=selected_models, | |
| yticklabels=selected_models | |
| ) | |
| # Customize plot | |
| plt.title(f"{selected_metric} for {selected_dataset}", fontsize=16) | |
| plt.xlabel("Models", fontsize=14) | |
| plt.ylabel("Models", fontsize=14) | |
| plt.xticks(rotation=45, ha='right') | |
| plt.yticks(rotation=0) | |
| plt.tight_layout() | |
| # Save to buffer | |
| buf = BytesIO() | |
| plt.savefig(buf, format="png", dpi=100, bbox_inches="tight") | |
| plt.close() | |
| # Convert to PIL Image | |
| buf.seek(0) | |
| img = Image.open(buf).convert("RGB") | |
| return img | |
| def validate_inputs(selected_models, selected_dataset): | |
| if not selected_models: | |
| raise gr.Error("Please select at least one model!") | |
| if not selected_dataset: | |
| raise gr.Error("Please select a dataset!") | |
| def update_datasets_based_on_models(selected_models, current_dataset): | |
| try: | |
| available_datasets = get_leaderboard_datasets(selected_models) if selected_models else [] | |
| if current_dataset in available_datasets: | |
| valid_dataset = current_dataset | |
| elif "mmlu_pro" in available_datasets: | |
| valid_dataset = "mmlu_pro" | |
| else: | |
| valid_dataset = None | |
| return gr.update( | |
| choices=available_datasets, | |
| value=valid_dataset | |
| ) | |
| except DatasetNotFoundError as e: | |
| # Extract model name from error message | |
| model_name = e.args[0].split("'")[1] | |
| model_name = model_name.split("/")[-1].replace("__", "/").replace("_details", "") | |
| # Display a shorter warning | |
| gr.Warning(f"Data for '{model_name}' is gated or unavailable.") | |
| return gr.update(choices=[], value=None) | |
| links_markdown = """ | |
| [📄 Paper](https://arxiv.org/abs/2502.04313) | | |
| [☯ Homepage](https://model-similarity.github.io/) | | |
| [🐱 Code](https://github.com/model-similarity/lm-similarity) | | |
| [🐍 pip install lm-sim](https://pypi.org/project/lm-sim/) | | |
| [🤗 Data](https://huggingface.co/datasets/bethgelab/lm-similarity) | |
| """ | |
| model_init = ["HuggingFaceTB/SmolLM2-1.7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-4", "google/gemma-2-27b-it", "Qwen/Qwen2.5-32B-Instruct", "meta-llama/Llama-3.3-70B-Instruct"] | |
| dataset_init = "mmlu_pro" | |
| metric_init = "CAPA" | |
| # Create Gradio interface | |
| with gr.Blocks(title="LLM Similarity Analyzer") as demo: | |
| gr.Markdown("# Model Similarity Comparison Tool") | |
| gr.Markdown(links_markdown) | |
| gr.Markdown('Demo for the recent publication ["Great Models Think Alike and this Undermines AI Oversight"](https://huggingface.co/papers/2502.04313).') | |
| with gr.Row(): | |
| dataset_dropdown = gr.Dropdown( | |
| choices=get_leaderboard_datasets(model_init), | |
| label="Select Dataset", | |
| value=dataset_init, | |
| filterable=True, | |
| interactive=True, | |
| allow_custom_value=False, | |
| info="Open LLM Leaderboard v2 benchmark datasets" | |
| ) | |
| metric_dropdown = gr.Dropdown( | |
| choices=["CAPA", "CAPA (det.)", "Error Consistency"], | |
| label="Select Metric", | |
| value=metric_init, | |
| info="Select a similarity metric to compute" | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=get_leaderboard_models_cached(), | |
| label="Select Models", | |
| value=model_init, | |
| multiselect=True, | |
| filterable=True, | |
| allow_custom_value=False, | |
| info="Search and select multiple models" | |
| ) | |
| model_dropdown.change( | |
| fn=update_datasets_based_on_models, | |
| inputs=[model_dropdown, dataset_dropdown], | |
| outputs=dataset_dropdown | |
| ) | |
| generate_btn = gr.Button("Generate Heatmap", variant="primary") | |
| heatmap = gr.Image(value=create_heatmap(model_init, dataset_init, metric_init), label="Similarity Heatmap", visible=True) | |
| generate_btn.click( | |
| fn=validate_inputs, | |
| inputs=[model_dropdown, dataset_dropdown], | |
| queue=False | |
| ).then( | |
| fn=create_heatmap, | |
| inputs=[model_dropdown, dataset_dropdown, metric_dropdown], | |
| outputs=heatmap | |
| ) | |
| gr.Markdown("\* Self-similarity is only 1.0 for the probabilistic Kappa_p metric if the model predicts a single option with 100% confidence for each question.") | |
| clear_btn = gr.Button("Clear Selection") | |
| clear_btn.click( | |
| lambda: [[], None, None], | |
| outputs=[model_dropdown, dataset_dropdown, heatmap] | |
| ) | |
| gr.Markdown("## Information") | |
| gr.Markdown("""We propose Chance Adjusted Probabilistic Agreement ($\operatorname\{CAPA\}$, or $\kappa_p$), a novel metric \ | |
| for model similarity which adjusts for chance agreement due to accuracy. Using CAPA, we find: (1) LLM-as-a-judge scores are \ | |
| biased towards more similar models controlling for the model's capability. (2) Gain from training strong models on annotations \ | |
| of weak supervisors (weak-to-strong generalization) is higher when the two models are more different. (3) Concerningly, model \ | |
| errors are getting more correlated as capabilities increase.""") | |
| image_path = "data/table_capa.png" | |
| gr.Image(value=image_path, label="Comparison of different similarity metrics for multiple-choice questions", interactive=False) | |
| gr.Markdown(""" | |
| - **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n | |
| - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n | |
| - **Models**: Open LLM Leaderboard models \n | |
| - Every model evaluation is gated on Hugging Face and access has to be requested. \n | |
| - We requested access for the most popular models, but some may be missing. \n | |
| - Notably, loading data is not possible for many meta-llama and gemma models. | |
| - **Metrics**: CAPA (probabilistic), CAPA (deterministic), Error Consistency""") | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) |