Spaces:
Running
Running
Joschka Strueber
commited on
Commit
·
35404bc
1
Parent(s):
3eeaa4c
[Add, Fix] better warnings for missing models, better description
Browse files
app.py
CHANGED
|
@@ -3,13 +3,10 @@ import gradio as gr
|
|
| 3 |
import numpy as np
|
| 4 |
import matplotlib.pyplot as plt
|
| 5 |
import seaborn as sns
|
| 6 |
-
import re
|
| 7 |
from io import BytesIO
|
| 8 |
from PIL import Image
|
| 9 |
from datasets.exceptions import DatasetNotFoundError
|
| 10 |
|
| 11 |
-
print(gr.__version__)
|
| 12 |
-
|
| 13 |
from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
|
| 14 |
from src.similarity import load_data_and_compute_similarities
|
| 15 |
|
|
@@ -82,15 +79,15 @@ def update_datasets_based_on_models(selected_models, current_dataset):
|
|
| 82 |
)
|
| 83 |
except DatasetNotFoundError as e:
|
| 84 |
# Extract model name from error message
|
| 85 |
-
|
| 86 |
-
model_name =
|
| 87 |
|
| 88 |
# Display a shorter warning
|
| 89 |
gr.Warning(f"Data for '{model_name}' is gated or unavailable.")
|
| 90 |
return gr.update(choices=[], value=None)
|
| 91 |
|
| 92 |
with gr.Blocks(title="LLM Similarity Analyzer") as demo:
|
| 93 |
-
gr.Markdown("## Model Similarity Comparison Tool
|
| 94 |
|
| 95 |
with gr.Row():
|
| 96 |
dataset_dropdown = gr.Dropdown(
|
|
@@ -116,8 +113,6 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
|
|
| 116 |
info="Search and select multiple models"
|
| 117 |
)
|
| 118 |
|
| 119 |
-
gr.Markdown("* For the probabilistic Kappa_p metric self-similarity is only 1, if the model predicts a single option with 100% confidence.")
|
| 120 |
-
|
| 121 |
model_dropdown.change(
|
| 122 |
fn=update_datasets_based_on_models,
|
| 123 |
inputs=[model_dropdown, dataset_dropdown],
|
|
@@ -137,11 +132,21 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
|
|
| 137 |
outputs=heatmap
|
| 138 |
)
|
| 139 |
|
|
|
|
|
|
|
| 140 |
clear_btn = gr.Button("Clear Selection")
|
| 141 |
clear_btn.click(
|
| 142 |
lambda: [[], None, None],
|
| 143 |
outputs=[model_dropdown, dataset_dropdown, heatmap]
|
| 144 |
)
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
if __name__ == "__main__":
|
| 147 |
demo.launch(ssr_mode=False)
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import matplotlib.pyplot as plt
|
| 5 |
import seaborn as sns
|
|
|
|
| 6 |
from io import BytesIO
|
| 7 |
from PIL import Image
|
| 8 |
from datasets.exceptions import DatasetNotFoundError
|
| 9 |
|
|
|
|
|
|
|
| 10 |
from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
|
| 11 |
from src.similarity import load_data_and_compute_similarities
|
| 12 |
|
|
|
|
| 79 |
)
|
| 80 |
except DatasetNotFoundError as e:
|
| 81 |
# Extract model name from error message
|
| 82 |
+
model_name = e.args[0].split("'")[1]
|
| 83 |
+
model_name = model_name.split("/")[-1].replace("__", "/").replace("_details", "")
|
| 84 |
|
| 85 |
# Display a shorter warning
|
| 86 |
gr.Warning(f"Data for '{model_name}' is gated or unavailable.")
|
| 87 |
return gr.update(choices=[], value=None)
|
| 88 |
|
| 89 |
with gr.Blocks(title="LLM Similarity Analyzer") as demo:
|
| 90 |
+
gr.Markdown("## Model Similarity Comparison Tool")
|
| 91 |
|
| 92 |
with gr.Row():
|
| 93 |
dataset_dropdown = gr.Dropdown(
|
|
|
|
| 113 |
info="Search and select multiple models"
|
| 114 |
)
|
| 115 |
|
|
|
|
|
|
|
| 116 |
model_dropdown.change(
|
| 117 |
fn=update_datasets_based_on_models,
|
| 118 |
inputs=[model_dropdown, dataset_dropdown],
|
|
|
|
| 132 |
outputs=heatmap
|
| 133 |
)
|
| 134 |
|
| 135 |
+
gr.Markdown("\* Self-similarity is only 1.0 for the probabilistic Kappa_p metric if the model predicts a single option with 100% confidence for each question.")
|
| 136 |
+
|
| 137 |
clear_btn = gr.Button("Clear Selection")
|
| 138 |
clear_btn.click(
|
| 139 |
lambda: [[], None, None],
|
| 140 |
outputs=[model_dropdown, dataset_dropdown, heatmap]
|
| 141 |
)
|
| 142 |
|
| 143 |
+
gr.Markdown("""### Information \n
|
| 144 |
+
- **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
|
| 145 |
+
- Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
|
| 146 |
+
- **Models**: Open LLM Leaderboard models \n
|
| 147 |
+
- Every model is gated on Hugging Face and access has to be requested. \n
|
| 148 |
+
- We requested access to the most popular models, but some may be missing. \n
|
| 149 |
+
- **Metrics**: Kappa_p (probabilistic), Kappa_p (deterministic), Error Consistency""")
|
| 150 |
+
|
| 151 |
if __name__ == "__main__":
|
| 152 |
demo.launch(ssr_mode=False)
|