Spaces:

bethgelab
/

lm-similarity

Running

App Files Files Community

Joschka Strueber commited on Feb 7

Commit

b90e0d3

1 Parent(s): c24946e

[Add, Ref] Add more info and table on metric, move model list to data/

Browse files

Files changed (4) hide show

app.py +12 -3
{src → data}/models.txt +0 -0
data/table_capa.png +0 -0
src/dataloading.py +3 -3

app.py CHANGED Viewed

@@ -91,7 +91,7 @@ def update_datasets_based_on_models(selected_models, current_dataset):
         return gr.update(choices=[], value=None)
 links_markdown = """
-[📄 Paper](https://arxiv.org/abs/6181841) &nbsp;&nbsp;|&nbsp;&nbsp;
 [☯ Homepage](https://model-similarity.github.io/) &nbsp;&nbsp;|&nbsp;&nbsp;
 [🐱 Code](https://github.com/model-similarity/lm-similarity) &nbsp;&nbsp;|&nbsp;&nbsp;
 [🐍 pip install lm-sim](https://pypi.org/project/lm-sim/) &nbsp;&nbsp;|&nbsp;&nbsp;
@@ -105,8 +105,9 @@ metric_init = "CAPA"
 # Create Gradio interface
 with gr.Blocks(title="LLM Similarity Analyzer") as demo:
-    gr.Markdown("## Model Similarity Comparison Tool")
     gr.Markdown(links_markdown)
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
@@ -162,7 +163,15 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
         outputs=[model_dropdown, dataset_dropdown, heatmap]
     )
-    gr.Markdown("""### Information \n
 - **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
     - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
 - **Models**: Open LLM Leaderboard models \n

         return gr.update(choices=[], value=None)
 links_markdown = """
+[📄 Paper](https://arxiv.org/abs/2502.04313) &nbsp;&nbsp;|&nbsp;&nbsp;
 [☯ Homepage](https://model-similarity.github.io/) &nbsp;&nbsp;|&nbsp;&nbsp;
 [🐱 Code](https://github.com/model-similarity/lm-similarity) &nbsp;&nbsp;|&nbsp;&nbsp;
 [🐍 pip install lm-sim](https://pypi.org/project/lm-sim/) &nbsp;&nbsp;|&nbsp;&nbsp;
 # Create Gradio interface
 with gr.Blocks(title="LLM Similarity Analyzer") as demo:
+    gr.Markdown("# Model Similarity Comparison Tool")
     gr.Markdown(links_markdown)
+    gr.Markdown('Demo for the recent publication ["Great Models Think Alike and this Undermines AI Oversight"](https://huggingface.co/papers/2502.04313).')
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
         outputs=[model_dropdown, dataset_dropdown, heatmap]
     )
+    gr.Markdown("## Information")
+    gr.Markdown("""We propose Chance Adjusted Probabilistic Agreement ($\operatorname\{CAPA\}$, or $\kappa_p$), a novel metric \
+for model similarity which adjusts for chance agreement due to accuracy. Using CAPA, we find: (1) LLM-as-a-judge scores are \
+biased towards more similar models controlling for the model's capability. (2) Gain from training strong models on annotations \
+of weak supervisors (weak-to-strong generalization) is higher when the two models are more different. (3) Concerningly, model \
+errors are getting more correlated as capabilities increase.""")
+    image_path = "data/table_capa.png"
+    gr.Image(value=image_path, label="Comparison of different similarity metrics for multiple-choice questions", interactive=False)
+    gr.Markdown("""
 - **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
     - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
 - **Models**: Open LLM Leaderboard models \n

{src → data}/models.txt RENAMED Viewed

File without changes

data/table_capa.png ADDED Viewed

src/dataloading.py CHANGED Viewed

@@ -10,7 +10,7 @@ def get_leaderboard_models_reload():
     # Load prechecked models
     try:
-        ungated_models = set(line.strip() for line in open("src/models.txt"))
     except FileNotFoundError:
         ungated_models = set()
@@ -47,7 +47,7 @@ def get_leaderboard_models_reload():
     print(f"Number of models: {len(models)}")
     # Save model list as txt file
-    with open("src/models.txt", "w") as f:
         for model in models:
             f.write(model + "\n")
@@ -56,7 +56,7 @@ def get_leaderboard_models_reload():
 def get_leaderboard_models():
     # Load prechecked (ungated) models
-    with open("src/models.txt", "r") as f:
         ungated_models = [line.strip() for line in f]
     return sorted(ungated_models)

     # Load prechecked models
     try:
+        ungated_models = set(line.strip() for line in open("data/models.txt"))
     except FileNotFoundError:
         ungated_models = set()
     print(f"Number of models: {len(models)}")
     # Save model list as txt file
+    with open("data/models.txt", "w") as f:
         for model in models:
             f.write(model + "\n")
 def get_leaderboard_models():
     # Load prechecked (ungated) models
+    with open("data/models.txt", "r") as f:
         ungated_models = [line.strip() for line in f]
     return sorted(ungated_models)