EvalAnything-LeaderBoard

Running

App Files Files Community

htlou commited on Dec 2, 2024

Commit

0474b44

1 Parent(s): 0b07026

wip

Browse files

Files changed (2) hide show

app.py +117 -224
src/about.py +12 -11

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from pathlib import Path
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
@@ -19,268 +18,162 @@ from src.about import (
     ABOUT_TEXT
 )
 from src.display.css_html_js import custom_css
-# from src.display.utils import (
-#     BENCHMARK_COLS,
-#     COLS,
-#     EVAL_COLS,
-#     EVAL_TYPES,
-#     NUMERIC_INTERVALS,
-#     TYPES,
-#     AutoEvalColumn,
-#     ModelType,
-#     fields,
-#     WeightType,
-#     Precision
-# )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    pass
-    # restart_space()
-SUBSET_COUNTS = {
-    "Alignment-Object": 250,
-    "Alignment-Attribute": 229,
-    "Alignment-Action": 115,
-    "Alignment-Count": 55,
-    "Alignment-Location": 75,
-    "Safety-Toxicity-Crime": 29,
-    "Safety-Toxicity-Shocking": 31,
-    "Safety-Toxicity-Disgust": 42,
-    "Safety-Nsfw-Evident": 197,
-    "Safety-Nsfw-Evasive": 177,
-    "Safety-Nsfw-Subtle": 98,
-    "Quality-Distortion-Human_face": 169,
-    "Quality-Distortion-Human_limb": 152,
-    "Quality-Distortion-Object": 100,
-    "Quality-Blurry-Defocused": 350,
-    "Quality-Blurry-Motion": 350,
-    "Bias-Age": 80,
-    "Bias-Gender": 140,
-    "Bias-Race": 140,
-    "Bias-Nationality": 120,
-    "Bias-Religion": 60,
 }
-PERSPECTIVE_COUNTS= {
-    "Alignment": 724,
-    "Safety": 574,
-    "Quality": 1121,
-    "Bias": 540
 }
-META_DATA = ['Model']
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-# color_map = {
-#     "Score Model": "#7497db",
-#     "Opensource VLM": "#E8ECF2",
-#     "Closesource VLM": "#ffcd75",
-#     "Others": "#75809c",
-#     # #7497db #E8ECF2 #ffcd75 #75809c
-# }
-# def color_model_type_column(df, color_map):
-#     """
-#     Apply color to the 'Modality' column of the DataFrame based on a given color mapping.
-#     Parameters:
-#     df (pd.DataFrame): The DataFrame containing the 'Modality' column.
-#     color_map (dict): A dictionary mapping model types to colors.
-#     Returns:
-#     pd.Styler: The styled DataFrame.
-#     """
-#     # Function to apply color based on the model type
-#     def apply_color(val):
-#         color = color_map.get(val, "default")  # Default color if not specified in color_map
-#         return f'background-color: {color}'
-#     # Format for different columns
-#     format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
-#     format_dict['Overall Score'] = "{:.2f}"
-#     format_dict[''] = "{:d}"
-#     return df.style.applymap(apply_color, subset=['Modality']).format(format_dict, na_rep='')
-def regex_table(dataframe, regex, filter_button, style=True):
-    """
-    Takes a model name as a regex, then returns only the rows that has that in it.
-    """
-    # Split regex statement by comma and trim whitespace around regexes
-    regex_list = [x.strip() for x in regex.split(",")]
-    # Join the list into a single regex pattern with '|' acting as OR
-    combined_regex = '|'.join(regex_list)
-    # if filter_button, remove all rows with "ai2" in the model name
-    update_scores = False
-    if isinstance(filter_button, list) or isinstance(filter_button, str):
-        if "Image-Text-to-Text" not in filter_button:
-            dataframe = dataframe[~dataframe["Modality"].str.contains("Image-Text-to-Text", case=False, na=False)]
-        if "Video-Text-to-Text" not in filter_button:
-            dataframe = dataframe[~dataframe["Modality"].str.contains("Video-Text-to-Text", case=False, na=False)]
-    # Filter the dataframe such that 'model' contains any of the regex patterns
-    data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
-    data.reset_index(drop=True, inplace=True)
-    # replace column '' with count/rank
-    data.insert(0, '', range(1, 1 + len(data)))
-    # if style:
-    #     # apply color
-    #     data = color_model_type_column(data, color_map)
-    return data
-def get_leaderboard_results(results_path):
-    data_dir = Path(results_path)
-    files = [d for d in os.listdir(data_dir)] # TODO check if "Path(data_dir) / d" is a dir
-    df = pd.DataFrame()
-    for file in files:
-        if not file.endswith(".json"):
-            continue
-        with open(results_path / file) as rf:
-            result = json.load(rf)
-            result = pd.DataFrame(result)
-            df = pd.concat([result, df])
     df.reset_index(drop=True, inplace=True)
-    return df
-def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
-    new_df = orig_df[meta_data + columns_name]
-    new_perspective_counts = {col: perspective_counts[col] for col in columns_name}
-    total_count = sum(perspective_counts.values())
-    weights = {perspective: count / total_count for perspective, count in perspective_counts.items()}
-    def calculate_weighted_avg(row):
-        weighted_sum = sum(row[col] * weights[col] for col in columns_name)
-        return weighted_sum
-    new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
-    cols = meta_data + ["Overall Score"]  + columns_name
-    new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
-    return new_df
-data = {
-    "Model": [
-        "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B",
-        "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B",
-        "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B",
-        "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B",
-        "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B", "Beaver-Vision-11B",
-    ],
-    "Modality":[
-        "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text",
-        "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text",
-        "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text",
-        "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text",
-        "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text", "Image-Text-to-Text",
-    ],
-    "Correctness of Information": [
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-    ],
-    "Detail Orientation": [
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-    ],
-    "Safety": [
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-    ],
-    "AVG": [
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-        100.00, 100.00, 100.00, 100.00,
-    ]
-}
-df = pd.DataFrame(data)
-total_models = len(df)
 with gr.Blocks(css=custom_css) as app:
     with gr.Row():
         with gr.Column(scale=6):
             gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
-        with gr.Column(scale=4):
-            gr.Markdown("![](https://huggingface.co/spaces/Align-Anything/Leaderboard/blob/main/src/overview.jpeg)")
-            # gr.HTML(BGB_LOGO, elem_classes="logo")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏆 Align-Anything Leaderboard"):
             with gr.Row():
                 search_overall = gr.Textbox(
                     label="Model Search (delimit with , )",
-                    placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
                     show_label=False
                 )
-                model_type_overall = gr.CheckboxGroup(
-                    choices=["Image-Text-to-Text", "Video-Text-to-Text"],
-                    value=["Image-Text-to-Text", "Video-Text-to-Text"],
-                    label="Modality",
-                    show_label=False,
-                    interactive=True,
                 )
             with gr.Row():
-                Align_Anything_table_overall_hidden = gr.Dataframe(
                     df,
                     headers=df.columns.tolist(),
-                    elem_id="Align_Anything_leadboard_overall_hidden",
                     wrap=True,
                     visible=False,
                 )
-                Align_Anything_table_overall = gr.Dataframe(
-                    regex_table(
-                        df.copy(),
-                        "",
-                        ["Video-Text-to-Text", "Image-Text-to-Text"]
-                     ),
                     headers=df.columns.tolist(),
-                    elem_id="Align_Anything_leadboard_overall",
                     wrap=True,
                 )
         with gr.TabItem("About"):
             with gr.Row():
                 gr.Markdown(ABOUT_TEXT)
     with gr.Accordion("📚 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                lines=7,
-                label="Copy the following to cite these results.",
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-    search_overall.change(regex_table, inputs=[Align_Anything_table_overall_hidden, search_overall, model_type_overall], outputs=Align_Anything_table_overall)
-    model_type_overall.change(regex_table, inputs=[Align_Anything_table_overall_hidden, search_overall, model_type_overall], outputs=Align_Anything_table_overall)
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
 scheduler.start()
-# app.queue(default_concurrency_limit=40).launch()
-app.launch(allowed_paths=['./', "./src", "./evals"])

 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     ABOUT_TEXT
 )
 from src.display.css_html_js import custom_css
+from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
+# 定义模型性能数据和链接
+model_links = {
+    "LLaVA-v1.5-7B†": "https://huggingface.co/liuhaotian/llava-v1.5-7b",
+    "Qwen2-VL-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
+    "Qwen2-Audio-7B-Instruct†": "https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct",
+    "Chameleon-7B†": "https://huggingface.co/facebook/chameleon-7b",
+    "Llama3.1-8B-Instruct†": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct",
+    "Gemini-1.5-Pro†": "https://deepmind.google/technologies/gemini/pro/",
+    "GPT-4o†": "https://openai.com/index/hello-gpt-4o/"
 }
+data = {
+    "Model": list(model_links.keys()),
+    "Perception": [2.66, 2.76, 3.58, 1.44, 1.05, 5.36, 2.66],
+    "Reasoning": [2.67, 3.07, 4.53, 2.97, 1.20, 5.67, 3.48],
+    "IF": [2.50, 2.40, 3.40, 2.80, 1.20, 6.70, 4.20],
+    "Safety": [2.90, 4.05, 2.65, 2.45, 1.35, 6.70, 5.15],
+    "AMU Score": [2.68, 3.07, 3.54, 2.41, 1.20, 6.11, 3.87],
+    "Modality Selection": [0.182, 0.177, 0.190, 0.156, 0.231, 0.227, 0.266],
+    "Instruction Following": [6.61, 7.01, 6.69, 6.09, 7.47, 8.62, 8.62],
+    "Modality Synergy": [0.43, 0.58, 0.51, 0.54, 0.60, 0.52, 0.58],
+    "AMG Score": [1.56, 2.16, 1.97, 1.57, 3.08, 3.05, 3.96],
+    "Overall": [2.12, 2.62, 2.73, 1.99, 2.14, 4.58, 3.92]
 }
+df = pd.DataFrame(data).sort_values(by='Overall', ascending=False)
+total_models = len(df)
+# 定义列组
+COLUMN_GROUPS = {
+    "ALL": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score",
+            "Modality Selection", "Instruction Following", "Modality Synergy",
+            "AMG Score", "Overall"],
+    "AMU": ["Model", "Perception", "Reasoning", "IF", "Safety", "AMU Score"],
+    "AMG": ["Model", "Modality Selection", "Instruction Following", "Modality Synergy", "AMG Score"]
+}
+def format_table(df):
+    """Format the dataframe for display"""
+    # 设置列的显示格式
+    float_cols = df.select_dtypes(include=['float64']).columns
+    for col in float_cols:
+        df[col] = df[col].apply(lambda x: f"{x:.2f}")  # 修改为保留2位小数
+    bold_columns = ['AMU Score', 'AMG Score', 'Overall']
+    for col in bold_columns:
+        if col in df.columns:
+            df[col] = df[col].apply(lambda x: f'**{x}**')
+    # 添加模型链接
+    # df['Model'] = df['Model'].apply(lambda x: f'<a href="{model_links[x]}" target="_blank">{x}</a>')
+    df['Model'] = df['Model'].apply(lambda x: f'[{x}]({model_links[x]})')
+    # df['Model'] = df.apply(lambda x: model_hyperlink(model_links[x['Model']], x['Model']), axis=1)
+    return df
+def regex_table(dataframe, regex, filter_button, column_group="ALL"):
+    """Takes a model name as a regex, then returns only the rows that has that in it."""
+    # 深拷贝确保不修改原始数据
+    df = dataframe.copy()
+    # 选择要显示的列
+    columns_to_show = COLUMN_GROUPS.get(column_group, COLUMN_GROUPS["ALL"])
+    df = df[columns_to_show]
+    # Split regex statement by comma and trim whitespace around regexes
+    if regex:
+        regex_list = [x.strip() for x in regex.split(",")]
+        # Join the list into a single regex pattern with '|' acting as OR
+        combined_regex = '|'.join(regex_list)
+        # Filter based on model name regex
+        df = df[df["Model"].str.contains(combined_regex, case=False, na=False)]
+    df = df.sort_values(by='Overall' if 'Overall' in columns_to_show else columns_to_show[-1], ascending=False)
     df.reset_index(drop=True, inplace=True)
+    # Format numbers and add links
+    df = format_table(df)
+    # Add index column
+    df.insert(0, '', range(1, 1 + len(df)))
+    return df
 with gr.Blocks(css=custom_css) as app:
+    gr.HTML(TITLE)
     with gr.Row():
         with gr.Column(scale=6):
             gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏆 Model Performance Leaderboard"):
             with gr.Row():
                 search_overall = gr.Textbox(
                     label="Model Search (delimit with , )",
+                    placeholder="🔍 Search model (separate multiple queries with ,) and press ENTER...",
                     show_label=False
                 )
+                column_group = gr.Radio(
+                    choices=list(COLUMN_GROUPS.keys()),
+                    value="ALL",
+                    label="Select columns to show"
                 )
             with gr.Row():
+                performance_table_hidden = gr.Dataframe(
                     df,
                     headers=df.columns.tolist(),
+                    elem_id="performance_table_hidden",
                     wrap=True,
                     visible=False,
+                    datatype='markdown',
                 )
+                performance_table = gr.Dataframe(
+                    regex_table(df.copy(), "", []),
                     headers=df.columns.tolist(),
+                    elem_id="performance_table",
                     wrap=True,
+                    show_label=False,
+                    datatype='markdown',
                 )
         with gr.TabItem("About"):
             with gr.Row():
                 gr.Markdown(ABOUT_TEXT)
     with gr.Accordion("📚 Citation", open=False):
+        citation_button = gr.Textbox(
+            value=CITATION_BUTTON_TEXT,
+            lines=7,
+            label="Copy the following to cite these results.",
+            elem_id="citation-button",
+            show_copy_button=True,
+        )
+    # Set up event handlers
+    def update_table(search_text, selected_group):
+        return regex_table(df, search_text, [], selected_group)
+    search_overall.change(
+        update_table,
+        inputs=[search_overall, column_group],
+        outputs=performance_table
+    )
+    column_group.change(
+        update_table,
+        inputs=[search_overall, column_group],
+        outputs=performance_table
+    )
+# Set up scheduler
 scheduler = BackgroundScheduler()
+scheduler.add_job(lambda: None, "interval", seconds=18000)  # every 5 hours
 scheduler.start()
+# Launch the app
+app.launch(share=True)

src/about.py CHANGED Viewed

@@ -21,15 +21,15 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Align-Anything</h1>"""
 # MJB_LOGO = '<img src="" alt="Logo" style="width: 100%; display: block; margin: auto;">'
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-# Align-Anything
-Align-Anything aims to align any modality large models (any-to-any models), including LLMs, VLMs, and others, with human intentions and values.
-More details about the definition and milestones of alignment for Large Models can be found in AI Alignment.
 """
 # Which evaluations are you running? how can people reproduce what you have?
@@ -41,16 +41,17 @@ EVALUATION_QUEUE_TEXT = """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = """
-@article{Align-Anything,
-  title={ALIGN ANYTHING: TRAINING ALL MODALITY MODELS TO FOLLOW INSTRUCTIONS WITH UNIFIED LANGUAGE FEEDBACK},
-  author={Xuyao Wang and Jiayi Zhou and Jiaming Ji and Yaodong Yang},
-  journal={arXiv preprint arXiv:2411.20343},
-  eprint={2411.20343},
-  eprinttype = {arXiv},
-  year={2024}
 }
 """
 ABOUT_TEXT = """
 """

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Eval-Anything Leaderboard</h1>"""
 # MJB_LOGO = '<img src="" alt="Logo" style="width: 100%; display: block; margin: auto;">'
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Eval-anything is a framework designed specifically for evaluating all-modality models, and it is a part of the [Align-Anything](https://github.com/PKU-Alignment/align-anything) framework. It consists of two main tasks: All-Modality Understanding (AMU) and All-Modality Generation (AMG). AMU assesses a model's ability to simultaneously process and integrate information from all modalities, including text, images, audio, and video. On the other hand, AMG evaluates a model's capability to autonomously select output modalities based on user instructions and synergistically utilize different modalities to generate output. Eval-anything aims to comprehensively assess the ability of all-modality models to handle heterogeneous data from multiple sources, providing a reliable evaluation tool for this field.
+**Note:** Since most current open-source models lack support for all-modality output, (†) indicates that models are used as agents to invoke [AudioLDM2-Large](https://huggingface.co/cvssp/audioldm2-large) and [FLUX.1-schnell](https://huggingface.co/black-forest-labs/FLUX.1-schnell) for audio and image generation.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = """
+@misc{align_anything,
+  author = {PKU-Alignment Team},
+  title = {Align Anything: training all modality models to follow instructions with unified language feedback},
+  year = {2024},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/PKU-Alignment/align-anything}},
 }
 """
 ABOUT_TEXT = """
+We will provide methods to upload more model evaluation results in the future.
 """