PROBE

Running

App Files Files Community

mgyigit commited on Sep 30, 2024

Commit

1cc2077

verified ·

1 Parent(s): 4260f48

Upload 5 files

Browse files

Files changed (5) hide show

Makefile +13 -0
README.md +41 -7
app.py +129 -0
pyproject.toml +13 -0
requirements.txt +19 -0

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md CHANGED Viewed

@@ -1,11 +1,45 @@
 ---
-title: Probe3
-emoji: 🐢
-colorFrom: indigo
-colorTo: purple
-sdk: static
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PROBE
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
+sdk: gradio
+app_file: app.py
 pinned: false
+license: gpl
+python_version: 3.8.1
 ---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
+import gradio as gr
+import pandas as pd
+import re
+import pandas as pd
+import os
+import json
+from src.about import *
+global data_component, filter_component
+def get_baseline_df():
+    df = pd.read_csv(CSV_RESULT_PATH)
+    present_columns = ["Method"] + checkbox_group.value
+    df = df[present_columns]
+    return df
+def add_new_eval(
+    human_file,
+    skempi_file,
+    model_name_textbox: str,
+    revision_name_textbox: str,
+    benchmark_type: str,
+):
+    representation_name = model_name_textbox if revision_name_textbox == '' else revision_name_textbox
+    print(representation_name)
+    # Save human and skempi files under ./src/data/representation_vectors using pandas
+    if human_file is not None:
+        human_df = pd.read_csv(human_file)
+        human_df.to_csv(f"./src/data/representation_vectors/{representation_name}_human.csv", index=False)
+    return None
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        LEADERBOARD_INTRODUCTION
+    )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        # table jmmmu bench
+        with gr.TabItem("🏅 PROBE Benchmark", elem_id="probe-benchmark-tab-table", id=1):
+            # selection for column part:
+            checkbox_group = gr.CheckboxGroup(
+                choices=TASK_INFO,
+                label="Benchmark Type",
+                interactive=True,
+            ) # user can select the evaluation dimension
+            baseline_value = get_baseline_df()
+            baseline_header = ["Method"] + checkbox_group.value
+            baseline_datatype = ['markdown'] + ['number'] * len(checkbox_group.value)
+            data_component = gr.components.Dataframe(
+                value=baseline_value,
+                headers=baseline_header,
+                type="pandas",
+                datatype=baseline_datatype,
+                interactive=False,
+                visible=True,
+                )
+        # table 5
+        with gr.TabItem("📝 About", elem_id="probe-benchmark-tab-table", id=2):
+            with gr.Row():
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="probe-benchmark-tab-table", id=3):
+            with gr.Row():
+                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model's representation files here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(
+                        label="Model name",
+                        )
+                    revision_name_textbox = gr.Textbox(
+                        label="Revision Model Name",
+                    )
+                    # Selection for benchmark type from (similartiy, family, function, affinity) to eval the representations (chekbox)
+                    benchmark_type = gr.CheckboxGroup(
+                        choices=TASK_INFO,
+                        label="Benchmark Type",
+                        interactive=True,
+                    )
+            with gr.Column():
+                human_file = gr.components.File(label="Click to Upload the representation file (csv) for Human dataset", file_count="single", type='binary')
+                skempi_file = gr.components.File(label="Click to Upload the representation file (csv) for SKEMPI dataset", file_count="single", type='binary')
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                submit_button.click(
+                    add_new_eval,
+                    inputs = [
+                        human_file,
+                        skempi_file,
+                        model_name_textbox,
+                        revision_name_textbox,
+                        benchmark_type
+                    ],
+                )
+    def refresh_data():
+        value = get_baseline_df()
+        return value
+    with gr.Row():
+        data_run = gr.Button("Refresh")
+        data_run.click(
+            refresh_data, outputs=[data_component]
+        )
+    with gr.Accordion("Citation", open=False):
+        citation_button = gr.Textbox(
+            value=CITATION_BUTTON_TEXT,
+            label=CITATION_BUTTON_LABEL,
+            elem_id="citation-button",
+            show_copy_button=True,
+        )
+block.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+APScheduler
+black
+datasets
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.9
+gradio_client
+huggingface-hub>=0.18.0
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece
+matplotlib
+numpy
+pandas==1.1.4
+pyyaml==5.1
+scikit-learn==0.22
+scikit-multilearn==0.2.0