Commit
·
069fb2c
1
Parent(s):
0fdb208
Added some changes to filtering / dedup submissions
Browse files- app.py +8 -2
- constants.py +1 -0
- notebooks/pIgGen_example.ipynb +29 -3
- utils.py +17 -5
app.py
CHANGED
|
@@ -21,6 +21,9 @@ from utils import fetch_hf_results, show_output_box
|
|
| 21 |
|
| 22 |
|
| 23 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
|
|
|
|
|
|
|
|
|
| 24 |
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
|
| 25 |
if assay is not None:
|
| 26 |
df = df[df["assay"] == assay]
|
|
@@ -48,9 +51,9 @@ def get_leaderboard_object(assay: str | None = None):
|
|
| 48 |
# Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
|
| 49 |
lb = Leaderboard(
|
| 50 |
value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
|
| 51 |
-
datatype=["str", "str", "str", "number"],
|
| 52 |
select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
|
| 53 |
-
["model", "property", "spearman", "dataset"]
|
| 54 |
),
|
| 55 |
search_columns=["Model Name"],
|
| 56 |
filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
|
|
@@ -139,6 +142,9 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
|
|
| 139 |
"""
|
| 140 |
# Overall Leaderboard (filter below by property)
|
| 141 |
Each property has its own prize, and participants can submit models for any combination of properties.
|
|
|
|
|
|
|
|
|
|
| 142 |
"""
|
| 143 |
)
|
| 144 |
lb = get_leaderboard_object()
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
| 24 |
+
"""
|
| 25 |
+
Format the dataframe for display on the leaderboard. The dataframe comes from utils.fetch_hf_results().
|
| 26 |
+
"""
|
| 27 |
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
|
| 28 |
if assay is not None:
|
| 29 |
df = df[df["assay"] == assay]
|
|
|
|
| 51 |
# Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
|
| 52 |
lb = Leaderboard(
|
| 53 |
value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
|
| 54 |
+
datatype=["str", "str", "str", "number", "str"],
|
| 55 |
select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
|
| 56 |
+
["model", "property", "spearman", "dataset", "user"]
|
| 57 |
),
|
| 58 |
search_columns=["Model Name"],
|
| 59 |
filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
|
|
|
|
| 142 |
"""
|
| 143 |
# Overall Leaderboard (filter below by property)
|
| 144 |
Each property has its own prize, and participants can submit models for any combination of properties.
|
| 145 |
+
|
| 146 |
+
**Note**: It is trivial to overfit the public GDPa1 dataset, which results in very high Spearman correlations.
|
| 147 |
+
We would suggest training using cross-validation a limited number of times to give a better indication of the model's performance on the eventual private test set.
|
| 148 |
"""
|
| 149 |
)
|
| 150 |
lb = get_leaderboard_object()
|
constants.py
CHANGED
|
@@ -95,6 +95,7 @@ LEADERBOARD_COLUMNS_RENAME = {
|
|
| 95 |
"model": "Model Name",
|
| 96 |
"property": "Property",
|
| 97 |
}
|
|
|
|
| 98 |
|
| 99 |
|
| 100 |
def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
|
|
|
|
| 95 |
"model": "Model Name",
|
| 96 |
"property": "Property",
|
| 97 |
}
|
| 98 |
+
BASELINE_USERNAMES = ["loodvanniekerkginkgo"]
|
| 99 |
|
| 100 |
|
| 101 |
def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
|
notebooks/pIgGen_example.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"id": "7c6c914c",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
|
@@ -21,10 +21,32 @@
|
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"cell_type": "code",
|
| 24 |
-
"execution_count":
|
| 25 |
"id": "00cfd012",
|
| 26 |
"metadata": {},
|
| 27 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
{
|
| 29 |
"data": {
|
| 30 |
"text/html": [
|
|
@@ -276,7 +298,7 @@
|
|
| 276 |
"[5 rows x 30 columns]"
|
| 277 |
]
|
| 278 |
},
|
| 279 |
-
"execution_count":
|
| 280 |
"metadata": {},
|
| 281 |
"output_type": "execute_result"
|
| 282 |
}
|
|
@@ -285,6 +307,10 @@
|
|
| 285 |
"model_name = \"ollieturnbull/p-IgGen\"\n",
|
| 286 |
"df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n",
|
| 287 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
"# Example: Just predict HIC, so we'll drop NaN rows for that\n",
|
| 289 |
"df = df.dropna(subset=[\"HIC\"])\n",
|
| 290 |
"df.head()"
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
"id": "7c6c914c",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
|
|
|
| 21 |
},
|
| 22 |
{
|
| 23 |
"cell_type": "code",
|
| 24 |
+
"execution_count": 10,
|
| 25 |
"id": "00cfd012",
|
| 26 |
"metadata": {},
|
| 27 |
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"name": "stdout",
|
| 30 |
+
"output_type": "stream",
|
| 31 |
+
"text": [
|
| 32 |
+
"Index(['antibody_id', 'antibody_name', 'Titer', 'Purity', 'SEC %Monomer',\n",
|
| 33 |
+
" 'SMAC', 'HIC', 'HAC', 'PR_CHO', 'PR_Ova', 'AC-SINS_pH6.0',\n",
|
| 34 |
+
" 'AC-SINS_pH7.4', 'Tonset', 'Tm1', 'Tm2', 'hc_subtype', 'lc_subtype',\n",
|
| 35 |
+
" 'highest_clinical_trial_asof_feb2025', 'est_status_asof_feb2025',\n",
|
| 36 |
+
" 'vh_protein_sequence', 'hc_protein_sequence', 'hc_dna_sequence',\n",
|
| 37 |
+
" 'vl_protein_sequence', 'lc_protein_sequence', 'lc_dna_sequence',\n",
|
| 38 |
+
" 'hierarchical_cluster_fold', 'random_fold',\n",
|
| 39 |
+
" 'hierarchical_cluster_IgG_isotype_stratified_fold', 'light_aligned_aho',\n",
|
| 40 |
+
" 'heavy_aligned_aho'],\n",
|
| 41 |
+
" dtype='object')\n",
|
| 42 |
+
"Titer 7\n",
|
| 43 |
+
"HIC 4\n",
|
| 44 |
+
"PR_CHO 49\n",
|
| 45 |
+
"Tm2 53\n",
|
| 46 |
+
"AC-SINS_pH7.4 4\n",
|
| 47 |
+
"dtype: int64\n"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
{
|
| 51 |
"data": {
|
| 52 |
"text/html": [
|
|
|
|
| 298 |
"[5 rows x 30 columns]"
|
| 299 |
]
|
| 300 |
},
|
| 301 |
+
"execution_count": 10,
|
| 302 |
"metadata": {},
|
| 303 |
"output_type": "execute_result"
|
| 304 |
}
|
|
|
|
| 307 |
"model_name = \"ollieturnbull/p-IgGen\"\n",
|
| 308 |
"df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n",
|
| 309 |
"\n",
|
| 310 |
+
"print(df.columns)\n",
|
| 311 |
+
"# Show number of NaNs per assay\n",
|
| 312 |
+
"print(df[[\"Titer\", \"HIC\", \"PR_CHO\", \"Tm2\", 'AC-SINS_pH7.4']].isna().sum())\n",
|
| 313 |
+
"\n",
|
| 314 |
"# Example: Just predict HIC, so we'll drop NaN rows for that\n",
|
| 315 |
"df = df.dropna(subset=[\"HIC\"])\n",
|
| 316 |
"df.head()"
|
utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from datasets import load_dataset
|
|
| 3 |
import gradio as gr
|
| 4 |
import hashlib
|
| 5 |
from typing import Iterable, Union
|
| 6 |
-
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
|
| 7 |
|
| 8 |
pd.set_option("display.max_columns", None)
|
| 9 |
|
|
@@ -26,13 +26,25 @@ def fetch_hf_results():
|
|
| 26 |
assert all(
|
| 27 |
col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
|
| 28 |
), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
|
|
|
|
|
|
|
|
|
|
| 29 |
# Show latest submission only
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
)
|
|
|
|
| 33 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
| 34 |
-
|
| 35 |
-
#
|
|
|
|
|
|
|
|
|
|
| 36 |
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
|
| 37 |
|
| 38 |
return df
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import hashlib
|
| 5 |
from typing import Iterable, Union
|
| 6 |
+
from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS, BASELINE_USERNAMES
|
| 7 |
|
| 8 |
pd.set_option("display.max_columns", None)
|
| 9 |
|
|
|
|
| 26 |
assert all(
|
| 27 |
col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
|
| 28 |
), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
|
| 29 |
+
|
| 30 |
+
df_baseline = df[df["user"].isin(BASELINE_USERNAMES)]
|
| 31 |
+
df_non_baseline = df[~df["user"].isin(BASELINE_USERNAMES)]
|
| 32 |
# Show latest submission only
|
| 33 |
+
# For baselines: Keep unique model names
|
| 34 |
+
df_baseline = df_baseline.sort_values("submission_time", ascending=False).drop_duplicates(
|
| 35 |
+
subset=["model", "assay", "dataset", "user"], keep="first"
|
| 36 |
+
)
|
| 37 |
+
# For users: Just show latest submission
|
| 38 |
+
df_non_baseline = df_non_baseline.sort_values("submission_time", ascending=False).drop_duplicates(
|
| 39 |
+
subset=["assay", "dataset", "user"], keep="first"
|
| 40 |
)
|
| 41 |
+
df = pd.concat([df_baseline, df_non_baseline], ignore_index=True)
|
| 42 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
| 43 |
+
|
| 44 |
+
# Rename baseline username to just "Baseline"
|
| 45 |
+
df.loc[df["user"].isin(BASELINE_USERNAMES), "user"] = "Baseline"
|
| 46 |
+
# Note: Could optionally add a column "is_baseline" to the dataframe to indicate whether the model is a baseline model or not. If things get crowded.
|
| 47 |
+
# Anonymize the user column at this point (so note: users can submit anonymous / non-anonymous and we'll show their latest submission regardless)
|
| 48 |
df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
|
| 49 |
|
| 50 |
return df
|