Spaces:

openadmet
/

OpenADMET-ExpansionRx-Challenge

Running

App Files Files Community

Maria Castellanos commited on Oct 1

Commit

9233e8b

1 Parent(s): b2070e0

Add bootstrap

Browse files

Files changed (4) hide show

about.py +4 -12
app.py +17 -7
evaluate.py +34 -27
utils.py +119 -7

about.py CHANGED Viewed

@@ -11,18 +11,10 @@ ENDPOINTS = ["LogD",
              "MBPB",
              "MGMB"]
-LB_COLS0 = ["endpoint",
-            "user",
-            "MAE",
-            "RAE",
-            "R2",
-            "Spearman R",
-            "Kendall's Tau",
-            "data coverage (%)",
-            "submission_time",
-            "model_report"]
-LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details",
-           "data coverage (%)"]
 LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
 LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']

              "MBPB",
              "MGMB"]
+STANDARD_COLS = ["endpoint", "user", "submission_time", "model_report"]
+METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
+# Final columns
+LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
 LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
 LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']

app.py CHANGED Viewed

@@ -4,17 +4,23 @@ from gradio.themes.utils import sizes
 import pandas as pd
 from evaluate import submit_data, evaluate_data
-from utils import make_tag_clickable, make_user_clickable, fetch_dataset_df
 from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
 ALL_EPS = ['Average'] + ENDPOINTS
-def build_leaderboard(df_results):
     per_ep = {}
     for ep in ALL_EPS:
-        df = df_results[df_results["endpoint"] == ep].copy()
         if df is None:
             print(f"[refresh] {ep} returned None; using empty DF")
         if df.empty:
@@ -28,11 +34,15 @@ def build_leaderboard(df_results):
         df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
         if ep == "Average":
-            df["MA-RAE"] = df["RAE"]  # The average of the RAE per endpoint
-            sorted_df = df.sort_values(by='MA-RAE', ascending=True, kind="stable")
             per_ep[ep] = sorted_df[LB_AVG]
         else:
-            sorted_df = df.sort_values(by="MAE", ascending=True, kind="stable")
             per_ep[ep] = sorted_df[LB_COLS]
     return per_ep
@@ -65,7 +75,7 @@ def gradio_interface():
                     Go to the **Leaderboard** to check out how the challenge is going.
                     To participate, head out to the **Submit** tab and upload your results as a `CSV` file.
-                    **The challenge is not yet open for submissions. Plase stay tuned for the official launch date!**
                     """
                     )
             with gr.Column(scale=2):  # smaller side column for logo

 import pandas as pd
 from evaluate import submit_data, evaluate_data
+from utils import (
+    make_tag_clickable,
+    make_user_clickable,
+    fetch_dataset_df,
+    map_metric_to_stats,
+)
 from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
 ALL_EPS = ['Average'] + ENDPOINTS
+def build_leaderboard(df_results0):
+    df_results = df_results0.rename(columns={"endpoint": "Endpoint"})
     per_ep = {}
     for ep in ALL_EPS:
+        df = df_results[df_results["Endpoint"] == ep].copy()
         if df is None:
             print(f"[refresh] {ep} returned None; using empty DF")
         if df.empty:
         df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
         if ep == "Average":
+            # MA-RAE is the average of the RAE per endpoint
+            df = df.rename(columns={"mean_RAE": "mean_MA-RAE",
+                                    "std_RAE": "std_MA-RAE"})
+            sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
+            sorted_df = map_metric_to_stats(sorted_df, average=True)
             per_ep[ep] = sorted_df[LB_AVG]
         else:
+            sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
+            sorted_df = map_metric_to_stats(sorted_df)
             per_ep[ep] = sorted_df[LB_COLS]
     return per_ep
                     Go to the **Leaderboard** to check out how the challenge is going.
                     To participate, head out to the **Submit** tab and upload your results as a `CSV` file.
+                    **The challenge is not yet open for submissions. Please stay tuned for the official launch date!**
                     """
                     )
             with gr.Column(scale=2):  # smaller side column for logo

evaluate.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 from pathlib import Path
 from typing import Optional
 from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
-from utils import metrics_per_ep
 from huggingface_hub import hf_hub_download
 import datetime
 import io
@@ -157,7 +157,7 @@ def submit_data(predictions_file: str,
         path_in_repo=destination_csv,
         repo_id=submissions_repo,
         repo_type="dataset",
-        commit_message=f"Add submission for {safe_user} at {ts}"
     )
     # Upload the metadata JSON file
     meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
@@ -166,7 +166,7 @@ def submit_data(predictions_file: str,
         path_in_repo=destination_json,
         repo_id=submissions_repo,
         repo_type="dataset",
-        commit_message=f"Add metadata for {user_state} submission at {ts}"
     )
     return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
@@ -265,7 +265,9 @@ def calculate_metrics(
         raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
     # TODO: What to do when a molecule is duplicated in the Predictions file?
-    df_results = pd.DataFrame(columns=["endpoint", "MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"])
     for i, measurement in enumerate(ENDPOINTS):
         df_pred = results_dataframe[['Name', measurement]].copy()
         # Only use data with operator "="
@@ -295,32 +297,37 @@ def calculate_metrics(
         coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
         merged = merged.sort_values("Name", kind="stable")
-        # validate pairs
-        if n_pairs < 10:
-            mae = rae = r2 = spearman = ktau = np.nan
-        else:
-            y_pred = merged[f"{measurement}_pred"].to_numpy()
-            y_true = merged[f"{measurement}_true"].to_numpy()
-            # Force log scale for all endpoints except LogD (for outliers)
-            if measurement != "LogD":
-                y_pred = np.log10(y_pred)
-                y_true = np.log10(y_true)
-            mae, rae, r2, spearman, ktau = metrics_per_ep(y_pred, y_true)
-        df_results.loc[i, 'endpoint'] = measurement
-        df_results.loc[i, 'MAE'] = mae
-        df_results.loc[i, 'RAE'] = rae
-        df_results.loc[i, 'R2'] = r2
-        df_results.loc[i, 'Spearman R'] = spearman
-        df_results.loc[i, "Kendall's Tau"] = ktau
-        df_results.loc[i, 'data coverage (%)'] = coverage
     # Average results
-    num_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau", "data coverage (%)"]
-    df_results[num_cols] = df_results[num_cols].apply(pd.to_numeric, errors="coerce")
-    means = df_results[num_cols].mean()
-    avg_row = {"endpoint": "Average", **means.to_dict()}
     df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
     return df_with_average

 from pathlib import Path
 from typing import Optional
 from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
+from utils import bootstrap_metrics
 from huggingface_hub import hf_hub_download
 import datetime
 import io
         path_in_repo=destination_csv,
         repo_id=submissions_repo,
         repo_type="dataset",
+        commit_message=f"Add submission for user at {ts}"
     )
     # Upload the metadata JSON file
     meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
         path_in_repo=destination_json,
         repo_id=submissions_repo,
         repo_type="dataset",
+        commit_message=f"Add metadata for user submission at {ts}"
     )
     return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
         raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
     # TODO: What to do when a molecule is duplicated in the Predictions file?
+    # Compute leaderboard DataFrame
+    final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
+    all_endpoint_results = []
     for i, measurement in enumerate(ENDPOINTS):
         df_pred = results_dataframe[['Name', measurement]].copy()
         # Only use data with operator "="
         coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
         merged = merged.sort_values("Name", kind="stable")
+        y_pred = merged[f"{measurement}_pred"].to_numpy()
+        y_true = merged[f"{measurement}_true"].to_numpy()
+        # Force log scale for all endpoints except LogD (for outliers)
+        if measurement != "LogD":
+            y_pred = np.log10(y_pred)
+            y_true = np.log10(y_true)
+        # Calculate dataframe with the metrics for 1000 bootstraps
+        bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
+        df_endpoint = bootstrap_df.pivot_table(
+            index=["Endpoint"],
+            columns=final_cols,
+            values="Value",
+            aggfunc=["mean", "std"]
+        ).reset_index()
+        # Get a df with columns 'mean_MAE', 'std_MAE', ...
+        df_endpoint.columns = [
+            f'{j}_{i}' if i != '' else j for i, j in df_endpoint.columns
+        ]
+        df_endpoint.rename(columns={'_Endpoint': 'Endpoint'}, inplace=True)
+        all_endpoint_results.append(df_endpoint)
+    df_results = pd.concat(all_endpoint_results, ignore_index=True)
+    mean_cols = [f'{m}_mean' for m in final_cols]
+    std_cols = [f'{m}_std' for m in final_cols]
     # Average results
+    macro_means = df_results[mean_cols].mean()
+    macro_stds = df_results[std_cols].mean()
+    avg_row = {"endpoint": "Average"}
+    avg_row.update(macro_means.to_dict())
+    avg_row.update(macro_stds.to_dict())
     df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
     return df_with_average

utils.py CHANGED Viewed

@@ -1,22 +1,26 @@
 import pandas as pd
 import numpy as np
 from datasets import load_dataset
 from about import results_repo
-from about import LB_COLS0
-def make_user_clickable(name):
     link =f'https://huggingface.co/{name}'
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{name}</a>'
-def make_tag_clickable(tag):
     return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
 def fetch_dataset_df():
     dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
     full_df = dset.to_pandas()
     assert all(
-        col in full_df.columns for col in LB_COLS0
-    ), f"Expected columns {LB_COLS0} not found in {full_df.columns}. Missing columns: {set(LB_COLS0) - set(full_df.columns)}"
     df = full_df.copy()
     df = df[df["user"] != "test"].copy()
@@ -33,7 +37,43 @@ def fetch_dataset_df():
     latest.rename(columns={"submission_time": "submission time"}, inplace=True)
     return latest
-def metrics_per_ep(pred, true):
     from scipy.stats import spearmanr, kendalltau
     from sklearn.metrics import mean_absolute_error, r2_score
     mae = mean_absolute_error(true, pred)
@@ -45,4 +85,76 @@ def metrics_per_ep(pred, true):
     spr, _ = spearmanr(true, pred)
     ktau, _ = kendalltau(true, pred)
-    return mae, rae, r2, spr, ktau

 import pandas as pd
 import numpy as np
+from typing import Tuple
 from datasets import load_dataset
 from about import results_repo
+from about import METRICS, STANDARD_COLS
+def make_user_clickable(name: str):
     link =f'https://huggingface.co/{name}'
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{name}</a>'
+def make_tag_clickable(tag: str):
     return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
 def fetch_dataset_df():
     dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
     full_df = dset.to_pandas()
+    expected_mean_cols = [f"mean_{col}" for col in METRICS]
+    expected_std_cols = [f"std_{col}" for col in METRICS]
+    expected_all_cols = STANDARD_COLS + expected_mean_cols + expected_std_cols
     assert all(
+        col in full_df.columns for col in expected_all_cols
+    ), f"Expected columns not found in {full_df.columns}. Missing columns: {set(expected_all_cols) - set(full_df.columns)}"
     df = full_df.copy()
     df = df[df["user"] != "test"].copy()
     latest.rename(columns={"submission_time": "submission time"}, inplace=True)
     return latest
+def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
+    """
+    Generate bootstrap samples for a given size and number of samples.
+    Parameters
+    ----------
+    size : int
+        The size of the data.
+    n_samples : int
+        The number of samples to generate.
+    Returns
+    -------
+    np.ndarray
+        Returns a numpy array of the bootstrap samples.
+    """
+    rng = np.random.default_rng(0)
+    return rng.choice(size, size=(n_samples, size), replace=True)
+def metrics_per_ep(pred: np.ndarray,
+                   true: np.ndarray
+    )->Tuple[float, float, float, float]:
+    """Predict evaluation metrics for a single sample
+    Parameters
+    ----------
+    pred : np.ndarray
+        Array with predictions
+    true : np.ndarray
+        Array with actual values
+    Returns
+    -------
+    Tuple[float, float, float, float]
+        Resulting metrics: (MAE, RAE, R2, Spearman R, Kendall's Tau)
+    """
     from scipy.stats import spearmanr, kendalltau
     from sklearn.metrics import mean_absolute_error, r2_score
     mae = mean_absolute_error(true, pred)
     spr, _ = spearmanr(true, pred)
     ktau, _ = kendalltau(true, pred)
+    return mae, rae, r2, spr, ktau
+def bootstrap_metrics(pred: np.ndarray,
+                      true: np.ndarray,
+                      endpoint: str,
+                      n_bootstrap_samples=1000
+    )->pd.DataFrame:
+    """Calculate bootstrap metrics given predicted and true values
+    Parameters
+    ----------
+    pred : np.ndarray
+        Predicted endpoints
+    true : np.ndarray
+        Actual endpoint values
+    endpoint : str
+        String with endpoint
+    n_bootstrap_samples : int, optional
+        Size of bootstrapsample, by default 1000
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe with estimated metric per bootstrap sample for the given endpoint
+    """
+    cols = ["Sample", "Endpoint", "Metric", "Value"]
+    bootstrap_results = pd.DataFrame(columns=cols, dtype=[int, str, str, float])
+    for i, indx in enumerate(
+        bootstrap_sampling(true.shape[0], n_bootstrap_samples)
+    ):
+        mae, rae, r2, spr, ktau = metrics_per_ep(pred[indx], true[indx])
+        scores = pd.DataFrame(
+            [
+                [i, endpoint, "MAE", mae],
+                [i, endpoint, "RAE", rae],
+                [i, endpoint, "R2", r2],
+                [i, endpoint, "Spearman R", spr],
+                [i, endpoint, "Kendall's Tau", ktau]
+            ],
+            columns=cols
+        )
+        bootstrap_results = pd.concat([bootstrap_results, scores])
+    return bootstrap_results
+def map_metric_to_stats(df: pd.DataFrame, average=False) -> pd.DataFrame:
+    """Map mean and std to 'mean +/- std' string for each metric
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe to modify
+    average : bool, optional
+        Whether the dataframe contains average info, by default False
+    Returns
+    -------
+    pd.DataFrame
+        Modified dataframe
+    """
+    metric_cols = METRICS[:]
+    if average:
+        metric_cols[1] = "MA-RAE"
+    cols_drop = []
+    for col in metric_cols:
+        mean_col = f"mean_{col}"
+        std_col = f"std_{col}"
+        df[col] = df.apply(
+            lambda row: f"{row[mean_col]:.2f} +/- {row[std_col]:.2f}",
+            axis=1
+        )
+        cols_drop.extend([mean_col, std_col])
+    df = df.drop(columns=cols_drop)
+    return df