Maria Castellanos commited on
Commit
9233e8b
·
1 Parent(s): b2070e0

Add bootstrap

Browse files
Files changed (4) hide show
  1. about.py +4 -12
  2. app.py +17 -7
  3. evaluate.py +34 -27
  4. utils.py +119 -7
about.py CHANGED
@@ -11,18 +11,10 @@ ENDPOINTS = ["LogD",
11
  "MBPB",
12
  "MGMB"]
13
 
14
- LB_COLS0 = ["endpoint",
15
- "user",
16
- "MAE",
17
- "RAE",
18
- "R2",
19
- "Spearman R",
20
- "Kendall's Tau",
21
- "data coverage (%)",
22
- "submission_time",
23
- "model_report"]
24
- LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details",
25
- "data coverage (%)"]
26
  LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
27
  LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
28
 
 
11
  "MBPB",
12
  "MGMB"]
13
 
14
+ STANDARD_COLS = ["endpoint", "user", "submission_time", "model_report"]
15
+ METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
16
+ # Final columns
17
+ LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
 
 
 
 
 
 
 
 
18
  LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
19
  LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
20
 
app.py CHANGED
@@ -4,17 +4,23 @@ from gradio.themes.utils import sizes
4
  import pandas as pd
5
 
6
  from evaluate import submit_data, evaluate_data
7
- from utils import make_tag_clickable, make_user_clickable, fetch_dataset_df
 
 
 
 
 
8
 
9
  from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
10
 
11
 
12
  ALL_EPS = ['Average'] + ENDPOINTS
13
 
14
- def build_leaderboard(df_results):
 
15
  per_ep = {}
16
  for ep in ALL_EPS:
17
- df = df_results[df_results["endpoint"] == ep].copy()
18
  if df is None:
19
  print(f"[refresh] {ep} returned None; using empty DF")
20
  if df.empty:
@@ -28,11 +34,15 @@ def build_leaderboard(df_results):
28
  df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
29
 
30
  if ep == "Average":
31
- df["MA-RAE"] = df["RAE"] # The average of the RAE per endpoint
32
- sorted_df = df.sort_values(by='MA-RAE', ascending=True, kind="stable")
 
 
 
33
  per_ep[ep] = sorted_df[LB_AVG]
34
  else:
35
- sorted_df = df.sort_values(by="MAE", ascending=True, kind="stable")
 
36
  per_ep[ep] = sorted_df[LB_COLS]
37
 
38
  return per_ep
@@ -65,7 +75,7 @@ def gradio_interface():
65
  Go to the **Leaderboard** to check out how the challenge is going.
66
  To participate, head out to the **Submit** tab and upload your results as a `CSV` file.
67
 
68
- **The challenge is not yet open for submissions. Plase stay tuned for the official launch date!**
69
  """
70
  )
71
  with gr.Column(scale=2): # smaller side column for logo
 
4
  import pandas as pd
5
 
6
  from evaluate import submit_data, evaluate_data
7
+ from utils import (
8
+ make_tag_clickable,
9
+ make_user_clickable,
10
+ fetch_dataset_df,
11
+ map_metric_to_stats,
12
+ )
13
 
14
  from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
15
 
16
 
17
  ALL_EPS = ['Average'] + ENDPOINTS
18
 
19
+ def build_leaderboard(df_results0):
20
+ df_results = df_results0.rename(columns={"endpoint": "Endpoint"})
21
  per_ep = {}
22
  for ep in ALL_EPS:
23
+ df = df_results[df_results["Endpoint"] == ep].copy()
24
  if df is None:
25
  print(f"[refresh] {ep} returned None; using empty DF")
26
  if df.empty:
 
34
  df['model details'] = df['model_report'].apply(lambda x: make_tag_clickable(x)).astype(str)
35
 
36
  if ep == "Average":
37
+ # MA-RAE is the average of the RAE per endpoint
38
+ df = df.rename(columns={"mean_RAE": "mean_MA-RAE",
39
+ "std_RAE": "std_MA-RAE"})
40
+ sorted_df = df.sort_values(by='mean_MA-RAE', ascending=True, kind="stable")
41
+ sorted_df = map_metric_to_stats(sorted_df, average=True)
42
  per_ep[ep] = sorted_df[LB_AVG]
43
  else:
44
+ sorted_df = df.sort_values(by="mean_MAE", ascending=True, kind="stable")
45
+ sorted_df = map_metric_to_stats(sorted_df)
46
  per_ep[ep] = sorted_df[LB_COLS]
47
 
48
  return per_ep
 
75
  Go to the **Leaderboard** to check out how the challenge is going.
76
  To participate, head out to the **Submit** tab and upload your results as a `CSV` file.
77
 
78
+ **The challenge is not yet open for submissions. Please stay tuned for the official launch date!**
79
  """
80
  )
81
  with gr.Column(scale=2): # smaller side column for logo
evaluate.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  from pathlib import Path
4
  from typing import Optional
5
  from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
6
- from utils import metrics_per_ep
7
  from huggingface_hub import hf_hub_download
8
  import datetime
9
  import io
@@ -157,7 +157,7 @@ def submit_data(predictions_file: str,
157
  path_in_repo=destination_csv,
158
  repo_id=submissions_repo,
159
  repo_type="dataset",
160
- commit_message=f"Add submission for {safe_user} at {ts}"
161
  )
162
  # Upload the metadata JSON file
163
  meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
@@ -166,7 +166,7 @@ def submit_data(predictions_file: str,
166
  path_in_repo=destination_json,
167
  repo_id=submissions_repo,
168
  repo_type="dataset",
169
- commit_message=f"Add metadata for {user_state} submission at {ts}"
170
  )
171
 
172
  return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
@@ -265,7 +265,9 @@ def calculate_metrics(
265
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
266
  # TODO: What to do when a molecule is duplicated in the Predictions file?
267
 
268
- df_results = pd.DataFrame(columns=["endpoint", "MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"])
 
 
269
  for i, measurement in enumerate(ENDPOINTS):
270
  df_pred = results_dataframe[['Name', measurement]].copy()
271
  # Only use data with operator "="
@@ -295,32 +297,37 @@ def calculate_metrics(
295
  coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
296
  merged = merged.sort_values("Name", kind="stable")
297
 
298
- # validate pairs
299
- if n_pairs < 10:
300
- mae = rae = r2 = spearman = ktau = np.nan
301
- else:
302
- y_pred = merged[f"{measurement}_pred"].to_numpy()
303
- y_true = merged[f"{measurement}_true"].to_numpy()
304
- # Force log scale for all endpoints except LogD (for outliers)
305
- if measurement != "LogD":
306
- y_pred = np.log10(y_pred)
307
- y_true = np.log10(y_true)
308
- mae, rae, r2, spearman, ktau = metrics_per_ep(y_pred, y_true)
309
-
310
 
311
- df_results.loc[i, 'endpoint'] = measurement
312
- df_results.loc[i, 'MAE'] = mae
313
- df_results.loc[i, 'RAE'] = rae
314
- df_results.loc[i, 'R2'] = r2
315
- df_results.loc[i, 'Spearman R'] = spearman
316
- df_results.loc[i, "Kendall's Tau"] = ktau
317
- df_results.loc[i, 'data coverage (%)'] = coverage
 
 
 
 
 
 
 
318
 
 
 
 
319
  # Average results
320
- num_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau", "data coverage (%)"]
321
- df_results[num_cols] = df_results[num_cols].apply(pd.to_numeric, errors="coerce")
322
- means = df_results[num_cols].mean()
323
- avg_row = {"endpoint": "Average", **means.to_dict()}
 
324
  df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
325
 
326
  return df_with_average
 
3
  from pathlib import Path
4
  from typing import Optional
5
  from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
6
+ from utils import bootstrap_metrics
7
  from huggingface_hub import hf_hub_download
8
  import datetime
9
  import io
 
157
  path_in_repo=destination_csv,
158
  repo_id=submissions_repo,
159
  repo_type="dataset",
160
+ commit_message=f"Add submission for user at {ts}"
161
  )
162
  # Upload the metadata JSON file
163
  meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8"))
 
166
  path_in_repo=destination_json,
167
  repo_id=submissions_repo,
168
  repo_type="dataset",
169
+ commit_message=f"Add metadata for user submission at {ts}"
170
  )
171
 
172
  return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv
 
265
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
266
  # TODO: What to do when a molecule is duplicated in the Predictions file?
267
 
268
+ # Compute leaderboard DataFrame
269
+ final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
270
+ all_endpoint_results = []
271
  for i, measurement in enumerate(ENDPOINTS):
272
  df_pred = results_dataframe[['Name', measurement]].copy()
273
  # Only use data with operator "="
 
297
  coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
298
  merged = merged.sort_values("Name", kind="stable")
299
 
300
+ y_pred = merged[f"{measurement}_pred"].to_numpy()
301
+ y_true = merged[f"{measurement}_true"].to_numpy()
302
+ # Force log scale for all endpoints except LogD (for outliers)
303
+ if measurement != "LogD":
304
+ y_pred = np.log10(y_pred)
305
+ y_true = np.log10(y_true)
 
 
 
 
 
 
306
 
307
+ # Calculate dataframe with the metrics for 1000 bootstraps
308
+ bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
309
+ df_endpoint = bootstrap_df.pivot_table(
310
+ index=["Endpoint"],
311
+ columns=final_cols,
312
+ values="Value",
313
+ aggfunc=["mean", "std"]
314
+ ).reset_index()
315
+ # Get a df with columns 'mean_MAE', 'std_MAE', ...
316
+ df_endpoint.columns = [
317
+ f'{j}_{i}' if i != '' else j for i, j in df_endpoint.columns
318
+ ]
319
+ df_endpoint.rename(columns={'_Endpoint': 'Endpoint'}, inplace=True)
320
+ all_endpoint_results.append(df_endpoint)
321
 
322
+ df_results = pd.concat(all_endpoint_results, ignore_index=True)
323
+ mean_cols = [f'{m}_mean' for m in final_cols]
324
+ std_cols = [f'{m}_std' for m in final_cols]
325
  # Average results
326
+ macro_means = df_results[mean_cols].mean()
327
+ macro_stds = df_results[std_cols].mean()
328
+ avg_row = {"endpoint": "Average"}
329
+ avg_row.update(macro_means.to_dict())
330
+ avg_row.update(macro_stds.to_dict())
331
  df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
332
 
333
  return df_with_average
utils.py CHANGED
@@ -1,22 +1,26 @@
1
 
2
  import pandas as pd
3
  import numpy as np
 
4
  from datasets import load_dataset
5
  from about import results_repo
6
- from about import LB_COLS0
7
 
8
- def make_user_clickable(name):
9
  link =f'https://huggingface.co/{name}'
10
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{name}</a>'
11
- def make_tag_clickable(tag):
12
  return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
13
 
14
  def fetch_dataset_df():
15
  dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
16
  full_df = dset.to_pandas()
 
 
 
17
  assert all(
18
- col in full_df.columns for col in LB_COLS0
19
- ), f"Expected columns {LB_COLS0} not found in {full_df.columns}. Missing columns: {set(LB_COLS0) - set(full_df.columns)}"
20
 
21
  df = full_df.copy()
22
  df = df[df["user"] != "test"].copy()
@@ -33,7 +37,43 @@ def fetch_dataset_df():
33
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
34
  return latest
35
 
36
- def metrics_per_ep(pred, true):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  from scipy.stats import spearmanr, kendalltau
38
  from sklearn.metrics import mean_absolute_error, r2_score
39
  mae = mean_absolute_error(true, pred)
@@ -45,4 +85,76 @@ def metrics_per_ep(pred, true):
45
  spr, _ = spearmanr(true, pred)
46
  ktau, _ = kendalltau(true, pred)
47
 
48
- return mae, rae, r2, spr, ktau
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  import pandas as pd
3
  import numpy as np
4
+ from typing import Tuple
5
  from datasets import load_dataset
6
  from about import results_repo
7
+ from about import METRICS, STANDARD_COLS
8
 
9
+ def make_user_clickable(name: str):
10
  link =f'https://huggingface.co/{name}'
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{name}</a>'
12
+ def make_tag_clickable(tag: str):
13
  return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
14
 
15
  def fetch_dataset_df():
16
  dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
17
  full_df = dset.to_pandas()
18
+ expected_mean_cols = [f"mean_{col}" for col in METRICS]
19
+ expected_std_cols = [f"std_{col}" for col in METRICS]
20
+ expected_all_cols = STANDARD_COLS + expected_mean_cols + expected_std_cols
21
  assert all(
22
+ col in full_df.columns for col in expected_all_cols
23
+ ), f"Expected columns not found in {full_df.columns}. Missing columns: {set(expected_all_cols) - set(full_df.columns)}"
24
 
25
  df = full_df.copy()
26
  df = df[df["user"] != "test"].copy()
 
37
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
38
  return latest
39
 
40
+ def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
41
+ """
42
+ Generate bootstrap samples for a given size and number of samples.
43
+
44
+ Parameters
45
+ ----------
46
+ size : int
47
+ The size of the data.
48
+ n_samples : int
49
+ The number of samples to generate.
50
+
51
+ Returns
52
+ -------
53
+ np.ndarray
54
+ Returns a numpy array of the bootstrap samples.
55
+ """
56
+ rng = np.random.default_rng(0)
57
+ return rng.choice(size, size=(n_samples, size), replace=True)
58
+
59
+
60
+ def metrics_per_ep(pred: np.ndarray,
61
+ true: np.ndarray
62
+ )->Tuple[float, float, float, float]:
63
+ """Predict evaluation metrics for a single sample
64
+
65
+ Parameters
66
+ ----------
67
+ pred : np.ndarray
68
+ Array with predictions
69
+ true : np.ndarray
70
+ Array with actual values
71
+
72
+ Returns
73
+ -------
74
+ Tuple[float, float, float, float]
75
+ Resulting metrics: (MAE, RAE, R2, Spearman R, Kendall's Tau)
76
+ """
77
  from scipy.stats import spearmanr, kendalltau
78
  from sklearn.metrics import mean_absolute_error, r2_score
79
  mae = mean_absolute_error(true, pred)
 
85
  spr, _ = spearmanr(true, pred)
86
  ktau, _ = kendalltau(true, pred)
87
 
88
+ return mae, rae, r2, spr, ktau
89
+
90
+ def bootstrap_metrics(pred: np.ndarray,
91
+ true: np.ndarray,
92
+ endpoint: str,
93
+ n_bootstrap_samples=1000
94
+ )->pd.DataFrame:
95
+ """Calculate bootstrap metrics given predicted and true values
96
+
97
+ Parameters
98
+ ----------
99
+ pred : np.ndarray
100
+ Predicted endpoints
101
+ true : np.ndarray
102
+ Actual endpoint values
103
+ endpoint : str
104
+ String with endpoint
105
+ n_bootstrap_samples : int, optional
106
+ Size of bootstrapsample, by default 1000
107
+
108
+ Returns
109
+ -------
110
+ pd.DataFrame
111
+ Dataframe with estimated metric per bootstrap sample for the given endpoint
112
+ """
113
+ cols = ["Sample", "Endpoint", "Metric", "Value"]
114
+ bootstrap_results = pd.DataFrame(columns=cols, dtype=[int, str, str, float])
115
+ for i, indx in enumerate(
116
+ bootstrap_sampling(true.shape[0], n_bootstrap_samples)
117
+ ):
118
+ mae, rae, r2, spr, ktau = metrics_per_ep(pred[indx], true[indx])
119
+ scores = pd.DataFrame(
120
+ [
121
+ [i, endpoint, "MAE", mae],
122
+ [i, endpoint, "RAE", rae],
123
+ [i, endpoint, "R2", r2],
124
+ [i, endpoint, "Spearman R", spr],
125
+ [i, endpoint, "Kendall's Tau", ktau]
126
+ ],
127
+ columns=cols
128
+ )
129
+ bootstrap_results = pd.concat([bootstrap_results, scores])
130
+ return bootstrap_results
131
+
132
+ def map_metric_to_stats(df: pd.DataFrame, average=False) -> pd.DataFrame:
133
+ """Map mean and std to 'mean +/- std' string for each metric
134
+
135
+ Parameters
136
+ ----------
137
+ df : pd.DataFrame
138
+ Dataframe to modify
139
+ average : bool, optional
140
+ Whether the dataframe contains average info, by default False
141
+
142
+ Returns
143
+ -------
144
+ pd.DataFrame
145
+ Modified dataframe
146
+ """
147
+ metric_cols = METRICS[:]
148
+ if average:
149
+ metric_cols[1] = "MA-RAE"
150
+ cols_drop = []
151
+ for col in metric_cols:
152
+ mean_col = f"mean_{col}"
153
+ std_col = f"std_{col}"
154
+ df[col] = df.apply(
155
+ lambda row: f"{row[mean_col]:.2f} +/- {row[std_col]:.2f}",
156
+ axis=1
157
+ )
158
+ cols_drop.extend([mean_col, std_col])
159
+ df = df.drop(columns=cols_drop)
160
+ return df