import gradio as gr import pandas as pd from pathlib import Path from scipy.stats import spearmanr, kendalltau from sklearn.metrics import mean_absolute_error, r2_score from typing import List from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo from huggingface_hub import hf_hub_download import datetime import io import json, tempfile def _compact_dict(d: dict) -> dict: """Drop None/empty-string values; strip whitespace for strings.""" out = {} for k, v in d.items(): if isinstance(v, str): v = v.strip() if v not in (None, "", []): out[k] = v return out def submit_data(predictions_file: str, user_state, *, participant_name: str = "", discord_username: str = "", email: str = "", affiliation: str = "" ): if user_state is None: raise gr.Error("Username or alias is required for submission.") file_path = Path(predictions_file).resolve() if not file_path.exists(): raise gr.Error("Uploaded file object does not have a valid file path.") # Read results file try: results_df = pd.read_csv(file_path) except Exception as e: return f"❌ Error reading results file: {str(e)}" if results_df.empty: raise gr.Error("The uploaded file is empty.") if not set(ENDPOINTS).issubset(set(results_df.columns)): raise gr.Error(f"The uploaded file must contain all endpoint predictions") # Build destination filename in the dataset ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds").replace(":", "-") safe_user = str(user_state.strip()).replace("/", "_").replace(" ", "_") destination_csv = f"submissions/{safe_user}_{ts}.csv" destination_json = destination_csv.replace(".csv", ".json") # Upload the CSV file API.upload_file( path_or_fileobj=str(file_path), path_in_repo=destination_csv, repo_id=submissions_repo, repo_type="dataset", commit_message=f"Add submission for {safe_user} at {ts}" ) # Optional participant record participant_fields = _compact_dict({ "participant_name": participant_name, "discord_username": discord_username, "email": email, "affiliation": affiliation, }) # Metadata JSON meta = { "submission_time_utc": ts, "user": user_state, "original_filename": file_path.name, "evaluated": False, **participant_fields, # merged here } meta_bytes = io.BytesIO(json.dumps(meta, indent=2).encode("utf-8")) API.upload_file( path_or_fileobj=meta_bytes, path_in_repo=destination_json, repo_id=submissions_repo, repo_type="dataset", commit_message=f"Add metadata for {user_state} submission at {ts}" ) return "✅ Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv def evaluate_data(filename: str) -> None: # Load the submission csv try: local_path = hf_hub_download( repo_id=submissions_repo, repo_type="dataset", filename=filename, ) except Exception as e: raise gr.Error(f"Failed to download submission file: {e}") # Load the test set try: test_path = hf_hub_download( repo_id=test_repo, repo_type="dataset", filename="data/test_dataset.csv", ) except Exception as e: raise gr.Error(f"Failed to download test file: {e}") data_df = pd.read_csv(local_path) test_df = pd.read_csv(test_path) try: results_df = calculate_metrics(data_df, test_df) if not isinstance(results_df, pd.DataFrame) or results_df.empty: raise gr.Error("Evaluation produced no results.") except Exception as e: raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.') # Load metadata file meta_filename = filename.replace(".csv", ".json") try: meta_path = hf_hub_download( repo_id=submissions_repo, repo_type="dataset", filename=meta_filename, ) with open(meta_path, "r", encoding="utf-8") as f: meta = json.load(f) username = meta.get("user") timestamp = meta.get("submission_time_utc") except Exception as e: raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.") # Write results to results dataset results_df['user'] = username safe_user = str(username).replace("/", "_").replace(" ", "_") destination_path = f"results/{safe_user}_{timestamp}_results.csv" tmp_name = None with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp: results_df.to_csv(tmp, index=False) tmp.flush() tmp_name = tmp.name API.upload_file( path_or_fileobj=tmp_name, path_in_repo=destination_path, repo_id=results_repo, repo_type="dataset", commit_message=f"Add result data for {username}" ) Path(tmp_name).unlink() def calculate_metrics( results_dataframe: pd.DataFrame, test_dataframe: pd.DataFrame ): def metrics_per_ep(pred, true): mae = mean_absolute_error(true, pred) r2 = r2_score(true, pred) spr, _ = spearmanr(true, pred) ktau, _ = kendalltau(true, pred) return mae, r2, spr, ktau df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"]) for i, measurement in enumerate(ENDPOINTS): df_pred = results_dataframe[['Molecule Name', measurement]].dropna() df_true = test_dataframe[['Molecule Name', measurement]].dropna() # Make sure both have the same order pred = df_pred.sort_values(by='Molecule Name')[measurement] true = df_true.sort_values(by='Molecule Name')[measurement] mae, r2, spearman, ktau = metrics_per_ep(pred, true) df_results.loc[i, 'endpoint'] = measurement df_results.loc[i, 'MAE'] = mae df_results.loc[i, 'R2'] = r2 df_results.loc[i, 'Spearman R'] = spearman df_results.loc[i, "Kendall's Tau"] = ktau return df_results