import json import re import os import pandas as pd import numpy as np import pickle from urllib.parse import quote from pathlib import Path import re import html from typing import Dict, Any from scipy.stats import sem from utils.constants import (NORM_BASE_SUBMISSION, DATASETS, DIGITS_FOR_VALUES, DIGITS_FOR_ERRORS, DIMENSIONS, COLUMN_ORDER, MODEL_INFO_FILE, RESULTS_DIR) from utils import compute_tools def load_results(folder: str = RESULTS_DIR, items_to_ignore: list = ["__pycache__", "compiled.pkl", ".DS_Store"] ): """ Args: folder: folder containing results items_to_ignore: list of items in results folder to ignore """ #read model info with open(MODEL_INFO_FILE) as f: model_info = json.load(f) model_size = model_info["MODEL_SIZE"] backbone_names = model_info["BACKBONE_NAMES"] #read submission info all_submissions = os.listdir(folder) for item in items_to_ignore: if item in all_submissions: all_submissions.remove(item) all_submission_results = {} #TBD: add some info to json files and read here also all_model_names = [] for submission in all_submissions: all_submission_results[submission] = {} combined_results = pd.read_csv(f"{folder}/{submission}/results_and_parameters.csv") combined_results = combined_results.drop(["index"], errors='ignore') combined_results["# params"] = combined_results.apply(lambda row: model_size[row.backbone], axis=1) combined_results["Model"] = combined_results.apply(lambda row: backbone_names[row.backbone], axis=1) combined_results["Config Settings"] = combined_results.apply(lambda row: get_config_setting_string(row), axis=1) #TBD: read json info all_backbones = list(set(combined_results["backbone"].tolist())) all_submission_results[submission]["results"] = combined_results all_submission_results[submission]["all_backbones"] = all_backbones #all_submission_results[submission]["json_info"] = json_info all_model_names.extend(all_backbones) all_model_names = list(set(all_model_names)) return all_submission_results, all_model_names, all_submissions def compute_all_iqms( all_submission_results: dict, benchmark_name: str, dataset_group_keys:list =("backbone", "dataset", "partition name"), overall_group_keys:list = ("backbone", "partition name"), metric:str ="test metric", ) -> Dict: """ - reads combined results from repeated seeds for multiple models - computes the raw and normalized IQM by dataset for each model by task type - computes the raw and normalized overall IQM across multiple datasets in each each task type Args: compiled_results_folder: folder containing csv files with combined results dataset_group_keys: grouping for computing dataset IQM overall_group_keys: grouping for computing overall IQM metric: the column containing scores/values in the combined results tables benchmark_name: name of normalizer file to be used """ output = {} for submission in all_submission_results: output[submission] = {} submission_backbones = all_submission_results[submission]["all_backbones"] #save_folder = f"results/{submission}" partition_name = "0.10x train" if "data_10_perc" in submission else "1.00x train" submission_results = all_submission_results[submission]["results"] if not "partition name" in list(submission_results.columns): submission_results["partition name"] = partition_name submission_results["partition name"] = partition_name #get raw values per dataset series = submission_results.groupby(list(dataset_group_keys))[metric].apply(np.mean) raw_per_dataset = series.to_frame().reset_index() raw_per_dataset = raw_per_dataset.drop(columns=["partition name"], errors='ignore') raw_per_dataset_final = pd.DataFrame(columns=["backbone"] + DATASETS) #get raw errors per dataset series = submission_results.groupby(list(dataset_group_keys))[metric].apply(sem) raw_per_dataset_err = series.to_frame().reset_index() raw_per_dataset_err = raw_per_dataset_err.drop(columns=["partition name"], errors='ignore') raw_per_dataset_final_err = pd.DataFrame(columns=["backbone"] + DATASETS) #rearrange for backbone in submission_backbones: #get values data = raw_per_dataset.loc[raw_per_dataset["backbone"] == backbone] data = data.drop(columns=["backbone"]).rename(columns={metric: backbone, "dataset": "backbone"}) data = data.set_index(['backbone']).T.reset_index() raw_per_dataset_final = pd.concat([raw_per_dataset_final, data]) #get errors data_err = raw_per_dataset_err.loc[raw_per_dataset_err["backbone"] == backbone] data_err = data_err.drop(columns=["backbone"]).rename(columns={metric: backbone, "dataset": "backbone"}) data_err = data_err.set_index(['backbone']).T.reset_index() raw_per_dataset_final_err = pd.concat([raw_per_dataset_final_err, data_err]) raw_per_dataset_final = raw_per_dataset_final.drop(columns=["backbone"]) raw_per_dataset_final = raw_per_dataset_final.rename(columns={"index": "backbone"}).reset_index(drop=True) raw_per_dataset_final_err = raw_per_dataset_final_err.drop(columns=["backbone"]) raw_per_dataset_final_err = raw_per_dataset_final_err.rename(columns={"index": "backbone"}).reset_index(drop=True) #get overall raw values and errors series = submission_results.groupby(list(overall_group_keys))[metric].apply(np.mean) raw_overall = series.to_frame().reset_index() raw_overall = raw_overall.drop(columns=["partition name"], errors='ignore') raw_overall = raw_overall.rename(columns={metric: "Overall Mean"}) #raw_bootstrap_mean = compute_tools.bootstrap_mean_aggregate(submission_results, metric=metric) #series = raw_bootstrap_mean.groupby(["backbone"])[metric].apply(sem) series = submission_results.groupby(list(overall_group_keys))[metric].apply(sem) raw_overall_std_err = series.to_frame().reset_index() raw_overall_std_err = raw_overall_std_err.drop(columns=["partition name"], errors='ignore') raw_overall_std_err = raw_overall_std_err.rename(columns={metric: "Overall Mean"}) #get raw values and errors by dimension for dimension in DIMENSIONS: #get values dimension_data = submission_results.loc[submission_results["dataset"].isin(DIMENSIONS[dimension])].copy() series = dimension_data.groupby(list(overall_group_keys))[metric].apply(np.mean) raw_means_dimension = series.to_frame().reset_index() raw_means_dimension = raw_means_dimension.drop(columns=["partition name"], errors='ignore') raw_means_dimension = raw_means_dimension.rename(columns={metric: dimension}) raw_overall = raw_overall.merge(raw_means_dimension, how="left", on="backbone") #get errors #raw_bootstrap_mean = compute_tools.bootstrap_mean_aggregate(dimension_data, metric=metric) #series = raw_bootstrap_mean.groupby(["backbone"])[metric].apply(sem) series = dimension_data.groupby(list(overall_group_keys))[metric].apply(sem) raw_dimension_std_err = series.to_frame().reset_index() raw_dimension_std_err = raw_dimension_std_err.drop(columns=["partition name"], errors='ignore') raw_dimension_std_err = raw_dimension_std_err.rename(columns={metric: dimension}) raw_overall_std_err = raw_overall_std_err.merge(raw_dimension_std_err, how="left", on="backbone") #normalize results normalizer = compute_tools.load_normalizer(benchmark_name=benchmark_name) new_metric = normalizer.normalize_data_frame(df=submission_results, metric=metric) #get normalized values per dataset series = submission_results.groupby(list(dataset_group_keys))[new_metric].apply(compute_tools.iqm) normalized_per_dataset = series.to_frame().reset_index() normalized_per_dataset = normalized_per_dataset.drop(columns=["partition name"], errors='ignore') normalized_per_dataset_final = pd.DataFrame(columns=["backbone"] + DATASETS) #get normalized errors per dataset series = submission_results.groupby(list(dataset_group_keys))[new_metric].apply(compute_tools.trimmed_sem) normalized_per_dataset_err = series.to_frame().reset_index() normalized_per_dataset_err = normalized_per_dataset_err.drop(columns=["partition name"], errors='ignore') normalized_per_dataset_final_err = pd.DataFrame(columns=["backbone"] + DATASETS) #rearrange for backbone in submission_backbones: #get values data = normalized_per_dataset.loc[normalized_per_dataset["backbone"] == backbone] data = data.drop(columns=["backbone"]).rename(columns={new_metric: backbone, "dataset": "backbone"}) data = data.set_index(['backbone']).T.reset_index() normalized_per_dataset_final = pd.concat([normalized_per_dataset_final, data]) #get errors data_err = normalized_per_dataset_err.loc[normalized_per_dataset["backbone"] == backbone] data_err = data_err.drop(columns=["backbone"]).rename(columns={new_metric: backbone, "dataset": "backbone"}) data_err = data_err.set_index(['backbone']).T.reset_index() normalized_per_dataset_final_err = pd.concat([normalized_per_dataset_final_err, data_err]) normalized_per_dataset_final = normalized_per_dataset_final.drop(columns=["backbone"]) normalized_per_dataset_final = normalized_per_dataset_final.rename(columns={"index": "backbone"}).reset_index(drop=True) normalized_per_dataset_final_err = normalized_per_dataset_final_err.drop(columns=["backbone"]) normalized_per_dataset_final_err = normalized_per_dataset_final_err.rename(columns={"index": "backbone"}).reset_index(drop=True) #get overall normalized values series = submission_results.groupby(list(overall_group_keys))[new_metric].apply(compute_tools.iqm) normalized_overall = series.to_frame().reset_index() normalized_overall = normalized_overall.drop(columns=["partition name"]) normalized_overall = normalized_overall.rename(columns={new_metric: "Overall IQM"}) #get overall normalized errors #normalized_bootstrap_iqm = compute_tools.bootstrap_iqm_aggregate(submission_results, metric=new_metric) #series = normalized_bootstrap_iqm.groupby(["backbone"])[new_metric].apply(sem) series = submission_results.groupby(list(overall_group_keys))[new_metric].apply(compute_tools.trimmed_sem) normalized_overall_std_err = series.to_frame().reset_index() normalized_overall_std_err = normalized_overall_std_err.drop(columns=["partition name"], errors='ignore') normalized_overall_std_err = normalized_overall_std_err.rename(columns={new_metric: "Overall IQM"}) #get normalized values by dimension for dimension in DIMENSIONS: dimension_data = submission_results.loc[submission_results["dataset"].isin(DIMENSIONS[dimension])].copy() series = dimension_data.groupby(list(overall_group_keys))[new_metric].apply(compute_tools.iqm) normalized_iqms_dimension = series.to_frame().reset_index() normalized_iqms_dimension = normalized_iqms_dimension.drop(columns=["partition name"], errors='ignore') normalized_iqms_dimension = normalized_iqms_dimension.rename(columns={new_metric: dimension}) normalized_overall = normalized_overall.merge(normalized_iqms_dimension, how="left", on="backbone") #get errors #normalized_bootstrap_iqm = compute_tools.bootstrap_iqm_aggregate(dimension_data, metric=new_metric) #series = normalized_bootstrap_iqm.groupby(["backbone"])[new_metric].apply(sem) series = dimension_data.groupby(list(overall_group_keys))[new_metric].apply(compute_tools.trimmed_sem) normalized_dimension_std_err = series.to_frame().reset_index() normalized_dimension_std_err = normalized_dimension_std_err.drop(columns=["partition name"], errors='ignore') normalized_dimension_std_err = normalized_dimension_std_err.rename(columns={new_metric: dimension}) normalized_overall_std_err = normalized_overall_std_err.merge(normalized_dimension_std_err, how="left", on="backbone") output[submission]["raw_per_dataset"] = raw_per_dataset_final output[submission]["raw_overall"] = raw_overall output[submission]["normalized_per_dataset"] = normalized_per_dataset_final output[submission]["normalized_overall"] = normalized_overall output[submission]["raw_per_dataset_err"] = raw_per_dataset_final_err output[submission]["raw_overall_err"] = raw_overall_std_err output[submission]["normalized_per_dataset_err"] = normalized_per_dataset_final_err output[submission]["normalized_overall_err"] = normalized_overall_std_err return output def format_values(x): x = x*100 return "{:.1f}".format(x) def format_errors(x): x = x*100 return "{:.1f}".format(x) def get_config_setting_string(row) -> str: config_settings = f""" Early Stop Patience: {row.early_stop_patience} / Decoder: {row.decoder} / # trials: {row.n_trials} / Data : {row.data_percentages}% / Batch Size Selection: {row.batch_size_selection} """ config_settings = config_settings.replace("early_stopping_50", "50").replace("n_trials_16", "16").replace("data_100_perc", "100") return config_settings def get_overall_performance_table(all_submission_results: dict, all_iqms: dict ) -> Dict: """ create table for main 'overall performance' page """ output = {} result_type = ["normalized","raw"] for value in result_type: all_tables = [] all_tables_err = [] for submission in all_submission_results: #get results submission_data = all_iqms[submission][f"{value}_overall"].copy() submission_data["Config Settings"] = "-" submission_data["Model"] = "-" submission_data["# params"] = "-" submission_data["submission"] = submission submission_data_err = all_iqms[submission][f"{value}_overall_err"].copy() submission_data_err["Config Settings"] = "-" submission_data_err["Model"] = "-" submission_data_err["# params"] = "-" submission_data_err["submission"] = submission #get parameters parameters = all_submission_results[submission]["results"] for backbone in all_submission_results[submission]["all_backbones"]: submission_data.loc[submission_data["backbone"] == backbone, "Config Settings"] = parameters.loc[parameters["backbone"] == backbone]["Config Settings"].tolist()[0] submission_data.loc[submission_data["backbone"] == backbone, "Model"] = parameters.loc[parameters["backbone"] == backbone]["Model"].tolist()[0] submission_data.loc[submission_data["backbone"] == backbone, "# params"] = parameters.loc[parameters["backbone"] == backbone]["# params"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "Config Settings"] = parameters.loc[parameters["backbone"] == backbone]["Config Settings"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "Model"] = parameters.loc[parameters["backbone"] == backbone]["Model"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "# params"] = parameters.loc[parameters["backbone"] == backbone]["# params"].tolist()[0] all_tables.append(submission_data) all_tables_err.append(submission_data_err) all_tables = pd.concat(all_tables) all_tables_err = pd.concat(all_tables_err) all_tables.loc[:, COLUMN_ORDER[value]["overall_table"]] = all_tables[COLUMN_ORDER[value]["overall_table"]].round(DIGITS_FOR_VALUES).apply(lambda series: series.apply(format_values)) all_tables_err.loc[:, COLUMN_ORDER[value]["overall_table"]] = all_tables_err[COLUMN_ORDER[value]["overall_table"]].round(DIGITS_FOR_ERRORS).apply(lambda series: series.apply(format_errors)) all_tables = all_tables[COLUMN_ORDER["all_tables"] + COLUMN_ORDER[value]["overall_table"]] all_tables_err = all_tables_err[COLUMN_ORDER["all_tables"] + COLUMN_ORDER[value]["overall_table"]] output[value] = all_tables output[f"{value}_err"] = all_tables_err return output def get_performance_by_dimension_table(all_submission_results: dict, all_iqms: dict ) -> Dict: """ create tables for main 'performance by dimension' page """ output = {} result_type = ["normalized","raw"] for value in result_type: all_tables = {} all_tables_err = {} for dimension in DIMENSIONS: dimension_tables = [] dimension_tables_err = [] for submission in all_submission_results: #get results submission_data = all_iqms[submission][f"{value}_per_dataset"][DIMENSIONS[dimension]+["backbone"]].copy() dimension_results = all_iqms[submission][f"{value}_overall"][[dimension]+["backbone"]].copy() submission_data = submission_data.merge(dimension_results, how="left", on="backbone") submission_data["Config Settings"] = "-" submission_data["Model"] = "-" submission_data["# params"] = "-" submission_data["submission"] = submission submission_data_err = all_iqms[submission][f"{value}_per_dataset_err"][DIMENSIONS[dimension]+["backbone"]].copy() dimension_results_err = all_iqms[submission][f"{value}_overall_err"][[dimension]+["backbone"]].copy() submission_data_err = submission_data_err.merge(dimension_results_err, how="left", on="backbone") submission_data_err["Config Settings"] = "-" submission_data_err["Model"] = "-" submission_data_err["# params"] = "-" submission_data_err["submission"] = submission #get parameters parameters = all_submission_results[submission]["results"] for backbone in all_submission_results[submission]["all_backbones"]: submission_data.loc[submission_data["backbone"] == backbone, "Config Settings"] = parameters.loc[parameters["backbone"] == backbone]["Config Settings"].tolist()[0] submission_data.loc[submission_data["backbone"] == backbone, "Model"] = parameters.loc[parameters["backbone"] == backbone]["Model"].tolist()[0] submission_data.loc[submission_data["backbone"] == backbone, "# params"] = parameters.loc[parameters["backbone"] == backbone]["# params"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "Config Settings"] = parameters.loc[parameters["backbone"] == backbone]["Config Settings"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "Model"] = parameters.loc[parameters["backbone"] == backbone]["Model"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "# params"] = parameters.loc[parameters["backbone"] == backbone]["# params"].tolist()[0] dimension_tables.append(submission_data) dimension_tables_err.append(submission_data_err) dimension_tables = pd.concat(dimension_tables) dimension_tables.loc[:, DIMENSIONS[dimension]] = dimension_tables[DIMENSIONS[dimension]].round(DIGITS_FOR_VALUES).apply(lambda series: series.apply(format_values)) dimension_tables.loc[:, dimension] = dimension_tables[dimension].round(DIGITS_FOR_VALUES).apply(format_values) dimension_tables = dimension_tables[COLUMN_ORDER["all_tables"] + [dimension] + COLUMN_ORDER[value]["dimension_tables"] + DIMENSIONS[dimension]] new_column = f"Overall {dimension} IQM" if value == "normalized" else f"Overall {dimension} Mean" dimension_tables = dimension_tables.rename(columns={dimension: new_column}) all_tables[dimension] = dimension_tables dimension_tables_err = pd.concat(dimension_tables_err) dimension_tables_err.loc[:, DIMENSIONS[dimension]] = dimension_tables_err[DIMENSIONS[dimension]].round(DIGITS_FOR_ERRORS).apply(lambda series: series.apply(format_errors)) dimension_tables_err.loc[:, dimension] = dimension_tables_err[dimension].round(DIGITS_FOR_ERRORS).apply(format_errors) dimension_tables_err = dimension_tables_err[COLUMN_ORDER["all_tables"] + [dimension] + COLUMN_ORDER[value]["dimension_tables"] + DIMENSIONS[dimension]] dimension_tables_err = dimension_tables_err.rename(columns={dimension: new_column}) all_tables_err[f"{dimension}_err"] = dimension_tables_err output[value] = all_tables output[f"{value}_err"] = all_tables_err return output def get_datasets_tables(all_submission_results: dict, all_iqms: dict ) -> Dict: output = {} result_type = ["normalized","raw"] for value in result_type: all_tables = {} all_tables_err = {} for dataset in DATASETS: dataset_tables = [] dataset_tables_err = [] for submission in all_submission_results: #get results submission_data = all_iqms[submission][f"{value}_per_dataset"][["backbone", dataset]].copy() submission_data["Config Settings"] = "-" submission_data["Model"] = "-" submission_data["# params"] = "-" submission_data["submission"] = submission submission_data_err = all_iqms[submission][f"{value}_per_dataset_err"][["backbone", dataset]].copy() submission_data_err["Config Settings"] = "-" submission_data_err["Model"] = "-" submission_data_err["# params"] = "-" submission_data_err["submission"] = submission #get parameters parameters = all_submission_results[submission]["results"] new_column = "IQM" if value == "normalized" else "Mean" for backbone in all_submission_results[submission]["all_backbones"]: submission_data.loc[submission_data["backbone"] == backbone, "Config Settings"] = parameters.loc[parameters["backbone"] == backbone]["Config Settings"].tolist()[0] submission_data.loc[submission_data["backbone"] == backbone, "Model"] = parameters.loc[parameters["backbone"] == backbone]["Model"].tolist()[0] submission_data.loc[submission_data["backbone"] == backbone, "# params"] = parameters.loc[parameters["backbone"] == backbone]["# params"].tolist()[0] submission_data = submission_data.rename(columns={dataset: new_column}) submission_data_err.loc[submission_data_err["backbone"] == backbone, "Config Settings"] = parameters.loc[parameters["backbone"] == backbone]["Config Settings"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "Model"] = parameters.loc[parameters["backbone"] == backbone]["Model"].tolist()[0] submission_data_err.loc[submission_data_err["backbone"] == backbone, "# params"] = parameters.loc[parameters["backbone"] == backbone]["# params"].tolist()[0] submission_data_err = submission_data_err.rename(columns={dataset: new_column}) #TBD: colums to add: "submission", "Task Type", "HPO parameters", "Date", "Followed Evaluation Protocol", "Reproducible", "Comments" dataset_tables.append(submission_data) dataset_tables_err.append(submission_data_err) column = "IQM" if value == "normalized" else "Mean" dataset_tables = pd.concat(dataset_tables) dataset_tables.loc[:, column] = dataset_tables[column].round(DIGITS_FOR_VALUES).apply(format_values) all_tables[dataset] = dataset_tables[COLUMN_ORDER["all_tables"] + COLUMN_ORDER[value]["dataset_tables"]] dataset_tables_err = pd.concat(dataset_tables_err) dataset_tables_err.loc[:, column] = dataset_tables_err[column].round(DIGITS_FOR_ERRORS).apply(format_errors) all_tables_err[dataset] = dataset_tables_err[COLUMN_ORDER["all_tables"] + COLUMN_ORDER[value]["dataset_tables"]] output[value] = all_tables output[f"{value}_err"] = all_tables_err return output if __name__ == "__main__": #load results all_submission_results, all_model_names, all_submissions = load_results(folder=RESULTS_DIR) benchmark_name = f"leaderboard_{NORM_BASE_SUBMISSION}_main" combined_results = all_submission_results[NORM_BASE_SUBMISSION]["results"].copy() compute_tools.make_normalizer(combined_results.reset_index(), metrics=("test metric",), benchmark_name=benchmark_name) all_iqms = compute_all_iqms( all_submission_results = all_submission_results, benchmark_name = benchmark_name, ) #create tables to be rendered overall_performance_tables = get_overall_performance_table(all_submission_results=all_submission_results, all_iqms=all_iqms) performance_by_dimension_tables = get_performance_by_dimension_table(all_submission_results=all_submission_results, all_iqms=all_iqms) datasets_tables = get_datasets_tables(all_submission_results=all_submission_results, all_iqms=all_iqms) compiled_results = { "overall_performance_tables": overall_performance_tables, "performance_by_dimension_tables": performance_by_dimension_tables, "datasets_tables": datasets_tables } with open(f'{RESULTS_DIR}/compiled.pkl', 'wb') as handle: pickle.dump(compiled_results, handle, protocol=pickle.HIGHEST_PROTOCOL)