import pandas as pd import json import os import numpy as np import uuid from utils.constants import (NEW_SUBMISSION_FOLDER, CSV_FILE, JSON_FILE, DIMENSIONS, NEW_SUBMISSION_COLUMN_INFO, NEW_SUBMISSION_COLUMN_NAMES, JSON_FORMAT, MODEL_INFO_FILE, RESULTS_DIR) def check_correct_file_type(folder_contents) -> bool: """ checks that folder has 2 items: a csv file and a json file """ contains_correct_files = (len(folder_contents) == 2) and (CSV_FILE in folder_contents) and (JSON_FILE in folder_contents) if not contains_correct_files: print("\nInput Validation Error: Please check that the {NEW_SUBMISSION_FOLDER} contains the files: \ {CSV_FILE} and {JSON_FILE}") return False return True def check_csv_columns_datatypes() -> bool: """ checks that csv file has only required columns and columns have correct data types """ #check for correct columns csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}") submitted_csv_column_names = set(csv_data.columns) expected_column_names = set(NEW_SUBMISSION_COLUMN_NAMES) for item in expected_column_names: if item not in submitted_csv_column_names: print(f"The following column is missing: {item}") correct_columns = expected_column_names.issubset(submitted_csv_column_names) if not correct_columns: print(f"\nInput Validation Error: Please ensure that the csv file contains the following columns: {NEW_SUBMISSION_COLUMN_NAMES}") #check for correct dtype correct_dtypes = [] for col in NEW_SUBMISSION_COLUMN_INFO["string_cols"]: if col in csv_data.columns: print(f"{col} is string/object:{pd.api.types.is_object_dtype(csv_data[col])}") correct_dtypes.append(pd.api.types.is_object_dtype(csv_data[col])) for col in NEW_SUBMISSION_COLUMN_INFO["integer_cols"]: if col in csv_data.columns: print(f"{col} is numeric: {pd.api.types.is_numeric_dtype(csv_data[col])}") correct_dtypes.append(pd.api.types.is_numeric_dtype(csv_data[col])) for col in NEW_SUBMISSION_COLUMN_INFO["float_cols"]: if col in csv_data.columns: print(f"{col} is numeric: {pd.api.types.is_numeric_dtype(csv_data[col])}") correct_dtypes.append(pd.api.types.is_numeric_dtype(csv_data[col])) correct_dtypes = all(correct_dtypes) if not correct_dtypes: print(f"\nInput Validation Error: Please ensure that the csv columns have the correct datatypes as follows: \n\ string/object type columns: {NEW_SUBMISSION_COLUMN_INFO['string_cols']} \n\ numeric/integer type columns: {NEW_SUBMISSION_COLUMN_INFO['integer_cols']}\ {NEW_SUBMISSION_COLUMN_INFO['float_cols']}") return correct_columns, correct_dtypes def check_correct_entries_per_dataset(required_seeds: int = 10) -> bool: """ checks for correct number of runs per backbone/dataset combination checks for required number of unique seeds """ csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}") count_values = csv_data.groupby(["backbone", "dataset"]).count() count_values = list(set(count_values["test metric"].tolist())) correct_num_values = (len(count_values) == 1) and (count_values[0] == required_seeds) if not correct_num_values: print(f"\nInput Validation Error: Please ensure that each backbone/dataset combination has {required_seeds} entries") count_seeds = csv_data.groupby(["backbone", "dataset"]).nunique() count_seeds = list(set(count_seeds["Seed"].tolist())) correct_num_seeds = (len(count_seeds) == 1) and (count_seeds[0] == required_seeds) if not correct_num_seeds: print(f"\nInput Validation Warning: Please ensure that each backbone/dataset combination has {required_seeds} unique seeds") return correct_num_values, correct_num_seeds def check_json_keys() -> bool: """ checks json file has required keys and subkeys, check json file values have correct data type """ with open(f"{NEW_SUBMISSION_FOLDER}/{JSON_FILE}") as f: json_submission_data = json.load(f) #TBD: check json file nested values have correct data type all_required_keys = [] for key, value in JSON_FORMAT.items(): if (key in json_submission_data) and (type(value) == type(json_submission_data[key])): all_required_keys.append(True) else: all_required_keys.append(False) all_required_keys = all(all_required_keys) if not all_required_keys: print(f"\nInput Validation Error: Please ensure that json file has the correct keys and datatypes") return all_required_keys def check_has_atleast_one_dimension() -> bool: """ check that submission contains datasets required for at least one submission """ csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}") submitted_csv_datasets = set(csv_data["dataset"].tolist()) contains_atleast_one_dimension = [] for dimension, datasets in DIMENSIONS.items(): datasets = set(datasets) contains_atleast_one_dimension.append(datasets.issubset(submitted_csv_datasets)) contains_atleast_one_dimension = any(contains_atleast_one_dimension) if not contains_atleast_one_dimension: print("\nInput Validation Error: Please check that the submission contains all datasets for one or more dimensions") return contains_atleast_one_dimension def update_new_backbones_and_models(): """ checks if backbone exists in model_info.json (used to display results) if not, information on the new model is added to the json file """ with open(f"{NEW_SUBMISSION_FOLDER}/{JSON_FILE}") as f: json_submission_data = json.load(f) #read model info with open(MODEL_INFO_FILE) as f: existing_model_info = json.load(f) for item in json_submission_data["New model info"]: submitted_backbone = item["unique_backbone_key"] if submitted_backbone not in existing_model_info["BACKBONE_NAMES"]: existing_model_info["BACKBONE_NAMES"][submitted_backbone] = item["model_display_name"] existing_model_info["MODEL_SIZE"][submitted_backbone] = item["model_size"] #save new information with open(MODEL_INFO_FILE, 'w') as fp: json.dump(existing_model_info, fp) def validate_new_submission() -> bool: """ """ #get folder contents if not os.path.exists(NEW_SUBMISSION_FOLDER): return folder_contents = os.listdir(NEW_SUBMISSION_FOLDER) items_to_ignore = ['.DS_Store'] for item in items_to_ignore: if item in folder_contents: folder_contents.remove(item) if len(folder_contents) == 0: print("no new submissions") return #check all conditions correct_file_type = check_correct_file_type(folder_contents) correct_columns, correct_dtypes = check_csv_columns_datatypes() correct_num_values, correct_num_seeds = check_correct_entries_per_dataset() correct_json_keys = check_json_keys() contains_atleast_one_dimension = check_has_atleast_one_dimension() all_checks_passed = all([correct_file_type, correct_columns, correct_dtypes, correct_json_keys, correct_num_values, #correct_num_seeds, contains_atleast_one_dimension]) if all_checks_passed: submission_id = uuid.uuid4() os.makedirs(f"{RESULTS_DIR}/{submission_id}") #copy only required keys in json file to new submission folder with open(f"{NEW_SUBMISSION_FOLDER}/{JSON_FILE}") as f: json_submission_data = json.load(f) new_dict = {} for key, value in JSON_FORMAT.items(): if value == "TBD": continue new_dict[key] = json_submission_data[key] with open(f"{RESULTS_DIR}/{submission_id}/{JSON_FILE}", 'w') as fp: json.dump(new_dict, fp) #copy only required columns in csv file to new submission folder csv_data = pd.read_csv(f"{NEW_SUBMISSION_FOLDER}/{CSV_FILE}") csv_data = csv_data[NEW_SUBMISSION_COLUMN_NAMES] csv_data.to_csv(f"{RESULTS_DIR}/{submission_id}/{CSV_FILE}", index=False) #add any new model info to model_info.json update_new_backbones_and_models() #reset NEW_SUBMISSION_FOLDER os.system(f"rm -r {NEW_SUBMISSION_FOLDER}/") os.makedirs(NEW_SUBMISSION_FOLDER) return else: print("\nThe new sumbission has not been formatted correctly. Please fix the errors above") raise ValueError if __name__ == "__main__": validate_new_submission()