PROBE

Sleeping

App Files Files Community

PROBE / src /saving_utils.py

mgyigit

Update src/saving_utils.py

f5e0909 verified 12 months ago

raw

history blame

12.8 kB

	import os
	import pandas as pd

	from huggingface_hub import HfApi

	script_dir = os.path.dirname(os.path.abspath(__file__)) # Directory of the running script



	def download_files_from_hub(benchmark_types, repo_id="mgyigit/probe-data", repo_type="space"):
	api = HfApi(token=os.getenv("api-key")) #load api-key secret

	benchmark_types += "leaderboard"
	for benchmark in benchmark_types:
	file_name = f"{benchmark}_results.csv"
	local_path = f"/tmp/{file_name}"

	try:
	# Download the file from the specified repo
	api.download_file(
	repo_id=repo_id,
	path_in_repo=file_name,
	local_dir="/tmp",
	repo_type=repo_type,
	)
	print(f"Downloaded {file_name} from {repo_id} to {local_path}")

	except Exception as e:
	print(f"Failed to download {file_name}: {e}")


	return 0


	def upload_to_hub(benchmark_types, repo_id="mgyigit/probe-data", repo_type="space"):
	api = HfApi(token=os.getenv("api_key")) # Requires authentication via HF_TOKEN
	benchmark_types += "leaderboard"

	for benchmark in benchmark_types:
	file_name = f"{benchmark}_results.csv"
	local_path = f"/tmp/{file_name}"

	api.upload_file(
	path_or_fileobj=local_path,
	path_in_repo=file_name,
	repo_id=repo_id,
	repo_type=repo_type,
	commit_message=f"Updating {file_name}"
	)
	print(f"Uploaded {local_path} to {repo_id}/{file_name}")

	os.remove(local_path)
	print(f"Removed local file: {file_path}")

	return 0


	def save_csv_locally(dataframe, file_name, save_dir="/tmp"):
	# Ensure the save directory exists
	os.makedirs(save_dir, exist_ok=True)

	# Construct the full file path
	file_path = os.path.join(save_dir, file_name)

	# Save the DataFrame as a CSV
	dataframe.to_csv(file_path, index=False)
	print(f"Saved {file_name} to {file_path}")

	return file_path


	def save_similarity_output(
	output_dict,
	method_name,
	leaderboard_path="/tmp/leaderboard_results.csv",
	similarity_path="/tmp/similarity_results.csv",
	repo_id="mgyigit/probe-data",
	):
	# Load or initialize the DataFrames
	if os.path.exists(leaderboard_path):
	leaderboard_df = pd.read_csv(leaderboard_path)
	else:
	print("Leaderboard file not found!")
	return -1

	if os.path.exists(similarity_path):
	similarity_df = pd.read_csv(similarity_path)
	else:
	print("Similarity file not found!")
	return -1

	if method_name not in similarity_df['Method'].values:
	# Create a new row for the method with default values
	new_row = {col: None for col in similarity_df.columns}
	new_row['Method'] = method_name
	similarity_df = pd.concat([similarity_df, pd.DataFrame([new_row])], ignore_index=True)

	if method_name not in leaderboard_df['Method'].values:
	new_row = {col: None for col in leaderboard_df.columns}
	new_row['Method'] = method_name
	leaderboard_df = pd.concat([leaderboard_df, pd.DataFrame([new_row])], ignore_index=True)

	averages = {}
	for dataset in ['sparse', '200', '500']:
	correlation_values = []
	pvalue_values = []

	for aspect in ['MF', 'BP', 'CC']:
	correlation_key = f"{dataset}_{aspect}_correlation"
	pvalue_key = f"{dataset}_{aspect}_pvalue"

	# Update correlation if present
	if correlation_key in output_dict:
	correlation = output_dict[correlation_key].item()
	correlation_values.append(correlation)
	similarity_df.loc[similarity_df['Method'] == method_name, correlation_key] = correlation
	leaderboard_df.loc[leaderboard_df['Method'] == method_name, f"sim_{correlation_key}"] = correlation

	# Update p-value if present
	if pvalue_key in output_dict:
	pvalue = output_dict[pvalue_key].item()
	pvalue_values.append(pvalue)
	similarity_df.loc[similarity_df['Method'] == method_name, pvalue_key] = pvalue
	leaderboard_df.loc[leaderboard_df['Method'] == method_name, f"sim_{pvalue_key}"] = pvalue

	# Calculate averages if all three aspects are present
	if len(correlation_values) == 3:
	averages[f"{dataset}_Ave_correlation"] = sum(correlation_values) / 3
	similarity_df.loc[similarity_df['Method'] == method_name, f"{dataset}_Ave_correlation"] = averages[f"{dataset}_Ave_correlation"]
	leaderboard_df.loc[leaderboard_df['Method'] == method_name, f"sim_{dataset}_Ave_correlation"] = averages[f"{dataset}_Ave_correlation"]

	if len(pvalue_values) == 3:
	averages[f"{dataset}_Ave_pvalue"] = sum(pvalue_values) / 3
	similarity_df.loc[similarity_df['Method'] == method_name, f"{dataset}_Ave_pvalue"] = averages[f"{dataset}_Ave_pvalue"]
	leaderboard_df.loc[leaderboard_df['Method'] == method_name, f"sim_{dataset}_Ave_pvalue"] = averages[f"{dataset}_Ave_pvalue"]

	leaderboard_file = save_csv_locally(leaderboard_df, "leaderboard_results.csv")
	similarity_file = save_csv_locally(similarity_df, "similarity_results.csv")

	return 0

	def save_function_output(model_output, method_name, func_results_path="/home/user/app/src/data/function_results.csv", leaderboard_path="/home/user/app/src/data/leaderboard_results.csv"):
	# Load or initialize the DataFrames
	if os.path.exists(func_results_path):
	func_results_df = pd.read_csv(func_results_path)
	else:
	func_results_df = pd.DataFrame(columns=['Method'])

	if os.path.exists(leaderboard_path):
	leaderboard_df = pd.read_csv(leaderboard_path)
	else:
	leaderboard_df = pd.DataFrame()

	# Ensure the method_name row exists in function results
	if method_name not in func_results_df['Method'].values:
	func_results_df = pd.concat([func_results_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)

	# Storage for averaging in leaderboard results
	metrics_sum = {
	'accuracy': {'BP': [], 'CC': [], 'MF': []},
	'F1': {'BP': [], 'CC': [], 'MF': []},
	'precision': {'BP': [], 'CC': [], 'MF': []},
	'recall': {'BP': [], 'CC': [], 'MF': []}
	}

	# Iterate over each entry in model_output
	for entry in model_output:
	key = entry[0]
	accuracy, f1, precision, recall = entry[1], entry[4], entry[7], entry[10]

	# Parse the key to extract the aspect and datasets
	aspect, dataset1, dataset2 = key.split('_')

	# Save each metric to function_results under its respective column
	func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_accuracy"] = accuracy
	func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_F1"] = f1
	func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_precision"] = precision
	func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_recall"] = recall

	# Add values for leaderboard averaging
	metrics_sum['accuracy'][aspect].append(accuracy)
	metrics_sum['F1'][aspect].append(f1)
	metrics_sum['precision'][aspect].append(precision)
	metrics_sum['recall'][aspect].append(recall)

	# Calculate averages for each aspect and overall (if all aspects have entries)
	for metric in ['accuracy', 'F1', 'precision', 'recall']:
	for aspect in ['BP', 'CC', 'MF']:
	if metrics_sum[metric][aspect]:
	aspect_average = sum(metrics_sum[metric][aspect]) / len(metrics_sum[metric][aspect])
	leaderboard_df.at[0, f"func_{aspect}_{metric}"] = aspect_average

	# Calculate overall average if each aspect has entries
	if all(metrics_sum[metric][aspect] for aspect in ['BP', 'CC', 'MF']):
	overall_average = sum(
	sum(metrics_sum[metric][aspect]) / len(metrics_sum[metric][aspect])
	for aspect in ['BP', 'CC', 'MF']
	) / 3
	leaderboard_df.at[0, f"func_Ave_{metric}"] = overall_average

	# Save updated DataFrames to CSV
	func_results_df.to_csv(func_results_path, index=False)
	leaderboard_df.to_csv(leaderboard_path, index=False)

	return 0

	def save_family_output(model_output, method_name, leaderboard_path="/home/user/app/src/data/leaderboard_results.csv", family_results_path="/home/user/app/src/data/family_results.csv"):
	# Load or initialize the DataFrames
	if os.path.exists(leaderboard_path):
	leaderboard_df = pd.read_csv(leaderboard_path)
	else:
	leaderboard_df = pd.DataFrame(columns=['Method'])

	if os.path.exists(family_results_path):
	family_results_df = pd.read_csv(family_results_path)
	else:
	family_results_df = pd.DataFrame(columns=['Method'])

	# Ensure the method_name row exists in the leaderboard results
	if method_name not in leaderboard_df['Method'].values:
	leaderboard_df = pd.concat([leaderboard_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)

	# Ensure the method_name row exists in family results
	if method_name not in family_results_df['Method'].values:
	family_results_df = pd.concat([family_results_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)

	# Iterate through the datasets and metrics
	for dataset, metrics in model_output.items():
	for metric, values in metrics.items():
	# Calculate the average for each metric in leaderboard results
	avg_value = sum(values) / len(values) if values else None
	leaderboard_df.at[leaderboard_df['Method'] == method_name, f"fam_{dataset}_{metric}_ave"] = avg_value

	# Save each fold result for family results
	for i, value in enumerate(values):
	family_results_df.at[family_results_df['Method'] == method_name, f"{dataset}_{metric}_{i}"] = value

	# Save updated DataFrames to CSV
	leaderboard_df.to_csv(leaderboard_path, index=False)
	family_results_df.to_csv(family_results_path, index=False)

	return leaderboard_df, family_results_df

	def save_affinity_output(model_output, method_name, leaderboard_path="/home/user/app/src/data/leaderboard_results.csv", affinity_results_path="/home/user/app/src/data/affinity_results.csv"):
	# Load or initialize DataFrames
	if os.path.exists(leaderboard_path):
	leaderboard_df = pd.read_csv(leaderboard_path)
	else:
	leaderboard_df = pd.DataFrame(columns=['Method'])

	if os.path.exists(affinity_results_path):
	affinity_results_df = pd.read_csv(affinity_results_path)
	else:
	affinity_results_df = pd.DataFrame(columns=['Method'])

	# Ensure the method_name row exists in the leaderboard results
	if method_name not in leaderboard_df['Method'].values:
	leaderboard_df = pd.concat([leaderboard_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)

	# Ensure the method_name row exists in affinity results
	if method_name not in affinity_results_df['Method'].values:
	affinity_results_df = pd.concat([affinity_results_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)

	# Process 'summary' section for leaderboard results
	summary = model_output.get('summary', {})
	if summary:
	leaderboard_df.at[leaderboard_df['Method'] == method_name, 'aff_mse_ave'] = summary.get('val_mse_error')
	leaderboard_df.at[leaderboard_df['Method'] == method_name, 'aff_mae_ave'] = summary.get('val_mae_error')
	leaderboard_df.at[leaderboard_df['Method'] == method_name, 'aff_corr_ave'] = summary.get('validation_corr')

	# Process 'detail' section for affinity results
	detail = model_output.get('detail', {})
	if detail:
	# Save each 10-fold cross-validation result for mse, mae, and corr
	for i in range(10):
	if 'val_mse_errors' in detail:
	affinity_results_df.at[affinity_results_df['Method'] == method_name, f"mse_{i}"] = detail['val_mse_errors'][i]
	if 'val_mae_errors' in detail:
	affinity_results_df.at[affinity_results_df['Method'] == method_name, f"mae_{i}"] = detail['val_mae_errors'][i]
	if 'validation_corrs' in detail:
	affinity_results_df.at[affinity_results_df['Method'] == method_name, f"corr_{i}"] = detail['validation_corrs'][i]

	# Save updated DataFrames to CSV
	leaderboard_df.to_csv(leaderboard_path, index=False)
	affinity_results_df.to_csv(affinity_results_path, index=False)

	return 0