Spaces:
Sleeping
Sleeping
Update src/bin/binding_affinity_estimator.py
Browse files
src/bin/binding_affinity_estimator.py
CHANGED
|
@@ -12,8 +12,7 @@ sys.path.append('.')
|
|
| 12 |
|
| 13 |
from sklearn import linear_model
|
| 14 |
from sklearn.model_selection import KFold
|
| 15 |
-
from sklearn.metrics import mean_squared_error,mean_absolute_error
|
| 16 |
-
from sklearn.ensemble import RandomForestRegressor
|
| 17 |
from sklearn.preprocessing import MinMaxScaler
|
| 18 |
|
| 19 |
skempi_vectors_path = None
|
|
@@ -22,35 +21,35 @@ representation_name = None
|
|
| 22 |
def load_representation(multi_col_representation_vector_file_path):
|
| 23 |
print("\nLoading representation vectors...\n")
|
| 24 |
multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
|
| 25 |
-
vals = multi_col_representation_vector.iloc[:,1:(len(multi_col_representation_vector.columns))]
|
| 26 |
-
original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'),'Vector': pd.Series([], dtype='object')})
|
| 27 |
-
for index, row in tqdm.tqdm(vals.iterrows(), total
|
| 28 |
list_of_floats = [float(item) for item in list(row)]
|
| 29 |
original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
|
| 30 |
return original_values_as_df
|
| 31 |
|
| 32 |
def calc_train_error(X_train, y_train, model):
|
| 33 |
-
'''
|
| 34 |
predictions = model.predict(X_train)
|
| 35 |
mse = mean_squared_error(y_train, predictions)
|
| 36 |
mae = mean_absolute_error(y_train, predictions)
|
| 37 |
corr = scipy.stats.pearsonr(y_train, predictions)
|
| 38 |
-
return mse,mae,corr
|
| 39 |
-
|
| 40 |
def calc_validation_error(X_test, y_test, model):
|
| 41 |
-
'''
|
| 42 |
predictions = model.predict(X_test)
|
| 43 |
mse = mean_squared_error(y_test, predictions)
|
| 44 |
mae = mean_absolute_error(y_test, predictions)
|
| 45 |
corr = scipy.stats.pearsonr(y_test, predictions)
|
| 46 |
-
return mse,mae,corr
|
| 47 |
-
|
| 48 |
def calc_metrics(X_train, y_train, X_test, y_test, model):
|
| 49 |
-
'''
|
| 50 |
model.fit(X_train, y_train)
|
| 51 |
-
train_mse_error,train_mae_error,train_corr = calc_train_error(X_train, y_train, model)
|
| 52 |
-
val_mse_error,val_mae_error,val_corr = calc_validation_error(X_test, y_test, model)
|
| 53 |
-
return train_mse_error, val_mse_error, train_mae_error, val_mae_error,train_corr,val_corr
|
| 54 |
|
| 55 |
def report_results(
|
| 56 |
train_mse_error_list,
|
|
@@ -62,41 +61,35 @@ def report_results(
|
|
| 62 |
train_corr_pval_list,
|
| 63 |
validation_corr_pval_list,
|
| 64 |
):
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
"validation_corr_pval": list(np.multiply(validation_corr_pval_list, 100)),
|
| 93 |
-
},
|
| 94 |
-
index=range(len(train_mse_error_list)),
|
| 95 |
-
)
|
| 96 |
-
return result_df, result_detail_df
|
| 97 |
-
|
| 98 |
|
| 99 |
-
def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
| 100 |
K = 10
|
| 101 |
kf = KFold(n_splits=K, shuffle=True, random_state=42)
|
| 102 |
|
|
@@ -110,13 +103,15 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
| 110 |
validation_corr_pval_list = []
|
| 111 |
|
| 112 |
data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
|
| 113 |
-
ppi_affinity_filtered_df = ppi_affinity_df
|
| 114 |
-
|
| 115 |
-
|
|
|
|
| 116 |
target = np.array(ppi_affinity_filtered_df["Affinity"])
|
| 117 |
scaler = MinMaxScaler()
|
| 118 |
scaler.fit(target.reshape(-1, 1))
|
| 119 |
target = scaler.transform(target.reshape(-1, 1))[:, 0]
|
|
|
|
| 120 |
for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
|
| 121 |
|
| 122 |
# split data
|
|
@@ -124,9 +119,9 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
| 124 |
y_train, y_val = target[train_index], target[val_index]
|
| 125 |
|
| 126 |
# instantiate model
|
| 127 |
-
reg = regressor_model
|
| 128 |
|
| 129 |
-
# calculate
|
| 130 |
(
|
| 131 |
train_mse_error,
|
| 132 |
val_mse_error,
|
|
@@ -136,7 +131,7 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
| 136 |
val_corr,
|
| 137 |
) = calc_metrics(X_train, y_train, X_val, y_val, reg)
|
| 138 |
|
| 139 |
-
# append to appropriate
|
| 140 |
train_mse_error_list.append(train_mse_error)
|
| 141 |
validation_mse_error_list.append(val_mse_error)
|
| 142 |
|
|
@@ -162,35 +157,40 @@ def predictAffinityWithModel(regressor_model,multiplied_vectors_df):
|
|
| 162 |
|
| 163 |
ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
|
| 164 |
ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
|
| 165 |
-
ppi_affinity_df = pd.read_csv(ppi_affinity_file,sep="\t",header=None)
|
| 166 |
ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
|
| 167 |
|
| 168 |
-
#Calculate vector element-wise multiplication as described in https://academic.oup.com/bioinformatics/article/35/14/i305/5529260
|
| 169 |
-
|
| 170 |
def calculate_vector_multiplications(skempi_vectors_df):
|
| 171 |
-
multiplied_vectors = pd.DataFrame({
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
| 174 |
print("Element-wise vector multiplications are being calculated")
|
| 175 |
rep_prot_list = list(skempi_vectors_df['PDB_ID'])
|
| 176 |
-
|
|
|
|
| 177 |
if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
|
| 178 |
-
vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID']
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
| 187 |
return multiplied_vectors
|
| 188 |
|
| 189 |
def predict_affinities_and_report_results():
|
| 190 |
skempi_vectors_df = load_representation(skempi_vectors_path)
|
| 191 |
multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
|
| 192 |
model = linear_model.BayesianRidge()
|
| 193 |
-
|
| 194 |
-
result_df.to_csv(os.path.join(script_dir, r"../results/Affinity_prediction_skempiv1_{0}.csv".format(representation_name)),index=False)
|
| 195 |
-
result_detail_df.to_csv(os.path.join(script_dir, r"../results/Affinity_prediction_skempiv1_{0}_detail.csv".format(representation_name)),index=False)
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
from sklearn import linear_model
|
| 14 |
from sklearn.model_selection import KFold
|
| 15 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
|
|
|
| 16 |
from sklearn.preprocessing import MinMaxScaler
|
| 17 |
|
| 18 |
skempi_vectors_path = None
|
|
|
|
| 21 |
def load_representation(multi_col_representation_vector_file_path):
|
| 22 |
print("\nLoading representation vectors...\n")
|
| 23 |
multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
|
| 24 |
+
vals = multi_col_representation_vector.iloc[:, 1:(len(multi_col_representation_vector.columns))]
|
| 25 |
+
original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'), 'Vector': pd.Series([], dtype='object')})
|
| 26 |
+
for index, row in tqdm.tqdm(vals.iterrows(), total=len(vals)):
|
| 27 |
list_of_floats = [float(item) for item in list(row)]
|
| 28 |
original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
|
| 29 |
return original_values_as_df
|
| 30 |
|
| 31 |
def calc_train_error(X_train, y_train, model):
|
| 32 |
+
'''Returns in-sample error for an already fit model.'''
|
| 33 |
predictions = model.predict(X_train)
|
| 34 |
mse = mean_squared_error(y_train, predictions)
|
| 35 |
mae = mean_absolute_error(y_train, predictions)
|
| 36 |
corr = scipy.stats.pearsonr(y_train, predictions)
|
| 37 |
+
return mse, mae, corr
|
| 38 |
+
|
| 39 |
def calc_validation_error(X_test, y_test, model):
|
| 40 |
+
'''Returns out-of-sample error for an already fit model.'''
|
| 41 |
predictions = model.predict(X_test)
|
| 42 |
mse = mean_squared_error(y_test, predictions)
|
| 43 |
mae = mean_absolute_error(y_test, predictions)
|
| 44 |
corr = scipy.stats.pearsonr(y_test, predictions)
|
| 45 |
+
return mse, mae, corr
|
| 46 |
+
|
| 47 |
def calc_metrics(X_train, y_train, X_test, y_test, model):
|
| 48 |
+
'''Fits the model and returns the metrics for in-sample and out-of-sample errors.'''
|
| 49 |
model.fit(X_train, y_train)
|
| 50 |
+
train_mse_error, train_mae_error, train_corr = calc_train_error(X_train, y_train, model)
|
| 51 |
+
val_mse_error, val_mae_error, val_corr = calc_validation_error(X_test, y_test, model)
|
| 52 |
+
return train_mse_error, val_mse_error, train_mae_error, val_mae_error, train_corr, val_corr
|
| 53 |
|
| 54 |
def report_results(
|
| 55 |
train_mse_error_list,
|
|
|
|
| 61 |
train_corr_pval_list,
|
| 62 |
validation_corr_pval_list,
|
| 63 |
):
|
| 64 |
+
result_summary = {
|
| 65 |
+
"train_mse_error": round(np.mean(train_mse_error_list) * 100, 4),
|
| 66 |
+
"train_mse_std": round(np.std(train_mse_error_list) * 100, 4),
|
| 67 |
+
"val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
|
| 68 |
+
"val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
|
| 69 |
+
"train_mae_error": round(np.mean(train_mae_error_list) * 100, 4),
|
| 70 |
+
"train_mae_std": round(np.std(train_mae_error_list) * 100, 4),
|
| 71 |
+
"val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
|
| 72 |
+
"val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
|
| 73 |
+
"train_corr": round(np.mean(train_corr_list), 4),
|
| 74 |
+
"train_corr_pval": round(np.mean(train_corr_pval_list), 4),
|
| 75 |
+
"validation_corr": round(np.mean(validation_corr_list), 4),
|
| 76 |
+
"validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
result_detail = {
|
| 80 |
+
"train_mse_errors": list(np.multiply(train_mse_error_list, 100)),
|
| 81 |
+
"val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
|
| 82 |
+
"train_mae_errors": list(np.multiply(train_mae_error_list, 100)),
|
| 83 |
+
"val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
|
| 84 |
+
"train_corrs": list(np.multiply(train_corr_list, 100)),
|
| 85 |
+
"train_corr_pvals": list(np.multiply(train_corr_pval_list, 100)),
|
| 86 |
+
"validation_corrs": list(np.multiply(validation_corr_list, 100)),
|
| 87 |
+
"validation_corr_pvals": list(np.multiply(validation_corr_pval_list, 100)),
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
return result_summary, result_detail
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
def predictAffinityWithModel(regressor_model, multiplied_vectors_df):
|
| 93 |
K = 10
|
| 94 |
kf = KFold(n_splits=K, shuffle=True, random_state=42)
|
| 95 |
|
|
|
|
| 103 |
validation_corr_pval_list = []
|
| 104 |
|
| 105 |
data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
|
| 106 |
+
ppi_affinity_filtered_df = ppi_affinity_df[
|
| 107 |
+
ppi_affinity_df['Protein1'].isin(multiplied_vectors_df['Protein1']) &
|
| 108 |
+
ppi_affinity_df['Protein2'].isin(multiplied_vectors_df['Protein2'])
|
| 109 |
+
]
|
| 110 |
target = np.array(ppi_affinity_filtered_df["Affinity"])
|
| 111 |
scaler = MinMaxScaler()
|
| 112 |
scaler.fit(target.reshape(-1, 1))
|
| 113 |
target = scaler.transform(target.reshape(-1, 1))[:, 0]
|
| 114 |
+
|
| 115 |
for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
|
| 116 |
|
| 117 |
# split data
|
|
|
|
| 119 |
y_train, y_val = target[train_index], target[val_index]
|
| 120 |
|
| 121 |
# instantiate model
|
| 122 |
+
reg = regressor_model
|
| 123 |
|
| 124 |
+
# calculate errors
|
| 125 |
(
|
| 126 |
train_mse_error,
|
| 127 |
val_mse_error,
|
|
|
|
| 131 |
val_corr,
|
| 132 |
) = calc_metrics(X_train, y_train, X_val, y_val, reg)
|
| 133 |
|
| 134 |
+
# append to appropriate lists
|
| 135 |
train_mse_error_list.append(train_mse_error)
|
| 136 |
validation_mse_error_list.append(val_mse_error)
|
| 137 |
|
|
|
|
| 157 |
|
| 158 |
ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
|
| 159 |
ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
|
| 160 |
+
ppi_affinity_df = pd.read_csv(ppi_affinity_file, sep="\t", header=None)
|
| 161 |
ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
|
| 162 |
|
|
|
|
|
|
|
| 163 |
def calculate_vector_multiplications(skempi_vectors_df):
|
| 164 |
+
multiplied_vectors = pd.DataFrame({
|
| 165 |
+
'Protein1': pd.Series([], dtype='str'),
|
| 166 |
+
'Protein2': pd.Series([], dtype='str'),
|
| 167 |
+
'Vector': pd.Series([], dtype='object')
|
| 168 |
+
})
|
| 169 |
print("Element-wise vector multiplications are being calculated")
|
| 170 |
rep_prot_list = list(skempi_vectors_df['PDB_ID'])
|
| 171 |
+
|
| 172 |
+
for index, row in tqdm.tqdm(ppi_affinity_df.iterrows()):
|
| 173 |
if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
|
| 174 |
+
vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein1']]['Vector'])[0]
|
| 175 |
+
vec2 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein2']]['Vector'])[0]
|
| 176 |
+
multiplied_vec = np.multiply(vec1, vec2)
|
| 177 |
+
|
| 178 |
+
multiplied_vectors = multiplied_vectors.append({
|
| 179 |
+
'Protein1': row['Protein1'],
|
| 180 |
+
'Protein2': row['Protein2'],
|
| 181 |
+
'Vector': multiplied_vec
|
| 182 |
+
}, ignore_index=True)
|
| 183 |
+
|
| 184 |
return multiplied_vectors
|
| 185 |
|
| 186 |
def predict_affinities_and_report_results():
|
| 187 |
skempi_vectors_df = load_representation(skempi_vectors_path)
|
| 188 |
multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
|
| 189 |
model = linear_model.BayesianRidge()
|
| 190 |
+
result_summary, result_detail = predictAffinityWithModel(model, multiplied_vectors_df)
|
|
|
|
|
|
|
| 191 |
|
| 192 |
+
# Return the results as a dictionary instead of writing to a file
|
| 193 |
+
return {
|
| 194 |
+
"summary": result_summary,
|
| 195 |
+
"detail": result_detail
|
| 196 |
+
}
|