import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, f1_score from scipy.stats import spearmanr from collections import defaultdict import pandas as pd import logging import os import torch.optim as optim from datetime import datetime from transformers import AutoModel, AutoConfig, AutoTokenizer import os # point HF_ENDPOINT at your mirror # os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" class UnpooledBindingPredictor(nn.Module): def __init__(self, esm_model_name="facebook/esm2_t33_650M_UR50D", hidden_dim=512, kernel_sizes=[3, 5, 7], n_heads=8, n_layers=3, dropout=0.1, freeze_esm=True): super().__init__() # Define binding thresholds self.tight_threshold = 7.5 # Kd/Ki/IC50 ≤ ~30nM self.weak_threshold = 6.0 # Kd/Ki/IC50 > 1μM # Load ESM model for computing embeddings on the fly self.esm_model = AutoModel.from_pretrained(esm_model_name) self.config = AutoConfig.from_pretrained(esm_model_name) # Freeze ESM parameters if needed if freeze_esm: for param in self.esm_model.parameters(): param.requires_grad = False # Get ESM hidden size esm_dim = self.config.hidden_size # Output channels for CNN layers output_channels_per_kernel = 64 # CNN layers for handling variable length sequences self.protein_conv_layers = nn.ModuleList([ nn.Conv1d( in_channels=esm_dim, out_channels=output_channels_per_kernel, kernel_size=k, padding='same' ) for k in kernel_sizes ]) self.binder_conv_layers = nn.ModuleList([ nn.Conv1d( in_channels=esm_dim, out_channels=output_channels_per_kernel, kernel_size=k, padding='same' ) for k in kernel_sizes ]) # Calculate total features after convolution and pooling total_features_per_seq = output_channels_per_kernel * len(kernel_sizes) * 2 # Project to same dimension after CNN processing self.protein_projection = nn.Linear(total_features_per_seq, hidden_dim) self.binder_projection = nn.Linear(total_features_per_seq, hidden_dim) self.protein_norm = nn.LayerNorm(hidden_dim) self.binder_norm = nn.LayerNorm(hidden_dim) # Cross attention blocks with layer norm self.cross_attention_layers = nn.ModuleList([ nn.ModuleDict({ 'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout), 'norm1': nn.LayerNorm(hidden_dim), 'ffn': nn.Sequential( nn.Linear(hidden_dim, hidden_dim * 4), nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden_dim * 4, hidden_dim) ), 'norm2': nn.LayerNorm(hidden_dim) }) for _ in range(n_layers) ]) # Prediction heads self.shared_head = nn.Sequential( nn.Linear(hidden_dim * 2, hidden_dim), nn.ReLU(), nn.Dropout(dropout), ) # Regression head self.regression_head = nn.Linear(hidden_dim, 1) # Classification head (3 classes: tight, medium, loose binding) self.classification_head = nn.Linear(hidden_dim, 3) def get_binding_class(self, affinity): """Convert affinity values to class indices 0: tight binding (>= 7.5) 1: medium binding (6.0-7.5) 2: weak binding (< 6.0) """ if isinstance(affinity, torch.Tensor): tight_mask = affinity >= self.tight_threshold weak_mask = affinity < self.weak_threshold medium_mask = ~(tight_mask | weak_mask) classes = torch.zeros_like(affinity, dtype=torch.long) classes[medium_mask] = 1 classes[weak_mask] = 2 return classes else: if affinity >= self.tight_threshold: return 0 # tight binding elif affinity < self.weak_threshold: return 2 # weak binding else: return 1 # medium binding def compute_embeddings(self, input_ids, attention_mask=None): """Compute ESM embeddings on the fly""" esm_outputs = self.esm_model( input_ids=input_ids, attention_mask=attention_mask, return_dict=True ) # Get the unpooled last hidden states (batch_size x seq_length x hidden_size) return esm_outputs.last_hidden_state def process_sequence(self, unpooled_emb, conv_layers, attention_mask=None): """Process a sequence through CNN layers and pooling""" # Transpose for CNN: [batch_size, hidden_size, seq_length] x = unpooled_emb.transpose(1, 2) # Apply CNN layers and collect outputs conv_outputs = [] for conv in conv_layers: conv_out = F.relu(conv(x)) conv_outputs.append(conv_out) # Concatenate along channel dimension conv_output = torch.cat(conv_outputs, dim=1) # Global pooling (both max and average) # If attention mask is provided, use it to create a proper mask for pooling if attention_mask is not None: # Create a mask for pooling (1 for valid positions, 0 for padding) # Expand mask to match conv_output channels expanded_mask = attention_mask.unsqueeze(1).expand(-1, conv_output.size(1), -1) # Apply mask (set padding to large negative value for max pooling) masked_output = conv_output.clone() masked_output = masked_output.masked_fill(expanded_mask == 0, float('-inf')) # Max pooling along sequence dimension max_pooled = torch.max(masked_output, dim=2)[0] # Average pooling (sum divided by number of valid positions) sum_pooled = torch.sum(conv_output * expanded_mask, dim=2) valid_positions = torch.sum(expanded_mask, dim=2) valid_positions = torch.clamp(valid_positions, min=1.0) # Avoid division by zero avg_pooled = sum_pooled / valid_positions else: # If no mask, use standard pooling max_pooled = torch.max(conv_output, dim=2)[0] avg_pooled = torch.mean(conv_output, dim=2) # Concatenate the pooled features pooled = torch.cat([max_pooled, avg_pooled], dim=1) return pooled def forward(self, protein_input_ids, binder_input_ids, protein_mask=None, binder_mask=None): # Compute embeddings on the fly using the ESM model protein_unpooled = self.compute_embeddings(protein_input_ids, protein_mask) binder_unpooled = self.compute_embeddings(binder_input_ids, binder_mask) # Process protein and binder sequences through CNN layers protein_features = self.process_sequence(protein_unpooled, self.protein_conv_layers, protein_mask) binder_features = self.process_sequence(binder_unpooled, self.binder_conv_layers, binder_mask) # Project to same dimension protein = self.protein_norm(self.protein_projection(protein_features)) binder = self.binder_norm(self.binder_projection(binder_features)) # Reshape for attention: from [batch_size, hidden_dim] to [1, batch_size, hidden_dim] protein = protein.unsqueeze(0) binder = binder.unsqueeze(0) # Cross attention layers for layer in self.cross_attention_layers: # Protein attending to binder attended_protein = layer['attention']( protein, binder, binder )[0] protein = layer['norm1'](protein + attended_protein) protein = layer['norm2'](protein + layer['ffn'](protein)) # Binder attending to protein attended_binder = layer['attention']( binder, protein, protein )[0] binder = layer['norm1'](binder + attended_binder) binder = layer['norm2'](binder + layer['ffn'](binder)) # Remove sequence dimension protein_pool = protein.squeeze(0) binder_pool = binder.squeeze(0) # Concatenate both representations combined = torch.cat([protein_pool, binder_pool], dim=-1) # Shared features shared_features = self.shared_head(combined) regression_output = self.regression_head(shared_features) classification_logits = self.classification_head(shared_features) return regression_output, classification_logits def load_model(checkpoint_path, device): """Load trained model from checkpoint.""" checkpoint = torch.load(checkpoint_path, map_location=device) # Import the model class from your module or redefine it here # Initialize model with the same parameters used during training model = UnpooledBindingPredictor( esm_model_name="facebook/esm2_t33_650M_UR50D", hidden_dim=384, kernel_sizes=[3, 5, 7], n_heads=8, n_layers=4, dropout=0.14561457009902096, freeze_esm=True ).to(device) # Load the trained weights model.load_state_dict(checkpoint['model_state_dict']) model.eval() # Set to evaluation mode return model def prepare_inputs(protein_sequence, binder_sequence, tokenizer, max_length=1024, device='cuda'): """Tokenize protein and binder sequences.""" protein_tokens = tokenizer( protein_sequence, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True ) binder_tokens = tokenizer( binder_sequence, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True ) return { 'protein_input_ids': protein_tokens['input_ids'].to(device), 'protein_attention_mask': protein_tokens['attention_mask'].to(device), 'binder_input_ids': binder_tokens['input_ids'].to(device), 'binder_attention_mask': binder_tokens['attention_mask'].to(device) } # Perform prediction def predict_binding(model, protein_sequence, binder_sequence, device='cuda'): """Predict binding affinity between protein and binder sequences.""" tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D") inputs = prepare_inputs(protein_sequence, binder_sequence, tokenizer, device=device) with torch.no_grad(): regression_output, classification_logits = model( inputs['protein_input_ids'], inputs['binder_input_ids'], inputs['protein_attention_mask'], inputs['binder_attention_mask'] ) # Get numerical prediction (pKd/pKi) predicted_affinity = regression_output.item() # Get classification prediction (tight, medium, weak) predicted_class_idx = torch.argmax(classification_logits, dim=1).item() class_names = ['Tight binding', 'Medium binding', 'Weak binding'] predicted_class = class_names[predicted_class_idx] # Get class probabilities class_probs = F.softmax(classification_logits, dim=1).cpu().numpy()[0] return { 'predicted_affinity': predicted_affinity, 'binding_class': predicted_class, 'class_probabilities': {name: prob for name, prob in zip(class_names, class_probs)}, 'tight_threshold': model.tight_threshold, # 7.5 (≤ ~30nM) 'weak_threshold': model.weak_threshold # 6.0 (> 1μM) } # Example usage if __name__ == "__main__": # Set device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Load the model model = load_model('/scratch/pranamlab/tong/checkpoints/MOG-DFM/classifier_ckpt/binding_affinity_unpooled.pt', device) # Example protein sequences (replace with actual sequences) binders = ['GLSKGCFGLKLDRIGSMSGLGC', 'RGLSDGFLKLKMGISGSLGC'] protein_sequence = "RNLTLAVVLPEHNLSYAWAWPRVGPAVALAVEALGRALPVDLRFVSSELEGACSEYLAPLSAVDLKLYHDPDLLLGPGCVYPAASVARFASHWRLPLLTAGAVASGFSAKNDHYRTLVRTGPSAPKLGEFVVTLHGHFNWTARAALLYLDARTDDRPHYFTIEGVFEALQGSNLSVQHQVYAREPGGPEQATHFIRANGRIVYICGPLEMLHEILLQAQRENLTNGDYVFFYLDVFGESLRAGPTRATGRPWQDNRTREQAQALREAFQTVLVITYREPPNPEYQEFQNRLLIRAREDFGVELGPSLMNLIAGCFYDGILLYAEVLNETIQEGGTREDGLRIVEKMQGRRYHGVTGLVVMDKNNDRETDFVLWAMGDLDSGDFQPAAHYSGAEKQIWWTGRPIPWVKGAPPSDNPPCAFDLDDPSCDKTPLSTLAI" # name = "CLIC1_10_moppit" # print(name) # with open(f'/home/tc415/flow_matching/samples/unconditional_samples/12.txt', 'r') as f: # binders = f.readlines() # binders = [binder.strip() for binder in binders] # binders = binders[:100] # # Make prediction affinities = [] for binder in binders: result = predict_binding(model, protein_sequence, binder, device) print(result['predicted_affinity']) affinities.append(result['predicted_affinity']) # with open('/home/tc415/flow_matching/scores/affinity/EWSFLI1_12_unconditional.txt', 'w') as f: # for score in affinities: # f.write(str(score) + '\n') # print(sum(affinities) / len(affinities)) # with open(f'/home/tc415/flow_matching/scores/affinity/{name}.txt', 'w') as f: # for score in affinities: # f.write(str(round(score, 4)) + '\n') # Display results # print(f"Predicted binding affinity (pKd/pKi): {result['predicted_affinity']:.2f}") # print(f"Binding class: {result['binding_class']}") # print("Class probabilities:") # for class_name, prob in result['class_probabilities'].items(): # print(f" {class_name}: {prob:.2f}")