Spaces:
Sleeping
Sleeping
| import logging | |
| import sys | |
| import numpy as np | |
| sys.path.append("../") | |
| # from tdc.multi_pred import GDA | |
| import pandas as pd | |
| from torch.utils.data import Dataset | |
| LOGGER = logging.getLogger(__name__) | |
| class GDA_Dataset(Dataset): | |
| """ | |
| Candidate Dataset for: | |
| ALL gene-to-disease interactions | |
| """ | |
| def __init__(self, data_examples): | |
| self.protein_seqs = data_examples[0] | |
| self.disease_dess = data_examples[1] | |
| self.scores = data_examples[2] | |
| def __getitem__(self, query_idx): | |
| protein_seq = self.protein_seqs[query_idx] | |
| disease_des = self.disease_dess[query_idx] | |
| score = self.scores[query_idx] | |
| return protein_seq, disease_des, score | |
| def __len__(self): | |
| return len(self.protein_seqs) | |
| class TDC_Pretrain_Dataset(Dataset): | |
| """ | |
| Dataset of TDC: | |
| ALL gene-disease associations | |
| """ | |
| def __init__(self, data_dir="../../data/pretrain/", test=False): | |
| LOGGER.info("Initializing TDC Pretraining Dataset ! ...") | |
| data = GDA(name="DisGeNET") # , path=data_dir | |
| data.neg_sample(frac = 1) | |
| data.binarize(threshold = 0, order = 'ascending') | |
| self.datasets = data.get_split() | |
| self.name = "DisGeNET" | |
| self.dataset_df = self.datasets['train'] | |
| # self.dataset_df = pd.read_csv(f"{data_dir}/disgenet_gda.csv") | |
| self.dataset_df = self.dataset_df[ | |
| ["Gene", "Disease", "Y"] | |
| ].dropna() # Drop missing values. | |
| # print(self.dataset_df.head()) | |
| print( | |
| f"{data_dir}TDC training dataset loaded, found associations: {len(self.dataset_df.index)}" | |
| ) | |
| self.protein_seqs = self.dataset_df["Gene"].values | |
| self.disease_dess = self.dataset_df["Disease"].values | |
| self.scores = len(self.dataset_df["Y"].values) * [1] | |
| def __getitem__(self, query_idx): | |
| protein_seq = self.protein_seqs[query_idx] | |
| disease_des = self.disease_dess[query_idx] | |
| score = self.scores[query_idx] | |
| return protein_seq, disease_des, score | |
| def __len__(self): | |
| return len(self.protein_seqs) | |
| class GDA_Pretrain_Dataset(Dataset): | |
| """ | |
| Candidate Dataset for: | |
| ALL gene-disease associations | |
| """ | |
| def __init__(self, data_dir="../../data/pretrain/", test=False, split="train", val_ratio=0.2): | |
| LOGGER.info("Initializing GDA Pretraining Dataset ! ...") | |
| self.dataset_df = pd.read_csv(f"{data_dir}/disgenet_gda.csv") | |
| self.dataset_df = self.dataset_df[["proteinSeq", "diseaseDes", "score"]].dropna() | |
| self.dataset_df = self.dataset_df.sample(frac=1, random_state=42).reset_index(drop=True) | |
| num_val_samples = int(len(self.dataset_df) * val_ratio) | |
| if split == "train": | |
| self.dataset_df = self.dataset_df[:-num_val_samples] | |
| print(f"{data_dir}disgenet_gda.csv loaded, found train associations: {len(self.dataset_df.index)}") | |
| elif split == "val": | |
| self.dataset_df = self.dataset_df[-num_val_samples:] | |
| print(f"{data_dir}disgenet_gda.csv loaded, found valid associations: {len(self.dataset_df.index)}") | |
| if test: | |
| self.protein_seqs = self.dataset_df["proteinSeq"].values[:128] | |
| self.disease_dess = self.dataset_df["diseaseDes"].values[:128] | |
| self.scores = 128 * [1] | |
| else: | |
| self.protein_seqs = self.dataset_df["proteinSeq"].values | |
| self.disease_dess = self.dataset_df["diseaseDes"].values | |
| self.scores = len(self.dataset_df["score"].values) * [1] | |
| def __getitem__(self, query_idx): | |
| protein_seq = self.protein_seqs[query_idx] | |
| disease_des = self.disease_dess[query_idx] | |
| score = self.scores[query_idx] | |
| return protein_seq, disease_des, score | |
| def __len__(self): | |
| return len(self.protein_seqs) | |
| # # 分离正负样本 | |
| # positive_samples = self.dataset_df[self.dataset_df["score"] == 1] | |
| # negative_samples = self.dataset_df[self.dataset_df["score"] == 0] | |
| # # 打乱并划分正样本 | |
| # positive_samples = positive_samples.sample(frac=1, random_state=42).reset_index(drop=True) | |
| # num_pos_val_samples = int(len(positive_samples) * val_ratio) | |
| # # 打乱并划分负样本 | |
| # negative_samples = negative_samples.sample(frac=1, random_state=42).reset_index(drop=True) | |
| # num_neg_val_samples = int(len(negative_samples) * val_ratio) | |
| # if split == "train": | |
| # self.dataset_df = pd.concat([positive_samples[:-num_pos_val_samples], negative_samples[:-num_neg_val_samples]]) | |
| # print(f"{data_dir}disgenet_gda.csv loaded, found associations: {len(self.dataset_df.index)}") | |
| # elif split == "val": | |
| # self.dataset_df = pd.concat([positive_samples[-num_pos_val_samples:], negative_samples[-num_neg_val_samples:]]) | |
| # print(f"{data_dir}disgenet_gda.csv loaded, found associations: {len(self.dataset_df.index)}") | |
| # Shuffle and split data | |
| # class GDA_Pretrain_Dataset(Dataset): | |
| # """ | |
| # Candidate Dataset for: | |
| # ALL gene-disease associations | |
| # """ | |
| # def __init__(self, data_dir="../../data/pretrain/", test=False): | |
| # LOGGER.info("Initializing GDA Pretraining Dataset ! ...") | |
| # updated = pd.read_csv(f"{data_dir}/disgenet_updated.csv") | |
| # data = GDA(name="DisGeNET") | |
| # data = data.get_data() | |
| # data = data[['Gene_ID','Disease_ID']].dropna() | |
| # self.dataset_df = pd.read_csv(f"{data_dir}/disgenet_gda.csv") | |
| # num_unique_diseaseId = self.dataset_df['diseaseId'].nunique() | |
| # num_unique_geneId = self.dataset_df['geneId'].nunique() | |
| # print(f"Number of unique 'diseaseId': {num_unique_diseaseId}") | |
| # print(f"Number of unique 'geneId': {num_unique_geneId}") | |
| # num_of_c0002395 = self.dataset_df[self.dataset_df['diseaseId'] == 'C0002395'].shape[0] | |
| # print(f"Alzheimer Number in 2020:{num_of_c0002395}") | |
| # Convert 'Gene_ID' and 'Disease_ID' to str before merge | |
| # data['Gene_ID'] = data['Gene_ID'].astype(str) | |
| # data['Disease_ID'] = data['Disease_ID'].astype(str) | |
| # Similarly for 'geneId' and 'diseaseId', if they're not already of type 'str' | |
| # self.dataset_df['geneId'] = self.dataset_df['geneId'].astype(str) | |
| # self.dataset_df['diseaseId'] = self.dataset_df['diseaseId'].astype(str) | |
| # # 合并两个DataFrame并找出不同的行 | |
| # merged = df.merge(self.dataset_df, how='outer', indicator=True) | |
| # differences = merged[merged['_merge'] != 'both'] | |
| # differences.to_csv('/nfs/dpa_pretrain/data/pretrain/differences.csv', index=False) | |
| # Check for overlap between TDC dataset and DisGeNET dataset | |
| # merged_df = pd.merge(data, self.dataset_df, how='inner', left_on=['Gene_ID','Disease_ID'], right_on=['geneId','diseaseId']) | |
| # num_matched_pairs = merged_df.shape[0] | |
| # print(f"Number of matched pairs TDC: {num_matched_pairs}") | |
| # merged_dis = pd.merge(data, updated, how='inner', left_on=['Gene','Disease'], right_on=['proteinSeq','diseaseDes']) | |
| # num_matched = merged_dis.shape[0] | |
| # print(f"Number of matched pairs DisGeNET_test: {num_matched}") | |
| # self.dataset_df = self.dataset_df[ | |
| # ["proteinSeq", "diseaseDes", "score"] | |
| # ].dropna() # Drop missing values. | |
| # print(self.dataset_df.head()) "proteinSeq", "diseaseDes", "score" | |
| # print( | |
| # f"{data_dir}disgenet_gda.csv loaded, found associations: {len(self.dataset_df.index)}" | |
| # ) | |
| # df1 = pd.read_csv(f"{data_dir}/disgenet_gda.csv") | |
| # df1 = df1[ | |
| # ["proteinSeq", "diseaseDes", "score"] | |
| # ].dropna() | |
| # # 合并两个DataFrame并找出不同的行 | |
| # merged = df1.merge(self.dataset_df, how='outer', indicator=True) | |
| # differences = merged[merged['_merge'] != 'both'] | |
| # # 将结果保存到新的文件中 | |
| # differences.to_csv('/nfs/dpa_pretrain/data/pretrain/differences.csv', index=False) | |
| # if test: | |
| # self.protein_seqs = self.dataset_df["proteinSeq"].values[:128] | |
| # self.disease_dess = self.dataset_df["diseaseDes"].values[:128] | |
| # self.scores = 128 * [1] | |
| # else: | |
| # self.protein_seqs = self.dataset_df["proteinSeq"].values | |
| # self.disease_dess = self.dataset_df["diseaseDes"].values | |
| # self.scores = len(self.dataset_df["score"].values) * [1] | |
| # def __getitem__(self, query_idx): | |
| # protein_seq = self.protein_seqs[query_idx] | |
| # disease_des = self.disease_dess[query_idx] | |
| # score = self.scores[query_idx] | |
| # return protein_seq, disease_des, score | |
| # def __len__(self): | |
| # return len(self.protein_seqs) | |
| class PPI_Pretrain_Dataset(Dataset): | |
| """ | |
| Candidate Dataset for: | |
| ALL protein-to-protein interactions | |
| """ | |
| def __init__(self, data_dir="../../data/pretrain/", test=False): | |
| LOGGER.info("Initializing metric learning data set! ...") | |
| self.dataset_df = pd.read_csv(f"{data_dir}/string_ppi_900_2m.csv") | |
| self.dataset_df = self.dataset_df[["item_seq_a", "item_seq_b", "score"]] | |
| self.dataset_df = self.dataset_df.dropna() | |
| if test: | |
| self.dataset_df = self.dataset_df.sample(100) | |
| print( | |
| f"{data_dir}/string_ppi_900_2m.csv loaded, found interactions: {len(self.dataset_df.index)}" | |
| ) | |
| self.protein_seq1 = self.dataset_df["item_seq_a"].values | |
| self.protein_seq2 = self.dataset_df["item_seq_b"].values | |
| self.scores = len(self.dataset_df["score"].values) * [1] | |
| def __getitem__(self, query_idx): | |
| protein_seq1 = self.protein_seq1[query_idx] | |
| protein_seq2 = self.protein_seq2[query_idx] | |
| score = self.scores[query_idx] | |
| return protein_seq1, protein_seq2, score | |
| def __len__(self): | |
| return len(self.protein_seq1) | |
| class PPI_Dataset(Dataset): | |
| """ | |
| Candidate Dataset for: | |
| ALL protein-to-protein interactions | |
| """ | |
| def __init__(self, protein_seq1, protein_seq2, score): | |
| self.protein_seq1 = protein_seq1 | |
| self.protein_seq2 = protein_seq2 | |
| self.scores = score | |
| def __getitem__(self, query_idx): | |
| protein_seq1 = self.protein_seq1[query_idx] | |
| protein_seq2 = self.protein_seq2[query_idx] | |
| score = self.scores[query_idx] | |
| return protein_seq1, protein_seq2, score | |
| def __len__(self): | |
| return len(self.protein_seq1) | |
| class DDA_Dataset(Dataset): | |
| """ | |
| Candidate Dataset for: | |
| ALL disease-to-disease associations | |
| """ | |
| def __init__(self, diseaseDes1, diseaseDes2, label): | |
| self.diseaseDes1 = diseaseDes1 | |
| self.diseaseDes2 = diseaseDes2 | |
| self.label = label | |
| def __getitem__(self, query_idx): | |
| diseaseDes1 = self.diseaseDes1[query_idx] | |
| diseaseDes2 = self.diseaseDes2[query_idx] | |
| label = self.label[query_idx] | |
| return diseaseDes1, diseaseDes2, label | |
| def __len__(self): | |
| return len(self.diseaseDes1) | |
| class DDA_Pretrain_Dataset(Dataset): | |
| """ | |
| Candidate Dataset for: | |
| ALL protein-to-protein interactions | |
| """ | |
| def __init__(self, data_dir="../../data/pretrain/", test=False): | |
| LOGGER.info("Initializing metric learning data set! ...") | |
| self.dataset_df = pd.read_csv(f"{data_dir}disgenet_dda.csv") | |
| self.dataset_df = self.dataset_df.dropna() # Drop missing values. | |
| if test: | |
| self.dataset_df = self.dataset_df.sample(100) | |
| print( | |
| f"{data_dir}disgenet_dda.csv loaded, found associations: {len(self.dataset_df.index)}" | |
| ) | |
| self.disease_des1 = self.dataset_df["diseaseDes1"].values | |
| self.disease_des2 = self.dataset_df["diseaseDes2"].values | |
| self.scores = len(self.dataset_df["jaccard_variant"].values) * [1] | |
| def __getitem__(self, query_idx): | |
| disease_des1 = self.disease_des1[query_idx] | |
| disease_des2 = self.disease_des2[query_idx] | |
| score = self.scores[query_idx] | |
| return disease_des1, disease_des2, score | |
| def __len__(self): | |
| return len(self.disease_des1) | |