Upload new model safetensors with trained LMHead

#3
This view is limited to 50 files because it contains too many changes.  See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +43 -1
  2. .gitignore +0 -24
  3. README.md +33 -37
  4. __init__.py +0 -0
  5. benchmarks/.DS_Store +0 -0
  6. .DS_Store → benchmarks/Generation/.DS_Store +0 -0
  7. benchmarks/Generation/ProtGPT2/protgpt2_finetune.py +70 -0
  8. benchmarks/Generation/ProtGPT2/protgpt2_generate.py +55 -0
  9. benchmarks/Generation/ProtGPT2/protgpt2_generated_sequences.csv +101 -0
  10. benchmarks/Generation/ProtGPT2/protgpt2_test.txt +0 -0
  11. benchmarks/Generation/ProtGPT2/protgpt2_train.txt +0 -0
  12. benchmarks/Generation/ProtGPT2/run_clm.py +657 -0
  13. benchmarks/Generation/Visualize/analyze_mdlm_denovo_gen.py +7 -0
  14. benchmarks/Generation/Visualize/esm_umap.png +0 -0
  15. benchmarks/Generation/Visualize/esm_umap.py +111 -0
  16. benchmarks/Generation/Visualize/mdlm_de-novo_generation_results.csv +101 -0
  17. benchmarks/MLM/config.py +14 -0
  18. benchmarks/MLM/data_loader.py +48 -0
  19. benchmarks/MLM/esm_utils.py +16 -0
  20. benchmarks/MLM/mlm_generate_utils.py +108 -0
  21. benchmarks/MLM/mlm_lowercase_results.csv +0 -0
  22. benchmarks/MLM/mlm_motif_benchmarking.py +39 -0
  23. benchmarks/MLM/mlm_uppercase_results.csv +0 -0
  24. benchmarks/MLM/model.py +65 -0
  25. benchmarks/MLM/pretrained_models.py +12 -0
  26. benchmarks/MLM/screen_mlm_cosine_hamming.py +17 -0
  27. benchmarks/MLM/train_and_test.py +184 -0
  28. benchmarks/Supervised/.DS_Store +0 -0
  29. benchmarks/Supervised/Localization/cell_localization_predictor.py +224 -0
  30. benchmarks/Supervised/Localization/process_cell_local_data.py +12 -0
  31. benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_test.csv +0 -0
  32. memdlm_schematic.png → benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_train-val.csv +2 -2
  33. benchmarks/Supervised/Membrane Type/membrane_type_predictor.py +226 -0
  34. benchmarks/Supervised/Membrane Type/membrane_type_test.csv +0 -0
  35. benchmarks/Supervised/Membrane Type/membrane_type_train.csv +3 -0
  36. benchmarks/Supervised/Membrane Type/split_membrane_type_data.py +15 -0
  37. benchmarks/Supervised/Membrane Type/unsplit_membrane_type_all.csv +3 -0
  38. benchmarks/Supervised/Solubility/solubility_transformer.py +353 -0
  39. checkpoints/.DS_Store +0 -0
  40. config.json +30 -0
  41. config.yaml +127 -0
  42. data/.DS_Store +0 -0
  43. data/membrane/test.csv +0 -0
  44. data/membrane/train.csv +0 -0
  45. data/membrane/val.csv +0 -0
  46. data/uniref/100k_seqs/check_data.ipynb +168 -0
  47. data/uniref/100k_seqs/test.csv +0 -0
  48. data/uniref/100k_seqs/train.csv +3 -0
  49. data/uniref/100k_seqs/val.csv +0 -0
  50. data/uniref/200k_seqs/check_data.ipynb +168 -0
.gitattributes CHANGED
@@ -1 +1,43 @@
1
- memdlm_schematic.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ benchmarks/DeepLoc/cell_localization_train_val.csv filter=lfs diff=lfs merge=lfs -text
37
+ benchmarks/DeepLoc/membrane_type_train.csv filter=lfs diff=lfs merge=lfs -text
38
+ benchmarks/DeepLoc/OG_membrane_type_all.csv filter=lfs diff=lfs merge=lfs -text
39
+ data/uniref/100k_seqs/train.csv filter=lfs diff=lfs merge=lfs -text
40
+ data/uniref/200k_seqs/train.csv filter=lfs diff=lfs merge=lfs -text
41
+ benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_train-val.csv filter=lfs diff=lfs merge=lfs -text
42
+ benchmarks/Supervised/Membrane[[:space:]]Type/membrane_type_train.csv filter=lfs diff=lfs merge=lfs -text
43
+ benchmarks/Supervised/Membrane[[:space:]]Type/unsplit_membrane_type_all.csv filter=lfs diff=lfs merge=lfs -text
.gitignore DELETED
@@ -1,24 +0,0 @@
1
- # .gitignore
2
-
3
- /checkpoints/
4
- /data/
5
- /results/
6
- /build/
7
- /src/scripts/
8
- /src/benchmarks
9
-
10
- /src/lm/dplm
11
- /src/lm/evodiff
12
- /src/lm/dplm_playground.ipynb
13
- /src/lm/evoflow_playground.ipynb
14
- /src/utils/ubuntu_font
15
-
16
- /src/sampling/old_guidance.py
17
-
18
- /MeMDLM_v2.egg-info/
19
- *.pth
20
- *.ckpt
21
- *.err
22
- *.out
23
- *.csv
24
- __pycache__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,51 +1,47 @@
1
  ---
2
  license: cc-by-nc-nd-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
4
 
5
- <h1 align='center'>Token-Level Guided Discrete Diffusion for Membrane Protein Design</h1>
6
 
7
- <div align="center">
8
- <a href="https://shreygoel09.github.io/" target="_blank">Shrey Goel</a><sup>1</sup>&ensp;<b>&middot;</b>&ensp;
9
- <a href="https://www.linkedin.com/in/perin-schray-96855a32b/" target="_blank">Perin Schray</a><sup>2</sup>&ensp;<b>&middot;</b>&ensp;
10
- <a href="https://www.linkedin.com/in/yinuozhang98/" target="_blank">Yinuo Zhang</a><sup>3</sup>&ensp;<b>&middot;</b>&ensp;
11
- <a href="https://www.linkedin.com/in/sophia-vincoff-185192146/" target="_blank">Sophia Vincoff</a><sup>4</sup>&ensp;<b>&middot;</b>&ensp;
12
- <a href="https://www.linkedin.com/in/htkratochvil/" target="_blank">Huong T. Kratochvil</a><sup>2</sup>&ensp;<b>&middot;</b>&ensp;
13
- <a href="https://www.chatterjeelab.com/" target="_blank">Pranam Chatterjee</a><sup>4<sup>
14
- <br>
15
- <p style="font-size: 16px;">
16
- <sup>1</sup> Duke University &emsp;
17
- <sup>2</sup> UNC—Chapel Hill &emsp;
18
- <sup>3</sup> Duke-NUS Medical School &emsp;
19
- <sup>4</sup> University of Pennsylvania &emsp;
20
- </div>
21
-
22
- <div align="center">
23
- <a href="https://arxiv.org/abs/2410.16735"><img src="https://img.shields.io/badge/Arxiv-2506.09007-red?style=for-the-badge&logo=Arxiv" alt="arXiv"/></a>
24
 
25
- </div>
26
 
 
27
 
 
28
 
 
 
29
 
30
- ![MemDLM diagram](./memdlm_schematic.png)
 
31
 
 
32
 
33
- Reparameterized diffusion models (RDMs) have recently matched autoregressive methods in protein generation, motivating their use for challenging tasks such as designing membrane proteins, which possess interleaved soluble and transmembrane (TM) regions.
 
34
 
35
- We introduce ***Membrane Diffusion Language Model (MemDLM)***, a fine-tuned RDM-based protein language model that enables controllable membrane protein sequence design. MemDLM-generated sequences recapitulate the TM residue density and structural features of natural membrane proteins, achieving comparable biological plausibility and outperforming state-of-the-art diffusion baselines in motif scaffolding tasks by producing:
 
36
 
37
- - Lower perplexity
38
- - Higher BLOSUM-62 scores
39
- - Improved pLDDT confidence
40
-
41
- To enhance controllability, we develop ***Per-Token Guidance (PET)***, a novel classifier-guided sampling strategy that selectively solubilizes residues while preserving conserved TM domains. This yields sequences with reduced TM density but intact functional cores.
42
-
43
- Importantly, MemDLM designs validated in TOXCAT β-lactamase growth assays demonstrate successful TM insertion, distinguishing high-quality generated sequences from poor ones.
44
-
45
- Together, our framework establishes the first experimentally validated diffusion-based model for rational membrane protein generation, integrating *de novo* design, motif scaffolding, and targeted property optimization.
46
-
47
-
48
-
49
- ## **Repository Authors**
50
- - <u>[Shrey Goel](https://shreygoel09.github.io/)</u> – undergraduate student at Duke University
51
- - <u>[Pranam Chatterjee](mailto:pranam@seas.upenn.edu)</u> – Assistant Professor at University of Pennsylvania
 
1
  ---
2
  license: cc-by-nc-nd-4.0
3
+ extra_gated_fields:
4
+ Name: text
5
+ Company: text
6
+ Country: country
7
+ Specific date: date_picker
8
+ I want to use this model for:
9
+ type: select
10
+ options:
11
+ - Research
12
+ - Education
13
+ - label: Other
14
+ value: other
15
+ I agree to share generated sequences and associated data with authors before publishing: checkbox
16
+ I agree not to file patents on any sequences generated by this model: checkbox
17
+ I agree to use this model for non-commercial use ONLY: checkbox
18
+ base_model:
19
+ - facebook/esm2_t30_150M_UR50D
20
+ pipeline_tag: fill-mask
21
  ---
22
 
23
+ # MeMDLM: De Novo Membrane Protein Design with Masked Diffusion Language Models
24
 
25
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/65bbea9a26c639b000501321/uWW6xnJZwQFWDS1QZNQTm.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ Masked Diffusion Language Models (MDLMs), introduced by Sahoo et al (arxiv.org/pdf/2406.07524), provide strong generative capabilities to BERT-style models. In this work, we pre-train and fine-tune ESM-2-150M on the MDLM objective to scaffold functional motifs while unconditionally generating realistic, high-quality membrane protein sequences.
28
 
29
+ ## Model Usage
30
 
31
+ The MDLM model leverages an internal backbone model, which is a fine-tune of ESM2 (150M). This backbone model can be used through this repo:
32
 
33
+ ```python
34
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
35
 
36
+ tokenizer = AutoTokenizer.from_pretrained("ChatterjeeLab/MeMDLM")
37
+ model = AutoModelForMaskedLM.from_pretrained("ChatterjeeLab/MeMDLM")
38
 
39
+ input_sequence = "QMMALTFITYIGCGLSSIFLSVTLVILIQLCAALLLLNLIFLLDSWIALYnTRGFCIAVAVFLHYFLLVSFTWMGLEAFHMYLKFCIVGWGIPAVVVSIVLTISPDNYGidFCWINSNVVFYITVVGYFCVIFLLNVSMFIVVLVQLCRIKKKKQLGDL"
40
 
41
+ inputs = tokenizer(input_sequence, return_tensors="pt")
42
+ output = model(**inputs)
43
 
44
+ filled_protein_seq = tokenizer.decode(output.squeeze()) # contains the output protein sequence with filled mask tokens
45
+ ```
46
 
47
+ This backbone model can be integrated with the [MDLM formulation](https://github.com/kuleshov-group/mdlm) by setting the model backbone type to "hf_dit" and setting the HuggingFace Model ID to "ChatterjeeLab/MeMDLM"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__init__.py DELETED
File without changes
benchmarks/.DS_Store ADDED
Binary file (6.15 kB). View file
 
.DS_Store → benchmarks/Generation/.DS_Store RENAMED
Binary files a/.DS_Store and b/benchmarks/Generation/.DS_Store differ
 
benchmarks/Generation/ProtGPT2/protgpt2_finetune.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import subprocess
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+
6
+
7
+ # Format sequence inputs based on ProtGPT fine-tuning requirements
8
+ def modify_sequences(sequence):
9
+ modified_sequence = sequence.upper()
10
+ modified_sequence = '\n'.join([modified_sequence[i:i+60] for i in range(0, len(modified_sequence), 60)])
11
+
12
+ fasta = "<|endoftext|>"
13
+ modified_sequence = fasta + "\n" + modified_sequence
14
+
15
+ return modified_sequence
16
+
17
+ # Function to save sequences to txt files
18
+ def to_txt_file(df, filename):
19
+ with open(filename, 'w') as f:
20
+ for sequence in df['Sequence']:
21
+ f.write(sequence + '\n')
22
+
23
+
24
+ # Modify the sequences
25
+ path = "/workspace/sg666/MDpLM"
26
+
27
+ train = pd.read_csv(path + "/data/membrane/train.csv")
28
+ val = pd.read_csv(path + "/data/membrane/val.csv")
29
+ test = pd.read_csv(path + "/data/membrane/test.csv")
30
+
31
+ train = pd.concat([train, val])
32
+
33
+ train['Sequence'] = train['Sequence'].apply(modify_sequences)
34
+ test['Sequence'] = test['Sequence'].apply(modify_sequences)
35
+
36
+
37
+ # Save the modified sequences as txt files
38
+ to_txt_file(train, path + '/benchmarks/Generation/ProtGPT2/protgpt2_train.txt')
39
+ to_txt_file(test, path + '/benchmarks/Generation/ProtGPT2/protgpt2_test.txt')
40
+
41
+
42
+ tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2")
43
+ model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2")
44
+
45
+ finetune_protgpt2_command = [
46
+ "python", "run_clm.py",
47
+ "--model_name_or_path", "nferruz/ProtGPT2",
48
+ "--train_file", "protgpt2_train.txt",
49
+ "--validation_file", "protgpt2_test.txt",
50
+ "--tokenizer_name", "nferruz/ProtGPT2",
51
+ "--num_train_epochs", "10",
52
+ "--logging_steps", "1",
53
+ "--logging_dir", "test",
54
+ "--do_train",
55
+ "--do_eval",
56
+ "--output_dir", "/workspace/sg666/MDpLM/benchmarks/Generation/ProtGPT2/finetuned_models",
57
+ "--overwrite_output_dir",
58
+ "--learning_rate", "3e-04",
59
+ "--per_device_train_batch_size", "2",
60
+ "--evaluation_strategy", "epoch"
61
+ ]
62
+
63
+ try:
64
+ result = subprocess.run(finetune_protgpt2_command, check=True, text=True, capture_output=True)
65
+ except subprocess.CalledProcessError as e:
66
+ print("Command failed with the following error:")
67
+ print(e.stderr) # Print standard error output
68
+ print("Command output:")
69
+ print(e.stdout) # Print standard output if needed
70
+
benchmarks/Generation/ProtGPT2/protgpt2_generate.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import math
4
+ import torch
5
+ import sys
6
+ import pandas as pd
7
+
8
+ # Function to calculate perplexity of each generated sequence
9
+ def calculate_perplexity(sequence, model, tokenizer):
10
+ sequence = "<|endoftext|>" + sequence + "<|endoftext|>"
11
+ input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0)
12
+ input_ids = input_ids.to(device)
13
+ with torch.no_grad():
14
+ outputs = model(input_ids, labels=input_ids)
15
+ loss, _ = outputs[:2]
16
+ return math.exp(loss)
17
+
18
+ if __name__ == "__main__":
19
+ device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
20
+ path = "/workspace/sg666/MDpLM/benchmarks/Generation/ProtGPT2"
21
+
22
+ # Load fine-tuned model and tokenizer
23
+ model_path = path + "/finetuned_models/checkpoint-4510"
24
+ model = AutoModelForCausalLM.from_pretrained(model_path)
25
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
26
+
27
+ # Generate sequences
28
+ protgpt2 = pipeline('text-generation', model=model_path, device=device)
29
+ sequences = protgpt2("", max_length=100, do_sample=True, top_k=950, repetition_penalty=1.5, num_return_sequences=100, eos_token_id=0)
30
+
31
+ # Store generated sequences and their associated perplexities
32
+ generated_sequences = []
33
+ perplexities = []
34
+
35
+
36
+ # Calculate PPL for sequences
37
+ for item in sequences:
38
+ raw_sequence = item['generated_text']
39
+ ppl = calculate_perplexity(raw_sequence, model.to(device), tokenizer)
40
+ generated_sequences.append(raw_sequence)
41
+ perplexities.append(ppl)
42
+
43
+ # Clean the generated sequences
44
+ cleaned_sequences = [seq.replace('\n', '').replace('<|endoftext|>', '') for seq in generated_sequences]
45
+
46
+ # Create df with cleaned sequences and perplexities
47
+ df = pd.DataFrame({"Sequence": cleaned_sequences, "Perplexity": perplexities})
48
+ df.sort_values(by='Perplexity', inplace=True)
49
+
50
+ # Save results
51
+ df.to_csv(path + "/protgpt2_generated_sequences.csv", index=False)
52
+
53
+ # View the average de novo generation perplexity
54
+ avg_generation_ppl = df.loc[:, 'Perplexity'].mean()
55
+ print(f'Average de novo generation perplexity: {avg_generation_ppl}')
benchmarks/Generation/ProtGPT2/protgpt2_generated_sequences.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Sequence,Perplexity
2
+ LAPSVVTGVAQSSPLTIVTNPKEPRQPVPASDGADYLKTIPGFAVIRNGGSNGDPVLRGMFGSRLNILTNGGMMLGACPNRMDAPTSYISPETYDKLTVIKGPQTVLWGPGASAGTILFEREPERFGELGSRVNASLLAGSNGRFDKVLDAAAGNRLGYLRFTGNHAQSDDYEDGAGNTVPSRWKKWNGDVAVGWTPDEDTLIELTAGKGDGEARYAGRGMDGSQFKRESLGLRFVKSNVSDVLEKVEAQVYYNYADHIMDNFRLRTPDPSMPMT,2.6532732777535712
3
+ MPNFFIDRPIFAWVIAIIIMLAGGLAILKLPVAQYPTIAPPAVTISASYPGADAKTVQDTTVQIIEQNLNGLDNLLYMSSTSDDSGNATITITFAPGTNPDIAQVQVQNKLSLATPILPQAVQRQGVSVEKSSSSFLMVVGVINTDGTMTQEDISDYVAANMKDAISRTSGVGDVQLFGSQYAMRIWMNPNELNKFQLTPVDVITAIKAQNAQVAAGQLGGTPPVKGQQLNASIIAQTRLTSTEEFGKILLKVNQDGSRVLLRDVAKIELGGENYDIIAEFNGQPASGLGIKLATG,2.829348107084168
4
+ MAYRSTTLLALLALVLLYLVSGALVFRALEQPHEQQAQRELGEVREKFLRAHPCVSDQELGLLIKEVADALGGGADPETQSTSAWDLGSAFFFSGTIITTIGYGNVALRTDAGRLFCIFYAAXFGIPFTLLFLTAVGDRLGSSLRHGIGHIEAIFLKWHVPPELVRVLSEMLFLLVGCLLFVLTPTFVFCYMEDWSKLEAIYFVIVTLTTVGFGDYVAGADPRQDSPAYQPLVWFWILLGLAYFASVSAML,3.119025307842878
5
+ MPNFFIDRPIFAWVIAIIIMLAGGLAILKLPVAQYPTIAPPAVTISASYPGADAKTVQDTTVQIIEQQMNGLDGLRYISSNSAGNGQASIQLNFEQGVDPDIAQVQVQNKLQLAMPLLPQAVKEQGVSVEKSSSSFLMVVGVINTDGTMTQEDISDYVAANMKDAISRTSGVGDVQLFGSQYAMRIWMNPNELNKFQLTPVDVITAIKAQNAQVAAGQLGGTPPVKGQQLNASIIAQTRLTSTEEFGKILLKVNQDGSRVLLRDVAKIELGGENYDIIAEFNGQPASGLG,3.775355043694786
6
+ LFLTMAEAQLRYKTTEECLAYFGVSETTGLTPDQVKRHLEKYGHNELPAEEGKSLWELVIEQFEDLLVRILLLAACISFVLAWFEEGEETVTAFVEPFIILLILIANAIVGVWQERNAENAIEALKEYEPEMGKVYRADRKSVQRIKARDIVPGDIVEVAVGDKVPADIRILSIKSTTLRVDQSILTGESVSVAKSSDAVPDPRAVNQDKKNMLFSGTNIAAGKALGIVATTGVSTEIGKIRDQMAATEQDKTPLQQKLDEFGEQLSKVISLICIAVWLINIGHFNDPVHGGSWI,4.136701078251139
7
+ MPNFFIDRPIFAWVIAIIIMLAGGLAILKLPVAQYPTIAPPAVTISASYPGADAKTVQDTTVQIIEQNMNGIDNLMYMSSNSDSTGTAQITLTFESGTDADIAQVQVQNKLQLAMPLLPQAVQQQGVSVEKSSSSFLMVVGVINTDGTMTQEDISDYVAANMKDAISRTSGVGDVQLFGSQYAMRIWMNPNELNKFQLTPVDVITAIKAQNAQVAAGQLGGTPPVKGQQLNASIIAQTRLTSTEEFGKILLKVNQDGSRVLLRDVAKIELGGENYDIIAEFNGQPASGLG,4.210716900525416
8
+ MPNFFIDRPIFAWVIAIIIMLAGGLAILKLPVAQYPTIAPPAVTISASYPGADAKTVQDTTVQIIEQQMNGLDGLRYISSNSAGNGQASIQLTFESGTDADIAQVQVQNKLQLAMPLLPQEVAQQGVSVEKSSSSFLMVVGVINTDGTMTQEDISDYVAANMKDAISRTSGVGDVQLFGSQYAMRIWMNPNELNKFQLTPVDVITAIKAQNAQVAAGQLGGTPPVKGQQLNASIIAQTRLTSTEEFGKILLKVNQDGSRVLLRDVAKIELGGENYDIIAEFNGQPAS,4.526996795741569
9
+ MLKIIIPTTMLLPMTWMSKHNMIWINATVHSLLISLISLSLLNQLGENSLNFSLTFFSDSLSAPLLVLTTWLLPLMLMASQSHLSKETTTRKKLYITMLLILLQLFLIMTFTATELILFYIFESASLPTLLMITRWGNQTERLNAGLYFLMYTLAGSLPLLVALVYIQNTTGSLNFLIIHWSTHTSASFVSQTLLLMAWMAAMAVMAKMPLYGVHLWLPKAHVEAPIAGSMVLAAVLLKLGGYGMLRITTILNPLTNYMAYPFLMLCLWGMI,4.629232424547782
10
+ AKFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNVVVTGHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGCGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMSQYQTPLFVWSVMITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGHPXVLILILPFFGIVTEASAIPRIFNWMVTFHGQLMYHHMWIIGVL,5.0608380016313275
11
+ LVEKDPIKTSFEKWAKPGHFSRTLAKGPNTTTWIWNLHADAHDFDSYTSDLEEISRKVFSAHFGHLAVVTIWLSGMIFHGAKFSNYEAWLSDPLNVRPSAQVVWPLVGQDILNGDMGDGTYNGFQVMTSGLFQLWRASGITNEYQLYCTAIGGLVMAALMLFAGWFHYHKAAPKLAWFQDVETALNHHLSGLLGLGCLSWAGHQIHVSLPVNKLLDAGVAAKDIPLPHEFILDPAKFASLLPGLTQGLTPFFTLNWSEYSDFLTFKGGLNPVTGGL,5.597917119515088
12
+ MVRKVYVTLQGKVQGVFFRAHTQATAKQLGVVGWVRNTSDGTVEGEAQGPADKVDEMINWLHRGPPQAQIESHEFNSEKKELEAFSSFHIRY,5.635017933300935
13
+ EFGFWEIKFPEYLKGRPTTGRPEWVQDVDLVNKWAVPGLNPPHHFSPPVNLTGVEDTLPVSWVMVSMVVGFVLIVATAGNILVIIAVFTSRALKAPQNLFLVSLASADILVATLVIPFAMANEVMGYWYFGKAWCEIYLALDVLFCTSSAWHLCAISLDRYWSITQAIEYNLKRTPRRTKAIIITVWVISAVISFPPRCEINDQKWYYVISSCIGSFFAPCLIMILVYVRIYQIAKRRTRDLSRKSGRPSLLSEVHAAKSLAIL,6.370992471309986
14
+ MVYVSRISVFAFLGALASVAYGQVTPPNFGTEQDRVNFTKQIVPVLKEKCVVCHGPDKTKGKLRLDLRIEAFKGGESGESIDVIPGDPENSELLERITSKDPEFRMPPKSEHKPLTEAEIALLKQWILEGAKYDPAWAFTPPKRTDLPKVKRDEWAKNDVDRFILAKLESEGLTPNPEADKATLIRRVTLDLTGLPPTPAEVDAFLADKSPNAYEKVVDRLLASPHFGERWGRHWLDVARWAESNGFERNTIRNIWSYRDWVIKALNDDVPYDQFTVEQL,7.0499259667086145
15
+ SSNAKTVLITGGTGFVGRALVKRLLSTTKHTIVVPYREEADLHDVKVLQVKGDLRDAASLDAAFEGVDCVFHLASYGMSGPEMFELNVEGTRNVVEACLRHGVRRLIHVSSIAVMGEPSDHPRREADESLPARQATAYAKSKVEAERIVLEANGSDGLETVVVRPPMVWGPGDTQFLPRLVRMARRGLRPVIGNGKSLVSMVYIDNLVDGLIAAMDHPEARGKTYFLSNDGHASQREFIETVARAIGRPAPKLTLPVPVLYWAARLLG,7.429969652690046
16
+ SPELIEQLLQNYLQLPDAEKRKVADQLQTSNIRYCYLLASEKGWLDRVESCLAAEGCDVLQPDHTGRNLLQVVASVSPDHTARLIRALLARGADVHAQDSLGNTVLHILILQPNKTFACQMYNEILILGAKLCPTVNLEAVLNHQGLTPFKLAGVEGNTVMFQHLMQKRKHVQWTCGPLTSTLYDLTEIDSSGDDQSLLELIVTTKKREARQILEQTPVKELVSLKWKRYGRPYFCVLGAIYILYIICFTMCCVYRPLKPRITNRTNPRDNMTSLEL,7.910941817905356
17
+ ADVNLNARDLHGMTPLHLAAKNGHDKVVQLLLKKGALVNIQDKLGSTPLLEAIRGRREDTVKLLVEHGADIRAQDSLGNTVLHILILQPENSTSLKFAEMLYDMILLRSGTWELETTQPNDGLTALQLAAKMGKAEILKYILSREIKEKPLRSLSRKFTDWAYGPVSSSLYDLTNVDSSGNTVLHAMIMVADNTPQNSRFVKQMYNLLLSKGARLCPNVPNHQGLTPFKLAGVEGNIVMQEILRGTTISIPFTCITCGKKDTRFRGMSCEN,8.179497248919981
18
+ DPFNNFFRRSKIAVCGLVFFVLFIIYMVLGSMIFSAIERDHEQQAQRELGEVREKFLISHPCVSDQELGVLIEEVADALGGGADPETQSTSAWDLGSAFFFSGTIITTIGYGNVALRTDTMGRLFCIFYALVGIPLFGILLAGVGDRLGSSLRHGIGHIEAIFLKWHVPPGLVRVLSAMLFLAIGCLLFVTLPAYVFSHMEDWSKLEAIYFVIVTLTTVGFGDYVAGADPRQDSPQYQPLVWFWILL,8.306921086116862
19
+ GPQSFVHFTKQSLALIEQRIAERKSKEPKPSSDLEAGKQLPFIYGDIPPGMVSEPLEDLDPYYADKKTFIVLNKGKTIFRFNATPALYMLSPFSPLRRISIKILVHSLFSMLIMCTILTNCIFMTMNNPPDWTKNVEYTFTGIYTFESLVKILARGFCVGEFTFLRDPWNWLDFVVIVFAYVTEFVVAEFVSFSALRAFRVLRALKTISVIPGLKTIVGALIQSVKKLSDVMILTVFCLSVFALIGLQLFMGNLRKKCFFPDG,8.471762198050271
20
+ MLKIIIPTTMLLPMTWMSKHNMIWINATVHSLLISLISLSLLNQLGENSLNFSLTFFSDSLSAPLLVLTTWLLPLMLMASQSHLSKETTTRKKLYITQLILLQLFLIMTFTATELILFYIFESATLLPTLLIILRWGYQPERLQAGLYFLFYTLIGGVLVLLSILMIYVNTNSLLIHTLPMFNSTMETSLYTKIMWFACMMAFPTKMGLFPIHMWLPVVHSESPLAGSCILAGILLKLGGYGMMRVVTILNPLTNYMAYPFLML,8.583127806228307
21
+ MVLRLVVLALLCWTPGLWAQQADTLTLDEVVVTATRSEQNLQDVPASVSVITAEDLQRQAPRTLGEALRYVPGVFLDGTGRTNGQDINMRGYDHRGVLVLVDGIRQGTDTGHLNGTFLDPALIKRVEIVRGPSAALYGNGAAGGVVNFITRQPSDQLTGSVRLNTSLPQHDGDNSQQFYSLMAGNRLGEEGKLGMLASFSRQEKGQARDGAGNDIASLDEDSLSGKLLWQLTPEQQLDFSLDHYRFKTNAPHNPVNTDFTRHTRQESDSTVRRFFNQVQ,10.282136779067205
22
+ RPLVAIDFGTTYSGYAFSFKNQPETITLHWNSEISKALRKPTVLLIDSNMKEVAFGYEAENKFATLALDAEEKHFFFEKFKMALYDKNDRSILPSMRSANGTEKKAIDVFAEAIRYFKDHALKTINSTYPIDKQDLLWSVTVPSDWDARSKEFMRQAAVKAGLGEASLASEPEAASMYCVEHEVNKFGDEIKSGTKFLVVDVGGGTVDITVHEVLENNHLKELYKASGGPYGSVGIDQEFMKLFQLIVGAEAIEQFKIK,11.589466291126676
23
+ MKVSVIIPTYNERENLEELFSRIDQALQGLNYEIVVVDDDSPDRTWEKAQELSSKYPIKVCRRTKEKGLSSAVIRGFKEASGDVFVVMDADLQHPPEVIPKLIEAIKNGSDIAIGSRYVKGARVENWPFYRKLISKGALVVTKIPLKDLKDMRDFACGFIAIKREVIEKIEFDENLTYGKILKILKYCWGGFSKVVEVPFTFGIRARGESKLKGKTIFEYLRHIWSLNYTFFRILKLIFALGFTFFGVSLAYLTLVLMEKYFLWYIPGWAN,12.090375297427133
24
+ PGMQLNEFSSSGLGRAYSGEGAIADDAGNVSRNPALITMFDRPTFSAGAVYIDPDVNISGNSPLGAPGGTPSDREMKLVPTSHIALPINDRLAFGFAAYSNFGLATDYGDTFVGSTTPTDLEMKLNSLSIGGNAEITDQLSFGASITYQRAKIERFAGDLGQLVAGQIMQSPAGQTQQALLQAQSQGNLGSALAYANGIDSNTKIAHLNGNQWGYGWNAGILYELDKNNRYALTYRSEVKMTFKGNYSNDMPGYYEMNVPAWHNVSLYHE,12.173339409793382
25
+ DASRVYYEDRSVVKEDGSVVKEGPFDLQSTLTLSGVVRDYASGTPLADAEITLTGPAFRAHTNSYGKFVFEGLAAGTYTLSVSRFGYEPVSETIAVSAGQTVESNVALFALASEVEILEVTADADPVFNTGDVATSVGTREMKEIPTVVGDVDVIKSLQLLPGVASAGEGTSGFYVRGGGIDQNLYLLDNIPVYNVGHLFGFFSTFNSDAIKDVTLYKGGVPARYGGRLSSVLDITMKEGNSDKLSGTASIGLLPASAKLQGPI,12.228122271950522
26
+ GAVIDLSTATFDFGGSYTGVAVGDTITAVVTAPTEDDYVFQWFKDNVLQSGATGNSYTLTAAEAGKAIKVVVSGSKSGYTSTAKTAAVTTAITASSLTLTADKTKLTVGDTVTLTASLSDKNGNAVTGRTVKWSSSNTAVATVSSSGLVTGVAAGSATITASAEGQNGNGTANITVVAASVSSISLSPASASVAVGATQQFTASGYDSSGNVVTSGRVVTWASSNTSVATVSASGLVTAVAAGTATITVTSGGKSGNATVTVTAATLSSLSVSSSNL,12.23423450162324
27
+ MQTYNNPEVTYDWWAGNARFANLSGLFIAAHVAQAALIMFWAGAFTLYEISWLTADQSMGEQGLILLPHLATLGLGVGDGGQVTDTFPFFVVGAVHLIASAVLGAGALFHTFRAPSDLAAASGAAKRFQNFNPDLSKLGFISRHTHAAKPELWSQLIGGKHKTTTGFAWVGVANPDGSITGMGTAGIQVKQAEGVTVGLAHYIWPLIGAAALAATICFFGYNSVITDIAYPEKKLEAVTFGYQTQAFDAFTQAGQVIGSTT,12.368396953842797
28
+ AEGIRFAIVDEVDSILIDEARTPLIISGQAEDRTKELYKTLTRVLKSLEGGDYSVDLKNKKVSLTEKGVERTEKLLREAGIISDGTDNLYVVGAIFHAQKVATGKDYLFRKIVEKGRVEYTIDEKLKQVVIVDEFTGRMMPGRRYSDGLHQAIEAKEGVKVQRESKTLATITYQNYFRMFKKIMKLAGMTGTAETEAEEFKKIYNLDVVVIPTNEPMKRQDHSDQVYKTKREKYNAVLKEIEELYKKGQPVLVGTTSVEASEFLSNLLKKRKIPHNVLNAKPHAREAEIIAQAGRKG,12.697313288610662
29
+ MPNFFIDRPIFAWVIAIIIMLAGGLAILKLPVAQYPTIAPPAVTISASYPGADAKTVQDTTVQVIEQAMNGVDNLMYMSSNSDSTGTATITLTFESGTDADIAQVQVQNKLQLAMPLLPQAVQQQQGVSVEKSSSSFLMVVGVINTDGTMTQEDISDYVAANMKDAISRTSGVGDVQLFGSQYAMLIRMKPDLLNKFGVTANDVISALQAQNSQVEAGSIGQLPTLPGTPLQLSITAQSQLSSEQEYGDIMLRVNQDGSRVLLRDVAKIELGGENYDIIAE,12.907199708267516
30
+ DPLYYTNNGGLGFVLSALFGYIWWGYKSGTPKEVRSEAKYRMLTVVVPCYNEEKTIGRTLCSLLESDYPEDKLQIICVNDGSKDKTLKELEDFELRDVPLVVIDQENGGKARALNAGIDAASYEYFACVDADSQVEKDSLKKMVHHFADPSVGCVAGRVKIGNRWSWISRLIDLIQYLIAFNIGRRGINSITVVPGAIGAYRVSAIKKAGGFSGKTMTEDLDLTIAILRAGYKVVYEPEAICWTDVPETLKGFTRQRFRWTYGTMQ,12.993370901156627
31
+ DISAEDRMWSDAEKRMEWQRIDRQVANRKSHGKRGLLSRIFGWIFRRNMDEKALKLLPHIKCYTPAEIANAIQSMTPEDLQRYELRASMFSLADKSNSGTISLTEFRNILECLGVQMSPTELQTLFQVCDRDQNDMINFNEFANRFHEPAKEIGFNVAVLLTNLSEHVPHDPRLRNFLELAESVLNYFQPFLGRIEIMGSAKRIERVYFEISESSRTQWEKPQVKESREFRTMQEIYNHIYYHTKQKENENVQRNAERWKMIEENKL,13.119829828981848
32
+ SDITRLIVLVGTTLGVVLFLALAVWIVKSFWSPYQEINDWALALTIVDVLVVGVPAALPSTVTVTMALGAAYLAKKQALVKKLPIVESLSGVEILCSDKTGTLTKNKLSLQGAWLPGSEKPEQISGLVPEGSRQNITKCIHIAVLCNRASYKDGKLVGTPTEKAILKGLECWGVGYGEMRKKYPLVHQIPFNSTNKFQLSIHDKDNRYLLVMKGAPERVLEKCSTVLLQGKEQPLDEQWHTAFQTAYLSLGGLGERVLGFCQLYLSE,13.625918655212923
33
+ MEVTLFALLALVVASAIIAWGPVTKPLHPHEALVDVGGHKMHYICQGKGSPTVILEAGGGGGSIEWGWVQPQVAAVSRVCTYDRAGYGWSDPAPHARDAGIVAEELHRLLRAAQVPGPYVLVGHSIGGFNTLHFAARYPQDVAGLVLVDATHEDQYRRWKGYEQEMAPFTSGQALDNLAANVRVMESLPPVDAGKVRDLPVLVLSAGREHPPFDMKLYREQWQREVVDLSNVSDRQKHIVADRSGHHIQFDEPDLVVAAIRE,14.117540370332351
34
+ MDYHEDDKRFRREELCREAEFLKLKMPTKKVYHISETRGLLKTINSVLQKITDPIQPKVAEHRPQTTKRLSYPFSREKQHLFDLTDRDSFFDSKTRSTIVYEILKRTTCGITSLLANGIYWLAISTPTINEYPSFLSPSLYAAVLPFTFGFVVSFITLPRKALEYIEQNGQGKAAVHHHTHTHDHDAGDVKIVVNDKDLESHVVAGALMFVAALFSLVFHQWWSDYCDVAYTVFIRVRDVIFGHVKWT,14.986517088631075
35
+ PSNISAWWNFGSLLGACLILQITTGLFLAMHYSPDTTTAFSSLSHICRDVNFGWFIRNLHANGASFFFICIFLHIGRGLYYGSYLYKETWNTGVILLLTVMATAFMGYVLPWGQMSFWGATVITSLAVYLPWWGQHVQKLLFQLIPALLVLLTAWTPFLIGYTLIRETTETESTNYGTPLRLHRIISHHLLLLRAVAXXXXXXXXXXXXXXXXXXEIKAAFWSVFHFILPFMATALAAPRSLLLDEANSTNTLVTTNLIFNFIFFLLPIFPATLSMFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPFLHKSKQRSTMTF,15.230674054330438
36
+ SRTSELAVGIFVIIFGIALFFLAMKVSGLVGTNLSDGYTMKAQFDNVNGLKPRAKVTMSGVTIGRVDSITLDPVTRLATVTFDLDGKLTSFNAEQLKEVQKNALDELRYSSDYTQATPAQQMKACSEQMMTLLAPQQKEKKTLEVGDIIATSKSSVIYNDMSTYLNDLIGDLGTIASGVNELWPTLQANFSTVKTMAQNLLTANQQLPQLLGNVQTTSQLLAQDNNNFNKLVTDFALTIDALNAVVSKSGANLDTAIATANDLNTVLTENRQ,15.441068484289472
37
+ DDVTVVYQNGLPVISVRLPSRRERCQFTLKPISDSVGVFLRQLQEEDRGIDRVAIYSPDGVRVAASTGIDLLLLDDQLIIREKYQIFINDMSPGAKVAQTAPAREIKWDHEALTEELTYEGQSEKLRDKDRTEVRRTMLNLERRLSDIRRQLAPLEKVRIEISRKMEDKTIQSYALWLMLAVVVCLMGLAWWQVLASLATFCVAVIIMVFVGRNWSAVLQRRRKRMGAEELRHRAYQTHQCHLCAICFTNQKMATLVPCGHVFCEECIKQHL,17.259236085949127
38
+ DNTTNIVHVPVHYVFIMALPIIMCILGLLLNVLALWVFYGHMKRTTSVVYVINLAIADLLFVLSLPMYIHYYFNKTHWVFGELLCRITGTLFYMNTYCSILFLTCISIHRFLGVCYPFRLNLVKRNYAVCVSVGVWAFVMLACMPTLVFNQTEDYEGNRTICYDHLEDAQRHWALYLQVKVNVFVIGFLIPFLIITFCYSQIVATLLKVEANLAKKKSKAIRLVLTVVTVFVLSQFPYNFILLAKTIKLQQINSSCEFEKIIE,17.688518287684857
39
+ MDYHEDDKRFRREELCREAEFLKLKMPTKKVYHISETRGLLKTINSVLQKITDPIQPKVAEHRPQTTKRLSYPFSREKQHLFDLTDRDSFFDSKTRSTIVYEILKRTTCGITSLLANGIYSAAYPLHDGDYEGDNVEFYGDYTIHAGDPENGGQCVVITLTDYGNYEPFYSASLEFSRKHFGFSALSVQCELSDVQSFTAVKQQFINLLSSRAPITVRKFVSPEFPRNSDSHDIFSLSCDVSNTGHVTAVTCQVSARFLTRYLTD,17.749448694031326
40
+ MDNKLTLALAAIMVVLIAFVGINVMNNVNTNPTVVKTATVERGEYVERVDATGKVVAAQSTDLSFPATGEVTWLKVKVGDRVSKGQLLAELDTTDLEAQKNLALSQLEQSRASLALTRQTLARQQALAQTQAVSQQDLDNATNALRVQEAQLNQLRSGSRPEDIAAAQSQLRMAQDDLNRLRNGSRSEELRSLQAQLDVDKAKLNWDQKIVRRNQVRAPFAGVIAERLAEPGALVSPSQPILSLVADDNLEIEANVSEADILHLKPGQKAWFT,18.826645316378666
41
+ MVSVIIPAYNEEKYLEKCLESVRNQTYKNLEIILVDDGSKDKTLEIAKEYAKKDERVKVVTQENGGVSSARNRGIEESKGEWIAFLDADDYWEENHLEELVKAIESNNCDMSICNAIWYYWWDENKRIIKRLPRESVIEAEDFFKELPIFMLTVVVWNKLFKKELFDSIRFPEGKTYEDTATIVDVLMKCKKVAYLNKALVNYRIREGSASTSFNPAKAKDHLKAIEVAFKEAHAEGLGDVALRAFQRRYVNSII,19.064942570982527
42
+ MRPNLFLLALPFIALAAPAHAESITVNGDARIRALGKQNYAEVRTHISDNGTKATVDATGHLRIDAPLGERAQVKAYGELEAIYAKPSGDKNKASNTERLAYAGLKFADHGSIDYGRNYGILYDTNAWTDVFPLWGADVLESNTAAYKRTYGNVLTYRNNNAFGYVDGLSFALQYQGKNPTTGEVVKGDRVNSDGRRLGAATVGYDFDGFGIGFAAASSKTEQNGIKKDTDGREYAVAGSAKVGAAQVAGTYAETRNATRFGQTGKGRVE,19.328842227230165
43
+ DDALPLSYYGTNKGLDRPATGPDRREHRFGFIADASAYPSQQLFIRGKVDVRDYQGSDTLRDDNAYVRLRNLTVGYDNLLPGSPLNVVAQFDLFNVLNATNVKDYQEVLSGGKAAAANFPIPRTYTLGLKLTF,21.699704462300236
44
+ MAGRKILRDPYIIKLLELTEHNPGKRVTARCTSEGILTVPPDLICCLLIQLPIDSIDHHSFILNLQCKDDYQLILKNGSVLHSSCKYTPGKPAEVKAEGGSISIAITKLQLSDSGLYSCQPPNHEPSHGQLNLTVYKQTGFISVSDTGVGIVRVRAYAERPDDLNVTLTCLVTGVFPHDVTVQWTKNNSPLSKDSSPAEEQQHEDGTFFLYSKLTVDKSRWERGDTYTCVVAHEALPNKITKTLDRSKCQGEGLAPL,21.725914279351123
45
+ MIMTMTLTMMMVMISNKTHWNSFQMNLMMTSLMILSLGGLPPLTGFLPKWIIITELMKNNNLITASMAMMALLNLFFYTRLIYSTSLMKLYPTNNQTKTKPKMMTHQMKLTALMTITMSSMTLPLAPQLITTELMAFAXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTTLIMMATSLPIIMKPMTPWWC,22.24920166064802
46
+ GVVKAAVSFCLAFCLVLCIAVTAVWFLSPTSDLDKKAVLPREYEQFKANQSSDQLRAFAAQYGLDATPAEGATDAMLAKGREIYVVNCSICHGSDARGASGLGVTLNPRPPSFTDPGFAAAHTDGEIFWVIRHGIRNSAMPAWKDKISEQDRWDLVHFLRTFKPESQKELTEAEIAALSVGEKVTMGQALFQEKCIVCHGANGQGNQTVGPVLNPSPRNFTSGVFKLRSTDQGELYAIRNGIRQHGMPPWGSQLKD,23.148315124172598
47
+ MRTHTGEKPFACDVCDKRFNQKAHLNRHKTIHTGERPFACDICNKKFSEAGHMKIHTRTHTGERPFSCDVCSKKFSQKIHLNTHMRIHTGDKPFVCSVCNKSFSRKGDLNKHMITHTGLKPYSCDICSKSFSLKYNLITHKRIHTGEKPFVCDVCGKAFTQKCNLNIHMLIHAGVKPYSCDLCNMSFTQKSSLNTHLRTHAGVKPYACDLCGKSFALRQTLSRHHKTHTGVKAFACDFCDKTFFANQHLKRHRLTHS,24.00786399231911
48
+ MRITKGFTLIELMIVVAIIGILAAFAVPAYNDYIARSQAAEGLTLADGLKVRISDHLEAGTCTADNTAVNGKTIGTEGTVGALPEGVSGDCKLSVAFTAGAAGKEITVKYDHKAGALTYQSATGKTISLVLPASLITKAGSWQGSVSWDYLKNLVPTNLRYAYVRSYMGPDYNPNNWPASGSTMPSDICWKSGDPNYTGTPGCTKNNSVAWGYPINPATCTFTPVADPTPTLAPVASVSLNKCYSAGTATLTATAA,24.29027038113483
49
+ MVGRVGGWIVSVDPDGRFGPKPYKRHRAGIKDALSYLYQLKCRLRIDPDTWREWASPLRESITLEECRYTMPSFAVQASFMTLYWSVCEALFGCRFVYGPFNPILGETYEAHVADSDDEGQKTRYFAEQVSHHPPISACHVDSEKFYLDGHTCIRSKLTGKAISVHHVGQSCLTFKRAGETYLIHMPNQYCRSILTVPWAEQETVHCPTENHSAILEFTKGGFSAKFTGRWSSVLHVISAPHAATAEEKYPVTQVD,24.583101326751542
50
+ GFHYFDITLAYFIPALLALLSSAWLIRAIRMDRADERAALTARIDELEQHNAALQARVDELERHVAMRTSELLETEQALAAERAALLDKGNHLASNFDTLKQRVAQLESERDALAADRDNLRGERDTLSGQVATLEAQRDEFARQLDAARQQAATAEERARQAEAAAASLRQRLDEALARVTELAGQNSELQAALARERQNNDALNARVRELEEQVARAQAGANQAQAARDRAQADAERLRQLEQQLAGANEAARRRIADLEDQLNRANRTIAEL,25.434296722653517
51
+ QDTVADETGFFETELTVGTKEDRYSTVFNYRRINRDLKEPQDVNVYYARYEWQVAEDWKLRPGIRLDHDDFFGLTSSPKAYLMYEHDNGDTYKLGVARAYKAPNLYQSNPNYILYSKGQGCYGSSSCYLQGNGDLKAETSVNKELGVEYHHDRFAAALNLFHNDYKDKIVAGISTGVSGNSEMTTANYMEGWMTSVKWDWQIADNWKTDTSISWSRNKPKTSSSLDYQLRPENTLNSTLTWQARENLDFGWRVVHYG,26.433123728733182
52
+ PPSECPPSPCGEKEYFDVCGQCCKKCKPMEGKISTACRKISDAVCDSGEWVEHPASDKCYACQKTCATRRPTQKACAAMRDCKCLDYFYRQLCVSCIPKCPRACDNQFCTAICNPGCVCPEGLFQDEFTGLCVPESECRTGCSNGQVYRECTSPCPSTCGNPNPRPSCSKTCFDGCACPEGMVLDDQNICVLPEQCGCTLYGRHYKPGETFTSDCGNPCEPTCENAYRTVVCTR,26.468925245048567
53
+ ENKYSLLYKNQTLFDEWGIKYQVKSRMIEKSLYSVVFNVNDKKYNIIMRLYDKETKRIYSKREIINYIKNNSSINYKIDLIENGEYYAIAMPYIKGCTLRQYINKHISEKDFINILQPLIETLKVLHDKGIYHRDLKPENILIEQDENLFMIDLGLAIDLTNAIPTIDYGTDGFMAPEQALGNKPTFASDIYSLGVIAIELLTLKNPFDSNISLSESNWISTLHKKDKPLSSVLSKLILKMLEPSPNDRPNIKDVLNSLNSLEVLQRGVN,27.369412815985804
54
+ EKKRKRDAVTWPPEKRQDAILFYLKNHNAPGMEFTEVAKAAGIHKSTVSRELKDPTFPPDASSRAGPGRPKKLSAKADELLNAWIKDTYVEGDLRREVTANILREKALEHGIIELSASTVWRILHKQLGYSSKKMSNRAIAADRRQVQEYRLEVIKAMHDNPYIYLDEIWINQNEAMNHVWFHDSETGLRSTMGLNKGSRGKRIIGVIDAEGFLHYEFKSTTDSTAAKTIVDFLEHNEGDNYLIVVDNAKYHSRL,29.60231093300791
55
+ MVLFRATLVLTLFCVQLALAQVGINTSTPKATLDITAKTTDGSKPEGLLIPRVDRQQAETIPANPQLTIYTDGKTGKGFFYLGTTTPAGTANILDISKNGYYFYNGTAWVALNSGTYGSGTSGTPSATTDKEIYTNSTDKKVGFYSPTGTLVGYNSLTTTDYNSLITSGVTPSYAIGTSNTAALSSFYTGSVSGTLVTTGLTPVIGAAATNIYTVLDGGTSSTITIGSGGTVTSVTPIGGVTSVSLPLSGVSAVSITGSGSTITMGSGGVVTSVTAPSSVSTISITPTSGSIT,29.77388588380658
56
+ LATLRQLWAGTFRRLWRAGDRDPDPAKVPLRARLVLMAALPVLALVLSAALTWQAASEQVRSATDRTLLGEVAEIGRTVSTAYGDVDTRLRGQLDGLARIPGVRSAAVVPLGAEGGTTVLGERTVPAADRSRWFSSLPLRSGSPDTVVSAPVLRGDRVLGSVQVVLDTDRVNALVSGLGWVLLLDWLAVTLLLWAAAMVLLRRQLRPLARMTAVAGAVAGGDLSRRVPDPGPDEVAQLGRAFNTMLDRIEQLLAGQRALLDDVSGELNARTVEL,30.911742604776983
57
+ LSSSCFPWSLGVSVMTFISLSLLSYGPDRPLCPLTPTLSSLQFLVGTWKMVEGSGMFQEFCNHSASQWTFTADGHMTSKAFYVQPQQGQQLRCEEMRLIAQKHHPDTHRCRSLGQPPDTPYHYEYRRDCQDPLTMQHYVTEVMSRRLILSRQKPWDPAPDHIPPGTKIRYVSSPWGPEFCEPVPTQGEAVTLHGTVTHHTLGPLWGEGNHTALTDGFPEGVSPDVFLSAWGPKGLEKLNSLAR,32.019316129846914
58
+ SPLQIVRDHFIREGRLIDPPEREFVPNDMPEYVLPSGERLPPIDVAKSRHRAVMPPPPSDYMAEYMAYADIMAPVTYYTRKDLGLGTKTILVAGAIGGLCGFLWFFMYVKGMGVLDALGITPFQIVRGDFSDTMSMANGFHMFFMITCGICFGGWATNWSRKAGFSDSMEASLMSAVVAYVLMVPMMMGATHTEMLANGHLMDLTHWTVAHLNPFHMMGFFAINVVAGLASIMVFALHLWYALTVRKTFDPEVELKTLKN,33.613951886997285
59
+ MAYRSLFTSESVSEGHPDKIADQISDAVLDAVIAADQASCGTAKAAVTTGLVTIGGESAMCWVMSDMIRTTLVDIGYSVTAVGDEGGFAPNIQSFHDALKVIGDTIVNTRKAQSDTNVQIGIDVCATSAKVLPTEYMGYEDRGASLIFSHRSGETEDSTIADFCVGVLAADIKQTLPPIVAELGKPARLRAMGQLAPLAEDAAFVGYDWNHTTGFPRFSAGSMSTADALAAADNTADAAAMANTALAEAAMAGDHATAARWSAAVEDLTAQAKAGTITTGKIAEAIRAACL,34.430385572117466
60
+ SPDVQIHPPKRDPDPWGIKGLSAFLLGGATLWGLAALAIHLAGLVPFPTVELGTADFHMTLPFMAAAAGGFLIAKHQPRDMFGIGMPEDRPLIATGAAVSFALVVIALVLYAVAPGTYTPRAIGLVGSLAVSAGILGVFGAVLGRLRPVRGIGLVPAAILEGIARQPEARGPVLVSMVAGFALGAVGLLAPHHFGLAFGFGAIGGLGAVALAGWTGALVGAPDISGPTAIAAKMQRFYLWATVLPVAAMVVALVAIATPHLNLGIGEGLLLGGMLAGPLCVAA,35.89969391101693
61
+ SAWNTNLNMDARSAWATYQRQNGEVIGWMPIVNYADTIHDRDFAQAQLIFSTQVSKLWWAEDLGVNAFVVTLSNDLYQLWLNSPDEKADLMKQININAYNINWGVDDGTYADFQVWNIARMLRNDPSTNGKRYFAYGSDAPLIAAYRDQGWETNTVRGYGEYVVLPKAAGTVDNEVAQAAVDNWYSGAIANRLGTMANTGAVVQTGTTDNGIYGYAMTDGKTLYFPRYNTKYYNTDQGGVAHEFGHHVDYAV,39.68959897551875
62
+ GEKWIMKFDGALNPSNISAVLAGGLIGLAVGLQATFFNVSTTSHVTGVLGGATVVGMATYYKWASPWAISAGTFFSLVLGTYLGSQLVKRLHVYKLPEPIAFFGGSFVMVWLWSWMTTYIYPASHALTPYASHLSYLCAMLLGALGGILGSLITPPLKDTFIASALGIIGGTGFAVSHLTMLNPTIPSTLYAIAYAATGIWGAITATRIARVLNLFEGALVCGAATVFYSFVKVVAPELLPVALASIVCAAGVLYVANLTKVV,45.78566418659657
63
+ LAPSPKVFFIDDTPIQWGFVIILLLLSSGGLFFDSKLAGIFTSLGIAVGLIGAALTTFADTRKGKVTPEQLDRVNSTLKTFFGWSLISGVLGLAIYAASLNIDGKLAFVDSLFYFTGTGLVTVGFGDIVPTTTAAKILVVVLIVGGIGFAGSMISTVASWIRSQQEKSELDKHTIRAHARNIVICHDDPRVSALCEYLQGYFLVDDKQSTYHVLPMYLDGNSLERRALRKKLFSNRVAKHFAREGSVRDLDAVRRANVAGARAVIVLSKADENID,47.50012378184719
64
+ GSTDLSTWQTYVQSTAATITSYYQDTASQAQKNQVLANVTQIINQLDSSTKTKAEVDSALTAINKIKAQIAGDAGGGSSTQATIQGVLDNLITKANNLLRQGQTISEVNALISDLNNLVTQAKGQARSDQESVYTKADSALSNLQNQLNQEREVGSNDRYISQTEKDNLIQNVNNYINNEYLWTDGTSNEGQRLTAAKNLISDTLTNDQKRAAQDAINQLIKDANDLLNQARDRAANQGVTQTEKDNAISNVKTVY,51.616257412346954
65
+ PVPVPVPVRRPSNTQLDSPGHLRTLLDRNHLPPPDTQLSPDNRLLQDNVPGSGRPLPERTRLSPDRRTLQDFPVHGRDLPEVHRDHGLPAPDHIPPGYGGFLTEAQRHKEWFHVSDTHMAPPDGTSYPIARFHVSAGRPGMPAPDRYFAALGGAQGMASHMHGSGMHSSHGMHGSMGMHGMGHGMFGGGAMGPVFFIVAALAIIIAIGVAVAAKAGGGEGA,52.07059943074766
66
+ APFAICRRCRRRRGLPVCARRRWRRRRGNIWCAVGSGGIWRPCCRCITRITCRLRVSAAWRICCAGCRGRTCCGSFWWSTTCGSRACTARWTPSPWRSTGRCTRGAWTRWWRCARSTATPSPATASTWRAGWRAATPCGSATSTTSSCRCSTAW,52.5490203150644
67
+ LVLFAPTFNLSDPEGTVFATLVAIATAVGGYVAIPISGIDSIAGGVVSGYAVAKAGQFTNALKTTAMGAAVGEILGEQLYFGGFGPLGIVAGLITAGAIHKWLVMNKVSVNIYDAIGGRRFEVVLAVMIVTGLIMSFFVPAPVGGFIDNAVSKVGQSAAIGFITDSGSTLLANGINPVIAIGFLFAMAGVLIGGFKVASAQMGTLMGAVAFITGAFGFAVHFGANMVGVGALIAGRFTGRAFSDKVNETWPAVTDAVNNRYRTMVNVLAGSVVGAIFGL,52.5972401908542
68
+ MAAIHPPNLSFLPKPSAIHLFAFWTGSMGCLCPLLLGSQPILWASTALLLGTLQLGMGLKASLYPSPFPSHHLFQTTNYFLSFFLPFSLLSYASFFPSTLFPPGAIVTLTGLTLHGVSAYTLGGATGAWINYNTNHIFTAENGTVTGIKEMDTYSMVTANRFWSQVFQILFWCTNALALATHFSRIWTISRAEKHQLHVEEEHHHTAAEMVLAENIGIKTLTDYDDDDKMISYYRKDGVHHMHVEDAELALKLQEEEDLKNKKN,52.98995197391265
69
+ GLFAVIVEIPFSLRLPSVVQAKGSFSDSLFSHSAYPVVQPYFSPETLFGFDILLPITGEPVSRGLYTGHQPLLVVGVETSFLLTVETRLTGEVYSKGGRNSWDIQNCNFFGSDGKKYSLPAFERKKVKDVKCVDQDGVFSEVILERTHTSFTLKYTLPDSEWLIHSRSQLVKREDSNMGRPRKHLSSLVARNSSFEATYQRVSEKETEVSVQFGFSVGWKVIYLFLVKHFPFVFHWISNVLFYLLLNTLFAYIPDFSTFDCLAFLVTL,53.005620188296234
70
+ LTPRQRMWYGILSTAVFLLASEGSFFAISLTALVSYAYYQSILAQTQPAAAPAISAGFAFMLGVVIFGWVVLGVIQALINAISEWIRALVINIYSRTVFAPYVRALSHTPEGVRVINLQSSQLAGLFVNEFVKGFVDGLALIASLLVSLLISLWMGGVLGLIVFLYFCFRVMRQVGENMGRLREAQGQMYEQTLGLVEGLKDIRAARREEVYKGRIESLFGELAGMEVAGAKVQAVSTLMMRVVTQVAYLCMLWVGAYGVFHGDLS,56.40990415587325
71
+ MNINQLVLKAREENKQHENFQQGRLNLRYQEISKIEYLNRCRKLAINGNRIQRINDLQFFYHLTYLDLSNNLITSIENLHCLPLLRNLNLQKNLIGHITGLETLVRLEYLNLSHNQISKLENLECLVNLERLDVSHNHLTKIEGVCFLKSNILKELNLESNLLQELKFCEHLDYVTISNNNISSFSQVCYLLEHMPRLKYLSFTGNPYEQKLKQYRMVVFSKLQYLDGFVITEEELCRGSEVVDWIDSGSEFQRFRYCVINFLKDENNRT,58.18325968813114
72
+ GWVRQLPVYKRFAPFLSKFTLVTSLIAVGAGSGATYIQNLRKPRVRDKIVVHTVPLTPEMSGGKRFSVAPPSGIPHASHRMIPIERQREEDAXRERALRKKMLRRTAMLASGAFCLVLFVALGATIGTLRSEGVLKKDEFIPRPAIVGADGKAYDMDHPYAPPVKYQVQWEPKMGEKYYFHDYAKHHPNDNPENPYNKVAARA,60.55516244953947
73
+ MSASLFQTQGNYLVAAAISLSGLFMLVGLLAGSPRRPTYRWLLASVTLFCVAVSYFFMLSATTLEQGLVVKTNTGERALVDAVNGSVQYADGHYEIEATLRNLGSQPVRVEISRLQVVGEKMFGDIQSRTVEVGPNETRQVKFLLNRVLTSSANFRDRVLFVITDAQGNRQFIEVPVAYQYAQITGLLIALAWLAVIVIGFPVAWRSRMRIASGNRPVASGPQIAYLTALLFAATWTLVLMIAGTQIIGSQAGL,60.895889009456674
74
+ AQTTLNVADNSGARQLMCIRVIGASGNCSFVNQQKCTGICGCTRNATPIESEEIFDCIMKCGGQPGDCEVFQTHQCQQRMANNAHHYRRHWLSHTDFCVLPEHFHLDQDRHFHFQQHHHNWHHGHRHHHHHHDFHFGKFFETFAAPFASIFGGHIHGGFEKFSEMLANGFGGFDMFFGGFGGHGHFGGYEQEATSFKILASVVAAILLIAIAIPLGWLVKSQVSGIKVITTTTSGANQIILMKTVVAIATILAIAIAIPTG,62.10915337352313
75
+ GKRKAAVSRAAKLAATRAVPFARAAAIGPYAAIAIAGTKMAIDDHYKKDREKNREFVFNQWMSRKQLYDYKRKFWMFGPEKMKQLYEESGAKGAEAFFKENAETFKKIRDEYLVDLKNGTANPLTGEKVPLNPALPEDIRFPKYTPPPGLVPEGENPYYIPPPGYVREAERAGMPPPKKREMRMRPAGSEPGTTFGGAGYNPFAADPEYPHTAYAXXXXXXXXXXXXXXXXXXXXXXXXXXXXKRKAALSRTARLVATRAIPFGRSAAIGPYAMTAVAVAKMAYKDD,62.61105811999597
76
+ GWVRQLPAEERPVVLDRDEIELDPPVIGMGRNLAIMAVSVFLFMILTAWFALGEIQESEIARGTLRADRTLLDRTFIPITERGVFTTLDSRWALADVEPGELVWIAVDKHPATLQPGQSVQVYVRAVNDKPDNSVITPYRAVFAEIEREGFRWIVSVDQERFDQFRAHVTESLRLVNRGEALVGADGAPIPTIDLEATPGLAPDIPVTLRFEAEDIDWRILDQSQVQVARANVASADVSQPGWQEVELTAVAPWQAGKT,66.68297956107664
77
+ GWTLHPVSLYFSNHLGYVRLYQLWLTSVDKKSTNAFYHEVSDSQRKLVKRITRMELCFLGVMTLISLASLAIYAKFDQTSLPMLNKVFPRQNDIVTPVKFSLSASFFVFFLLLACFLSHAVNQVAKLASFCSALEDIQEFYVRIREELDSLRSYVENLEKRSAVSEEKLRLQASQTEMLLKRLPSFSSFCLLTLDRPILLSSHCPSLLPTVKGILNRGYKLSVYDPPPFQLGLCKDTHISDTQIYYNNGSRLEGATFHL,71.44388492712908
78
+ ALVPSDVSDQAEATLAFARQNLAKIEPEKIEIKQEPASGVNPADQPSQLDIYLTCTLKNEIRAPPGTTMPQLNFLRNQLEKNLLVPASQRDAYIQANPQQTLILDQPSPLTPEQKEDLAQLTITYGKNNLEVNTQRWPLPSLQVAMQTLESGEAHLEYRIHALPKAAGQPPVPVLKLVSKTTLPATAPVPNTASPTLSVRLPPRRTPPPPPIADEDLDDSPIVRDSRTLLKILLPTVLALVIALIAWRLWSSFTSHRIEAIATVPLPSATATPTP,72.99576740691371
79
+ MATPSFVSEPFAGLTPRQRQAIAAAMRSSLGYVQESVALNRMYSSALQGLVKPAGNAATIVASTGNVGTALSTLSGIQTAFSQYLKGKGSLVGSATNTLIAAQGKLISDLGSLITQEKTFMDSISKKLISDMDIAVSRTQTINSEVTKLTQERNALVAQLEAARKDADSAQKATITTELSNIIGTVAGAFLTAGFTAGIVLSIWELWAWGATLAAIAVGVGILLIIYATSRSSASNRKAELDAANSNLQNAQSTLKSDQQ,76.45022543295501
80
+ TPGLIDKLLGGGVQLPPGLLMALAVLAIQLGFIALIGKRVQFGAVARRYKIDAPETSLITAVLLGLAGYLAIFFAMRGMPWSATGELRWISGPHLNPLTFTAKFAMCALIVVPAAMRGWWAFSGPGADERSRHNARYAFWGSIVAVTALVVEGFLIMAPSLTEARFSPFYYARLLTYFVVTTALLVWTTVRESETPGRTLMGFALFSAAMVGLEMLSFTRFAVQFPTWWNVEVANLMYFGTMMIVLGLFFAMGGNIRWMVAA,77.96717901193621
81
+ MRYFKIRSTTLLIYLAAISVCALSICAPGFITPDEPAHFNYIRYLADHGQLPRIDPYAYASWGSTLSSLSYEFFAALFSWIPLETARSTVIFFAILNAVIIFATARRIAARYGSTGAFAAAAVFLLSPRVLAQSSFNNYDSLGIALMLAAWIFYEKVLTEKRLLPAVLSAVAVSIALLTNYQGYFIFAAVLLFSLPFPKLFFSRKNILFSAGVLSAAVIAAGLFAVFYKDLFLYSVFDVRLMSVFKMMTHQYPFSDAMTIYGGYFTVLF,81.23947975232642
82
+ LSPDLVAQLKAKTGVSYKEAKEALEATNGDIVAATIWLHEQARTSTFFFFFFFFNLVVGMGLFGPDRPLWLPGHALRLQPRHGLPGHRAAGLHRPRALPRLRLRPLPRLPRLRQGRPHLRHAPARPLRQPRLLPGDHHGRLLPRPRRLPLRLRLRHPRLLLRHLRLRPVLLPRRPLRALRLPRQPRRPLLPLLRRLLQGHPHRGARLPLRPVRRHLLHLRVRPLLRRLLGLLRARVRPQLLRPAPQRLHRLPRAQARPQGPRDGLLLPGQGPAQG,81.88453571822619
83
+ VGRINTAVTKVEGLKGVFDTASVFIIMRLILGALPGHDYFWHVATHKVLSTTWYELFSNVFLQVPSFITTFFMGAMLVQTMAQKSPEMQEFLKKGGIIFMTLAWFFFAPSGDYVVMRVISACTALVFIVTSMLEMNHVTPPPDTGLPRPIALCLRAFFYIGFLEWCVQQNFYAMCVLFFFMLGGVFTHYTALFVARYMKFFETFVPPIVHSGFSIAWMMWATQEGFITPMGQEPLLLTVLSVMVFFSVMSMC,86.07420147138896
84
+ MVYRITTIILVISTLTSFLIMFIPLTFRTFHYVMAFMVLLETSMFMWWYFDMSTSSYWNQERVHYEENGVPEFSLSFWSGLMFQMASVCYTYGKVYLSALRFGDMDHVQGQFIDLSNHFAMKTGLNPNDFKMRWPIQLMHNIINTMVEETEKLNAKQQREGITAEVEGEGRPQTFYEIQMLWHCITIILDELKRCTTVSNAIMTKETVDRMVHLCEKGIIPPDLEDFVFKLVFFTPPFEMILNFAI,92.19736060379442
85
+ MFSKLSLDAVPFARAPQWQRHLLRVACLISLFSLAYLAIVIAADTTNSIFTVGIGILLAAGVWFYWRDAVREELSHNPLGTRAAGIILGSGLVMLGLQLSAHLTGTWGYVTPTTFRWLAIMALAWPAAFLALRLTRDEEPVSEAMDNFDRAMAIMLVVSLVLWTFSPLLRGAVQHLHWLLFADYCFVVVDVVAVVMIYHMVRFLLAPLRETHPDAAQAIARKADAMVLWLFLWALYPIAYLVPAFVWGFHFPEGSIW,94.53532369154783
86
+ MVIVAIDRAIKATTILISPLIVIDLISAFIIGFRYQLVHDVIARICFAMVFVYYFALFFEIYYSRHFQGYQSALIKRCFLTLVPWLIYGPLLLLYRPVGDWYFPMTLLAFTIFALLAKRFVIEEETRDVMLEKERRMHFFAMVLFVGAMAIAFALSHFGVLEAFMPYRAFWMRGVTLIYFTSFYLVLLHHYGLREEIAFHKRGEVKPYPAYLAYTVINLTAWAVFFFFTHYAPTSAFARWWAWANFICIPFYAIG,95.44574656037318
87
+ MLLVFFAVMAGLLYGWWLRGSPKHARVTLGFYFITFFLLALLVWTHLGPSQAWSGFSVTLNRFYFWYLIITANAGAVLSAFGLVHRKTYVPEAERKRISLQFDAVFLILWLASALVCTFVMCEYLRWGWTGTDTLFGNHYLTPVLGPLLFWEWVTGLGLVVFAVLCWIYVRKFHYHDNLTARFAYSLLFVAPLIYLWMWVAGHPYQLAWTQDTAWLQSMGYWNGYPFMNPAHMIAFLGAGALFSLAMVAHGFRSERDGY,99.45883530953378
88
+ FGKDVVPVAATMVPFFGAIGFVLALRQPHFYPPAILIHGFIAAHFIGLYGENDFGEDFVPYFVAGLFVFWGFVAFILNVYFPPTPQNKRTLREEKYHEQVSVLTQAAIDGQEPQEIEVALGQVQANFDTAKSALEADRLIANQKLRAAVSTAATLIVMTAVIVGVHSQYDLLGLVLAMAAAISTLAGLYVFVGVSRAVLTFFTLRRGKTDEFLADADNFLKRNPVPVAALSALAKGHRDQAVAAAQSAIDNINPNPTTSSSSSTSASSSSSWAFDPLG,100.05271078832499
89
+ MRCGGTMPSTRSTTTGRCTRGWRRRICGTGTATSWTRPTGCTSRSRTSTTSWAGSPTCRTSRTGSSTPWTCPCTCTWRTRSSRSSCRTSPSWTSRRSTRSSRCTKTCWWTRSSRTTWCISGRCSWRSGSTCTRALSWSTWGRCATRRPTWSARSTSPTRRWSSACTARSGRRWCSTTPTSCTWGSCTPARTTGAWKPCSGSTSSPTPTSRPQARCSSATWSRACPTWRSTSSTRRRTSSAATPSSGPW,101.66820225573242
90
+ RIHDTILPFLMLGVGAFLSGIATLIEKSPNIMKCLPLLLTIGCCIPFLGWVSPIVLPFFSMKTQTTLSDGAIYGNSSISRVYENGIVEETQYVCGLNIFTSRIEVDGDFLFPKYYAPTNDTELQYVTEIPTSAHGTNPAELNATRKNLLNTLGPRYTLVLTDTDGVVRDYVVGNIPQGSPSPNLRYKGLRLELAVDQLPAYTISPPDGTSAFTFINKHWLIDIPTTLISETMVRKLVKAAGPLGPAYIVITEQSPNPIVATAGQAR,103.31987729217545
91
+ CRFGTCTVQKLAHQIYQFTDKDKDNVAPRSKISPQGYVNPNNEPTSYISPGHLRTKKSNMIPAKEVTRIDPNIVPNPNVQYPNLPAPYMYSGRAKRRRNLGLLMGRPNENPDNQHEMQDGYESAAYSNSYKGTYGKLTRWTSRWINNHYIDIERKVHFKDGRIFKTRAESSRINPKIGDFKTTKYITRGEKEALGFKIGGRLLLRPSSKLEKNFTVTETRTIRNGYTTTISRTIRWDDLEKCPLGNCAVGDLVTIDVTD,103.52201921223038
92
+ MWWWRLLVVALLRIGLALEDPARNPCSRVFFEGLTGCQQKVLRAVYPDPSRCLKACSEMKEAANSWGTRYAVATSVLGLEWLAYSWIQDKVACRCRGLSIPPTQKPSLFEKLLHSPLLLQGLQRAAEPVLGFFTQATQALKEAVWSALQWLGGQAGHILAFSRHFAFCLMAFSTLTLLSVCCNWWAIRRRFHQLESVTEQLLRCQQYVLQFRAVSRRHYISWALQLYFAHAFILRACAQLVSVLTTVSNMVSDSFSL,113.06700569292313
93
+ DPLSIILGILAGLFLIILIVLYFCGPYCTCIKRTGCCGNRWCYRWRCCCRRRRWRRRCCRWRTCYRYRWYSTRVRKCVKVPVVKTYKYRSKCGTCYGYVTRTRKVCCSYSSSSKKVCYTACRKKVYKTRYTYKVKVRNCKPCTKYRTKTVCSKCSYKTKIRTRTYKVRVAKCPRKSYKVVTMCKKKPSYRTCSRTSRRSRKVCLTCGSKAYRTKRTIKVPVKKTCSRKVSYKVCARTSHRTH,118.16791808043538
94
+ MQTYNNPEVTYFDRSQTDVEYGWWSGNSAWKNEQWLVMTKEEAKEFFRRSWIKLLDAFLPTTYIIVRWYQMYNYGCPLWCEKNDGKAHCKDWDYHPTCGKGPWWANNPLPTVKGQCEVYTTHRAGSSKECRSYYDLTSAQKAALQSSDCKATTGVYPFYSQAGTCRLNADYPLEKIPEGICNVHLNHKTRASHWGDPDIPTSQIWYFAAYDQAEKEWRTLSGTLEHTWVALSHEDYQRLVDIESKVPWSVSP,119.47290840438525
95
+ VQHFTGYVEDGRGIFYSLPMTNKGLDRIMLCIAVIVAFGMLLCPLASLYFSSEPVLVREDIFSALRTLSIFAAVWQIADVLRRTYVVVSKNPLLLLGLALELTFYTVYFGLDKLYPYPLAVFLPLQFGGILLRHITSIYLQAVSSRNESIIAQLRREREREARRTRERNIAQKRRIDAALWRQMSAVVIFLLLWLIAFTSSALALYNNLLASQQLSIAGLTPSQAASLNTGELLLRVIYGLVISCSAVLFTMTVEARDKIMHD,126.04829832003558
96
+ MLARFFRRQRTASFSLATVVALSALALHTSGMHRPALYASATAVHAITLITLGVMYARSMAPRAEGADHDLRHFMTAYLVTALAWPLAMVLTFALTHFLPGTDPLVPDRTLRLVTLINLAFMASATAHFAFALHTGWNVPRAIAATIVVFALVALTAWLIEIFTGGSTHWSFWAVLIASAAIWLGLALYFRRHAAAIAAFERRHNAQILARFIAAQDETHEQAGGGARSLAHNLDSPLTAAALFADDLSGKVDAPVREHLRLIRRSAND,146.856677830673
97
+ GWFDAMLASVSEFAPIFVVLIIFIVRVYKPFGSEWIVHVLHIADKRPGLNALIHRLLPRTAVHVPQAVKDKYVFLNSEHCIQFGCRHDPVPQYLELLSKGTYSLKVDVWFKHDRAREFYNMLLNEAQTASENHHASKIRHWTDEKMSELFAMAKKAYIPLNETREHSHDKAHSHNHAHSHNHSHDHGHSHEHTHGHDGVHAHDHSNTSDAHVHSHKHLHLHVHVHDKKNIIRRMNSALRKMKAAGVNTHEVAHVHDETTP,166.7331315912073
98
+ GAPPITGEALEKDISRREKGVGGFLSRLFSLVNSTNPFAVGVEGEKLLEEIENIRDSMGHQTAAQLYFAQQQSLLQAEYARWQESHNATLQATKDHIFNAQLGHILMLAGAVVCYTAGLRAWAXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXFLAITNTAINTYFGGSITQSLEQVSAAVEHAKKVGLLSQETVGQIEAAYNSATGKALSYNNIADTEAAFQEFSRNHVLRSQLDKENADAAWTRVQSEGASLRAYLDHASRAITSALNGAIFAFGGP,169.6224292611664
99
+ MSAMSVQIDRLQDQLNHLGELVAQNSKVIAALTQRIQVLERIVTERLRIPYIPLEKRTALMFPLHDEEKQSEITLFINAELHLGTAPGKHKVYYTTVEEMIQHFKEGKCLPQNWPQSDNPFWPCYRELADEMKSNTAAYNNFIKMQDEMRKLCIMLSSGVYHISRNPGGAKDLYTDPKLFIQIYTNECLRNAIPAEILDQMIIDLYANYTEADIHNMAEVRASRNFNHLEKQYMHKLLKLKKTLPFAIQASMDVVL,187.47480920419864
100
+ AAGVAAWLPFARAAAIGWMPVASTAPRAMTATASWPIWMIWAMAMPAMTGRRWRRSRWATAPMARSSGAAPMMARPVMTAMPVAAATIRPSALRSISATASATAGSASAMTRSAAMSPIRTIWRRCSRARRAVASGAWSAISATSSARRRTTSTARALSCAAMASGLPSASMKAAAGGGSSNTMPRCSGSSASRTRACGTASPSCCRAATAASASARAARARSCRASSRARSAIWRAMSVRSRTWSRSARLRRSTSRPSMRSATAA,216.8557544805602
101
+ EKKEVCSVFLTNRVPLDDKRFRRERVYLPGESPFIDPDLFLSREHPLRAQVRGTIIEWLRASIYGIYPYPEQRDPNLWCTERFKQEVMPDGHCEPTLGFVPLTFSTCLTRDMIAASSYNWRKTMEVPGAKMLLHVGPLGTGGHYDYAFTFLQPDNTFAYVKGNKLVRQTKIWNDAGFQLVTEEATLLDAQEYFGAANKLGVCIFCGNCVEYCPTNCLSMCEEVLPRGNALQESWTILERVFMPEDPEHENFKYRRLRTSDGAKFINYTS,520.388790480398
benchmarks/Generation/ProtGPT2/protgpt2_test.txt ADDED
The diff for this file is too large to render. See raw diff
 
benchmarks/Generation/ProtGPT2/protgpt2_train.txt ADDED
The diff for this file is too large to render. See raw diff
 
benchmarks/Generation/ProtGPT2/run_clm.py ADDED
@@ -0,0 +1,657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
18
+
19
+ Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
20
+ https://huggingface.co/models?filter=text-generation
21
+ """
22
+ # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
23
+
24
+ import logging
25
+ import math
26
+ import os
27
+ import sys
28
+ from dataclasses import dataclass, field
29
+ from itertools import chain
30
+ from typing import Optional
31
+
32
+ import datasets
33
+ import evaluate
34
+ import torch
35
+ from datasets import load_dataset
36
+
37
+ import transformers
38
+ from transformers import (
39
+ CONFIG_MAPPING,
40
+ MODEL_FOR_CAUSAL_LM_MAPPING,
41
+ AutoConfig,
42
+ AutoModelForCausalLM,
43
+ AutoTokenizer,
44
+ HfArgumentParser,
45
+ Trainer,
46
+ TrainingArguments,
47
+ default_data_collator,
48
+ is_torch_xla_available,
49
+ set_seed,
50
+ )
51
+ from transformers.testing_utils import CaptureLogger
52
+ from transformers.trainer_utils import get_last_checkpoint
53
+ from transformers.utils import check_min_version, send_example_telemetry
54
+ from transformers.utils.versions import require_version
55
+
56
+
57
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
58
+ check_min_version("4.45.0.dev0")
59
+
60
+ require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
61
+
62
+ logger = logging.getLogger(__name__)
63
+
64
+
65
+ MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
66
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
67
+
68
+
69
+ @dataclass
70
+ class ModelArguments:
71
+ """
72
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
73
+ """
74
+
75
+ model_name_or_path: Optional[str] = field(
76
+ default=None,
77
+ metadata={
78
+ "help": (
79
+ "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
80
+ )
81
+ },
82
+ )
83
+ model_type: Optional[str] = field(
84
+ default=None,
85
+ metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
86
+ )
87
+ config_overrides: Optional[str] = field(
88
+ default=None,
89
+ metadata={
90
+ "help": (
91
+ "Override some existing default config settings when a model is trained from scratch. Example: "
92
+ "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
93
+ )
94
+ },
95
+ )
96
+ config_name: Optional[str] = field(
97
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
98
+ )
99
+ tokenizer_name: Optional[str] = field(
100
+ default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
101
+ )
102
+ cache_dir: Optional[str] = field(
103
+ default=None,
104
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
105
+ )
106
+ use_fast_tokenizer: bool = field(
107
+ default=True,
108
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
109
+ )
110
+ model_revision: str = field(
111
+ default="main",
112
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
113
+ )
114
+ token: str = field(
115
+ default=None,
116
+ metadata={
117
+ "help": (
118
+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
119
+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
120
+ )
121
+ },
122
+ )
123
+ trust_remote_code: bool = field(
124
+ default=False,
125
+ metadata={
126
+ "help": (
127
+ "Whether to trust the execution of code from datasets/models defined on the Hub."
128
+ " This option should only be set to `True` for repositories you trust and in which you have read the"
129
+ " code, as it will execute code present on the Hub on your local machine."
130
+ )
131
+ },
132
+ )
133
+ torch_dtype: Optional[str] = field(
134
+ default=None,
135
+ metadata={
136
+ "help": (
137
+ "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
138
+ "dtype will be automatically derived from the model's weights."
139
+ ),
140
+ "choices": ["auto", "bfloat16", "float16", "float32"],
141
+ },
142
+ )
143
+ low_cpu_mem_usage: bool = field(
144
+ default=False,
145
+ metadata={
146
+ "help": (
147
+ "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
148
+ "set True will benefit LLM loading time and RAM consumption."
149
+ )
150
+ },
151
+ )
152
+
153
+ def __post_init__(self):
154
+ if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
155
+ raise ValueError(
156
+ "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
157
+ )
158
+
159
+
160
+ @dataclass
161
+ class DataTrainingArguments:
162
+ """
163
+ Arguments pertaining to what data we are going to input our model for training and eval.
164
+ """
165
+
166
+ dataset_name: Optional[str] = field(
167
+ default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
168
+ )
169
+ dataset_config_name: Optional[str] = field(
170
+ default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
171
+ )
172
+ train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
173
+ validation_file: Optional[str] = field(
174
+ default=None,
175
+ metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
176
+ )
177
+ max_train_samples: Optional[int] = field(
178
+ default=None,
179
+ metadata={
180
+ "help": (
181
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
182
+ "value if set."
183
+ )
184
+ },
185
+ )
186
+ max_eval_samples: Optional[int] = field(
187
+ default=None,
188
+ metadata={
189
+ "help": (
190
+ "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
191
+ "value if set."
192
+ )
193
+ },
194
+ )
195
+ streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
196
+ block_size: Optional[int] = field(
197
+ default=None,
198
+ metadata={
199
+ "help": (
200
+ "Optional input sequence length after tokenization. "
201
+ "The training dataset will be truncated in block of this size for training. "
202
+ "Default to the model max input length for single sentence inputs (take into account special tokens)."
203
+ )
204
+ },
205
+ )
206
+ overwrite_cache: bool = field(
207
+ default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
208
+ )
209
+ validation_split_percentage: Optional[int] = field(
210
+ default=5,
211
+ metadata={
212
+ "help": "The percentage of the train set used as validation set in case there's no validation split"
213
+ },
214
+ )
215
+ preprocessing_num_workers: Optional[int] = field(
216
+ default=None,
217
+ metadata={"help": "The number of processes to use for the preprocessing."},
218
+ )
219
+ keep_linebreaks: bool = field(
220
+ default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
221
+ )
222
+
223
+ def __post_init__(self):
224
+ if self.streaming:
225
+ require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
226
+
227
+ if self.dataset_name is None and self.train_file is None and self.validation_file is None:
228
+ raise ValueError("Need either a dataset name or a training/validation file.")
229
+ else:
230
+ if self.train_file is not None:
231
+ extension = self.train_file.split(".")[-1]
232
+ assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
233
+ if self.validation_file is not None:
234
+ extension = self.validation_file.split(".")[-1]
235
+ assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
236
+
237
+
238
+ def main():
239
+ # See all possible arguments in src/transformers/training_args.py
240
+ # or by passing the --help flag to this script.
241
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
242
+
243
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
244
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
245
+ # If we pass only one argument to the script and it's the path to a json file,
246
+ # let's parse it to get our arguments.
247
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
248
+ else:
249
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
250
+
251
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
252
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
253
+ send_example_telemetry("run_clm", model_args, data_args)
254
+
255
+ # Setup logging
256
+ logging.basicConfig(
257
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
258
+ datefmt="%m/%d/%Y %H:%M:%S",
259
+ handlers=[logging.StreamHandler(sys.stdout)],
260
+ )
261
+
262
+ if training_args.should_log:
263
+ # The default of training_args.log_level is passive, so we set log level at info here to have that default.
264
+ transformers.utils.logging.set_verbosity_info()
265
+
266
+ log_level = training_args.get_process_log_level()
267
+ logger.setLevel(log_level)
268
+ datasets.utils.logging.set_verbosity(log_level)
269
+ transformers.utils.logging.set_verbosity(log_level)
270
+ transformers.utils.logging.enable_default_handler()
271
+ transformers.utils.logging.enable_explicit_format()
272
+
273
+ # Log on each process the small summary:
274
+ logger.warning(
275
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
276
+ + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
277
+ )
278
+ logger.info(f"Training/evaluation parameters {training_args}")
279
+
280
+ # Detecting last checkpoint.
281
+ last_checkpoint = None
282
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
283
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
284
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
285
+ raise ValueError(
286
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
287
+ "Use --overwrite_output_dir to overcome."
288
+ )
289
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
290
+ logger.info(
291
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
292
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
293
+ )
294
+
295
+ # Set seed before initializing model.
296
+ set_seed(training_args.seed)
297
+
298
+ # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
299
+ # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
300
+ # (the dataset will be downloaded automatically from the datasets Hub).
301
+ #
302
+ # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
303
+ # 'text' is found. You can easily tweak this behavior (see below).
304
+ #
305
+ # In distributed training, the load_dataset function guarantee that only one local process can concurrently
306
+ # download the dataset.
307
+ if data_args.dataset_name is not None:
308
+ # Downloading and loading a dataset from the hub.
309
+ raw_datasets = load_dataset(
310
+ data_args.dataset_name,
311
+ data_args.dataset_config_name,
312
+ cache_dir=model_args.cache_dir,
313
+ token=model_args.token,
314
+ streaming=data_args.streaming,
315
+ trust_remote_code=model_args.trust_remote_code,
316
+ )
317
+ if "validation" not in raw_datasets.keys():
318
+ raw_datasets["validation"] = load_dataset(
319
+ data_args.dataset_name,
320
+ data_args.dataset_config_name,
321
+ split=f"train[:{data_args.validation_split_percentage}%]",
322
+ cache_dir=model_args.cache_dir,
323
+ token=model_args.token,
324
+ streaming=data_args.streaming,
325
+ trust_remote_code=model_args.trust_remote_code,
326
+ )
327
+ raw_datasets["train"] = load_dataset(
328
+ data_args.dataset_name,
329
+ data_args.dataset_config_name,
330
+ split=f"train[{data_args.validation_split_percentage}%:]",
331
+ cache_dir=model_args.cache_dir,
332
+ token=model_args.token,
333
+ streaming=data_args.streaming,
334
+ trust_remote_code=model_args.trust_remote_code,
335
+ )
336
+ else:
337
+ data_files = {}
338
+ dataset_args = {}
339
+ if data_args.train_file is not None:
340
+ data_files["train"] = data_args.train_file
341
+ if data_args.validation_file is not None:
342
+ data_files["validation"] = data_args.validation_file
343
+ extension = (
344
+ data_args.train_file.split(".")[-1]
345
+ if data_args.train_file is not None
346
+ else data_args.validation_file.split(".")[-1]
347
+ )
348
+ if extension == "txt":
349
+ extension = "text"
350
+ dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
351
+ raw_datasets = load_dataset(
352
+ extension,
353
+ data_files=data_files,
354
+ cache_dir=model_args.cache_dir,
355
+ token=model_args.token,
356
+ **dataset_args,
357
+ )
358
+ # If no validation data is there, validation_split_percentage will be used to divide the dataset.
359
+ if "validation" not in raw_datasets.keys():
360
+ raw_datasets["validation"] = load_dataset(
361
+ extension,
362
+ data_files=data_files,
363
+ split=f"train[:{data_args.validation_split_percentage}%]",
364
+ cache_dir=model_args.cache_dir,
365
+ token=model_args.token,
366
+ **dataset_args,
367
+ )
368
+ raw_datasets["train"] = load_dataset(
369
+ extension,
370
+ data_files=data_files,
371
+ split=f"train[{data_args.validation_split_percentage}%:]",
372
+ cache_dir=model_args.cache_dir,
373
+ token=model_args.token,
374
+ **dataset_args,
375
+ )
376
+
377
+ # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
378
+ # https://huggingface.co/docs/datasets/loading_datasets.
379
+
380
+ # Load pretrained model and tokenizer
381
+ #
382
+ # Distributed training:
383
+ # The .from_pretrained methods guarantee that only one local process can concurrently
384
+ # download model & vocab.
385
+
386
+ config_kwargs = {
387
+ "cache_dir": model_args.cache_dir,
388
+ "revision": model_args.model_revision,
389
+ "token": model_args.token,
390
+ "trust_remote_code": model_args.trust_remote_code,
391
+ }
392
+ if model_args.config_name:
393
+ config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
394
+ elif model_args.model_name_or_path:
395
+ config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
396
+ else:
397
+ config = CONFIG_MAPPING[model_args.model_type]()
398
+ logger.warning("You are instantiating a new config instance from scratch.")
399
+ if model_args.config_overrides is not None:
400
+ logger.info(f"Overriding config: {model_args.config_overrides}")
401
+ config.update_from_string(model_args.config_overrides)
402
+ logger.info(f"New config: {config}")
403
+
404
+ tokenizer_kwargs = {
405
+ "cache_dir": model_args.cache_dir,
406
+ "use_fast": model_args.use_fast_tokenizer,
407
+ "revision": model_args.model_revision,
408
+ "token": model_args.token,
409
+ "trust_remote_code": model_args.trust_remote_code,
410
+ }
411
+ if model_args.tokenizer_name:
412
+ tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
413
+ elif model_args.model_name_or_path:
414
+ tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
415
+ else:
416
+ raise ValueError(
417
+ "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
418
+ "You can do it from another script, save it, and load it from here, using --tokenizer_name."
419
+ )
420
+
421
+ if model_args.model_name_or_path:
422
+ torch_dtype = (
423
+ model_args.torch_dtype
424
+ if model_args.torch_dtype in ["auto", None]
425
+ else getattr(torch, model_args.torch_dtype)
426
+ )
427
+ model = AutoModelForCausalLM.from_pretrained(
428
+ model_args.model_name_or_path,
429
+ from_tf=bool(".ckpt" in model_args.model_name_or_path),
430
+ config=config,
431
+ cache_dir=model_args.cache_dir,
432
+ revision=model_args.model_revision,
433
+ token=model_args.token,
434
+ trust_remote_code=model_args.trust_remote_code,
435
+ torch_dtype=torch_dtype,
436
+ low_cpu_mem_usage=model_args.low_cpu_mem_usage,
437
+ )
438
+ else:
439
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
440
+ n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
441
+ logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
442
+
443
+ # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
444
+ # on a small vocab and want a smaller embedding size, remove this test.
445
+ embedding_size = model.get_input_embeddings().weight.shape[0]
446
+ if len(tokenizer) > embedding_size:
447
+ model.resize_token_embeddings(len(tokenizer))
448
+
449
+ # Preprocessing the datasets.
450
+ # First we tokenize all the texts.
451
+ if training_args.do_train:
452
+ column_names = list(raw_datasets["train"].features)
453
+ else:
454
+ column_names = list(raw_datasets["validation"].features)
455
+ text_column_name = "text" if "text" in column_names else column_names[0]
456
+
457
+ # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
458
+ tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
459
+
460
+ def tokenize_function(examples):
461
+ with CaptureLogger(tok_logger) as cl:
462
+ output = tokenizer(examples[text_column_name])
463
+ # clm input could be much much longer than block_size
464
+ if "Token indices sequence length is longer than the" in cl.out:
465
+ tok_logger.warning(
466
+ "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
467
+ " before being passed to the model."
468
+ )
469
+ return output
470
+
471
+ with training_args.main_process_first(desc="dataset map tokenization"):
472
+ if not data_args.streaming:
473
+ tokenized_datasets = raw_datasets.map(
474
+ tokenize_function,
475
+ batched=True,
476
+ num_proc=data_args.preprocessing_num_workers,
477
+ remove_columns=column_names,
478
+ load_from_cache_file=not data_args.overwrite_cache,
479
+ desc="Running tokenizer on dataset",
480
+ )
481
+ else:
482
+ tokenized_datasets = raw_datasets.map(
483
+ tokenize_function,
484
+ batched=True,
485
+ remove_columns=column_names,
486
+ )
487
+ if hasattr(config, "max_position_embeddings"):
488
+ max_pos_embeddings = config.max_position_embeddings
489
+ else:
490
+ # Define a default value if the attribute is missing in the config.
491
+ max_pos_embeddings = 1024
492
+
493
+ if data_args.block_size is None:
494
+ block_size = tokenizer.model_max_length
495
+ if block_size > max_pos_embeddings:
496
+ logger.warning(
497
+ f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
498
+ f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
499
+ )
500
+ if max_pos_embeddings > 0:
501
+ block_size = min(1024, max_pos_embeddings)
502
+ else:
503
+ block_size = 1024
504
+ else:
505
+ if data_args.block_size > tokenizer.model_max_length:
506
+ logger.warning(
507
+ f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
508
+ f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
509
+ )
510
+ block_size = min(data_args.block_size, tokenizer.model_max_length)
511
+
512
+ # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
513
+ def group_texts(examples):
514
+ # Concatenate all texts.
515
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
516
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
517
+ # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
518
+ # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
519
+ total_length = (total_length // block_size) * block_size
520
+ # Split by chunks of max_len.
521
+ result = {
522
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
523
+ for k, t in concatenated_examples.items()
524
+ }
525
+ result["labels"] = result["input_ids"].copy()
526
+ return result
527
+
528
+ # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
529
+ # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
530
+ # to preprocess.
531
+ #
532
+ # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
533
+ # https://huggingface.co/docs/datasets/process#map
534
+
535
+ with training_args.main_process_first(desc="grouping texts together"):
536
+ if not data_args.streaming:
537
+ lm_datasets = tokenized_datasets.map(
538
+ group_texts,
539
+ batched=True,
540
+ num_proc=data_args.preprocessing_num_workers,
541
+ load_from_cache_file=not data_args.overwrite_cache,
542
+ desc=f"Grouping texts in chunks of {block_size}",
543
+ )
544
+ else:
545
+ lm_datasets = tokenized_datasets.map(
546
+ group_texts,
547
+ batched=True,
548
+ )
549
+
550
+ if training_args.do_train:
551
+ if "train" not in tokenized_datasets:
552
+ raise ValueError("--do_train requires a train dataset")
553
+ train_dataset = lm_datasets["train"]
554
+ if data_args.max_train_samples is not None:
555
+ max_train_samples = min(len(train_dataset), data_args.max_train_samples)
556
+ train_dataset = train_dataset.select(range(max_train_samples))
557
+
558
+ if training_args.do_eval:
559
+ if "validation" not in tokenized_datasets:
560
+ raise ValueError("--do_eval requires a validation dataset")
561
+ eval_dataset = lm_datasets["validation"]
562
+ if data_args.max_eval_samples is not None:
563
+ max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
564
+ eval_dataset = eval_dataset.select(range(max_eval_samples))
565
+
566
+ def preprocess_logits_for_metrics(logits, labels):
567
+ if isinstance(logits, tuple):
568
+ # Depending on the model and config, logits may contain extra tensors,
569
+ # like past_key_values, but logits always come first
570
+ logits = logits[0]
571
+ return logits.argmax(dim=-1)
572
+
573
+ metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir)
574
+
575
+ def compute_metrics(eval_preds):
576
+ preds, labels = eval_preds
577
+ # preds have the same shape as the labels, after the argmax(-1) has been calculated
578
+ # by preprocess_logits_for_metrics but we need to shift the labels
579
+ labels = labels[:, 1:].reshape(-1)
580
+ preds = preds[:, :-1].reshape(-1)
581
+ return metric.compute(predictions=preds, references=labels)
582
+
583
+ # Initialize our Trainer
584
+ trainer = Trainer(
585
+ model=model,
586
+ args=training_args,
587
+ train_dataset=train_dataset if training_args.do_train else None,
588
+ eval_dataset=eval_dataset if training_args.do_eval else None,
589
+ tokenizer=tokenizer,
590
+ # Data collator will default to DataCollatorWithPadding, so we change it.
591
+ data_collator=default_data_collator,
592
+ compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
593
+ preprocess_logits_for_metrics=preprocess_logits_for_metrics
594
+ if training_args.do_eval and not is_torch_xla_available()
595
+ else None,
596
+ )
597
+
598
+ # Training
599
+ if training_args.do_train:
600
+ checkpoint = None
601
+ if training_args.resume_from_checkpoint is not None:
602
+ checkpoint = training_args.resume_from_checkpoint
603
+ elif last_checkpoint is not None:
604
+ checkpoint = last_checkpoint
605
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
606
+ trainer.save_model() # Saves the tokenizer too for easy upload
607
+
608
+ metrics = train_result.metrics
609
+
610
+ max_train_samples = (
611
+ data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
612
+ )
613
+ metrics["train_samples"] = min(max_train_samples, len(train_dataset))
614
+
615
+ trainer.log_metrics("train", metrics)
616
+ trainer.save_metrics("train", metrics)
617
+ trainer.save_state()
618
+
619
+ # Evaluation
620
+ if training_args.do_eval:
621
+ logger.info("*** Evaluate ***")
622
+
623
+ metrics = trainer.evaluate()
624
+
625
+ max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
626
+ metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
627
+ try:
628
+ perplexity = math.exp(metrics["eval_loss"])
629
+ except OverflowError:
630
+ perplexity = float("inf")
631
+ metrics["perplexity"] = perplexity
632
+
633
+ trainer.log_metrics("eval", metrics)
634
+ trainer.save_metrics("eval", metrics)
635
+
636
+ kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
637
+ if data_args.dataset_name is not None:
638
+ kwargs["dataset_tags"] = data_args.dataset_name
639
+ if data_args.dataset_config_name is not None:
640
+ kwargs["dataset_args"] = data_args.dataset_config_name
641
+ kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
642
+ else:
643
+ kwargs["dataset"] = data_args.dataset_name
644
+
645
+ if training_args.push_to_hub:
646
+ trainer.push_to_hub(**kwargs)
647
+ else:
648
+ trainer.create_model_card(**kwargs)
649
+
650
+
651
+ def _mp_fn(index):
652
+ # For xla_spawn (TPUs)
653
+ main()
654
+
655
+
656
+ if __name__ == "__main__":
657
+ main()
benchmarks/Generation/Visualize/analyze_mdlm_denovo_gen.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ path = "/home/sg666/MDpLM/benchmarks/Generation"
4
+
5
+ res = pd.read_csv(path + "/mdlm_de-novo_generation_results.csv")
6
+ average_ppl = res['Perplexity'].mean()
7
+ print(average_ppl)
benchmarks/Generation/Visualize/esm_umap.png ADDED
benchmarks/Generation/Visualize/esm_umap.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ from umap import UMAP
6
+ from sklearn.manifold import TSNE
7
+ from sklearn.decomposition import PCA
8
+ from transformers import AutoModel, AutoTokenizer
9
+
10
+ path = "/workspace/sg666/MDpLM/benchmarks/Generation"
11
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
+ esm_model_path = "facebook/esm2_t33_650M_UR50D"
13
+
14
+ # Loads ESM model and tokenizer to embed the sequences
15
+ def load_esm2_model(model_name):
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModel.from_pretrained(model_name).to(device)
18
+ return tokenizer, model
19
+
20
+ def get_latents(model, tokenizer, sequence):
21
+ inputs = tokenizer(sequence, return_tensors="pt").to(device)
22
+ with torch.no_grad():
23
+ outputs = model(**inputs)
24
+ embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy().tolist()
25
+ return embeddings
26
+
27
+ # Load a random set of 100 human and reviewed sequences from uniprot
28
+ def parse_fasta_file(file_path):
29
+ with open(file_path, 'r') as file:
30
+ lines = file.readlines()
31
+
32
+ sequences = []
33
+ current_seq = []
34
+ current_type = "UniProt"
35
+
36
+ for line in lines:
37
+ line = line.strip()
38
+ if line.startswith('>'):
39
+ if current_seq:
40
+ sequences.append(("".join(current_seq), current_type))
41
+ current_seq = []
42
+ else:
43
+ current_seq.append(line)
44
+ if current_seq:
45
+ sequences.append(("".join(current_seq), current_type))
46
+
47
+ return pd.DataFrame(sequences, columns=["Sequence", "Sequence Source"]).sample(100).reset_index(drop=True)
48
+
49
+
50
+ # Obtain/clean sequences generated from ProtGPT2 fine-tuned on membrane sequences
51
+ protgpt2_sequences = pd.read_csv(path + "/ProtGPT2/protgpt2_generated_sequences.csv")
52
+ protgpt2_sequences['Sequence'] = protgpt2_sequences['Sequence'].str.replace('<|ENDOFTEXT|>', '', regex=False)
53
+ protgpt2_sequences['Sequence'] = protgpt2_sequences['Sequence'].str.replace('""', '', regex=False)
54
+ protgpt2_sequences['Sequence'] = protgpt2_sequences['Sequence'].str.replace('\n', '', regex=False)
55
+ protgpt2_sequences['Sequence'] = protgpt2_sequences['Sequence'].str.replace('X', 'G', regex=False)
56
+ protgpt2_sequences.drop(columns=['Perplexity'], inplace=True)
57
+ protgpt2_sequences['Sequence Source'] = "ProtGPT2"
58
+ bad_sequences = []
59
+ for seq in protgpt2_sequences['Sequence']:
60
+ for residue in seq:
61
+ if residue in ['B', 'U', 'Z', 'O']:
62
+ bad_sequences.append(seq)
63
+ protgpt2_sequences = protgpt2_sequences[~protgpt2_sequences['Sequence'].isin(bad_sequences)]
64
+
65
+
66
+ # Load MDpLM generated sequences
67
+ memdlm_sequences = pd.read_csv(path + "/mdlm_de-novo_generation_results.csv")
68
+ memdlm_sequences.rename(columns={"Generated Sequence": "Sequence"}, inplace=True)
69
+ memdlm_sequences.drop(columns=['Perplexity'], inplace=True)
70
+ memdlm_sequences['Sequence Source'] = "MeMDLM"
71
+ memdlm_sequences.reset_index(drop=True, inplace=True)
72
+
73
+ # Load UniProt sequences
74
+ # fasta_file_path = path + "/uniprot_human_and_reviewed.fasta"
75
+ # other_sequences = parse_fasta_file(fasta_file_path)
76
+
77
+ # Load test set sequences
78
+ other_sequences = pd.read_csv("/workspace/sg666/MDpLM/data/membrane/test.csv")
79
+ other_sequences['Sequence Source'] = "Test Set"
80
+ other_sequences = other_sequences.sample(100)
81
+
82
+ # Combine all sequences
83
+ data = pd.concat([memdlm_sequences, protgpt2_sequences, other_sequences])
84
+
85
+
86
+ # Load ESM model and tokenizer for embeddings
87
+ tokenizer, model = load_esm2_model(esm_model_path)
88
+ model = model.to(device)
89
+
90
+
91
+ # Embed the sequences
92
+ data['Embeddings'] = data['Sequence'].apply(lambda sequence: get_latents(model, tokenizer, sequence))
93
+ data = data.reset_index(drop=True)
94
+ umap_df = pd.DataFrame(data['Embeddings'].tolist())
95
+ umap_df.index = data['Sequence Source']
96
+
97
+
98
+ # Do PCA
99
+ umap = UMAP(n_components=2)
100
+ umap_features = umap.fit_transform(umap_df)
101
+ umap_df['UMAP1'] = umap_features[:, 0]
102
+ umap_df['UMAP2'] = umap_features[:, 1]
103
+
104
+ # Visualize the PCA
105
+ plt.figure(figsize=(8, 5),dpi=300)
106
+ sns.scatterplot(x='UMAP1', y='UMAP2', hue='Sequence Source', data=umap_df, palette=['#297272', '#ff7477', "#9A77D0"], s=100)
107
+ plt.xlabel('UMAP1')
108
+ plt.ylabel('UMAP2')
109
+ plt.title(f'ESM-650M Embeddings of Membrane Protein Sequences')
110
+ plt.savefig('esm_umap.png')
111
+ plt.show()
benchmarks/Generation/Visualize/mdlm_de-novo_generation_results.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Generated Sequence,Perplexity
2
+ GEGQPTLDAEGMPKADEGKMMTFKSENFTDDSVENLVLTSYGVYNPVIFTDLVIRTPKEGAVVPPTVVLMNGEWTEVMPNLTGAETFDTQSKYLVNGLKRYGVSKKKHVQVYQMARRTKDLLTMIPDGMASADFSFEAPGRANTMPAVGLSMDSAVGQPNLSRLRGVDVFFRYIVYTADPFGSETQNLEVQASERTNILFLNQQKKKVKSGIVVQMQKGILFERFGEVMDGQRPSNQRVGSQDMLIGVGALVKLNQKKIRTRIIQLFNLGYDDSEAIDWLPTTVAYLDSTYYVAMTTIQSIWVTDYYGLQGLFPFNQNKIGKHGVEVKHVQYFLEFVEAYVDQLEDLFTEYNERNSKLSNSNAIQAITIAEYQQLKDQLQLLTTENPIVDSSMIALRIKKLDNSATRELVSQFNRDVERATPNITAAQISVLKDNMTILLQDELMHMSDLNGEAADATYTLQAARESLEQLTTAAEFAPEYLTIEEQDISDFKARMELLKEIVGSLSNRIESAVKNKQDKEGIQYAMYKRPNRIDILIKNINLKFKGIQFQIDSIVAKVRNMEAFIKALVYRLDNVRISLVQRVGNRRHLAKKEKEPETVLIVNLRDYRSTLILFDIMTNLRITDEGQPENILRMKPVLDNADIPTENERIPSLSMPLMVRYTTVVINLPELDEHKAPLGINIVVAKDAVVSRLEWEWEGDVFKNKPYRIKRAGYGPDYVRAGALAQVFIARSDTATQSIAVRKTANEKFLLRLPRLPGSLMGEVVLKSFATFHQAFGTGRNNVYQRDEDSDKKYNQTLIDYWFDLNRFFGLSQREEGVQMMLLVEEPFTAGILSKAIVFDDDKKSAFLMMARAFLVYLPLHHSPDAPLEVANNSPKNIRLNLQATIAARG,18.2131
3
+ WTTGWGVSQDLIDSASMSPGMIWILLVDSYKERWFGTYWWGTSTCKEGAFPFEDVMQRIELRILKKYFYYLAIISSVLTLLMIIAKLVTNCLSFANIHKSHRYFFCVNCFWFISQLCNDLSAFPVLKKLESATRFVIYPSPVKAVQLDTMPDKIVLYLIFLNIFSTHTVLVFQSMSLGLITGIIDIPTAKRIIVPNLGILVIKTFSSKNCKLSLLAPEMWPKCMYDYVAFKNIEAQIVITSTSVGAVLCLLLILKGSVFVSSSYMFVGGKPANPGTGTRMLLPKDDHFEHKFCHNFSNVEKISASSYAASPEESILLLVNKEEHNKLRVLAVVPKGARNVLVIEIMKLKPFQTTYNDLYLPRDENNQLQKNKKVVSVGKIVLKDPASWVYLPQGRLKMNFKKAYIKSGAAPILLSFGQRLISVDNAVPLAKMRTTGITVLEMAPRGSRVQAIVVLPGQLKCGKSETVYWFTVSSIDNNRRGIAKYMGGVTYRGRAFIDMDKNLAGPPLVSDAYQMLFNDWLEMLCGAMKATESEKVKSRKGASELRVIHRSHHGCIVAILDDLYRLRFDLVDIERIGMINEEGRINGKIRSFEFQNFMLTSKNDMKTGFVNMPESFKPRTILTGDLIDNDWAPSFDLAAIRHGNIQVLVDGNDLEGSEEATNCHHGNAFSLGPQGRKVVVGAVVAPKTATPKCNSISINLQELPANYVVLGAKALTAQHVANFSVNGTKHDKKTCKQHEEMWKMYIQKSGQISKQASIKQCAILLSLGKLRATVKWFLGKYLERIFHVLLKGCKTVETIIDKGRMHKIKLVKFRFGIWIFLSVVCLAELFCIMIFVAPEIVRGLHTLILFLLFMMLLLLNYADTGHEVTGVPYY,18.1991
4
+ GIISVIDLGAKLIVPGDLFFVLCGKNRPPTSGALQYTILHTKKKLFCCGPTHAKHICLINGECIRDGREKLQNLCKTSGKWTEVRRPKSTYSCSLVYRRVQRRFCGPQKARAPVWILYIVLLSAIGVIIAVTINWVLQVCIILGAVVANGFLIRVLSIVDTRNQIITRGLRRYGIYRNSVKVACTSGSVVIVRIKFMEDQISGGWRPASFIRTSFKEFLPASATAFSRLADCNEKLIEALV,17.785
5
+ RARILNRSLESARKYLDFLKIDKVEFYENEMTFRVFIEEAPEFKRMEITEDKIRLRLKPDKIRKFVELGNLFKYTDALQLAVQLEKQNKEELVIAEPEVIHALHKHHNHMPIFHLLEAFNDETVAEIILANIGKMPAFLFWVWNRMSDPTEDRGKGFNEKVKRKNPTIVSILDKQVYTIKGGFGAALTKSILGPLLATQNIKIKADYDESLADVFVGLHFVDGSIILRPWPITGKEVALSEEVTATDKVISASEVGSEEDKFQLTSIENNFTKTLLEIKKRRENAFEGAYTETGSKVSDKPVRELKANLKLIPEYQAERIDQTYWRKILRLSRSLISEPRGARAYLLRIGERVDPHRYRPAIGAEMLMGIPNSITTGFKISKTLGLQAAGLDLIQTFRSLSIRRMITKNFMAILIDKPGLKAVFWFIPLVPFIAVQLLIYGVLVGRAKPGNLVEIIDSMVDGKFETRPNGSPAASHKMVIGVKYSLIYPGNQAKKISLVRWNTALSKDVRGQAKEIDLWQLISYYLEKMRIGPSAVSNVFQSVHDGLKRNELAVLLIMDPKTRDDSMILDIMNLRIERGFVSLIKSYIHDYDEEVYMTYREILNQNVFLMKYEEYATMSADLEAYWLKSIEETNLRALRLPGAMRKQLFLANLCRISEHLDTPTEQDAFSNPEGITIDEGFTDEARAQGIGFVAGFVDEKEFEQRYALLAKVYIASLKALAAALADSGVKTGIKIGLGTVERIEVHKDGMMIDHVKMEGPGRFPIIVGGEVSPIVNGATIIPSFIKILADGKVDEGKSPNKTPTEKGEITPQSLYRGMGKTVVLNDNGGLQAHALTWAINDEYSYFVAMGTSNKSDEKQLAALSNSLDMTTFEDSAGRLFSSIVKAKTLSENGVITENGAEFL,17.7595
6
+ DIMLPKSPLFEEMATLGFIGHTLLAPIKPWTKATATMVGITGIGVTIYWGVPDMFPFSPTNHWWVKGMKAIVPSIIALQIIDLFYVVLTGLTSRFVYPIVATFYDHYFVNVQILVTGIACTLVYPETHGDVVSVDLLQCRTDGKLTQIPMEEALELINFIDQIMEKTKCKFNISEGYYEVLMTKKFIVKGGKGVGPDTDPEPCEKYATWGRLRDPTGPIRPNRAEKSQIAIYAVCGGAVLQKLGVPLLDNEAPIPSQLLIFAIGTVGIAAITIVALIFGGIDIAMSNPVYRPILAYSPHNKLLYPLSTWDVGYYNVPNVTSTYVVVVVPPALTIYSVANEAIKVETTPIPVKFAEILPTGETALLSSYPLTIAQTDLNARYERREADIFTKHEGMQQTFKGEVVPLVSNNRLKSPSGVQIGCAASLMTVPEEDGMTPPRIATTWFEGPYSAPASDPLMRMPGDFYGYGKGTMDGTVSNEMNGISHRPVTLATGGNVKFSPVMLPWYIGARYGLDIQHSGNRRMAHWLTSRAVMGFFKKNVARLADRVNAITLQVPSKDPDLNNRPNHAVMSRQWITAVEAIKELAICSLLNQFREGLGHKNDTIEADLNPFSGVIAQSSMAILKTAMGATRSSVAQLSMGMALEAFRHQMTGTAGIHYLMSVTGNGPGRISTALNKLDSPLPPAIAED,18.2088
7
+ MASLAAANKRRSITSIAVFLGIAAAVVVLGISGKVTDKNTVLDFSYGKNRGFKPQHLSEYVRSRYPAVVIAKGENLLRGGRFLPWVRPPGLRYVAYYDAIESWDIEDSLKELVDLRAQGLYADVETAAALKKASVSRSTITEMVGMHTNQSEPQFRTEQTKVAKSNGTMFVFATMSFTRGRYEIGSLMLSAIGKRWVEEPTPDTKMRYMKLRPSIQLLCRTGGIYRPIFQLGGPEGIFYHDGYGENSYQVDDFIWKHLERRA,17.9845
8
+ VYGLLTTGSASSAMATTTLMIGVLFGLFSSFALPLWSWLQYLTTTGITTATGARYFKNIMIEAFSSYSAAIIGTVSIVPLGSSIPAAASASAVGAFGAITGFALGIYSMLFKKMDSFTHWLFSAGAGLSAGLAGLISGIGAIQIGNAATATSGTAIPLFGLVLRIIVSSVQGLLGTIAAILLISN,14.5579
9
+ IGHVFHLLHMALPIWRPPLPVTPGEPHPRPIADLVTPAFEYKTLLRCPHPHVSPIFLSVVLWMIVALVLAGVMVAQALPAPTGARLLACYP,17.2759
10
+ SIIFIFMYLTNCLQWRQNGHLQMSGLLFATLGVSTVTHTMMLASRGLCQAQKQRIRRECTLFGLAVHFCLAVGLFIASVSFAVWSSLEGLDDDANTVAVMRWWGWTFSFERYATVKVLFDQGIQSTYIMSWLLLMTKREDFRLLLFFMLTMFASILVPFSRGAHFYSLSVAFSNFATVILPGVNGVGNEIVQQIIFVLLFTFPMFLLIVDIAFFVNLIFKAAHP,17.039
11
+ LPDVLYHYEERKFVITRSEVVLEPNELFIGKIAVDVANYNIKVKVDLRISSKYVVFNNSQLTWNDHFLQPLISDRLRWVIFRVCVGTSSPQLLIHIDMIENFLQQLLSFLKGIVVQQFVGKVTLIQEDMKIEEETALLIEIETYPEAELLKLVRNLIIKVEDRTIGSIRHEAQLAKINDWSAKRIISDLNIGDIDNGEHVLVSAQEELESSIMERLAGHLRKFVNVGTWTESAAIDIVARIYGSLSVELHEEWLVMLEYLFTYPNDYFPGMYTVQYYQNADPGELLKNHALIIDELQRLELYEG,17.8596
12
+ SQGLDDLIMTTVADSEKDTDLTTTADLNMVPIYVGSNETATSQLGMIVKRKRPEKPIYVPVHSCSKDDRACAFVNFFNLARDLGYIEQDEQRVTPDWRAIILTMAEDHIHLWSHPNVAILKLRVIGEKLADQRETMKDPLNTRVEQVALVQAPKIDLIRASYGSLLEYQGEQKKYRINNTLSRCESLCADAGVGYMVATDAKVCQVEGETVDNNTGKDGDKTEILRAHFKQPSAFNKGSGVLRGHIMMTLGIGLLILLLYVIIVFFLHKIQNATFKFRIPRVAIATSLVADACMSVLAAGIAYALANFPVFTAKIYAETAVVLVLLVKGRLFIGKNKEIPMTARITIIRLAGAVLGFAATIIGIVILDPVLSIDGVSLAPGKVPQLLCLAQTAVREGGHQTWDLQLFELSKLSGMKIQPGKNIVRDPDKSNTAEEPTVALWDMVDVPGDIDSALQKDPVVKAMFKPHTGETMLMRQDIWAVQRWVMNSLGKLRLGKEVAILRKYVDTTHPAGCCDTGDAAIRQAQTEGKTVHSDGVRVLEDSVRMVGLDGDGKTCVGQAEEQLIQKFPCEKKMADDVFTSARALALNASTLIEQTDNGGDEWGENDTIKQVIKTGRKVEGDAAEIATPDASWNDGVYRKSAVFSSVTSDCITDIAGRTNTIVTIKELKGPRSLPMITNLRKRTALILAEMKTLIGPGGLYIDKTGIDCKVKDEGKIDSQIKYEIRGIELYGNMTPAPGIKPVAFTGKGGSGKSTIIRVKGSVVPSFVPINKFGKGRGERRTEKNADALLIPFSNAKKLEGETKVLIPDFLFKITTRNVVTVGRIVVGGVLDNSDVFEGFDSIKLVQAAMVEGQKQVTIVGINRKEGPVYGDNLLLSADIETEEYMTYGADQALAKAAILRRSGAVLFALVFGGNPNPRIFKGTEIDDVWLKIKPRAQMASVKFDEYIKQGTIAVHGGGINNGKYLVEGEDDPCDPNDQPLP,17.9744
13
+ MDILKKLIGLSSLLALFLLTPDLLAEIVRDIVGVSIGEMPEIYIYLLAFYLLGLMLASMTTSPPGFSFLTTRIIYCIFYAWYYILLTLIVTILLIIGKTEGNYKISKQFGVTENGVIMNMIQKAWNFLSDISTGNYLITMWPLNHFGVVPWFNKAAGIAWFIGPYFTYRLSQRPVNFIFSALSFVIKKWLSKIWGKFVRMACAFTSWVFLMGVATTLVLVIFNEMKWIKCSILNSKQWFGKLLGYMRNSLTVLCQKTSIGINMMLVSVLILILIGTIGNGDTAIYWHILFLIYSAIGIFAFVFVVQIVVCNKDRGKKTDLSPAVYAGLAELLPSLSTSYDNVNLAPQDYLTALNVIDSLLIKLVLEIIVAGFLSPLLYDFRLSSDTKLIFCFISLILFGYVFLGFEKDKAESEIGHSRVPSIPRNIHNHTAATVVRLREVLYELFTSQDEAHLGAHEKQNVSKVLLFALFFLFVSTYLTISTPVSNVNCTSYRLDTPFSKRKRQLSLALFSIGCCLDGFSTMQHMVCGEEFQLDSFFKFFVRFGKVIGKRVAMFFFWTLAKALASYSDAIIAPGYSEKMAHFPPDQFNGRAVDFIDVDEDLFANGFSEGKTRVAGPGEIVIFYQIGGNKFEAMFTVSEPIKLLYYDKGIIQAEYKSEFGCELITVGFTTPTVYDYLSPVPAYYFSLVKDPTGTFFDLISVPQGSIGYVNAKFQAYGEFWFGRIVHTGQNRNYITSLPLLIHLKAGNILFWLICVVDLTTTSILGKGNKRAVEVYGLSLLSQCDSFHTEVKIIEEVKRFFSLKQRKYLSSIFYTSYMNIFLALQYKAFAMPINAGVFVTDLDEQAGILIQAKKTRRRIPPRLIFVRDRVSDPNIKIENSSPLFNVYLLSCGTDYTSKKIISIDNRIWALLDGIHKEELSYEFNYE,17.5258
14
+ FRFCYTWVVLILVPSIFIRSFLRWESRFYFLKELERKMSGGDDLVQRSEQVETSCPVSSRCNQISEKILNWIKCEHKRVLVGGDVEQIIFPYTSSPTQSAEFQKMHQFQFLDDSGTQEANYVYQRIDETGYFRFADAAEEFTGALDVEGMCENWNVFLCMNVSTQISLILNQAKAYMFTQVLLQDGTPLVQFLDPDDQRLLVNCEDNEASNEMQDANRYQQILDVDYLLLEIQNQYYPAYFLVNLNHADCFKGTPLFTPKILEGVQDVVTCRRLVWLKFALNRYDPYDSVGSLCNTPRYMRLTRRLMENWDLSGVFWTSLTFLLGRCW,19.2309
15
+ TPGGFIDQNREISHATRNADVNYSLLLLGHYTALAGMHAMYLGPDNVVILTEGGDFATLPYTAAPTFTAQFWQMKILAAGFSVVIAFGHFVFGVSRFWHGMLDVTMGHSSALSTLFAGHIGHLICHAGGSLIFFNFDSEPVVGTTVTIVLPLWFTHHNLVLIPWHEWTLNHSQLVQVLFKPNMSFIFAFGGHFRGMHWGIGGVNEFADGHNTGLQYHHSFFILLSLLALAVHALQISRGIIWPARNWNRAKDFWNSDTVPWISYVVYTDAFWALLAYTLGAYWAFASSGNLWTSIHQRYLQSEQVTVTTHATAMFGNMAWGGFITPVHSWIFNQGKLWSLSQISHVSRGWRFVLNSFHHGLWFIALVGIATWFYWRLHFRWGDTSLAVEAGFWNIHWVTSNAAPLMFFILAYILVATETSNKY,18.3108
16
+ RVFFVQNLAMMLFLLLILIPLFGDKYDVLVSCTDEAFVELNYLIVLAKQWEIGCSERVVPMAAILAFLINFGLICITLVVGGNVIQWYSPKLHEKNHFIWSCLETIMVITVVLIFLQVLVCLGFTNLITALCWLPGWKLVAPWQSCALRQRITLALLLLKVPAFLGILLHVFSKQGWLIISGVQQLSYILSSLTMIAVDLWGGSIAIQDCRGKHSLIVLKVRVLMLYAPLLSSYVYWFEYAVGTCSRYFLEIVDFLVLAFMIVVLLILYGREYVERLDNIYSLVDGANVAESLTHTYILILIAYPRTNPNRIAVHIKLISFYVWMIVIMVFARKALRRLIPCSHPFGPKPTVSDKINAVQSGSTKAEWESIEYFVLFVFVILVLLLVGLGCIKLYDTFWNPQEDLDTDIKTDPFNNSLMTIIGVPYLVTVVIRKTMSLLPLTTMFKIVGVLIAGRLNLTIGWAMAYVTAFWLVTIECIRFFHPQPGVSAKPECLRFALTTATVLVASATIVFSADINRNHKGEISKAAGFAWLFAYVLLMFLAVAFVIWKIAERGVLGQSATVSRSESEEAVVYQTAKDARADIFPLVDDIRVTEPKDSAGRVINLGMPLTPSGQSKNLLPAPGANPVPTRPCVFNACWTVPDTVLVFCTVAAQMTLTFRELVLERVKDAIHTGRCVHFWPDLAICADVFDKTGLDNISGVRMNIAHGESELSAIEFNIKFQ,17.5525
17
+ RWSLATLFMVISLLPVAFVNFFSFKEGFHFFGFVFAIITLGLSGIAWITKLRDPVDKMFFFRVRMLGWLRPTLVYFIMYGILGIISRLTALVRFKAMPIFMQLGHFFPVVNGILIFVANRPMKAVRLQARILNRIARGRWTGASYPEDKPGETMTDEEFICLTQSGMNLGDSFQIIENGTLIPNWLTSSPLKVEVLLYTYFLLGLFGLCVSLRLAGCGCLPEVIRRMWCWIWFALFFFSSFWQVFRQLSALRIALGRARWKKFDFGPVSFGLVVLFLVAVFLSQVLLVAILDIDEMRQKFAEVQTALTIPRNLPELKDSIKSTLLPFQGELQWYSQWTSLIVYLTHLIMTGMGKAMELSWQLFNLIWAFLCSNNGFLCFFQEYFLKLFLWGSAASILLFLPSILNLVQRLVPFTILIFFCVPPLIAVHSLYNRGLGVFENDVGTLKAKAVQTSAAPDWQITETNGPDEDYESHIMAIVTFVNLLCLHIIALLMTGTNSAQPLLFELKFDAKVFMAVFNGLIRTMSVLYSRGCETYALLNLLASILLKLALWFWAEIEEDEFASNISLGTLLREATRHIPVLITPVAMVNGAAGAGLTLLWPTRGHVYLRGAGNKRPAGRTSLGYMAGPSGEQFYIRGAFWMASIEISAGTGDVEINGSHIAFFALKGKMIKLTLDGKPASKIKDLVTRYADVAVDDKIDFEWDALAERSIWDKKQLKKGMQLNGSWPKTNVMPLITGQDMSEKYLYARVDLQNFNYNVKGASNKEGIHIIDPFGSLLPLVVFSSFGIIGLGFLYKYNVWQDTSTLQEHFQKRKTTISSSKPVKFTMDEPKLLGPMVFLTFQIVVMFLLGFHKFPWLYFAYIYKN,17.5249
18
+ VYPAGAALAKAAQKALENAIQEHYEVAMREELEANPERRLIAILKGLARVRSAYAEIDIMRDQAQNAIEESIEYANDMYKEGSYAIVTPTVRIHHSIAPVEMQQAIELMADLAALGYGDAGPVVAQVIQLPGLKFRGQTHPGASPYKIDVDVAIAAINLAVERLLDALQEVYQQPPKTVKVHRVSASHDVPLVFQVQVFVQFKINGAREGDFIYPGRDVSPQNRKVEKFDERRKSTRIIPIYRLRLQNPGAAFALKDHEAITLGFAQHYLLGNWPIEVGETPRTTQDGLPSMAEKAADSTNYLLAAANFMHGKPDLEVQMILKANLCSTEKKVTKLDRLALAMNYAVYLVMLAQDLSLFLKVPQNVKVHDGMGGDYQIMMSTLPSSVTEAEIEKGGDMHVQLKALLPVSEPFDAEDVIFGQTARVDEYLVDAKMAKLLRPPTNCGTNYKENSENAQYFPLGRNINLVPCTMEDGSLGLTGLEGFSLSQAGNQRMWAVNLIDRQGKVALLAEFVLNELISLSDNAEQIADSHTLKVVGVRGNVGNLWTTGTMSDKIEYTPVLMSGETKLHASLGHLIASTPDLTATMQEKKLTLLLSPPAYDETPPKIEKLVWPYEKNAEVTGPWRNITKAIGTSISDLLSLSNKMAKVEQERDAKMETSELQKYDNPEIRISRKLVSALVIIISLLIFALRHRFALATWRGCVVAIGTPSTPLKRLSGIVRQSADAGITTAGSKCSRIRIIIKRGIFMTTLGISSTIITLVFAYQ,17.9413
19
+ YMKRGMVHRTLTLLLIFMVLALGFAIDIRGWAMFLPEATLMVLSMLGFFRQGASDPNYDAVMPGVVCEIHMRTSMLFFSWSIALAYLAMLVNSAGQITQSPKIVDSFTKIVSAGKGLLALIINGITVAKPTDGDELFSQFSLTLTLTNIGSMSIVPQIRLQIYRWLMKPPHGFLGIFPVLSAGTSLAIALFNFWRNSLTEQYFKFLSDMTNSINAQVASMVAHRSMAFNWIGTIWQYCMITVFLLGFVYFYFLSETNGIQLRLDIDHSCGFSNVPIVFAHEFLTIACKAAAEILKSDSDNKVKVHVMSKAENIAGSGQLWLEVFESGSLPGANASIIQVIPNRKREADIVGPGTVDGLDGATLLLSPNNIFVPPGDLVAARGNKAAAGATLGANGTLTVDARKYGASKINVGYDTEAVGLAVCTLVIGTGDSVLPTAKKQMDNVVAFLIKEEDAWALQQPLKRSAERGYFALTMAMNPDTVAFATETGLRVCDLMNTLGDLFSIGPAGLDAEAVGAQGLSTTIDMNKHVFFGLEAKFSSSNINQPSSWLGALEAGLGNWLSLRNALRGDGPQPQGP,17.4011
20
+ AAYAQKNAKIKRKLEETVLCGGCDEGEYRRESSFGAISASDGFTPDWEHNLDGQPGLYVTKLIYKYIQHPQYLYEILAVALLGVIGAKTSLFEGLSHPKRRTESLAITFNSAHVSACLTVLTDYTRQLTYTLSACLVTLVSTLYAVNLIVRDKKIAADIQFFVEASDYLKMGLEVTRNENVTPVNDDDFFSHILWLIDHTKPTMIEGHFREYKLVNKFFILEEHGLVGKRGSMMDPINTFIKCEKLLQLIDTKYGGSVKKLKSSKVAFYNAVSEECAPVKITLPKTSDILAHRYVSVRDIPARGVPYTHSFSSNVVSAITDGRVMDKAGDFDEDLAIKIMGLKLDGFTVMVYRLDGFRMGETSVSKIATLEALIKDDIVTHELITKSSFTRDYRSMERHDFVLGSNFPYCSPAHEDTIEFKQKRQSYVGRAVADAKVEELELPATGDRGEVKDQVAKNMKVLTNQAMKVHVGTMLAPDGDIYSITKENVLPACYVDVKGYLTRRNILGKLKKFMDVFEDFAKVINILDDTGSGNHRFNRYWTRRDSRLGKPLLLTHEDDLETNVADNRRIRTNKQRERCLVRVLNLECEKCHLPEMVVLGIFIGSSAILFTLFTLMSINGVNVLLDQVPPSGFGASIEGAMREAKVLVRLGEFVANKANMFSQERGGDVPAIVPMTEEQRSDLNKPCKEERKISKCFTRMHGSWGGVKRMDPPFTRGGYLMMRQTRMGIWISFDKRKFGKTQKFKYLDCGMKDPNVWKRNINVGCHLVNTYADTNFNCCTQTIQAVVESHWTEPLFVARTFQPVSICLIGMLQFSYGPVMAGLKTPKPHPGTLRVVNVSTVNLMLFVLFNYLRPAAYNGFYGKYTKPFTLGVSQKPRAWSHKIITPPGPKQDLFISFFSHLVVLIFMVVMWIYFAGTVTPFDFQYYRQVSLDVV,18.5249
21
+ MKYNNYALLSTVTILGLVFTIFNWANDWNLHLNFGLTTYLFVGGTFLILTTFGVGQDDPSYLKGFTINLAGKMIIGTHLPPLVPTPFSPFIDKISKHLAGAHVTISAVTVDNIIGTLLKLLESGDNREGNWRAHRLAFHAIGATIFLWEIVISIYIGFNDGFNVDNGEKVSKTAELITTPSGTLDHTGSAESWSFDDDSSPLKWYNGFLISKEIKYKHRPLFSLTTQLSILMYNRIFIVLQLVIVHSERMKLSLFDLFFQNFFFFSTIPLDLEGLSGEYRSIGGKSDIRTLIVSCFGSLLHG,17.1279
22
+ KNEKTNSSSKKVQVADEIYAPGKPVAYVNTGTTQETIASDAILWLASEFSAIIEIKVVLFGPVTNDVYSCNIANYSPIPQGLEVVHKKYTNKNNLWLFTTGYDLNITFLNTDMLNLESSFLIIEGAISTSRMTSDKEITNFEVPGNAVVLCTYNAPSITSKGAKAHEASGGLAANLPREEQLQAILRSHEQYVSRKMKADCFPTTKAVNDGRILLFYLSAKNLVDSLPMERGDFNLIYQKMEVKIYLDDLLKTREEIQAARAFMTEFIVRQNGDIKLLGLSEISDTSDGRAEVLDLPLESGNSLSSEVDAVLVVGQLRAMIHGTGTFTGVFIPHDLISSGIDPESDREGIGNFSRFDRESLVLFGIGVYIDGVNEIGWFKKTPIAIGIGNASYRRSNCLQISFYCDVDANTHEDTGTSKGKTLIMATNEYSIAGAICEACGYDVEGDDKTDRQIVVNQPSAVTVAGMPGLVAKTHNGFRNFEKNFEYLNFPVSKVLAEEGGLDYFWTIPPGNYQNNVPWNPVRAQMTSWGVSTTATLVFGVTYSRTLLVSLRVNATLTTNSLFAFFASKLSHINTFRTGGIISGGLCSVLILNFVVAIYGVSLRAFGGALLTYAMVMIVVLFCREVWKVMYYADIYGKQDLIIFELLNFFVNFGFILTIPLLSTASPPGSIDIKLPGILRTLSLYNDNQQRRTFIGKLLWDPESKVYNLKSGEAKLLGANASGLMAGGSEGAVHEVETDTSNLVFRSDVSSP,17.7156
23
+ EGEVNRIVLDLSGTSDGSVIIEANKVTRDNVSDALLKGKNFNAPAKTSSYPAYVAASLERQDPKKTFISFFKHAHNNAHGGQGRIINLAFAHATQTKRFNVFFEAYKKHGLKFDQNTFKFHVPEDMSRKGTIAFKGNDGEITLVDVFTSSFRQQISQITIRQGLWDWKSTRKNELGYFNNTIQFQGSKTTGSADLIFALSLLGAIRTIREYYPFKEQYVLLHRTWTNLQKKNKASWEWASARDKGQLNTGTKQTFATSLITELPIKSFELMTSARSLPEMEVNQQYEHYRIREYYKCRGAGEDTLMKGITGSGADATKVLMISYMLNEGLVLILDYSQQNIKTGNTIAILKEQGLAIKTSPSYSIQRLTKLHIYAMENLEIFPLHREQVNVMNAVLLGELGVAADEVSKANFNNMPLPSRQATVLSINLQDKDKNRKVLLRALGDQNSFIPPFDKSDVQNTVNLMESITKNQAITFDLRQGNGRSNQLIDVDI,18.2889
24
+ NTTRNPTENMPTPRSWLTEGRPYIAYACAKCKSETDKANKGLLFVTKDKIIIKSVPGIADQIAREVKEFFNVQTPAEGWDLVVGDVTADASAGVRGVLGGIVFTQKGSVLQALAVAVTSIGTMILFLNLFSWGGGWVTMFGAAENIITSLAMIAKLVLENKVLLLNIGMSGAGICLMMTTDVSPSVLTANFLAYAMIDTIAFAGDAITYPFTIDIGDAFFKFYGGAEIESVYNKQSKPWPSWVAEQLSFASASNTGAGVTWTFSLTSINKSYTLQFLQAVGLLLSAQSRPPDLLGSEAALTFNVTYVPLGFEVNAARIKTMLSPTKNVNQIGNLKPFLKHLTESLGNLKTLLKQRTQITEDDVDVRKLATSIEVPEKNLLNNELNKIRYANFVSKRLAVALNDEIPDLYKVNDLKSSHIFLKSNANLGNGIERLNTGIDIVSNQEPQMLTMLFLKGRLIKNNGTTAPLTLWLKYISNLTTLTNIGISVEESANRFRNLQNSFFKNNNILEINIQVIANPTDAEKELNLVGY,17.3426
25
+ GAQNTDVILGALSNFILGAFGLYYWFQWGNVILHQAVIMSFIHLVLSPDWTIWFYPYFVSEGCLYRVVLAIVQRTAMTLHISPEVSKYGIRAALSSPQEMYSLSRGDLRWFFKDLAIQKVRWRRMPPAVMILVLFIAYQLLQTKTITPTQLLLIQGLLFRVYGNLMITITILGTVMGVSPFTVIYNGWGKPKGITYCEFPSAFLFLLEDYGSGEEMTSIALPASLFVEYTEKASVIRAGYILSQVDEFSIKNMITRERNLPKSELLYVAADSGVNHTLNICQFPVSDTYLIKYSFIPYKLYIEDGKKVEMPPNKVWDAIVIGHYSQDDYWQLAAFCNQEWDFANFEKMLARPQRLVDTCGMALAATYWALLVQVLGAPILDNCLWINTFAILFAAGILWQIPPLRQDMRIDLSARFKHSVIVVAAYPYVLRLTWSGQSQQKFDLFIYFFLAIFTLSFNSVHYTADPAREQFEWRDSTGKDIPCVFLFGLTVTYWYGALHTGHDPESNTGLSTAKTSFDWKSQFQPFDNQYTRQATELLGIIPCATLHRKCRETWTRQRVFNVMVDMQQGSARFIFLIQDTAFNRNFKGGLIQDRQDLRKMLAISPGEALRAVIHRREHAAIEKQLNDVRADELVVAAQTAPGERVQELLRGSGVSYSLTNFVTFKKNISDDERRVPAPELVFQIVIVCCWDSRIVKALLAIITITSLAVGDLSGVFILFRS,18.2048
26
+ GELPALAGNRCGEAKLFDILARPDLPRRWYIHLGSVFTLMLVLTFLGAFIGTGCWVDGGGFGKFIDRGLSQAPTFGPQVLTHLYPEAWAHFFGIADPAGGYWLYHIILFSGAHGVFIFAGGALARTLRLGRLLGMARALGMRPKHCAVGAVGVILFLTAFYYLPDGNPTFTPDQGYESGSTGTIMVIDNGAVGLLFHPLFGAGLTGTFHTLTLAHEGTASGEGLSNLSEGGTESETYAAARLNALFRLVANQGRAWRALHIYTLPFLSLGVCAALGLTVAHAWTAFDYNNFVAAARADSFKFGANNWVLAANDIRAGAGKFVHAGDEVLPGELIR,16.0959
27
+ ADFFVRRQSTKKLYGLPLDGSVNDSVACIWGFAVFWNGLVFPWVFAFVGLIGWRLQIRFVPGSVIGLFKFELILSLIPDALAHFGVEDIYANPEYVFNFPRGVLTFASTHGIRTLRALRFAYPFVALFGRKAAGLFRRMGVVCLMAMVIGVGFAVAAFFFGELMPTMRWTFGEGGIIQTPVFAAGFRSSDVPATALEAAHFLVFFLLGLIFMAIHTGAAIFYAGESAARKNEDSQTFSWSSASSARLTRQRDREILVRRNGTSGESPGLA,16.437
28
+ MSYLYLVFFMILILFHLNLLTYNIVKKKPPFNGKYKKWEFKRAFDRYPVGYIYYGHGQWKDERNKTEKHPRDQ,29.4763
29
+ KVAAIGVPFFGLLIALLLNITMVFLSQTTLSKYWFAWHIFAIILILLGLLVNVLVNQGSSGSTTSNFDSGMLAMISVGKALGWNIMARYTPWQTGTLNSISWFNIGGAVTVAVMGKMAGIELIERENSRTPEGFSSPWPVGQTPAWMGAGPIGGVIAIVGISVSAVAVSALANISVVDVSNISLLLEIPVNSIIMGEGVGFYYLIMVLIMGMITLAYSGGFFSAKFGGYSERLGADLAGARTPLNVYGENIPKVMRATASVPALFRRPVANLALSLWILASLGVMVTYFESVAIFNRTIENIGKVAITNGQSVDVMGFTDVYPLDVDESNFIAWRTAIPPGVLVGVTPPIFGRIELVAVNAGLLKLERKGVAQVIDTGPESFELEAKMLAPSMTGSFSTQAAIGGSAFAAMFQSSTGANSVFVSFSKGSVAFSIMAGVFIGLVVALMLAGLNWNPGTVMKKLMMSMTVVSAGVSSIFAMVKPLALTTSSFLLVESVVIFSNSIGASEFIGFAGGAAFMVNKQFVRALASGTGALVIGGPVFAIGYIAAGLGSVTAAADVGRAAFIMAGIAGVLTGVSMLTGSLVGSAKFPDRSEGKMKVLRNWWPGYSIARLAGRFETSNLLMFFTYVADQLGLLSKDLVRNAHNFAN,16.5548
30
+ NWYNIRAHNYVAGTTMVDAATKPALATSIATQLLGTSDYDTISKLEHNAKEGGKINLIMTNQFPASGKMVIQQGYFGRGSAVPYTNRLPLIQLLSLVDSAATADKEQVLSVGWAIDAIVERRASKMVLYNASKSFLLGKISNIMGSMLVNIQISAAGQYTILTSYDSILTSKFLSYNRPVVDQGAGMINMATGTTVGANGQLLLRKVKEYITKVQGIDASLLAFAQRGLGSVTQASISARRPTRNRMEENAQKGAPGEFSKVTDAGGGHLPGSKMVFKRILIPVFMRYAIMDVRVKMAKTTYCPQTQTPFDKWYYTLNFTLRGTGYTTVVANPDKTGKDVMRTTMHRADCTGFEVAGSVDLGLQDIQVLEMGQFKNFDVYLFLGQGEGSDKYAVAKLTNAPPIAILNGFSSTMTLKAIWYTWRWPTMTRFSLAVLYFAAGHIMTRKFQNTAFMRDGQARQV,18.0029
31
+ DFDMPDGGVVTPLKAGETVGNLSAKGTLFNPPDDLHMRGDHNETLKYHSVTAVVIAGLQHEEIIGTAQDESCGYSAEQNTHCVAIHAAHKGDHDSSIALETEKVAVLCGDTEEGGYIWKERRHLSDSLLARIKAMFDVRFYDSHYGDKPGMSWPALRPWMKRGDLRGAWPVFLGAGGFAFNLGSMLGDGYTWNIYAILPALNGLQRLLFALGRPIRAVKYVKDTFDGTATTSFLLFYPAPSVFFLIAFFFGAISALAAGYMFLLEGRASLPQAITASIVAVSVCWQYNALFVGLMLVGEFCPRFAGTPAGVMAILGQMHDVLPHLLMVNEAVLAFIKTILYLLSGSGEPPLEASQMEYSAIVGGLVRITPAKDLDDPADYAVTGYAMITLVGFAIVLAMQVHLDGMCGDFSGVRFANPLHVGVKVIFNVDPDILCGPDTVTVGTLLFWAGGRFVFFRAASRILLPVFLSPVYKRWGSRVSVVATAFLTCTIIVGVRIRYQNDEVYANGAIYSRSDCAPGMFEEDKRFRNLLPTLEYLNINCYFYKLKGHNQINVHTFNWASMVFALYKKKEFIKQALLGWLNGDKIDLERQKEKSPNSENHDSDDWRGDVTVSGFTRPNCGHQRTTTLLQKVRFRTRCMMSRLLHVPFRRVAVHFFSFVFIMRLFSK,17.9999
32
+ GRFRTYVKFYLRFGACHLPVTVFVFVNVAALVPFILIARLKFTSDPVHVTVEMFVEGMTFLTGSASIMLFGILMAFTDRRSELMSWWFESEGATSAGLYNEIGFWLFITIEFGTGLIGFGLRTVEIARALGFKPVINFMYFAPLMGLVSVLASIRLGMALSLALDMSPVVIVLTGLSGRDDGTNFAWLYGGIGGSGTYGTGLGDSPGGSSFLAVMFARGVAKLGSKVPEIAWAIIYALLPAVLGLGVNALPKYYLGELRVTGIRGIPFGDPAIVTRSLTKLLRQEAPVDLLVEPLLIRHAILVRSVRTMKIGELVQIRVDVPLESFEDSKIRSVDDPLLDGDDVISTTGQ,16.3933
33
+ GVSKWFDPSKVNEAYSLSLRGDKYETTKANKTELFGEISLRVKEYANLSSIYYSSTSGYKDGFKWSDNSSKNKKVKLFNHFNAGDYQAMWEASRYIHLNQAKDCTLSYSAWNGTDAVSVTQAAGDSSLTLYRTINSTNDTTYFLLGSMNGGFSHQEQTDCSTSIPNCSAQFPAANVPTQRATYCVVCSLHNDHCKSTDVSEGCAGKNLLKESCQASFTNYKN,20.0769
34
+ YLLWMHDKSAYMQKSRTPSVQWGYGVAAVEKLAQWWASAKGRGGWFVDPPSPKVQAIPNGCLRNIASGFWKPPVNYSHETSKWKFIYVTLAFENLYSAFWRFFPGFMGFLSPEWNRKANKWNVVGKYDYLAAFVLKFGASYTDQTHIITWARGVRDRISNISLTVYVGANKLGNVLLSLGGGLSFLRGEFQPYNHYRAKFQAVALYDWRMSMTYSAKYLQVLSGQSGLKETVMTSGFHFFRLTAPASVFRTSQRTEVYTLFLGGLGEAQKDKEVYYITLPTLGITYYSATLTGSFDFSFHVGLKEDWRSIRRGHITLHFGAGSHDGKLTLRNVVDITRGIPLKYVDFRGLEFKWRDKAFYIHAKPDPQAFWVGIAPSDGVKSKIGPLPTITRLTPQLLVAIDINYPMFPKDGVDGYGAVEGESRSYYVHVFTAFDMQSLFNGQVHANYQKNKPKKDVIVTAATTPSSEELIKQLTQKCGKRATFMSIDMQDK,18.4373
35
+ MNPLPYKVRFLEWTNDSAPDTCSEAATAEPALRCSNIVGVKNPREFDTLWEKRKTRLESGTLTTKLESPSRMAILKRSIFRIFINFVVALGALVLLVISVSLNVRNNLLDPAYRIGVSQNKIARIGIDLFNGPKLQVAEFKICLGQTVFHLNVLHTILGLLVFYFTLGGADEDSARYDHDQYLPFSFVTNYTFHFEVAHYAMEQFGVGALANLLFLILVAHTIFVVSEEIRRGMANRVNLKKTSKLNPSGPARIIEEFQYCAYFVNQVLKIGKWAEPAAAQFIGRHDMIARELGQKLFDDNPSQSEVNEGVTAARVKVINGCSKEPCGKPPVMAQDLASKILDQFGTYSDTPIIGRINTIMLNGNTENGQTVIDGWLHHLQQRLEVHHIPLAESYDNFIFGLDNTETTLFHPFWTDMEEGEYGNPNYITSGERLINYRRALHNTWGSVFLPLYVFFWNWSILRPPPDAETLLKYQISMPSSIRATAVIHYHIHWLTDEEKHYVQGKITQCQGATIICESTATEDLIEFVTLDPAWSHLTGGRN,18.4536
36
+ MRRANLTRADSIADGEVDSLVRASPSLPRTEDDAVYLDGFERRAPDFEAIAQLSKMRYAGMSGLMDELKKLHDATDLNELISMGEMALVESENRTNAIVRQGLSEVLAAEDLSICDIQIAGESGSVGFGRGLRNLTNYVIDVEVRPNGHLIIQAQCFHTEDKSYEKADSKPLDSVQYDDRKVGYQGDSVNAGIPEVAAAGAGRKVLYAEIAVGGDRGDTGWKLAPIGSVLGGGDGAGIRGWATAAAQIYNWTRLAEGIASIDRGLAINGGARLDGTQYALGVGDANQASPVLFTGGLTGAGPAHVRQFERLVPDHPLSKTLVVLSSINGTVLADNSAVGHVVARGNTGLEILTADTAKVANGYTLPVRGEFDVSSAGNITAVTAIAGPGEDISRQAP,17.1274
37
+ GSTKDQKQTFTSFVGWIIFCSVATLSSFVYQQVLLKGLSQVLDYLAVTGSFGGIGSILCFFISTIGSGSGTVRTNNLYQHAASIFWTIIGFFGIAEAAGLVASLVFYFFQ,15.303
38
+ SGLPAFLAGIYPVIGGSLAVSIAKIGPTVPILQAGQAACHSKLLPSNEKPVTIPVILSLAYGVLGWTLGGLGEDLLGELGQVIGIGGPKL,14.4576
39
+ RTSQIFEAFLLRTKALKWCWIVLHLVTLLLLTSLACAYYQVESAHSQQPVLDCAYHYKRLGDGWWVGYSQGVIGFGVTAFILLISHQEASGVQDETGKFARYWKLNCTIFLTFLVTWIGLHFMIEGIDTFIGYILMVAVASALLGQVLISINEVAKTTLLGNNLDGITLSYGASPEPVSKNLEGDPAVYAQIANSGISIRLWWIIWALFAALGILLFVMLTDRHPTPQPFVEAGYLEKGIMTVLLLALSNYPILPAVFLIVLTSADIRTHRNKVVYSCNDSKFISKLSAYFEQTNKEVTVMMETAEPIVHVGNYSSPVGAIITISAIIVSTLGSLGKRKSAFPVTLTFVVVLITVIAIANNVISPSDQPVGDNSFFLFEITIALGVDFSSFILAICSFLKLELNTIFGSFPKCCYFLLSFVIMLFSSETFIAEPLFSQILLALISVITLPETTSYFGQKAVSFIKFPCIKDGFSILPTLLAVLELFGIVRNLRLLRLLRSFRAFRIVSEAKVFCITKTVLAHFYGPLRHRLLMHTVKGRKEKLMQALMCLGILAFLVSAIVEAIVLLFASYYLSTCYLLPAFSFSTVTLSLLHVYLSYIHVNTILVALVVSIFVIGILMSLILRIHKNMKAQANN,16.9812
40
+ VGSETIGAPIENLPDPLQAPAITAKIPTGATVQYLAQEPGIVGVWLQPRMVAFKVNRAIGSISFLIFFFLTTFAWLYITPGQINVVGTCVGVSVGGVLIGWGILIPGDPAKASFKADKYRWVESLALKFGETAARACYGYLFLSVAAGLEYLNLFIF,16.1419
41
+ MNAMHLVRLNSAGRGSSVAILNDNLATGAAGVSSHMSEDDRIDVIVDFSRGGGGMQQEALAQYLTARLSSDGFLLADINKPNVNIQSVATSSQFEVQPRIQSNMDVLVINWLIQGKDSDFSTLIIQRGKTPYINSAHREKILLSLNSINVADKDIELDFDGYQTGPTQQLPPNVFAASIGTSLAIFFAKGEIPLRYMINSETNGIKLLQYISQSSPADMEREVVLVNHEKEIQQSLNTEKLADSELFLEGWSEKIDNSVYVANLFEDCFHRAVVGCVATARLDDMMGTVEFAAWLNVDSQGKLLISEIYTSFTPELVAGQAVVGGKFSTVDISTGEYEIFEKRAAFGINTQTASALIYLPMPRALAPRVEFWQLIEKLMKASNQSVMISAGVAGTFSGGRGLLYVNGLNAQLVGMLDALLKLQKIFAANLGANPNLSNVLIIGDTDSVLALSQGIKLPNGMSLELKEVNKLNNTFLDELSEIIGDFSGSSEVRSKIWTSTQEVKLGDLTEPLFVGVSSDIVALVANGNIELIIANAGVSPRANLDTAQVFQRGKQVIKSRTGPSLNAKGLYLVLSDQESIRSCQLTGAQNLLAMNIQINLKVVVRDVLSAAAMAFLAKECAIVDIGGCEVSAPAYPEVVTLRYDTQTSRSFGQRIIQKQTLGNAAVNCSVSDAGQSAPGSSGHAKGNNTAYISVIAARVGGGIGDLAIVLAGLIAGATAATAPNLAYKWKGNIAPQAKDVLSSVKNGDRSLNTRDLSVEPVKNELAGTTTLNWHTTFAMNSDSGWRNVHPYPSNGNFP,17.5854
42
+ AKAPGLVGLGIGSVSGLVVGLALSFLLGCVCTDHRWAKYDGAGLAILEGMALNDALLWVYPLQWTLIGGVSLDSSSVSLVLVIVACTAALAGVGRVLRAILRFFAPRTRSQRLLLALVLSEVAVQLVVFFAQPLATALPLITAFTDHTLQVCYGGYTTLSPMDTLGQWVTYVKANSTGGTSLRDPYRALSILLVSFGLVTVAVGVTLKRFTASAGDCQ,15.0093
43
+ ALAQCVLLALASGVSAVLAIIPRKETYIRAKIVSIKKAKYGLSMYERGGRLKGLGIPPWSKAPRSNHHLGVYADEIGILGTIFGYTVPMGALVIAILITFAHLMPSYIKKYVYLTQVEIENYSPVPHQVPAE,16.8979
44
+ KECARRIKGCLNFTGSASWLSFVNLFVKQIYTGYVFAHASLMTLLVWQAAMHHIVNMNLCDEYHWTFTTATSGPLGYRNFTTLAWIDSMANFVALHRHFLVYGGLYHVASASLFTAFVAHFIRRRSPPTSFWAYLNFEQKKFLSAYSHGHHLILGSFLAFLTHLDYFFDKFSVHTNAFSQSWVFRGELTPELAVNLGLMFHVKHFSLFHFSNSVLILALHFSHSGAFIDEMRSLTALESAYGTMRWVREGMGWHRGVVERWYHGSFQVKHTEEGSMQFAKNFLLYLPELPRAECYAAFYLRTDFKGNLALRRHAEYRRKFYMMEGKTLFWRATQKGLECQKTWGAGFARTAALTSSTHAVAHVANVTTGFVLGFFFVWRQVHK,18.4019
45
+ FIATDRKWIPLWILNMMYTLSGMGGVVAFSAFLAITRMGYDIKWTGALFVAGSVIEYVEKLFPQAGPAGTLVVLLIPAAATGHGMLPMICVVCCMRIGFIGWGAKILILPLLGNGKLLAIYGIRSPWGVAFTVPAVLILVAAGLVFEHTWKLVVHAYDIGFLLTALAVALLALSKLLWYKEPILYALLAFSVTALVGFIASAAGSFFVGRGCTTCQHPPSTIFSDNGRGKSVWTFFLAIGVTIQLLALFAFLPKVGTHQTVKDLFNIIGTGDITIMLEQAAKAKRRGVYVLNLFNDKCPKSPAVLDRTVSYLPPNVSCAVKATKDTNMPLVTLTDDMHFHLEDYGHRALQEFPTLPFNHTKCYLSQADLYLTGTDMSGIILVSLNNWMGEIGHLAAHTLECPDERAVFSLPIGDDTFKYLLYHEQTLKGIRLFSHLLSQSSRQVTEGAGGRDILMTHQTALITLRSLIAAEVFVMTNGTLKLIPIGRTDVLWEYRATAYHDCTMVGIPSTNHLTWDGQVVESEPLWPLSGYMSLKTGSVILVMIDTVGRTTYMLQNCIIYLGLLTVRLPASVATMEDRDCVLMGYLAFLVKTLLTEKGYCFLRRACELIAIGVFAAWFMSIKYIAVGAFTGGILPWVLSYAVLGMMFIGILYCLIMFRMGQMVERGIVVYGRSDDVSMQNRLPDVADPMPATSLVLSHEMFSGCLPNDVHFEIREPVGVPMIRFFDWYGERVLPCQQPFKEVSKLIALVLQQLAHMHEKNLDPPIWNVLRIHVTPARPFRGLGAMGVNVIISYMILILVKFLGITISEKWL,17.7597
46
+ LFKSSKINSRNPISMLNIKMNLGSRYQVLAQIQLSPNKINSDDDTISFCINTENLLSWFLPGDHFNFADLRVMWALLIVTIICGGILFATLSMLYGIAPTRSTTKMIKINDQPAFKLGLIRTHVTFSSAGILLVGVSYNLSEELVKVPYIRGGNLYFQFSTPFALEFTSICFHNSYEPLYNWLAGYDPYTGTEVFFNFGPFLAAWGAGVAGTIALVAHACVAVELFKQLKFKIKISKICSTRIILPVALTGALIAWIVPLISSPDILKTTGKNIHDGDTLIMIPVLLKRIFAQMGKPESHSIEHALAHNHAAPSEAQFRLAIDDSYYNQAISTCTSRELKPLLNRNVVRLLCADGKKTIRDPKRILESYCEAINRVFGGTFKDFLFGVVENSKLTKFFKYFLGVLDIADLSNYSNGALTTEADQFLVEFLDIYPEYHKFSQNKTYIRK,17.6359
47
+ FWNWRRRFLGFLIGVVVTLFFVEATGTFVDNWSTIRAMHKMTGMTFGDWLGTIEALLTFGFLIAHLTGGGTPFGWVDDVFVVVTIALFARQRIFRLALVGLRGFRLERAGSTLKAVGALRPLSSTRKLAAWLMGWLSMLAFFGLVTGVLVYVDVRGNWFETAPYTFETVTVVYNFYQEHGYGDDALRYGLALSVLAVSPFIIGILGISFNWLVVPLSGWDYD,15.5532
48
+ TLLICYGASASNYSDSTRANAYLNMPITLSDVVVGLIYAISLGSVFQVDAILLAVILGNIVLGAVAFVVASAVATALERLVGRVSLIPAFETAVSGSISGDLSSYPDLYKRTAQSVIAMAIVGEVEEQVRGAENAGEGILDVLDWQEGGGEARTTLNQIGDGVLQGVNIGEEELRSLKPLEVGNLDVASDVTDYDKAVKIDIQFALRRARAGGDVVLLDARNKSSIDFGTDIIVGTAGFGPVGTAPFIELAAKAGFNVMVRGGIEDGIALTDIEVVKHARIKGVAISGGTTASIGSAARRIGRARISVSVGKARFQSLKKVCDVAALDIQETFAVEQILLLATGGKQIRSVSSAIGKPYIQGPDGSLGDLLASIENTVTVVSVKQNKAAIINDLGPSDLASIEDRTPEEFLETTEDDVAEQNDCVLMNALGLNIVFEDNVVLIMDIELGEFIPAGREVQLLNSRLEKKQSKLRIAEVLLTLSSRALPGITRGNYDIEYHDLAAFALGFQPVFIGSAAREGTTREALIAAIILLLESLILAGMAILAVGVRKLVGFQVQPFSGLFRAIFSVVIGTAIVGLGLAWAYGPLHRLGEEEEVAQTKVGWGGSFALIISIVNAVLYLTAAIFLIILLVALALFFVVETIDHIFTFEIENTVSAVDTFLAGFGCMTPQLQKFNRQLHKIPNFFHTLDEFKGLMAHQDIIADFNRSIASLFLDYINAVMIFLDGEIATKILRALDAILGGVVFSAIIIGAQASVADSIITGRDITLELAVALLLGAGLAASVLALGVGVTIAGVGGAEKTASNEGQARNCRILFYFCVGCTSVIVTGVAVAKSIEIL,16.4406
49
+ TLTFMMEGTQAWIPWYIIMMVYHLLTQTYNMAGLLLFGLLFAGIIGLLASRPRLFDLEQRERINWTMQPLPRALTLVIYMLLPFSSLVVIFAIAEATTYSPPKQDEHPHRLTTAINVVVAPPYNFDAGVSWIPLALLGLAVALLQKVQLTPTNRYNRLFKLVQFSQININLYSGKAPITIAMDSKDTYPIDETMRTFAVLRETSKKDTVYIPVEVENCLKGTESLYPAADTSVNLYLVHGGQNHFATKATMHSIFIVPVGIVGPFLAVVHLIILGIAEAGKREEYYLLYLFGYLSVLTLKLNGTAIIDALIRDGIHCARLPGRYNVLNYVVPKVSAEMKIIQDTLIYWEPAATQWETKLFDKSRILRNSPGYKFAKLLSVHLITMAAYCTLILVLPTVLSEYGQRNSGPEKRVLFSCRLLKSRVKGKSRICFHQVPGRKMTDTAKKLTSGVKNIFRNPGYKYMESNEILLIYTINLDYKNNALYENGPAIQTAAVINNNHAGTLFLQDIDVIPNLMALSPFVFLVCGYAPEDTELVFCWVLNKCGGKEVYIAFSINRNQIEDPLSKLEVIANNIIRVIKDDYNHRKAAAKEYAEWIAVAEASIGTLPLSIAKGFFASEETPKELRMSFILRAMKQWLVVRRKHKHDCKNMDFKQRCKSVATIKRKPMEQSLCVPIEKHQPAMRKYLIITLLEQNLDRWAHEAERVTSHFLPFFNNNSETNCHICECLNEYYQDAELLLQNAKISSGCNEYGAIYYSGIPISGAVAQKMTNIFISGSSVVLLITSYGGDE,18.2811
50
+ VAQMEFPEGTTSSCIWKQGYHNPAVVIQQLTLHRCSASSDTICTLMTSQSNSTQLMDDLLASVLKILVGLISSDYTLIDVGGFTVDVDSLSLVYRNFHTNISPCNIDTVTKTPDDTLTFEQYRDDMRAQVEQTYKAYVSADPPETRSVKASSYTHVYRPIGMPHNIIQPIMMIVEDTPQTESGTGIKMCSNQRQDVVTGNPVEAFQTLAQGDHYKLMDSSTNKSILAVSNGWNLCLGSFDSLENNPTITDKEGEKHYKFMKNADDTLNSYLYLNAVYINDPTLPVTILSSNAGCKGLLEAIYKNNIRILYDSYPAPNAEASGNNVKSIGTITVASCMGKGSMCPCGDDYQTLAVAVSLVNYEYWDLNGSKSNNNVIRTSGTFSIAILTDRGNYTANRSALLKAYINLLENYAEERKKQIKATIWLYQRDGRSSGKKEMSCNDDPSDTGYVAAEYPGAAQVLDTDDLETMPGSILPSFQNFAQVKLFKQQYKGKVPVKWMHGYVRHNLKANYFANGYYFAPSEGSIINPVLGGATE,18.9275
51
+ TGPYKKLADWNERVPTPSITQYIASHYNYPDLVAVRRVLRVPVAVDATGKEVTVDKQDCFFKSDGVVYTTNYKSYPKTIISESYFAYAIPGDVQQKMHTIPLTSNVYKDDREFFQYKVSFQFTPPTPPDIQYPARADNDSEGVLDWTKEVPWYAKDCNGPLAKCYARVNTDEFYESSAARLHPWDFPWASRLHIPAGIYYR,19.3561
52
+ SRKILPIVGLIIGIFSVIAMIFYVLLKDKNHATNIETTPADVETIWNMTGLLSQSIEKAYTNPTREYITHADVLEQLKKTFNFDSEILNKAMNTVTQYMSENQGDAAVKLTEDFFQTCAIETQTRNPGQFASSYGPDHKLAKDQATDETIGEDNKSPFNDPTVFGIMKALLASMTNIIKIAMETLNLFTIESNVIQLLPLVHAMNPRSIEELRLTLSYFHKNLNVTLEEDRQKLASILEILRHLLQLFFLYLFSVQDTLQNWLLNIHFNPPLETIVPTIPPNDNEIAQMLIQLNTDSSSHLITILDKASPKMHMIVLGEQILNQSLKDFTDGLHSVKDWAEPTDVLTRLGISPIDNPMSELSKLWQNVLLYIKHQFTSISNSSTLIGQLKTLAHVRYQLLEIKPALQSLASYYLNIDTTMIMSYNLYAFDELAIKENLEEEEIHSKIPEEQDYLDIIAQDDLDLYLKNLIEYNGSIDQQARNRIGFSVISFVHNKLFEILPWLFGKDQRTIKIGLVIKNIKGYIPGLLAGKMEQIRNLSTDENLQLNDKLVVFSGMKQTNGFAKLTLLNMSPLISYYFSSKAAGLTWTSDFIPVLKISQLIALLQVYFLVMKSKTITGKYMLRYTDTAVKKNHVFSFHEVAGHFEGQYSSPLNAFFNHISRNTIPGNRKIIQTYPLLFGSLLAVIILLLILKLSLYPVKLNLATLFALNIVLVTAFLVKTGKDRLKATALLLLGLAYAADLLLGFKSFGGQGESSSREHKLANLIIFPLLMIKTIFVIVSIFALYYIG,17.5915
53
+ TPDSEYMSQTQNRYSENTCNHQYPTEWSEVIDHTSVGILVVSINRFWHQDKCHQKASFLREGAFFRAGILLGALSVLLCFSKWSVPPIPLTLLINVYVSEQWIFLGLFVGDNNEIANHYQIEVLLDFARPYKRYAYEILILFSTHVIIAVVFRNLVVYSPDSLLISISQTDRLQHGFCNLSAVLETVGILDIIVSLFLYSLAETSVALIIGLVVIGSAFAVHQAWAGMWIPGRNTSRVLREVKWFVIIFVAGRCTLWFYLFSCSQNNLIRQTSMHVFVTGLQICHLFQASAPHLVSYLVHLVVRFTQISSVLRRNNVYYFLGAPFTSSSSLIGGIAICVFPDYSGFEKLIFLSENAILMIASNLLLRDGPRRAFLVAREHQEVTLNWLSRSLIWRKEVDIVLMGLLMLLLIVATGNIFTIGEIARVVSSSITLNSVLSLIYWFIFNGEHVKPFTWVSLREMVNLQLNLSVMRSKRGATNRLQKMNAIQEIMAVDLSGGQRRAVLIARELAIAPRLVILDEHTATIDTVETSVLALLSPLLRKGTTAIVILAMHGRDLLHQLIGLIYLNMRVLDYLRHKKWNDMKLFKAMAELLKKYMTEPGFLRWMDRLLLYQLRNQTVEDLKFTFVAQQPA,17.687
54
+ PRSRLRLFMLRLTGMSAKGASPTMLLGLGLLLPPVTLFYGGGVAEHGLPDPYALGNVTIVFATPSVLQHGVHWPIPELGIMALLSFIPIFAPEWRAPTMMAYGLLTGFLLGAIYGPPVVVLPLLWGKIKMWWKLAQALLGASQLYFTIQTAIPLLVTTESETYNPDSRFVMQLLWSHIHTFIPILFIIKAFTIGLQPLQMQHQPGIWALFALTMFLVSWTLARDPYITPDGYFADQKAMGDLLTFNLLQRIPVGNHPALSPPGPYSLLGHISTQIIVAPFIWYWRSA,17.041
55
+ NGYVEQISYHETITSDKLRIDCLLDLNLRFLAMVLKLDIKPLKGELFAAFDCAKMWNCPVERSKDGEPVNQDFVAEAQLRGKVFSCVIIEESQSEYIPCSAPSTVSLEICDNLGKMMPVLRATFQLQLNLGTIMKSTVKESDCRLPAYHLKLECPNENELVGVPQPGPVRKAIDPISLFEELAAHIKFDKNGERKFVQILSYSKKPIKYSVKFDFCNSAREERLEVASYKLEIVSLQEMRKDSTKERSLRTMILLQSSTISFQRLDMYLYKILYLCFLDTKYADVMFRFGVLADISRLCSMMPELKGTWCTGTFVWFIKWAFKVPTLNLGGQDEQMSQFLRYMFKAKKMVIHDPPDWKMACKDSFMPKRNVRLTLCNQKETSTRQALIETKLLEEICTDVDVVMRGEENTVEKSNLFWLVGDSKTVPRNDRLLVGLGQTRNLNASKFEVSHVNIPPGATAVETHPTRIVKLPQIQGALLFYLLYNCWALTPWFRLSKLETVTAITFFSRALYAKLYVTNAHCAQDMLKMCTAVRQLFSGSNFGYLMFHKLITTKQTMKKHFNVQQILSIAVTNVALVVTVGQTECVRPSIFSYVQTVNQAKAIQGVLVSILSPDQAGLAILFIEEGLTRFVLVDYLKMLGKSRQSKPFTLIVGTKETLEEWALYLTGEYVPT,18.533
56
+ SGSFNLTNAIWVRKYKWYELAPLRYLGRCMVMDKSGNKHVILSQVGLLDSPGDELIQGANLPLRITIVDGDDNDFFDQFGEVFELMNLGERAEFNNFVQPADIATTISVQDFIRYSRYLGKGGTFVNEFRDRYLEDGRITEASVGGGFIANLLDVEDLSLVPEREMVFGEKERGFSEAFGSLNRLVENNFSQENGRAEYLAGSNGSSYTTGKIGYVTNWQSQFVITRDLVSMGDFTQKLFSYHQGNIGSYRPGFEKGARVKFGDPIQDWTNGSDPVTSDWSDNYGFKYFVDTPTSTLWRGVVVSNPAIFNMDEIGKNLKVTSLTSYYIKADGNIGRGTKVLAGNNYQVNFELIYFGTTWTLFDANLYYDNGDEWGLSDWTDVVYNSMVAAYQAVDDGHMTISVLGTIYVLMMLVSISFGTIYVYDLYTAMALAASGYLFTRRGLS,17.983
57
+ NDIRSTTEDVLMPVPKDLAGKFFIIEESVVIATETLQKDSMISFHEFGMGSADVYYTVASRPQYISDSTLSLNDTAISDDVTVIKSIGLTVILDLTAYDVSRVTMADRQSYEDREKVSYREIDLFTILVAEAFSCGILTPTYLAERLEQLGRIDIHDGGWNKTINAYELVISASTFADGKNFSTAVTIVPNLPAVGSEIGRIKANDGDIRDALGWVFGETTEQSISPVEYGMILITHGSPGGLLTAKPPLDNSVQEKLFQVLASGWKRGLYLQGGTLVSRAAYLGLDHWLKLPRGLSIIDVSMIDNSSLGIPLYIRYQVSVDQTDKIYEGGKPIPERMDQNRRSFFGTLNLPLAITQAKNITNKSAHNIGQEWWLKIFFTLIRVTVCMCILGFPHTGIEASFMFYLCSQYYSHWFVKWGLEVLSWENVMSVAGMNKKPGFEFALFSDGVILGVVLFTAYIVILFVIMLKRPLMLPIKRMKFIGALLVWSLSVVVGFLQGSPRDKKKFLIKSAIWAVFFSLVAFPNVFLWFFTWKIARLSANASVFYSGTTTMFLSLLVTTATEFSVVQYTVFLEFFIMLTSGILVLVVWLISSQKTSSVSIT,17.4521
58
+ KARYVRLVVAVCLCPFVRYLEIQLQDELEAEAAKKMQLVGREKFNAEKLTTEDLIAVDAVGRAMAEAQMDPATIQRKIPGEVPANLLEEQLKSFLLAQEAKLEARRRRKKLQASGSKSNRVMARERQYLKRCDCSIDEAKRNLLDTTVDALAARSTIREDILLADSKISQLADTSPGIEYPNAFPEQLPYLKEEYYFIRTSRFAFDERVHALQSNLSLLGFDDDLTDATKAYTEFGEAFGMCLEKLDISGILDFLKFIPASSKWNPI,17.6976
59
+ SLIGDLMSDFSGYAEIVTEEYMMKHWMPLGLIDSENTFKYSYQAKMGLAGIENTGIDTSYVRSPAAGKVPVLPARDAGQRLGFTQLLMDLYLNSPGILQTLVYSWMEVQASWMRETRFGSLSNEFETTEQYLPGFKKARAPLEAEQIPKNGGRVPGGDRIVGVFEDSPVSGRSPEEHFQSFSILYIKWNAFWFFSVQCILTLILIIGFLLTVDGLHPCMQPMRYLSLTEFLMEFEGWVVSRRVVYIRDYDFTLTFMEIGNVAGVRLESYHWFLFWTAGLILGSIFFETLRHYIGAMGVVFPTDPPPSEKSDTFSGVTFIAGFSGAMRVALVYTAPQCCRYGEIAADVGHILAGGGGGYDQSCDEYLVIYPLSGGGWALERARKRGVIVFPYNATPWAGILERFLPLVGTARYIAFLVWLISLAVIIYGVYAYALIARKNPKGLMNEKGIKTARLATGWSWFILIKGLINMLPLRGVGTKVFLSQIVRWLPEYALGK,17.7286
60
+ LSFKIFIKLLIYLILIILFILSLFCKTTQTIGMPDLFVKKVDWIYTTYYTFYNDYWIVSVKGVSVEEAIRDLETSFELSKRNVMQLVDAVVWTEASDINPGTDFYHWQWKKLLEEDPLFAKTERLTLVTTFNCMFLAWFANVYALAITTMPTGLFIFVLRFFLLIYALFASISGSGYKDTWLVPFGGAPIRGNLAAPTGRKAFLDCLEYDIVVTNLGAATRATASVLITLFTILRLFTGKWNMIVDVTVRRVSMDCDEELAGATSTTSNMREIERATDVFASVCQLIRSFLDGRNYSQAVANMEYLLRMPESKIMLAWKWNEQAQYPVFRYFVLDAMNEMRVMNQQWMSELDGLFVQGPLRNVFDYLQEQVQQLRVAKQNSFMRFKTKFARGKELWNVWLSKSNNLCQCSDEGLTTLEVAAILLAVCWMVYGFTGTIRIITEDATPKSFTGHLYYQRLHYLRPMMQKIDNNPLVSLLPPRIILDDSTNWKNLVPELINVYIEPLTIPASQQVYELLVVLHHISPSFSGWRRETMVRPNFATDDVGWMKMEVSFINYDQVFYLTEMYPFSQAPFFKLLSQLRIMSQANFRVTIADLSNEIFNYQSLDFEAMKALDHLYQDLGPIDFLFVNTVLVRILNVLRYIRFLRVSRFILPYLRKIARGVFNIFDWYNIVRILFYAFGVSNLLSTIMCSSEPNEDTCDIMQPLDMYLTIFVLDLMLFLSYPQYGFIKALHGFLYHLNTLGTTMFGLAKNNLIYFTLVFSILLILFLGKILAFYAKRNNLEELIR,18.0488
61
+ ASIHFVVASLVATGLVIGTLIGNLIHSAGVAPVIAIALLIIFFCYIFHVMTTSYMNSMSSQGPVDAWTCVGQAIAIGISGFIAAVEGLVATIFFAGLAGAIISPISIYLIATIAIPLTIGLVLASLLVIVLKHICKAALPSVSVVKGISLALTLLVASSLIWRAADSAKCSNCLTASSFVHATFDAISYGAMIEVMAAAASLGEGVIVTAFAVIIALVFVEGLAFALTNIFCGLFDG,15.3219
62
+ KWLKSKEATKRARDKVYVKIMNRETPMAIYTGHHTYWELATNPVVPDKRFVLGEVCENRDDLPYYHWIEHFRSAIDKGARSKEDEGKDRKTSGIYTFRPLTQYQREEDMPTARVQLVCKGVTVEGMSINEIYFHIIHFAADDLNDMAAEVDWGVTEVQLDALVDEPSECEIVTDKKSRPKHIVITTNDKDLPTVRALVDKICLAEVGDHEIQMTRCVTEEQESYIKYLSRHKSDAVLMAGGAISDIQNCSEGRFPITYTDVCLKDDSKWSSANIISHFRGFEEILAEYINEQRWLNGVALRRGFTVQGVSDENPILLITDIQLDDLELAFRQNSINQTSLVSIGSHVLRDIGYFSRGQWGHEGPNQYRTRRHASCWVLNVRHNAILPTEVVIEDGWIHSVFTLYPSAPPHPLGYVQAQWRGFNKENVKDIREAFLKVRDLEWKRHEEVGKLINDIFNTMGYAGNTFWEAHFKRPLFSGLGRIKEAIRRKLIFIRTVISFENLKALVIAAVVTAYLIMAFIILEGKAWGRFEDYGNTTSGWFNLTGHVPRYRYFINEFCLSWHKQTRCREGNFITQIEDKLQACLKFFGDIINSYKGSLFKHSVWGLCPDIICLDKGISRWNVDWSPERTKEICGINPEPRASKSTWRSIQSICDLFNLDGDEYGTYDIDMRVRLTHNSTTPCFPISIGLPCKFDSAGYWTLARIIFEKYSLAFLRRIQIFSPGVAEPLVMVTKGLNTAFAILTLGLAGGLITALYLTFGKPMEGWIESIRILVLVLSLFLVALVLSGVTHGVQYRTFKDDRIKISVRLWVFTRRIE,18.7487
63
+ QEETPSSDRFICKNIVVLSGVAAILIGLGNILICVTTKYVKVLRYPNLRSVLTVVALAGFVANGLLVFIATNGSPTIGVSWLSMAVEVGFAVGLLMCLATTNILADNQNGETGPSDNDFLGSTQAELVMKGNKVAWWPMGFFVVDVYYAKLFAGVNNRILMGKIKGNTWEKNSWNKPGQIMAQVFYIIMTIAIFLSPLLVLVPMHRFPLNVVATSVSVSVLLGAAFTGLPDVMNWCTASFGIRYLGFTSGLAVKIISLILRISGRLGSIQLNFAEKLGVLLVSIAAGLISGIVWIGGLLVQLFTFIVDSFLNTKAASDQPLDIIIMLWFGITWHVLVLASCTGIFYMYNIFILGQSKNYGSISAVSTGLITANQGVELKGYPLASCCVFQITNMKVHEDIKECWTLIENCKDERNVHDIFTITVMHGKKILLTGGNTYRGVEIRVNEAGQVVPNHELYVLAKEVVYSNPRTLTSVRVAKNVELCGLTVRVADKAAMNSMLDKQISNLVHLGMSLHKMEKNVVLSGGQRKRFAIARAMISNNFLVLLDEPTSALSTSGENALFTDLPVKENGTTLVVVSHRITLLKFGDVVIMLAHGEVRVHHLCIYTKLDEFSLKIATYFMRHIGYFLDIVWAFILACIFGLAIFNLSVFGYNPSASVKLVPVITLFITSFLVAINQFFGQSAKGKLAYMHHMVRRDLFGKCH,17.4792
64
+ WNRAAHFLCMMLTFGVTTVSITTRDTYYTGLTKAVKDMSYTNWLIVFQFKMDTPSRTPWWRFENRWLNVPVLASVIWRGIPELFYGSTLVDHFSGVWNIVHWKHRLPTFKRLRGINSDYKPPLRIIIFLSLTFIPDLPRVLIVLGNIPKVTVRFFMLVMTCQPQTDLKQQDGFGFWRYKKPTTANEHNWKELRAADRPLMTYPTTAPKHMHPFGSLLFYGLRVATDQARVYMNEHPTSRAFLNLILALLELIPDPSG,18.5118
65
+ KDFATYDPTALSPGANRRHDTWKPMGTATKVERLLRWGYLTFAMLTTLTHIVILVFVPFSWSVWGNMRYGVEPPEMKDQGVVKFFLLVFSFFLHTYVLFTALR,17.4756
66
+ AFVPITKFYNLRREGTIFKTPELRKMGIKVWLVDLAVVPVAKPGVRASARRIIAYILEFNKKASKLIIRVDASTGFFLTDNLIGFAFKQGIRKVRFITDAPKSGSIIQALFGQHDVVISGADIVGTEFEVGHELEELDIAIGIGAREATRVFAAISACVPSQKIIGGGGTVLEYCATTGSATKGIFLIRGWMEYVNLLPELVRLSAVFSMARLMSTSIHIRRGQGSTPSYAILVGCFVLWIGILAWLVGFFDLSEQEWLFTLPILQLGLAAFAGLGLAIIAKELANITTAFGVLAADLVGGAFCIGGFNAMVHKLPFLYNLTVGIIGLISLAGYIHIIIGGSWWPGPRDREGVLANFFWTPTSNENEDFDILPLEAEDEKTSIDNPSKGGEVNRINLFDDQVLVKQSMTPCGSNWPHLPFVFPDWIMNNLFKAIFWKIVEGSINGAAIIAEDAIARMKVHVKPISYVNESRRFLKLAAFMVHVILEIYVFFCITLEFEQNLFGKSNGAGVPKLLILLMIALFLLAAVGGDITTKWATDVVQQLALYLTPDLLPMWWFETALGDAFECERPGIGTSVQYEKTAFEDKGEPTFDDTLARLIPEVLNVVFPETSPNAVILHWVNFLTFMLALAQSGICVLTGSFFFNQARSLRLCQFQKVTLLATSQDQSADRIVAVLKKWPPEKSGRVAYVNRIFVQLLVDPNKMRVLSGLAFTASVEVLSSVHAKRGAFITKPFTVIFVLLLVLVPLVGGYAFRILQNNFHYLQLLCFIDRDPNLNY,17.2646
67
+ SVSDVDVVKDKGITHHNTIVAAQSKIEIRVMSVAPVTVNQKGTLILDFSNREPNEISVTDKSTAGNCIYAEKHYKKDCVLAEEGGKVRLAGVGKSSSQSVKSKAAIAVQPQAGFACGQNGAGAFQREDELWDELITRNKIAATVALLLGGVMGKTVEKVNSIILLREKESQYIKSIAVQIKGGDKYQICVALVLEQDVLFHGVDKQAPLRNLIIKMKVCNTREMIPKIYETCKDAGKTDVATEVAQSHVLREELVAYTEIIVGVYSPNLLEVVYKMIPDDSVKLELPFDVNGSKIMAVDGKRVLKEKFMFGWAIGNRFGCIMDGKHEKVEKDVVAATLMGIDEPGTNVSELLKYLSYPRAVAAENTLSVEDVNMTMISNHINLGDVSKFKRECLALMDYLRSISSLVTLYNSENVSWQAKTKTRTFGFSFNGNGFPSSLQLVKLVTIIVANYDVQYAWYTGAEESEVNSPERFGCCYKCVRKPIRAGCKTSKMSPTFIILPEKTWEIDDHNLEYRCMGKPALSITLKYDRDDDNSKEDNKLYALAVGLVDSAVTVHGWETFQVSCWIPIPDKKTVKMPGFSDLYLAVSLCFPMDEEKKLRLHAPTLPEIVFVTHASTYIGDEAELVLHILKRNGVCKSLGFEDNHEIWSFIAWISQYHSTNWRHSGSVVCGKIRQLLQDLIPSADQDTQVQAYCEECKNQENANIDDSTLMLVIAYKNLQYLRAGILPDYYTFDNIIQVGSNNVISGAAMHFLDQIEPFFVANTDPQKNIQLIRNKEDEFYWRFWAFDAYANEERTSNNRDIAKFEIATKTIPRLQYERKYSEALEVIKGFSIAYNEKY,18.7321
68
+ GENFEELFARVRRQHPEAFFLYLPVILIIGTGVAMELFPFAKKYWRFSSALGRAFLILLLSILVLKLLLGKLDEFRIESWALADFLHVVQAKTAPISPTIAVLRYFRVFRALRDNMLDRTHDLSKPVIGYLFMAGLPILLILSTAIELGVIQLDGLTILPLLTSAAFWGILPQRVTSGGDGSLLAVLTGFAPSLKEGFRYGFIIGLFMALLGFIYTIAAENDNEALRQG,15.1469
69
+ PYVRSGNVLMAMPQWLQDMKKTLSSKRSQKELVKDGDRIGQKLVKERKMSTVAMDLYWMSLMLAHPYAHPVLSGQATVYHAVGDGAVVKVHDGDTLFGVALYFSENMWFAFFNFAPGMQAPNVSSRDGSIGLWGHLLPAPNFSFAQLMMIWFVVIDFLAGLSRLLMLYYYNLAKTFRFHLLFASTVSFLEVQAVATSFWAYSGNSALLVMVSFLITYTGTTFLAGSMHATGFYVIHLTDILSQHVTFLMTLIEAMNSHQSATSMSGSRADGKTTVNIMLYSASLLNFTTFKGGFAKFMCYISALLWLVILLYAFVDGAACIGFGGRLRRFSHMAIVKDQSWKFYVTQKGPGIIQAAEYMMNGPNIAWSFVIHVTASHHRGDIIVSGGWIGSLLPMAMQFAGQWAPLIVRAPKNPRVLKLYLLVTMLPAGILIAITVYTLWQPVKKRPQRPSNESNMLIVIVGMALGAGTACLPFVLGEYNSNIVVAWAVSPVLVRNCFIIFMTVPQMACMQDTICSVDRGEHVTGLNSLTVVSVTVMSMPSYVIAVQTVSVSKSMLGIPFPFVELSLKADASLEQLAGPINIKDTVLKQCGAVVIILMLVFGILRLTFGGVGVVDLYSPKLLSIAEAKVVFLFMTITWGSGITNSTKVFD,17.7269
70
+ KWAQGYAVLEVVFTVPFVFILLFMFITTCILYDAKTDFVEFVLSIAFFLTNSGIEWKVCAVTASSDSQLLAVMCLVGLAYYKIYDYDCCDCPFSIDPKKREKTVNLKQCSQLIAFELPPKFVAREEVVVSQVPHRFKNSEIEDLTFELEGLIYDHNYPIEDGFEAWRVHFLVDVGGGEIGELAYPVYHAPVMSIGYISQRPIGIKAIVVRNQKDQMAELINEKDVLSISYSVGLSYELNEYQLTTIRNLRNSAAGLSGLKIAVDSIIGLCRTPGLFPFNLSHATSQAITVVLGKSKRFNLDLSKIKGVPALKSFARYAVKSVKRLIADADSLPPTLVAVFYKTGNVSTVKGLSPPLIKLNVLSDASVVPLGKKINGIGSTGAVCTIFNGVCWATTIVSQDDVPTVVVQITQFRLGSQLDRWGKRAQTSDDMFWDTAAGVRLIVQMGVGSPIATIIVAVRPADMYNNMVENSEEKLKLRNNRNRQADDEIYVAIRMTGGNARQVRLGLSEVQQKQRFVLDIPTAGLIFIGKEFTSVIAKVAGVYPTILLAERTPEDNSVSIYLRNVNYIKGRPTSFLGTGFNNSKGEFLDPFFTLDPGPQDAVNGLRIAKEPESHKILEEQHGPPCLTYNQHESMLQILKRARLSIAVPDNRVAD,17.9063
71
+ KSVVILVGCASSDPSDSIEFHFFGDNTAITKGRIGRRRFVVIGGPSADLDDEEGEYGATHVTVFDIAGSIMPIGFTRGMTRLYGISHLTEKPLPGGFVMVLPPGGWRELQNLQFYEAEKYIRLSESVMQDVNGGLTILEDLGDEIIKTSFPFGQPPDKG,17.3791
72
+ ESNISKIFKDPICAEFKKVLVSKIRPIKTTAVLAGLAGFFCGGGFFLGAITADVFMIGTIVMVYFAAVLKMSDARGYAWFFSFFSRFLIGATNFADFGELIRAFLKDVNLRKEKVHKGNYLALFGVFGITWIVLIMTSLLALGEFIFMVGDIFKQSGKKAKAKLNAEETIAIANPVIYALIMIISFLLSVATILTSSTGARAKRIQSKRRNLGVVLVGIFALIVVAILFILIVELCTSIGIQASYSLLAERLIGTSEYMEGIPNTNEYWNAQGVKQMLGVASLWHTKIYEWWNAIFGFFVIKLIKFISDQAFRDWKNGLHSLQIFVGLSVLSAGAGSISSILVLSDIIKNANTGSFIIVPVVFFIGLMINVAIFAIYYGD,16.4909
73
+ KTYSEDMTFLNTPIDRYDKPLIDRVPPEHHTYVRKIITVFLVSGILAVLLLMWATPQMHSKVRWLEAGNSPGVGRIKLDVRVPEIHPQTMHILNAILRFTKKQKDGPVLVEAKSDGDTIGTADEFAPLAAIARIEQDLKASLVVRQIVPHACSVPYPMWITEGDKAWDGVFYKVEECDTMDDFVRILNFMIGAEYLPGSNTTNEYCGASRKIVCFEPVMIRNGDDAEWKASVVVTMEILELVMQQVITCTDSAEDGFLISQKGQFVGEGELGILSVNLEKQLYKAVEIRSQDDRLKTLIMLIVSFIAVAVGAAMVSGYIPRRRYQVTVNKVPYRLQDDAPVEGDVFEHGLTEMRIPVLFSLVDKLECVAAIDRQFKLRRAERALVSFGTYLQQGKSLAQSWAPVFFGFAEYLKTIGVCIIDNVEGKYLKNTVAATMTLFLLFLCWLTMCFPLQHPPRLPLADFRYLRNLPGGSTTMLTVFIYAHGLDETLDTEKGFWFWSDTGLLGTNGNMSGCYFVSFAVETQAFVAMMLGRSHSLSHRFHGLLQYTTLGWAFFTSFIPFVRDRNFTERHYVIPQHTKASTITCQNKSVRTTDKPQARKYQEDLSHETTCHCQTVEKTFNHPRVKLTTVKACGEVWTECPVDIAYTLASELFYSFDLQGGTLLRPQFEHPWGRSNLFAAFFHMDEGFSCHLGRDPDMQEYINSSSYLLNSTNVSEVVLLEFCDNIPPNILLITATFFYGNMMGKDAIAIPYDEYVGKHAYELAPEAAVIVLIVFAVKFLLRPLLLKCLWAAEFADHALNRRSINTPTAFPVIHLFFDVSTVAAIIGKQKNYDRFFPQLAVDLEALVDEEGKEEISFLIREMQRFIDGVMVMLFMRKWRKRRTLAQLRPPAVSSPDAPRNCLNVPDREKACSLNNELKTNLAIAADY,18.5435
74
+ TDAIKVLVVVIVGIITYLLVMWYWSGVVFEYGPVFIFFLLITPFTGEEYNFVAIFDAIAK,13.894
75
+ LGRRRIYGGLFHVLFFFTAFIFLVQGLRDGGRLGVPPHTLVALFEVCVILVLSASYASTFYCDYLSTTVHIMHIIQQLLDCYYYHNTVLTDNNPWAIDPVTLSLDKTYEEEEDERVEDVTLNILKKQYWHQAFQFEKCRMNMRQEEEESWLLLGNPQKQCESCKVRVSDPIGPDSPRPDVTIRISSVDISRVLLILRGSESLACEDKVRFSLYRAYQEPSPLFAEEFTEDLRCIVHTVRVEENALDTLDASPELFYLPSCNAQLIACSSFLRLNFLLKWTERLALNDHFWANKGNLYHKSWQSVKEHEYVHFPYCLVRLGGQDLERVEAHEKKKPLNICLSITVPIYWGDPRRKEFCKICLETDYTGYDMFKKRRLAMIFFLMFLFFWVFSLYLVEHAGQAMKNKGKAVKLKEKLDRLLDCSEMKPKPIRDGNLKMLQMPGTFIDCSSDKGVLSEFVAGYLLIGVFFLGFLVYVSCSCTILLWLYWCIVIVWIIIMFYWWVSAVTVDLQMAKFARQRERKVVAQIELAWRLSLFWVLYPFLLLLFLLSYFGVSPLPKKVKGIGRAKDHPKLFYQLKILFNLTLLLFGSLGFNDSMKFPWDLVLFLFIEFVLMLYFFTVANSQGFSFLYRLTKPLRNVSALLIIHTFASFLSHVVKRIYESALFSMMLQTLVVSFFQQYVVIIYQVTATNFISRLIKTLKLNLPTFVITMVFAFLANFVCKLLMRVNNENYTFILMAVPLVVPSLFVPFTTLGLNSVAMGYFLRGFFCETLQEKAKQSVVKFKKDEPTSRYELTPTR,18.1273
76
+ SEKSISRALVTTMRYKVKFTHGQAVFERQYRHVLDGPFGERDVTGKLRLPPDPDRQLSLKNVYIVSAHFDPKGVEAKNDYVSVSDSIAKRAACVIADLRHQGCRIMYPGTAKGVEIMADSDGVHLQLLLQATKGPGGKKTAIADLRVPLIDYDPMATIIHAHVQGGPVFLREWTVIPVYVQLKFKNDNKVKINFIVPGDIQPPLIQDNDHWDTNRYKDDGQYKFDLLPEQIEIRGGYEDIVIDPGFTNSFGRVLTHCNSDAVERRTLPLPLWKGLYTRHKARSQDICEVPVLDQRVVLPSSRVRSIPELAIEDMWTPSLSDSVDKNVKHAGNKCNMTNMKREFTGIRPGDFKQALLGLTCHTGREMNINCLKSVLKGNKAKTMVFLHGPMTNTLGALEKLNKENPKRKYMVFKAFHIDADGLSISLMISSSGANLPYSTGLHLLNPDGQIVALNVIAPISIGGQSVEHLQENLLQKVLRNNKAKYSINTCVLSIDWMADLTSRPTKLLPRLYGSGYSISDIVTSPSDALFDIGAQEFVAAPLMQGPIDWLRAIAIDEAEHEGMTMSNVVAEAIENARREYTLGLSGVDTSGIAIGHARETVGREGTASMPTSSAAAAQGFWWATSILNLPPTMTALSDMIGGQVVNGGSIVVVGDGVSNIDEEQRIMAQQMVEQIIHLMSVNQVSALALKNLIEQENNTVGQDMLMRPDQTAVSLILKTSAISELCELTDYVYQHSVVQSQRILVGPSGTAIEQRTSKAALLDKQLTMPALYMEGDAG,18.4702
77
+ GPVSNYTYTRRDGLRAWFAQTEPRTIAKPDPADYLPTHLPAREKLATYKRVVIERITNSMGYIEYLDARTFNYISSPDANHIIIETVKMIACMLFAIGIIFSVHDQVTATQRTAVALLVAIEVLPNGVCPSGTNHPSVFQKILTGEGIFCADVASVGAARAVFITPQVQGGSLLATKVIMHEQPRPTEIVRDPILNQAGVHALCGTKVEGDVRQSANFTISFYSYSSTEGINYESTQSDIYHDKSPSNPITLLCVRYSANGKRDLEDGEVLSRPVTTQVTHESDGGTEKVRKDNITDIVIVLEKAFPAAIDEFRITTIKILIDHPITEYIVVCNDPNPVGTFRLAKYILNTYPDGSVVVRHENTFMKMLSSIFIHVNPDPSRLLNVIPVTNSLVKSGRYVMGDSDVVEKDNMKAVLKPLFEKVVGSWMGNSTFAMVAGFASLASFVFAFGHVSTQYAGGIHSEPLSILFGVNFSLSTTWVSAFYLLAIILMGILAGLETLLEGEQA,17.9375
78
+ IGFNTTTLCVCLMVALAITLTYFIKNKKSAYTIRLECTQASNCEVINFPRGMTTLNSLPSDDIQLFHTYSAVRLALCLGGSLILGAVLKIIFTNTELGRVLHAKMLKNGSLSALAIVTIFVVFFIAMSNLALIHAKGSNKAAASIPCGFLTYSVLALLGVFMSKCWNRQMPMLYGLSKGHC,17.0854
79
+ NGQVRERMVVLALKDPANSDRINDHSMHIESYTFVYYPAQGGCIHVGIIRLRKPQKLSLQEVLAANGDSVILAGLGVDACSFPDHIMENFFEWGDTQQKMDPRVGPNAFVYDQAFLDEIEGHDLFFLREILDSVTIGNRLQNPLQIFRPYAQARNKITRSHLGCSVTKMMLRNLETVCTASFQTREPELEPFLQEMRADNVSPVLDLLEEFSFKIPPGANIVWLAPVAWPILQMVKRQLRGTGRCPEVNHVSPGDVPKGATQGTWDALDAAQLFIGGDSNIRGSRFFFQLCGLVRLFRTARVASVFTKADPLSRTALPEQGAGMERLVADIFESVHENERVANLADLDRELCDYPARVQAEEWARACGRAKSHAAYLQSGTVDTNVKTHAAHYVLKKEELDFAMGFQGKTLALSGHRCLVRKRVASTKPEIFTQLEQLRVNNPGIARAEYTELFAQGSFVVVVALAEFRNAVRQVQDDSLCQELIEKAQMFPRVLQEVRKDPTPKRLSFTLTLIVVGLPRSDFALLNDAFLACFIPNPRGVANVILDTNAYDLEDLTSKPEVIEFEGICDLRGVAAVAPKTPIPAPRPMEGGKMFLISDMEEDGKGYDVPLPYALGAAVVVAACDQCGHPTFGTNGIAERIVIYVYLHFPAVVALFNPIGWACFIAHSCDPAFNSSFCRLGLPMLIMVVAAIVSGTLVFTLMIVTETECLDSGESKQKALAGDMLIAFPLMGLLELVLMELAILKGSAPSSSRHVHKDDGFVAMSPLNGLALFIVLLIGTFHGSTTVSGQVRSGRRTDMGLQGITGRAAGVRGHHFIMLSRADNIVTKLVPPWATALMLLLLPFLLIAEIEMGAGPLTLMDGVRSWISCLLVAIATAAFFFLIGTFQWVLGYWHRSNDSFISILTALYLLINIAKLGFGFYLH,17.9518
80
+ VPISYDIKVPTGWFIDGDKVRYKPCQGLKVITLVNDWWIILEVFACVTLPSTLILEKQDYFHKRRCTTIFESVAEFTQAYQVSIQESTQQHLTMAVQQVLGIRNRVDRKYVILVANDSPVVCYLEGSKVLFTLLQGPKPSSTVLLGRNGKTGLLLRDVTYMKTNGFDVVGGDLSVGVKNAVKAGFYPVPLTQIVQLSPVLTQAFFDDESVTVLDGPIGGHMSNKVNSQAQQNWKINNDNGFMVKREARYLGVTVIKNDLVRGFEDLTDVLGGCSKSILGALEMRDASELNHKGISVVAETISNAMTLGVSELTCCTKHRKEVWLQRRLRGKWLRLMLNIVFAWQNDLFFTITFVPNIVHFQRKIFMPAEAVFNFLIASLLFVLIGEFGFFDVEWRRWHWRRFNMIFYVERYFLFKAFLANEVKRGIEEAKKLLSLAFTFVLISALHLVNRIVNLLADCSHSRFLVDNLIDLRFIDIYCTKKYNHMTFMLLLLAATIITFLLIGINAAMVCCARNDQVLQLIQSLESLFNLIAHLNYMTVNKFTFGLMLRMNAYSLLIIVNSLTGYAELRIASAIMLRLEQAFYDLMSRFDVTLNGVIADRVGVASYSELAVAILQLLELLVMEIYEYSIILGLVAIYFMVTIGCCVKTLKFQGLDAFNP,17.7417
81
+ IFFLLSNSQDAYADKFKILVPLLWFLLSVGFAVLLHWKQSIMKAIMFNLSLVAFLCWLVWAVSNFLYKLSDIKMVFCLFIVVIWVTALWSLSTPISYRYTVKNFVVNERITGMFPNLILFAEIVPAITYIYFLFTFLDFTYRLQALNDVTVLGTKPMRLIQVLLHLRVAIGFLVVNLVGTFYDETMEGGEVWELFNSLTPDINSSVTVIIAVLFVFFNFFFVVLTPQHTKPCKKADNSSKPLAILVNGVTLILVLSLGYFFGSLGICAFSANSASMLQAFSMHTTIIMLFKIGVASAWGQVWNQRTDLEVTDHDPPLFILTLLLAGWVIESTTNFGGDSEIMNLLGFLAGVLTSIEIFGLQINLPTFGSFVQDWGAIATTGFQSQEQFFWITYGLIFVLPVAMLFFKIFHEIIEFNTMLIICTALNASFSLLHLSRALKFETKMVGKKRCSADEEFGARMEDAMDGAYAFFSKLLNTLFVAVFRVIVIYLVAFFIFKMLWIFIPTIVDKTNEWSVGGFVCELLFSIAGNLMGIILLAAPNFKMWFLLDVNTVFLFVGLLGLINEISGLRLWEMRFSNWFIKFHMWFFLPLILLLPSAIITFSGFIGAFRIEIVYVFSLIGLYSVPLILSALRQR,16.9356
82
+ AVITQRVIGIVAVTLLLLIALAGGILTPMGETGSFRFPEVSWSVLTLLKETGWGSAEDGPTLQLGRLVTRAIVVMVFAALIGGWIGAILAWLVGRRS,12.8882
83
+ FISRALFETAVILILVLSGILQGMVLHSSAPITDDMELHKLLAKEIYTAFCIVAAYLVVCVGYPLEREDRLMIGVYGSCSAGPLDNVKEWGYRTIAPLLVAYFGLFTLWYYRVFGEDAEKIWIQTAIFVAAVLGMSILNLLVYLGRFPDAKARLSILLKDLAPNVLFHFVFFLINAVALMPFIYALIESIIQGVGIKKFIIEQNGVNVPLTIARALKIRIIDGVQITTPNQNIRRVSPTVDDRMGPETSRFHNTLEQDDSIVFIFLFVQDLKILGLGSQYFTLFINVPAFYYVIDIINVGFTVVIIFVVIELIKGRFRSLVGLFWVGSSIKSSDFLAGIKNFVIFNIAEVPGVLITTIVEIAWGADFNSAKITIMDAVLIVFWFPILDFAWTNIAFATTGNFYFLIIVAGLGMKKADPMLLATLIYAVLSGACTPMLEVIMESVYVVGAVDAIALFIAPMFLRFNLPILVTYETRRPNLIWLMALIYFVDSYHLYFKSWWFFILSIWGGVIGMDVVGLVWILGNYSTVSIIGMG,16.8602
84
+ KATQSDKTFPLEVSFGFTASSGNIVDAHAASMATYITLQAVADLVDSPTECPISKDISTEQKIWDCLPTVNTFTARTGRQAEIKSTSLGQNLGLPYFDSSKSDKLHVDMNADGRTISDTLFLRDTQKQMHDSRKIFLPNTAAPNTGTKDLIDEYGEGLIVNHDTNDASGYLLTDELDCHRPTMKAGSLNPDYPSILRGLKVNIEDIIQDENKVSGFYQVLSYLLSKGSRLKRTIKFCHERDRYIHSDNHKFVFSGIGDQAKMEKELDVAKATGVTIGLEDSMVKKPSTKGNLVGIIPMNGTFLLVPQDPLSGQYGSIIFAHCIGKLDEDTAANIHTYYKAWVMMGNLSLYDKFMSAKSFSALKVQLRVLIARAGYIPVLQVQTNILDLSITEDANIFEEVLSSVSNPFFGAKKTIRQIDHDRNVGFSVNPRGIDSWRNDMPIVLADVAGSLTLTTLGLGFRAGLSDADLCYYHDLLLEGRVDNEKNAVPEAKARKVSQTRAGLFKRLLLQMNGTDFSIRGCQGQQLDLIASNGYQSLKNLQACKKNGSVTLSISVVMRMYRFVADFAKNNEALNINKYDFLKHTDVWYYPGEHNRDIGDLNLREIKFKPFFTCDNSPRNTVAVFKQLYTLPLDGRYFVSTKRETSKLITGEIYSISEFGEHKGWVALANENPGLVSTIRQTGRVVVQFMAGKSVKDAPISKRIMLVPCDKVIVMYKSLFYRLPEQIDTCYESEEFYDTKKDCIKTALLLARKIGYGMNALGHDERLSALQDVLRTLKPCDVLTLKLVTDLGQATGNMILLIISVHRSKLRKVISILGNCEAIARVLQTMKVGGVDAVILLGNNINDKDIEPTVYVGYSIMKEGINVPFTGGINNGAVLACMIKILEPIPIFVAPARPFRYYLRALLGFLDLGVGYLSNADEKATFASSYRPGMTAKELLAQLAGELNIPSDLPRTMEIVKDL,18.0526
85
+ RPKTAEAIGLLRATMLFALILAFIALAELLAGAMGLPGSWVGLPVPLIVLAILLVLIGFFLGLWAIGGFER,11.2268
86
+ AGVCRVPEHHEMPTIFCDRDGNNGPILADNAQVMAKLEERQQGRKIRAYSSSHFPTRSVFLRMGSGALAAIQANEGNVRLLEFDSSEPRAPLSKESDLFGYGGIGQPNSPVKNDYKQKIFGGMIITGPPSKVSLSLSGYAQAQPTTSLGTLYGDAAPAENSKVKSLLAEGNTRWRFVDNTDDATIMAYGMRSIRFERPFGMATLNAPVNTDLAPEGRAIIKCPTAFTFKFTGDDEYYTHSPPAYVGAIEDMKKEKYRPPVSLSGKRPLVIEDKRAGVMDIYSFTPKITVVPGGRNDVYYDIDIRISGGIEAVDRQPMFMFAVAIALCAALAVGFVMAAACELMGRSPRKAGTQYRKKRILVLTSNTLICYLLSPLLLEATLIDDSSCDCDLDLEGNRFDVTYGLVSCDDLQFQTLFSFFFFNLIASVIFVELRHARRIYIIEYPLFESPMSLVGCVFLALFAILLPDTVADGSGDQPMTFYTDIYYGLTSLGEMVTAYRLINLATGFLIGIIVELAATLFIMLAAEFFRSSQHNSADPRKFQREQRSFLRLRRELLPTTSGVVPTNW,17.8225
87
+ NGKNLAERFVWQQLSAAPFTMIFTMVQIGSPDVTEYGWNFDKRALAGVYISGQDVRLIGGARNKNVTVTIESVVIGGYGRPSSTDLAGHEVLAYFSQSPARSRFTNIYARLQNTRGGYTAKFTSSFRPEAQTYETGALSIRFQGTADAPSHLQERTIGQLESDGTLTGDEKSYRTGINRGLIGKWEDAMAHTLGAVGTSGSALVLISGNHFGNGYAFYGAGNKSLTSKLVYDNPFTQVNTQERFAKDRYPDLTGLELLPENVQVTAVGNTSDWLKGSIMFAAGALAGLGSGQIIAGFAAVRNAAEGLGVALLIAGGTVVGSKAN,17.1103
88
+ AGLFPNELNELRRRLASDATTFIAPINFKVMLTREFQLLHLVFGFAVGLAWNLLMGQNWPFFPLIHGSADDLPKLTSFGVIVHMHEAVEPIWAWYLSLISVQIHSGKFLQSIANTRLVGSLNGMFPAWQGGKMIGRLIPRHKIAATLAIPSLPVLWGVTHIDLMPESLEWSMNLVEGDIALFQETGTFVDIFLLAGGPRYLFQVTFDEKINRLMRKRPILIVSQKIGSHHFKDVEEYAIAMRQGIHLEEAEINIPGGKVTYTPNYLAPSYREGRNRTVGIWQTFLDEAWESEAELAILHKDLVISGEPVLYPHQFRQGGRELVGKFFRLVTVDPRAFVAMQNGAISKEELVAPLITARERTSWYIFGIGAVSATLVATGPVNLNCTQIAYGPSLSAGLAHGLIFLLIAFHLYHVLKCSQAFQGLAAIKALNLIKPTEQALPERIDLDPLVLFNVGHTLIVTLFLYLSILGRGDVGLNLTMAGVVGVMTVLTYAKFRHCPIQSQSNLNMDRADQYRVIDGLQYVLKEIEKDFETATGL,17.607
89
+ WAQKLIILMLSVILGGLFYFSLLPLLHPSTTLRNAPIIMPLLVLSSIIFYWFIHDNMIHHFDWNFKEFRLISQAISLNAFAFGLMLGTPDTLKPGCQHIRNPSFILYYLVYFTWYTGLLAKQQKELLKSLWSFLIIDSPFNLSVMRITRANLIENFSISQGNYLGQILLILLTHHSPLTILSWRGSPDRINDHASKNVPIKVDNETDNGELDKLACGALWSYYSQLWIETMLYRPTNGTEKNQYKDFINLVNLESYSTNVTSDVREGSPKAALLVNELHTYVILNASVVLFVTSRRDYNSLKKRNEA,17.6565
90
+ KGYDIRSNASWLVTRADGKRANAVAEPSALKPGPQSGVGNILPKSRASYFILNNIGAKIIYLLDILDTATVGALAKAPPANTNRDNQAKFKFTATATGGASFSGTVPTDIVGIKVGPTAPILWGQVVGGGQAAAGGTKGVTVEGGSGYFVAGFVLDDKENSLLPNSENVATIYIIPRGNIIVNNISEKTGPGVIIAAEGLAAKGGEMLVARGNSQSSTVVDVSKKAESKSIITEELLKTAQGNNFRADINQLVTSLVDSWDLGTEFTVGLNNATPAGGIFASGGTAVNGKQSNAAVAYGGVQIPQNGKAYGTMVIYKGSSQSKLDAVKVRFSTTNAPEYWFLVGAQDQLAGNNTGYFTGKNALAFAAALAQTEANKVVLAALTNKDNPVPQNKSGVVAKGIAEAFTEKFTVDAVGTDSIVANFNTKLAPGQIVFIGPDLEITIAYNGTVLSDAVGNNAGEAAILNPNISKRIQEKVEIGFSPKKNIGEEYISANGSIDSIGKCAADETAEFSALASTFTSVDFLVKSEYSSYSSNNTFDRFSLRDTSFTDDSNSKNSTRLKASDSSKFYDNYKMVKTAVFNGVATPAAAALGSLSQFTGTTTLRLEFDKGAASLKGEKFSDSKGNSVTEKKFQDTLRVNTLGRGVAFIGVKVDSPKALVIAVAGALGIGGNVLLKGGKIVASSKLKALSAKQARNQLPLFGEYNFLSGLFSNGATVAFIDPLGIPATKAIIYPGESIPIEVFTKTPTRVKFLEKGAKLGNTLSALFVFTETANLTSSLLVRANPGVAGNTKPKNLSDTSAGTSPQFAVTAISHRLALTA,16.9213
91
+ VLGVAEKKDNDDQAQSNSSDDIKKADRESTLYGQISAGVQVGAVGTAQQQVTFQLGYASLWGNSKWYHGLNKRDAVASGYESLMGSMTQAGNGISVRGQNSSSDHMSSLNNNSNNQYAGDNLLFSGGNVIQDMGMAQSLSYQGPFSGIQYSSQSYTNTNIFWWSGGDNASDIKAKLVYKAVGYDNYLGEVPGEATIQVRNLKFANNGTLAYAVHSQILLNGGKVAYNGRCMVSNNSTVYYSKTLQSALAQTWYEQGLVDANTLLVSAQGKKSDLYSLAKQNIAGNRRASFAYGASANPSAQVNASLSNTFTDTYTYFSGTPTYSRSSFP,18.6621
92
+ VGIIMPDSAAAFSVAGSLDPLQQVADAIEEFAEKKLSHFDIDDTSFINIVSYTKQVVRALFVQPCRQMIQPFRDPYSEIKFVNLDISKQLMPSPRRNSSVVKQQLIPFGKVWNILHKVGLNIIFKDVTVVSIALALAANLIKKSEFLSLKMAGRSVGTEEKLGFMGFIFMNTENYTKGNIPGKEIVAMYFLYLQNILFHPPEIGSPEPAKYSEQSGTYPCADAAKKYKKYPLQEKFMFIHASIGVGDVGKKVFSQRPEKGHLAEMLGAAVLFFGENFPQADFNYLPSKEAEYNLSLALFKFGTQFVVNNQPAFCYNEEGNGWLPVNKLESNEILDCDTPTKGTKVSGHLTPVTAGWLHLLQNLGMMCGSRQAQWQVYPFHNATCANLQHTKISPMAGLGEAISTGGRIRPTYRKILLGIPEDHYNPSSDLNMIESQLVQLDKLKEYEAYHFGPQMYAQNFPEKTQLTRLMQLSILLSDDRSLARLNNKSLQKMTESPEKNVTKSVPSTITFGYASELRRKKDRTKNTWVNLRRKENGAFDELFVSQESIHSNLSVAEFTIDFKRNDGVEIRACGVLCVTDAFRVKSQFKAIHVVGMSSTTLAKVNCQVLSPNNVTFDVNNPQYLHRQQSVLANASTWPQFRWQGEVSGPTLYQLDSIGVLPKADPQKPKWQAGIMFVLKLCLYLILFCYLPFIGVMYLKPAIFEQSTPPYKQAEAMHLLICCIVIMLFYSLSPVNLQASKQVEGSGVNLLVLFMTLLSYWSNLFRWFGHLFLMLTIASSLIYEAVAKLKTITPKNLKSIWHIETWQVFEPFMVFYIVVYSVALMSLKVFVDTWWVTFFYSGPVIITGSILGHGVNLLDRKMPYSSNIKLALHNVHQLLLNINVMVDEFTGQPTSPFVTNVAEPAKRAASLIIIAVGDALLAYMLGYTVPLVPRPK,18.4627
93
+ KILLGSSITQSWLTYIPFVFLLVIPLFMIRHYGILMTNVLTILILCVGAKVLANSKGDDPTSVRNLKDVWQKAFATALQITIKYYFGKSTKDFLQSVSMIKADSTIFVRKAPSWPFNLSVATVRGATTNGMSFTLPCTGEGNGGFCLLSQEAYAVTGPLLEDVGVLAPEGAGKLTQAPELVVGKVGDVDSKALLSLMIHLLAKIGVATVALSLIKGELEQLRVEGTDIARELATKDSKDDNKGSSLATIINPPMNIIVTVSATKPNNTVGGRASRALTQYLMDAGTKVVISTSTYRDLVPKARNDSSLVKTILAFKEDRVPLEGIISKRKDTVELRVVLIRLIDRGRIAEWLDKDVKAIDSSDDVTEDLIVDSKPMVHPMRVGGTIRSDYVRNQTIIHIYEKEWDDLSRVIEEKQEMKEVPFMWIQSGKNMEDEILP,17.3175
94
+ MFFYSLIRTAPGTLPLRQSLIIFVSGGGGDGSVAEAGTSLGAPAAEVFHVTIAARFSHELFATILLAYCVASGDTLSKVVSDRAHLVQLVTHHVGLARLRMLVTVIHFTALCPFGGAILFTAPLDINTRQPDPDPVALWWYYIAPVTGQMTREFGGTIINPNIANSYHLVYFKVLFRHFVAEYVGWLHGPGMHPTDVLDIKAALKKSPTHGPEIPHYYSPPRVPRAIPPIFSVFNEIGDARYTTIYDGSVMGLLEKARTYDMEEVYTPRQVGYIFVHSKGHNVFRLVAELESAIGDLFTAYFDSLTSEDGKQQNMISAYLKGLVASHGCGLASAFSFGEQEKWRNAFNYLLWGRYQIESWRTVEAIGPDLLSFWRKRFNELKQAGVWITTSPTCWEAGSVKDNGLFIINSMKYALGRDAVWSANMPRVNKHITIEVKGAAEHQQIVDALIALVKDYDNLGFYSAEERADHRFFVAMVKEKGSAGSSKIDTARVWDVHLIRSRYFYYDASAWYHSAQRMTDPIVNRGYNIGLFAAIVAAGMLLLVLRVDRRKITCPFRISCPDERFSLKSHEIPLDGNLRVYGELKSELDHEDPFGDLTVFRGTDTELSSGGFPLHWEFVKEPEIGMLETLIQAVVGVYFTTSLYPGYADEPGRTEMGLYNMGPFGWWLVKYSDR,18.1742
95
+ SNTARSTQMVGTGIDINSTQMYPYNIMLTGFEVLIRLTPSAIENWQIRGEEALDSFFTSLSNAIGNACVTVFLMRILLAVYTTKSSAESDRAIGYATAGLPNNITAIVAQINAVVATAVVNSMNACLDLAPILYWETLQKIDNISNYYPPFDRDCLKARAMTYQPQEVRMDMPITVACQSGRLMNTAVRKETVIYAIIKEEPKNNFYLLTDPVYQRADTVVQAQYGHEPEFDTEDNLYPNKYGWIQYHEEYYEPIWWRWKIRSYFRTTQLETKTSLLARDSWEPFYASPFSRHIPISITDRPGMDHFMDDLYQSTPSFLTNAICCHNTNGHFPTELLGTIDTLRSALGGLDLHQSSHKSHLLLLRSTIRDLCEASGSGMTTQFTYLLLGNVYIARSDNLKHAFDANAKKGFKLRVIKGAIPPHVKMQVVIGASAGRILLMKTSKLKFVFSDGNLQRPLSEYVELAGDDISEAVFHAGKDTFEFEYEVTDDQFFFHFRAELIEPWKRENLYDNSLYLFRIGDKKFVRTLFTTLSCNKSVMLYFQKFAKLKITASKKKGISFTPDRPACGISVVPHLDKQFVLQVVLQTLMKITWKPCKKNRFQSRFVDHGGFFDFVTYSEIYLKLFAGENVVFTRMSWYAKLSTPHDFQPRSLVGVSTMGIFDEADGKYHLIGTGNFGFKIWRFLYVLDSVFSIEGMFAATITEYILWSGIVRYFRTFFTLEAGIPSHSSGTEGVYVCFKELIFEWPKDTPSVQISLAESTDPSAGIWIREIENRNQFNKVSMLVKTAVDVAQLVFTLEAFAPFEQSLNVIFDNEVDVSLTKALGPTASNTYESSQLALGNRLVLSEAGDVTALDRMVTTITCNTLCFFRHYNGITVVNCIEKAAAVVCIHITNPMPGFVQTQLGIGGLGPYICKSCSLAELQCEDRKIRFYSQIPGSAGDFFEDIAWQKRLEELKNLPK,18.5522
96
+ LVKVETEVKVYVRPTKPLPYTVETAYGGSPEQQFYNIRKLEPGLFADMGGFFTPPMSTASLGSTYQIFRQIYDGTLLWKNNMSDTDPNNVYRALQMKDTVSSLMFVLVLPGSDVYIKLGLVHIEETTKIDGTPDDAQPSSTDLSPGRFVEEQEVMSEDDELALLEDLRSLKFVCQDVLKRQKRHIFNNASITESLTIAFALNNTDNRLSWLMYLWIFALFGLVLIILVLVAGFDIWWSPTKQYGMIIFNLIGNFSSYKTMSEASMKSGIANVGQAWTTIKTVTAVLNNKALVVVNAGEDALALLKSLPKQTDAMVGHIQMTDRAIGMKPNDLWTFIAPIGYVGKGTRMFIIFPKSILSPNPSIGRTVVVVAGIKDVLQDMIQISKTKTDKVSGVNKGDQVDFRYKVETSKNNSLVAGAITEALRGSSASGIKIGSFVQDPNPLFGDLENNFAYGASAMLFDAFLTRFNKENNLIVLVGNRSALNTDTRRLVQWVDALHFNTQLFIIAMEKNAQVQNALTTANKLGILKPTVNVVDQYFPQGLLISLDINRAIHASLKGLPPKGFVVTVIGEEPDSSQLVAKVKAFGLKLFVYAKSTSDVAKLNDLGFATLETGGSLEFFKMDQFKLEIANEVTKTMRSFALIVVLDDDMQQGTKILTDIHPHHPQFTPGPKEASLEKKLALVLGMAVVYRLRLTTVRLEVVTRIPAVIVVNDIQIFTDMAYTNVSGNLPRLPADKVKLGKYSYHAADADGINYKVTGDGSKLKGSIVAQIMVNDVVVLNTKWPIETSKWKIAVNAEVRLDIGPFSTNIPRTMTGYESYLGNRIVLIGRKNRVFGRSTIVEGLIGFLFDVFLHVWGYVFTWITLAIHYWAGPRITHILSRAGDILEIVMSAMRLEKFNSMTKDWLRLLEIPILAELAERIVEGDKRFGVIDKSGTPYELIKKTVEPENVPTALSKVESL,17.6058
97
+ EFRTALGTLAAFVAIFFISVQFLFRFYPETWLPIYHLAFKRLSVPPAAIVAIASVTIFCIGAIFGLFPGPALALRRLTGNVAFLIGIATAIGIGVTFLIKGISNSNNTSSMSIIIRTVAGAIAVVLLTLPALVVRIHGNFGRAAVGGGAAEGANAAIFQSLTGSNNAFRDALFNFGVKLLFGLAIETREIILLEYIYNLLLTVGYDLNFASRGRLQLNAITGLLVVSAAIVSGYRTVAAERKVFDFALLKARKSVYPALRELRLVPLITFIGVLFVTTK,15.2478
98
+ KGLQFTRKGWNHKGRRHWRDFDTVAGNALLGIEGQTGPRMVETGENVTTEPGNRRTRPTLLATGTEPADAGIEETRIEQDVILPLTTKANGGMIRVHHYDVRKIAGVEIDLESDILEARLTDGEDLHNCKFTTTVKAHIKTEPTPVSAADSEILLKGQYVSSDFEVLDSDLDANVSRDSRMWFEVAYICDILDKQTLLNEGMTFTVTSDGYSSGAADVWVLSTIKTQCRHAQGQQWLYRAGNLKPVVEMEIVYSAARDVTGGSLFGAVNSAAPFTVEMLFFPATIEQLRPGTPRAGKTITNPENATSGGNIEVFEEVKHFSDSQFRNEVRFITDDDSVYTATERRRIPNAPQNGIIRYWMKNGYASWNTEKVYARQPDGDITRQESFENAAQSMSTADNYYNHYYKEALRMHLAGGVEDDLEDDVTQEVRVSKDGEVEIDLDLNYTSKRYREGISWFLGCNAAHGIPINDAGVGFAFAAIPQYA,18.6838
99
+ FSSVATATTTAIAFAIAAGVGGAIGGAVVGSLVIASLRGTVTAASALKAPLVPLALTVGAASLGATIGLAASWGVNLTL,10.8479
100
+ SSITAAIQLYKPDSISILDDDSPDDLFETVEFLTEKQKNKQTSDNSYKLFADSFLSIVDSPNWTNMLLIAARVLLVLYTICPCCRADWVGAIGTDDVSYDIVCDLLGININFFKITKVLTAQYLPGRTKVGYMKHPLKTSYFVSIYVEDISDCARHPYGFSYAWQYVKKPYGTVSVDIYNGNPREKLFCLEGLNWATGLGLVVGAGAYKSLGTSVERVNTLVIFLETGELFVWAYAALWFRKRYTEDSEAKVNLYIAGLIVLFAVEKVSYAVPDIVFKEQILKSMVIFAKFSIINLYLDALFDFVFICIIILLLRVLSKDLEAVVGPTLSVFL,17.2645
101
+ LECYGQQSSLIEMYRDYTIKVRDRYANNERIILDHYLVLNGDFYVRLASNKIVLGPDDANSVVAILQIGDMGLFLANGKNVTEMKRMLEKLEILYFTGSEAAVGSVTGHVCLMITNIWKDNKKLVEMLEFLGTEIIYNSVGLVFMIGKMSDKQGVYAKNKFSDSILEIAVKLQNFTWRNHVLFIGAYLYQWELYAEPEVVINNNISVIRVLWDPDGKSLYIIRPEKPPNIFEYLMHGICTFGGVGAIAGGMGVPASHIGGLIYKADFSISSWCEPGSVNVGALPYGSNCVVVQEGGNVVTFSLPTGSDVPIFALEHFPEPGKWKWEGFYWINPTDYRIMISGLKYTLAANAIAGIGAYLESYNIKISTWQYLVNGNPYDSVGVYNQHEYPLYPSLPMSDFTIFPVLTFAP,18.4071
benchmarks/MLM/config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PATH = "/workspace/sg666/MDpLM"
2
+ TRAIN_DATA = PATH + "/data/membrane/train.csv"
3
+ TEST_DATA = PATH + "/data/membrane/test.csv"
4
+ VAL_DATA = PATH + "/data/membrane/val.csv"
5
+
6
+ ESM_MODEL_PATH = "facebook/esm2_t30_150M_UR50D"
7
+ MLM_MODEL_PATH = PATH + "/benchmarks/MLM"
8
+ CKPT_DIR = PATH + "/benchmarks/MLM/model_ckpts"
9
+
10
+ ESM_LAYERS = 3
11
+ BATCH_SIZE = 8
12
+ NUM_EPOCHS = 10
13
+ LEARNING_RATE = 5e-3
14
+ MASKING_RATE = 0.40
benchmarks/MLM/data_loader.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import config
4
+ import random
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from torch.nn.utils.rnn import pad_sequence
7
+ from pretrained_models import load_esm2_model
8
+
9
+ class ProteinDataset(Dataset):
10
+ def __init__(self, csv_file, tokenizer):
11
+ self.tokenizer = tokenizer
12
+ self.data = pd.read_csv(csv_file)
13
+ self.max_len = max([len(seq) for seq in self.data['Sequence'].tolist()])
14
+
15
+ def __len__(self):
16
+ return len(self.data)
17
+
18
+ def __getitem__(self, idx):
19
+ sequence = self.data.iloc[idx]['Sequence'].upper()
20
+
21
+ # Randomly mask 15% of the sequence
22
+ num_masks = int(len(sequence) * 0.15)
23
+ mask_indices = random.sample(range(len(sequence)), num_masks)
24
+ masked_sequence = ''.join(["<mask>" if i in mask_indices else sequence[i] for i in range(len(sequence))])
25
+
26
+ inputs = self.tokenizer(masked_sequence, padding="max_length", truncation=True, max_length=self.max_len, return_tensors='pt')
27
+ input_ids = inputs['input_ids'].squeeze()
28
+ attention_mask = inputs['attention_mask'].squeeze()
29
+
30
+ labels = self.tokenizer(masked_sequence, return_tensors='pt', padding='max_length', max_length=self.max_len, truncation=True)['input_ids'].squeeze()
31
+ labels = torch.where(input_ids == self.tokenizer.mask_token_id, labels, -100)
32
+
33
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
34
+
35
+
36
+
37
+ def get_dataloaders(config):
38
+ tokenizer, model = load_esm2_model(config.ESM_MODEL_PATH)
39
+
40
+ train_dataset = ProteinDataset(config.TRAIN_DATA, tokenizer)
41
+ val_dataset = ProteinDataset(config.VAL_DATA, tokenizer)
42
+ test_dataset = ProteinDataset(config.TEST_DATA, tokenizer)
43
+
44
+ train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
45
+ val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
46
+ test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
47
+
48
+ return train_loader, val_loader, test_loader
benchmarks/MLM/esm_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
3
+
4
+ def load_esm2_model(model_name):
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
6
+ masked_model = AutoModelForMaskedLM.from_pretrained(model_name)
7
+ embedding_model = AutoModel.from_pretrained(model_name)
8
+ return tokenizer, masked_model, embedding_model
9
+
10
+
11
+
12
+ def get_latents(model, tokenizer, sequence):
13
+ inputs = tokenizer(sequence, return_tensors="pt").to(model.device)
14
+ with torch.no_grad():
15
+ outputs = model(**inputs)
16
+ return outputs.last_hidden_state.squeeze(0)
benchmarks/MLM/mlm_generate_utils.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+ import config
4
+ import sys
5
+ import pandas as pd
6
+ from esm_utils import get_latents
7
+ from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer
8
+
9
+
10
+ def mask_for_de_novo(sequence_length):
11
+ return "<mask>" * sequence_length
12
+
13
+ def generate_de_novo(sequence_length, tokenizer, model):
14
+ masked_sequence = mask_for_de_novo(sequence_length)
15
+ inputs = tokenizer(masked_sequence, return_tensors='pt').to(model.device)
16
+
17
+ with torch.no_grad():
18
+ logits = model(**inputs).logits
19
+ mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
20
+ logits_at_masks = logits[0, mask_token_indices]
21
+
22
+ pred_tokens = []
23
+ for i in mask_token_indices:
24
+ topk_logits, topk_indices = logits_at_masks[i].topk(k=3, dim=-1)
25
+ probabilities = torch.nn.functional.softmax(topk_logits, dim=-1)
26
+ predicted_index = torch.distributions.categorical.Categorical(probabilities).sample()
27
+ predicted_token_id = topk_indices[predicted_index].item()
28
+ predicted_token = tokenizer.decode([predicted_token_id], skip_special_tokens=True)
29
+ pred_tokens.append(predicted_token)
30
+
31
+ generated_sequence = ''.join(pred_tokens)
32
+ perplexity = calculate_perplexity(model, tokenizer, generated_sequence)
33
+
34
+ return (generated_sequence, perplexity)
35
+
36
+
37
+ def mask_for_scaffold(sequence, generate_type):
38
+ if generate_type == "uppercase":
39
+ sequence = ''.join(["<mask>" if residue.isupper() else residue.upper() for residue in sequence])
40
+ elif generate_type == "lowercase":
41
+ sequence = ''.join(["<mask>" if residue.islower() else residue for residue in sequence])
42
+ return sequence
43
+
44
+
45
+ def generate_scaffold(sequence, generate_type, tokenizer, model):
46
+ masked_sequence = mask_for_scaffold(sequence, generate_type)
47
+ inputs = tokenizer(masked_sequence, return_tensors='pt').to(model.device)
48
+
49
+ with torch.no_grad():
50
+ logits = model(**inputs).logits
51
+ mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
52
+ logits_at_masks = logits[0, mask_token_indices]
53
+
54
+ pred_tokens = []
55
+ for i in range(len(mask_token_indices)):
56
+ topk_logits, topk_indices = logits_at_masks[i].topk(k=3, dim=-1)
57
+ probabilities = torch.nn.functional.softmax(topk_logits, dim=-1)
58
+ predicted_index = torch.distributions.categorical.Categorical(probabilities).sample()
59
+ predicted_token_id = topk_indices[predicted_index].item()
60
+ predicted_token = tokenizer.decode([predicted_token_id], skip_special_tokens=True)
61
+
62
+ pred_tokens.append('G' if predicted_token == '' else predicted_token)
63
+
64
+ generated_sequence = masked_sequence
65
+ for token in pred_tokens:
66
+ generated_sequence = generated_sequence.replace("<mask>", token, 1)
67
+
68
+ return generated_sequence, mask_token_indices
69
+
70
+
71
+ def calculate_perplexity(model, tokenizer, generated_sequence, mask_token_indices):
72
+ total_loss = 0.0
73
+ tensor_input = tokenizer.encode(generated_sequence, return_tensors='pt').to(model.device)
74
+
75
+ for i in mask_token_indices:
76
+ masked_input = tensor_input.clone()
77
+ masked_input[0, i] = tokenizer.mask_token_id
78
+
79
+ labels = torch.full(tensor_input.shape, -100).to(model.device)
80
+ labels[0, i] = tensor_input[0, i]
81
+
82
+ with torch.no_grad():
83
+ outputs = model(masked_input, labels=labels)
84
+ total_loss += outputs.loss.item()
85
+
86
+ num_mask_tokens = len(mask_token_indices)
87
+ if num_mask_tokens == 0:
88
+ perplexity = 10000
89
+ else:
90
+ avg_loss = total_loss / num_mask_tokens
91
+ perplexity = math.exp(avg_loss)
92
+
93
+ return perplexity
94
+
95
+
96
+ def calculate_cosine_sim(original_sequence, generated_sequence, tokenizer, esm_model, device):
97
+ og_embeddings = get_latents(esm_model, tokenizer, original_sequence.upper()).to(device)
98
+ new_embeddings = get_latents(esm_model, tokenizer, generated_sequence).to(device)
99
+
100
+ sequence_similarity = torch.nn.functional.cosine_similarity(og_embeddings, new_embeddings, dim=-1)
101
+ cosine_similarity = torch.mean(sequence_similarity).item()
102
+ return cosine_similarity
103
+
104
+
105
+ def calculate_hamming_dist(original_sequence, generated_sequence):
106
+ generated_sequence = generated_sequence.upper()
107
+ original_sequence = original_sequence.upper()
108
+ return sum(1 if original_sequence[i] != generated_sequence[i] else 0 for i in range(len(original_sequence)))
benchmarks/MLM/mlm_lowercase_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
benchmarks/MLM/mlm_motif_benchmarking.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import config
3
+ import sys
4
+ import pandas as pd
5
+ from mlm_generate_utils import generate_scaffold, calculate_perplexity, calculate_cosine_sim, calculate_hamming_dist
6
+ from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer
7
+
8
+ def motif_benchmarking():
9
+ path = "/workspace/sg666/MDpLM"
10
+
11
+ test_sequences = pd.read_csv(path + "/data/membrane/test.csv")['Sequence'].tolist()
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(config.CKPT_DIR + "/best_model_epoch")
14
+ mlm_model = AutoModelForMaskedLM.from_pretrained(config.CKPT_DIR + "/best_model_epoch")
15
+ esm_model = AutoModel.from_pretrained("facebook/esm2_t36_3B_UR50D")
16
+
17
+ device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
18
+ mlm_model.to(device)
19
+ esm_model.to(device)
20
+
21
+ for generate_case in ['uppercase', 'lowercase']:
22
+ case_results = []
23
+ for original_sequence in test_sequences:
24
+ generated_sequence, mask_token_idx = generate_scaffold(original_sequence, generate_case, tokenizer, mlm_model)
25
+ perplexity = calculate_perplexity(mlm_model, tokenizer, generated_sequence, mask_token_idx)
26
+ cos_sim = calculate_cosine_sim(original_sequence, generated_sequence, tokenizer, esm_model, device)
27
+ hamming_distance = calculate_hamming_dist(original_sequence, generated_sequence)
28
+
29
+ case_results.append([original_sequence, generated_sequence, perplexity, cos_sim, hamming_distance])
30
+
31
+ print(case_results)
32
+ sys.stdout.flush()
33
+
34
+ df = pd.DataFrame(case_results, columns=['Original Sequence', 'Generated Sequence', 'Perplexity', 'Cosine Similarity', 'Hamming Distance'])
35
+ df.to_csv(path + f'/benchmarks/MLM/mlm_{generate_case}_results.csv', index=False)
36
+
37
+
38
+ if __name__ == "__main__":
39
+ motif_benchmarking()
benchmarks/MLM/mlm_uppercase_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
benchmarks/MLM/model.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ import torch
3
+ import torch.nn as nn
4
+ from pretrained_models import load_esm2_model
5
+ from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel
6
+
7
+ class MembraneTokenizer:
8
+ def __init__(self, esm_model_path=config.ESM_MODEL_PATH):
9
+ self.tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
10
+
11
+ def __getattr__(self, name):
12
+ return getattr(self.tokenizer, name)
13
+
14
+ def __call__(self, *args, **kwargs):
15
+ return self.tokenizer(*args, **kwargs)
16
+
17
+ def save_tokenizer(self, save_dir):
18
+ self.tokenizer.save_pretrained(save_dir)
19
+
20
+ def load_tokenizer(self, load_dir):
21
+ self.tokenizer.save_pretrained(load_dir)
22
+
23
+ class MembraneMLM:
24
+ def __init__(self, esm_model_path=config.ESM_MODEL_PATH):
25
+ self.model = AutoModelForMaskedLM.from_pretrained(esm_model_path)
26
+ self.tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
27
+
28
+ def __getattr__(self, name):
29
+ return getattr(self.model, name)
30
+
31
+ def __call__(self, *args, **kwargs):
32
+ return self.model(*args, **kwargs)
33
+
34
+ def freeze_model(self):
35
+ # Disable parameter updates for all layers
36
+ for param in self.model.parameters():
37
+ param.requires_grad = False
38
+
39
+ def unfreeze_n_layers(self):
40
+ # Count number of encoder layers
41
+ model_layers = len(self.model.esm.encoder.layer)
42
+
43
+ # Enable parameter updates for the last 3 encoder layers
44
+ for i, layer in enumerate(self.model.esm.encoder.layer):
45
+ if i >= model_layers-config.ESM_LAYERS:
46
+ for module in layer.attention.self.key.modules():
47
+ for param in module.parameters():
48
+ param.requires_grad = True
49
+ for module in layer.attention.self.query.modules():
50
+ for param in module.parameters():
51
+ param.requires_grad = True
52
+ for module in layer.attention.self.value.modules():
53
+ for param in module.parameters():
54
+ param.requires_grad = True
55
+
56
+ def forward(self, **inputs):
57
+ return self.model(**inputs)
58
+
59
+ def save_model(self, save_dir):
60
+ self.model.save_pretrained(save_dir)
61
+ self.tokenizer.save_pretrained(save_dir)
62
+
63
+ def load_model(self, load_dir):
64
+ self.model = AutoModel.from_pretrained(load_dir)
65
+ self.tokenizer = AutoTokenizer.from_pretrained(load_dir)
benchmarks/MLM/pretrained_models.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel, EsmForMaskedLM, AutoModelForMaskedLM
3
+
4
+ def load_esm2_model(esm_model_path):
5
+ tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
6
+ model = AutoModelForMaskedLM.from_pretrained(esm_model_path)
7
+ return tokenizer, model
8
+
9
+ def load_mlm_model(esm_model_path, ckpt_path):
10
+ tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
11
+ model = AutoModelForMaskedLM.from_pretrained(ckpt_path)
12
+ return tokenizer, model
benchmarks/MLM/screen_mlm_cosine_hamming.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ path = "/home/sg666/MDpLM/benchmarks/MLM"
4
+
5
+ df = pd.read_csv(path + "/mlm_uppercase_results.csv")
6
+
7
+ all_sequences = df['Original Sequence'].tolist()
8
+ seq_len_sum = sum(len(seq) for seq in all_sequences)
9
+ ppls = [ppl for ppl in df['Perplexity'].tolist() if ppl != 10000]
10
+
11
+ ppl_mean = sum(ppls) / len(ppls)
12
+ cos_mean = df.loc[:, 'Cosine Similarity'].mean()
13
+ hamming_mean = sum(dist for dist in df['Hamming Distance'].tolist()) / seq_len_sum
14
+
15
+ print(ppl_mean)
16
+ print(cos_mean)
17
+ print(hamming_mean)
benchmarks/MLM/train_and_test.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import config
3
+ import math
4
+ import sys
5
+ import os
6
+ from tqdm import tqdm
7
+ from torch.optim import Adam
8
+ from torch.optim.lr_scheduler import CosineAnnealingLR
9
+ from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer, AutoConfig
10
+ from pretrained_models import load_esm2_model
11
+ from model import MembraneMLM, MembraneTokenizer
12
+ from data_loader import get_dataloaders
13
+
14
+ def save_hyperparams(ckpt_dir):
15
+ hyperparms_txt_file = os.path.join(ckpt_dir, "hyperparameters.txt")
16
+ with open(hyperparms_txt_file, 'w') as f:
17
+ for k, v in vars(config).items():
18
+ if k.isupper():
19
+ f.write(f"{k}: {v}\n")
20
+
21
+ def train_and_validate(model, optimizer, device, train_loader, val_loader, num_epochs, ckpt_dir):
22
+ best_val_loss = float('inf')
23
+
24
+ for epoch in range(num_epochs):
25
+ print(f"EPOCH {epoch+1}/{num_epochs}")
26
+ sys.stderr.flush()
27
+ total_train_loss = 0.0
28
+ weighted_total_train_loss = 0.0
29
+ total_masked_train_tokens = 0
30
+
31
+ model.train()
32
+ train_update_interval = len(train_loader) // 4
33
+
34
+ with tqdm(enumerate(train_loader), desc="Training batch", total=len(train_loader), leave=True, position=0, ncols=100) as trainbar:
35
+ for step, inputs in trainbar:
36
+ inputs = {k: v.to(device) for k, v in inputs.items()}
37
+ optimizer.zero_grad()
38
+ outputs = model(**inputs)
39
+ train_loss = outputs.loss
40
+ train_loss.backward()
41
+ optimizer.step()
42
+
43
+ num_mask_tokens = (inputs["input_ids"] == tokenizer.mask_token_id).sum().item()
44
+ total_masked_train_tokens += num_mask_tokens
45
+
46
+ total_train_loss += train_loss.item()
47
+ weighted_total_train_loss += train_loss.item() * num_mask_tokens
48
+
49
+ if (step+1) % train_update_interval == 0:
50
+ trainbar.update(train_update_interval)
51
+
52
+ avg_train_loss = total_train_loss / len(train_loader)
53
+ avg_train_neg_log_likelihood = weighted_total_train_loss / total_masked_train_tokens
54
+ train_perplexity = math.exp(avg_train_neg_log_likelihood)
55
+
56
+ # Save model every epoch
57
+ train_ckpt_path = os.path.join(config.CKPT_DIR, f'epoch{epoch+1}')
58
+ model.save_model(train_ckpt_path)
59
+ save_hyperparams(train_ckpt_path)
60
+
61
+ # Validate model
62
+ if val_loader:
63
+ model.eval()
64
+ total_val_loss = 0.0
65
+ weighted_total_val_loss = 0.0
66
+ total_masked_val_tokens = 0.0
67
+
68
+ with torch.no_grad():
69
+ val_update_interval = len(val_loader) // 4
70
+
71
+ with tqdm(enumerate(val_loader), desc='Validiation batch', total=len(val_loader), leave=True, position=0) as valbar:
72
+ for step, inputs in valbar:
73
+ inputs = {k: v.to(device) for k, v in inputs.items()}
74
+ val_loss = model(**inputs).loss.item()
75
+
76
+ num_mask_tokens = (inputs['input_ids'] == tokenizer.mask_token_id).sum().item()
77
+ total_masked_val_tokens += num_mask_tokens
78
+
79
+ total_val_loss += val_loss
80
+ weighted_total_val_loss += val_loss * num_mask_tokens
81
+
82
+ if (step+1) % val_update_interval == 0:
83
+ valbar.update(val_update_interval)
84
+
85
+ avg_val_loss = total_val_loss / len(val_loader)
86
+ avg_val_neg_log_likelihood = weighted_total_val_loss / total_masked_val_tokens
87
+ val_perplexity = math.exp(avg_val_neg_log_likelihood)
88
+
89
+ # Save the best model based on validation loss
90
+ if avg_val_loss < best_val_loss:
91
+ best_val_loss = avg_val_loss
92
+ val_ckpt_path = os.path.join(config.CKPT_DIR, "best_model_epoch")
93
+ model.save_model(val_ckpt_path)
94
+ save_hyperparams(val_ckpt_path)
95
+
96
+
97
+ print(f"Average train loss: {avg_train_loss}")
98
+ print(f"Average train perplexity: {train_perplexity}\n")
99
+ sys.stdout.flush()
100
+
101
+ print(f"Average validation loss: {avg_val_loss}")
102
+ print(f"Average validation perplexity: {val_perplexity}\n")
103
+ sys.stdout.flush()
104
+
105
+
106
+ return avg_train_loss, train_perplexity, avg_val_loss, val_perplexity
107
+
108
+
109
+ def test(model, test_loader, device):
110
+ model.to(device).eval()
111
+ total_test_loss = 0.0
112
+ weighted_total_test_loss = 0.0
113
+ total_masked_test_tokens = 0.0
114
+
115
+ with torch.no_grad():
116
+ for step, inputs in enumerate(test_loader):
117
+ inputs = {k: v.to(device) for k, v in inputs.items()}
118
+ outputs = model(**inputs)
119
+ test_loss = outputs.loss.item()
120
+
121
+ num_mask_tokens = (inputs["input_ids"] == tokenizer.mask_token_id).sum().item()
122
+ total_masked_test_tokens += num_mask_tokens
123
+
124
+ total_test_loss += test_loss
125
+ weighted_total_test_loss += test_loss * num_mask_tokens
126
+
127
+ avg_test_loss = total_test_loss / len(test_loader)
128
+ avg_test_neg_log_likilehood = weighted_total_test_loss / total_masked_test_tokens
129
+ test_perplexity = math.exp(avg_test_neg_log_likilehood)
130
+
131
+ return avg_test_loss, test_perplexity
132
+
133
+
134
+ if __name__ == "__main__":
135
+ device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
136
+ print(device)
137
+
138
+ model = MembraneMLM()
139
+ model.to(device)
140
+ model.freeze_model()
141
+ model.unfreeze_n_layers()
142
+ tokenizer = model.tokenizer
143
+
144
+ train_loader, val_loader, test_loader = get_dataloaders(config)
145
+ optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.LEARNING_RATE)
146
+
147
+ # Train and test the model
148
+ avg_train_loss, train_ppl, avg_val_loss, val_ppl = train_and_validate(model, optimizer, device, train_loader, val_loader, config.NUM_EPOCHS, config.CKPT_DIR)
149
+ avg_test_loss, test_ppl = test(model, test_loader, device)
150
+
151
+ results_dict = {"Average train loss": avg_train_loss,
152
+ "Average train perplexity": train_ppl,
153
+ "Average val loss": avg_val_loss,
154
+ "Average val perplexity": val_ppl,
155
+ "Average test loss": avg_test_loss,
156
+ "Average test perplexity": test_ppl,
157
+ }
158
+
159
+ print("TRAIN AND TEST RESULTS")
160
+ for k, v in results_dict.items():
161
+ print(f"{k}: {v}\n")
162
+
163
+ # Save training and test performance
164
+ with open(config.CKPT_DIR + "/train_test_results.txt", 'w') as f:
165
+ for k, v in results_dict.items():
166
+ f.write(f'{k}: {v}\n')
167
+
168
+
169
+ ### Get embeddings from model
170
+ # best_model_pth = config.MLM_MODEL_PATH + "/best_model"
171
+
172
+ # model = AutoModel.from_pretrained(best_model_pth)
173
+ # tokenizer = AutoTokenizer.from_pretrained(best_model_pth)
174
+ # model.eval().to(device)
175
+
176
+ # random_seq = "WPIQMVYSLGQHADYMQWFTIMPPPIEMIFVWHNCTQHDYSFRERAGEVDQARMKTEMAR"
177
+ # inputs = tokenizer(random_seq, return_tensors='pt')
178
+ # inputs = {k: v.to(device) for k, v in inputs.items()}
179
+ # inputs = inputs['input_ids']
180
+ # print(inputs)
181
+ # with torch.no_grad():
182
+ # outputs = model(inputs).last_hidden_state
183
+ # print(outputs)
184
+ # print(outputs.size())
benchmarks/Supervised/.DS_Store ADDED
Binary file (6.15 kB). View file
 
benchmarks/Supervised/Localization/cell_localization_predictor.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from transformers import AutoModel, AutoTokenizer
6
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
7
+
8
+ from tqdm import tqdm
9
+ from datetime import datetime
10
+ import pandas as pd
11
+ import numpy as np
12
+ import pickle
13
+ import os
14
+
15
+ # Hyperparameters dictionary
16
+ path = "/workspace/sg666/MDpLM"
17
+
18
+ hyperparams = {
19
+ "batch_size": 1,
20
+ "learning_rate": 5e-4,
21
+ "num_epochs": 5,
22
+ "esm_model_path": "facebook/esm2_t33_650M_UR50D",
23
+ 'mlm_model_path': path + "/benchmarks/MLM/model_ckpts/best_model_epoch",
24
+ "mdlm_model_path": path + "/checkpoints/membrane_automodel/epochs30_lr3e-4_bsz16_gradclip1_beta-one0.9_beta-two0.999_bf16_all-params",
25
+ "train_data": path + "/benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_train-val.csv",
26
+ "test_data" : path + "/benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_test.csv",
27
+ }
28
+
29
+ # Helper functions to obtain all embeddings for a sequence
30
+ def load_models(esm_model_path, mlm_model_path, mdlm_model_path):
31
+ esm_tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
32
+ esm_model = AutoModel.from_pretrained(esm_model_path).to(device)
33
+ mlm_model = AutoModel.from_pretrained(mlm_model_path).to(device)
34
+ mdlm_model = AutoModel.from_pretrained(mdlm_model_path).to(device)
35
+
36
+ return esm_tokenizer, esm_model, mlm_model, mdlm_model
37
+
38
+ def get_latents(embedding_type, tokenizer, esm_model, mlm_model, mdlm_model, sequence, device):
39
+ if embedding_type == "esm":
40
+ inputs = tokenizer(sequence, return_tensors='pt').to(device)
41
+ with torch.no_grad():
42
+ embeddings = esm_model(**inputs).last_hidden_state.squeeze(0)
43
+
44
+ elif embedding_type == "mlm":
45
+ inputs = tokenizer(sequence, return_tensors='pt')['input_ids'].to(device)
46
+ with torch.no_grad():
47
+ embeddings = mlm_model(inputs).last_hidden_state.squeeze(0)
48
+
49
+ elif embedding_type == "mdlm":
50
+ inputs = tokenizer(sequence, return_tensors='pt')['input_ids'].to(device)
51
+ with torch.no_grad():
52
+ embeddings = mdlm_model(inputs).last_hidden_state.squeeze(0)
53
+
54
+ return embeddings
55
+
56
+
57
+ # Dataset class can load pickle file
58
+ class LocalizationDataset(Dataset):
59
+ def __init__(self, embedding_type, csv_file, esm_model_path, mlm_model_path, mdlm_model_path, device):
60
+ self.data = pd.read_csv(csv_file)
61
+ self.data = self.data[self.data['Sequence'].apply(len) < 1024].reset_index(drop=True)
62
+ self.embedding_type = embedding_type
63
+ self.tokenizer, self.esm_model, self.mlm_model, self.mdlm_model = load_models(esm_model_path, mlm_model_path, mdlm_model_path)
64
+ self.device = device
65
+
66
+ def __len__(self):
67
+ return len(self.data)
68
+
69
+ def __getitem__(self, idx):
70
+ sequence = self.data.iloc[idx]['Sequence']
71
+ embeddings = get_latents(self.embedding_type, self.tokenizer, self.mlm_model, self.esm_model, self.mdlm_model,
72
+ sequence, self.device)
73
+
74
+ label = 0 if self.data.iloc[idx]['Cell membrane'] == 0 else 1
75
+ labels = torch.tensor(label, dtype=torch.float32).view(1,1).squeeze(-1)
76
+
77
+ return embeddings, labels
78
+
79
+ # Predict localization with MLP head using pooled embeddings
80
+ class LocalizationPredictor(nn.Module):
81
+ def __init__(self, input_dim):
82
+ super(LocalizationPredictor, self).__init__()
83
+ self.classifier = nn.Sequential(
84
+ nn.Linear(input_dim, 640),
85
+ nn.ReLU(),
86
+ nn.Linear(640, 1)
87
+ )
88
+
89
+ def forward(self, embeddings):
90
+ logits = self.classifier(embeddings)
91
+ logits = torch.mean(logits, dim=1)
92
+ probs = torch.nn.functional.softmax(logits)
93
+ return probs
94
+
95
+ # Training function
96
+ def train(model, dataloader, optimizer, criterion, device):
97
+ model.train()
98
+ total_loss = 0
99
+ for embeddings, labels in tqdm(dataloader):
100
+ embeddings, labels = embeddings.to(device), labels.to(device)
101
+ optimizer.zero_grad()
102
+ outputs = model(embeddings)
103
+ loss = criterion(outputs, labels)
104
+ loss.backward()
105
+ optimizer.step()
106
+ total_loss += loss.item()
107
+ return total_loss / len(dataloader)
108
+
109
+ # Evaluation function
110
+ def evaluate(model, dataloader, device):
111
+ model.eval()
112
+ preds, true_labels = [], []
113
+ with torch.no_grad():
114
+ for embeddings, labels in tqdm(dataloader):
115
+ embeddings, labels = embeddings.to(device), labels.to(device)
116
+ outputs = model(embeddings)
117
+ preds.append(outputs.cpu().numpy())
118
+ true_labels.append(labels.cpu().numpy())
119
+ return preds, true_labels
120
+
121
+ # Metrics calculation
122
+ def calculate_metrics(preds, labels, threshold=0.5):
123
+ all_metrics = []
124
+ for pred, label in zip(preds, labels):
125
+ pred = (pred > threshold).astype(int)
126
+
127
+ accuracy = accuracy_score(label, pred)
128
+ precision = precision_score(label, pred, average='macro')
129
+ recall = recall_score(label, pred, average='macro')
130
+ f1_macro = f1_score(label, pred, average='macro')
131
+ f1_micro = f1_score(label, pred, average='micro')
132
+
133
+ all_metrics.append([accuracy, precision, recall, f1_macro, f1_micro])
134
+
135
+ avg_metrics = np.mean(all_metrics, axis=0)
136
+ print(avg_metrics)
137
+ return avg_metrics
138
+
139
+
140
+
141
+ if __name__ == "__main__":
142
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
143
+
144
+ for embedding_type in ['mdlm', 'esm', 'mlm']:
145
+ # Initialize datasets
146
+ train_dataset = LocalizationDataset(embedding_type,
147
+ hyperparams['train_data'],
148
+ hyperparams['esm_model_path'],
149
+ hyperparams['mlm_model_path'],
150
+ hyperparams['mdlm_model_path'],
151
+ device)
152
+ test_dataset = LocalizationDataset(embedding_type,
153
+ hyperparams['test_data'],
154
+ hyperparams['esm_model_path'],
155
+ hyperparams['mlm_model_path'],
156
+ hyperparams['mdlm_model_path'],
157
+ device)
158
+
159
+ # Prepare dataloaders
160
+ train_dataloader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
161
+ test_dataloader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)
162
+
163
+ # Initialize model, optimizer, and loss function
164
+ input_dim=640 if embedding_type=="mdlm" else 1280
165
+ model = LocalizationPredictor(input_dim=input_dim).to(device)
166
+ optimizer = optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
167
+ criterion = nn.BCELoss()
168
+
169
+ # Initialize main directory model checkpoints
170
+ base_checkpoint_dir = f"{path}/benchmarks/Supervised/Localization/model_checkpoints/{embedding_type}"
171
+ # Initialize subdirectory and name it based on hyperparameters
172
+ hyperparam_str = f"batch_{hyperparams['batch_size']}_lr_{hyperparams['learning_rate']}_epochs_{hyperparams['num_epochs']}"
173
+ model_checkpoint_dir = os.path.join(base_checkpoint_dir, hyperparam_str)
174
+ os.makedirs(model_checkpoint_dir, exist_ok=True)
175
+
176
+
177
+ # Training loop
178
+ for epoch in range(hyperparams["num_epochs"]):
179
+ # Train the model
180
+ train_loss = train(model, train_dataloader, optimizer, criterion, device)
181
+ print(f"EPOCH {epoch+1}/{hyperparams['num_epochs']}")
182
+ print(f"TRAIN LOSS: {train_loss:.4f}")
183
+ print("\n")
184
+
185
+ # Save the model checkpoint for the current epoch
186
+ checkpoint_path = os.path.join(model_checkpoint_dir, f"epoch{epoch + 1}.pth")
187
+ torch.save({
188
+ 'epoch': epoch + 1,
189
+ 'model_state_dict': model.state_dict(),
190
+ 'optimizer_state_dict': optimizer.state_dict(),
191
+ 'loss': train_loss,
192
+ }, checkpoint_path)
193
+ print(f"Checkpoint saved at {checkpoint_path}\n")
194
+
195
+ # Save hyperparameters only once
196
+ if epoch == 0: # Hyperparameters don't change midway through training
197
+ hyperparams_file = os.path.join(model_checkpoint_dir, "hyperparams.txt")
198
+ with open(hyperparams_file, 'w') as f:
199
+ for key, value in hyperparams.items():
200
+ f.write(f"{key}: {value}\n")
201
+ print(f"Hyperparameters saved at {hyperparams_file}\n")
202
+
203
+ # Evaluate model on test dataset
204
+ print("Test set")
205
+ test_preds, test_labels = evaluate(model, test_dataloader, device)
206
+ test_metrics = calculate_metrics(test_preds, test_labels)
207
+ print(test_metrics)
208
+ print("TEST METRICS:")
209
+ print(f"Accuracy: {test_metrics[0]:.4f}")
210
+ print(f"Precision: {test_metrics[1]:.4f}")
211
+ print(f"Recall: {test_metrics[2]:.4f}")
212
+ print(f"F1 Macro Score: {test_metrics[3]:.4f}")
213
+ print(f"F1 Micro Score: {test_metrics[4]:.4f}")
214
+
215
+ #Save test results
216
+ test_results_file = os.path.join(model_checkpoint_dir, "test_results.txt")
217
+ with open(test_results_file, 'w') as f:
218
+ f.write("TEST METRICS:\n")
219
+ f.write(f"Accuracy: {test_metrics[0]:.4f}\n")
220
+ f.write(f"Precision: {test_metrics[1]:.4f}\n")
221
+ f.write(f"Recall: {test_metrics[2]:.4f}\n")
222
+ f.write(f"F1 Macro Score: {test_metrics[3]:.4f}\n")
223
+ f.write(f"F1 Micro: {test_metrics[4]:.4f}\n")
224
+ print(f"Test results saved at {test_results_file}\n")
benchmarks/Supervised/Localization/process_cell_local_data.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ path = "/home/sg666/MDpLM/benchmarks/Supervised/Localization"
4
+
5
+ train_val = pd.read_csv(path + "/deeploc2.0_train_val.csv")
6
+ test = pd.read_csv(path + "/deeploc2.0_test.csv")
7
+
8
+ train_val = train_val[train_val['Sequence'].apply(len) < 1024].reset_index(drop=True)
9
+ test = test[test['Sequence'].apply(len) < 1024].reset_index(drop=True)
10
+
11
+ train_val.to_csv(path + "/true_deeploc2.0_cell-local_train-val.csv", index=False)
12
+ test.to_csv(path + "/true_deeploc2.0_cell-local_test.csv", index=False)
benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_test.csv ADDED
The diff for this file is too large to render. See raw diff
 
memdlm_schematic.png → benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_train-val.csv RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e27d6e53b463f6265e4ea6cc6c156d2d6bb11b69284a5988f29648ece581cb19
3
- size 228019
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ca38d78cc8fbc8777a23f456477901f5af4bbfda7a0908081effd09adbe7e94
3
+ size 12568908
benchmarks/Supervised/Membrane Type/membrane_type_predictor.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from transformers import AutoModel, AutoTokenizer
6
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
7
+
8
+ from tqdm import tqdm
9
+ from datetime import datetime
10
+ import pandas as pd
11
+ import numpy as np
12
+ import pickle
13
+ import os
14
+
15
+ # Hyperparameters dictionary
16
+ path = "/workspace/sg666/MDpLM"
17
+
18
+ hyperparams = {
19
+ "batch_size": 1,
20
+ "learning_rate": 5e-4,
21
+ "num_epochs": 5,
22
+ "esm_model_path": "facebook/esm2_t33_650M_UR50D",
23
+ 'mlm_model_path': path + "/benchmarks/MLM/model_ckpts/best_model_epoch",
24
+ "mdlm_model_path": path + "/checkpoints/membrane_automodel/epochs30_lr3e-4_bsz16_gradclip1_beta-one0.9_beta-two0.999_bf16_all-params",
25
+ "train_data": path + "/benchmarks/Supervised/Membrane Type/membrane_type_train.csv",
26
+ "test_data" : path + "/benchmarks/Supervised/Membrane Type/membrane_type_test.csv",
27
+ }
28
+
29
+ # Helper functions to obtain all embeddings for a sequence
30
+ def load_models(esm_model_path, mlm_model_path, mdlm_model_path):
31
+ esm_tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
32
+ esm_model = AutoModel.from_pretrained(esm_model_path).to(device)
33
+ mlm_model = AutoModel.from_pretrained(mlm_model_path).to(device)
34
+ mdlm_model = AutoModel.from_pretrained(mdlm_model_path).to(device)
35
+ return esm_tokenizer, esm_model, mlm_model, mdlm_model
36
+
37
+ def get_latents(embedding_type, tokenizer, esm_model, mlm_model, mdlm_model, sequence, device):
38
+ if embedding_type == "esm":
39
+ inputs = tokenizer(sequence, return_tensors='pt').to(device)
40
+ with torch.no_grad():
41
+ outputs = esm_model(**inputs)
42
+ embeddings = outputs.last_hidden_state.squeeze(0)
43
+
44
+ elif embedding_type == "mlm":
45
+ inputs = tokenizer(sequence, return_tensors='pt').to(device)
46
+ with torch.no_grad():
47
+ embeddings = mlm_model(**inputs).last_hidden_state.squeeze(0)
48
+
49
+ elif embedding_type == "mdlm":
50
+ inputs = tokenizer(sequence, return_tensors="pt").to(device)
51
+ with torch.no_grad():
52
+ embeddings = mdlm_model(**inputs).last_hidden_state.squeeze(0)
53
+
54
+ return embeddings
55
+
56
+
57
+ # Dataset class can load pickle file
58
+ class MembraneDataset(Dataset):
59
+ def __init__(self, embedding_type, csv_file, esm_model_path, mlm_model_path, mdlm_model_path, device):
60
+ self.data = pd.read_csv(csv_file)
61
+ self.data = self.data[self.data['Sequence'].apply(len) < 1024].reset_index(drop=True)
62
+
63
+ self.embedding_type = embedding_type
64
+ self.device = device
65
+
66
+ self.tokenizer, self.esm_model, self.mlm_model, self.mdlm_model = load_models(esm_model_path, mlm_model_path, mdlm_model_path)
67
+
68
+ # Create multi-class label list
69
+ self.data['label'] = self.data.iloc[:, 3:7].values.tolist()
70
+ self.data['label'] = self.data['label']
71
+
72
+ def __len__(self):
73
+ return len(self.data)
74
+
75
+ def __getitem__(self, idx):
76
+ sequence = self.data.iloc[idx]['Sequence']
77
+ embeddings = get_latents(self.embedding_type, self.tokenizer, self.esm_model, self.mlm_model, self.mdlm_model,
78
+ sequence, self.device)
79
+ labels = torch.tensor(self.data.iloc[idx]['label'], dtype=torch.float32)
80
+
81
+ return embeddings, labels
82
+
83
+
84
+ # Predict localization with MLP head using pooled embeddings
85
+ class MembranePredictor(nn.Module):
86
+ def __init__(self, input_dim, num_classes):
87
+ super(MembranePredictor, self).__init__()
88
+ self.classifier = nn.Sequential(
89
+ nn.Linear(input_dim, 640),
90
+ nn.ReLU(),
91
+ nn.Linear(640, num_classes)
92
+ )
93
+
94
+ def forward(self, embeddings):
95
+ logits = self.classifier(embeddings)
96
+ logits = torch.mean(logits, dim=1)
97
+ probs = torch.sigmoid(logits)
98
+ return probs # pass logits of dimension 1x8 (8-class distribution) to CE loss
99
+
100
+ # Training function
101
+ def train(model, dataloader, optimizer, criterion, device):
102
+ model.train()
103
+ total_loss = 0
104
+ for embeddings, labels in tqdm(dataloader):
105
+ embeddings, labels = embeddings.to(device), labels.to(device)
106
+ optimizer.zero_grad()
107
+ outputs = model(embeddings)
108
+ loss = criterion(outputs, labels)
109
+ loss.backward()
110
+ optimizer.step()
111
+ total_loss += loss.item()
112
+ return total_loss / len(dataloader)
113
+
114
+ # Evaluation function
115
+ def evaluate(model, dataloader, device):
116
+ model.eval()
117
+ preds, true_labels = [], []
118
+ with torch.no_grad():
119
+ for embeddings, labels in tqdm(dataloader):
120
+ embeddings, labels = embeddings.to(device), labels.to(device)
121
+ outputs = model(embeddings)
122
+ preds.append(outputs.cpu().numpy())
123
+ true_labels.append(labels.cpu().numpy())
124
+ return preds, true_labels
125
+
126
+ # Metrics calculation
127
+ def calculate_metrics(preds, labels, threshold=0.5):
128
+ all_metrics = []
129
+ for pred, label in zip(preds, labels):
130
+ pred = (pred > threshold).astype(int)
131
+
132
+ accuracy = accuracy_score(label, pred)
133
+ precision = precision_score(label, pred, average='macro')
134
+ recall = recall_score(label, pred, average='macro')
135
+ f1_macro = f1_score(label, pred, average='macro')
136
+ f1_micro = f1_score(label, pred, average='micro')
137
+
138
+ all_metrics.append([accuracy, precision, recall, f1_macro, f1_micro])
139
+
140
+ avg_metrics = np.mean(all_metrics, axis=0)
141
+ return avg_metrics
142
+
143
+
144
+ if __name__ == "__main__":
145
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
146
+
147
+ for embedding_type in ['mdlm', 'mlm', 'esm']:
148
+ # Initialize datasets
149
+ train_dataset = MembraneDataset(embedding_type,
150
+ hyperparams['train_data'],
151
+ hyperparams['esm_model_path'],
152
+ hyperparams['mlm_model_path'],
153
+ hyperparams['mdlm_model_path'],
154
+ device)
155
+ test_dataset = MembraneDataset(embedding_type,
156
+ hyperparams['test_data'],
157
+ hyperparams['esm_model_path'],
158
+ hyperparams['mlm_model_path'],
159
+ hyperparams['mdlm_model_path'],
160
+ device)
161
+
162
+ # Prepare dataloaders
163
+ train_dataloader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
164
+ test_dataloader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)
165
+
166
+ # Initialize model, optimizer, and loss function
167
+ input_dim=640 if embedding_type=="mdlm" else 1280
168
+ model = MembranePredictor(input_dim=input_dim, num_classes=4).to(device)
169
+ optimizer = optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
170
+ criterion = nn.CrossEntropyLoss()
171
+
172
+ # Initialize main directory model checkpoints
173
+ base_checkpoint_dir = f"{path}/benchmarks/Supervised/Membrane Type/model_checkpoints/{embedding_type}"
174
+ # Initialize subdirectory and name it based on hyperparameters
175
+ hyperparam_str = f"batch_{hyperparams['batch_size']}_lr_{hyperparams['learning_rate']}_epochs_{hyperparams['num_epochs']}"
176
+ model_checkpoint_dir = os.path.join(base_checkpoint_dir, hyperparam_str)
177
+ os.makedirs(model_checkpoint_dir, exist_ok=True)
178
+
179
+ # Training loop
180
+ for epoch in range(hyperparams["num_epochs"]):
181
+ # Train the model
182
+ train_loss = train(model, train_dataloader, optimizer, criterion, device)
183
+ print(f"EPOCH {epoch+1}/{hyperparams['num_epochs']}")
184
+ print(f"TRAIN LOSS: {train_loss:.4f}")
185
+ print("\n")
186
+
187
+ # Save the model checkpoint for the current epoch
188
+ checkpoint_path = os.path.join(model_checkpoint_dir, f"epoch{epoch + 1}.pth")
189
+ torch.save({
190
+ 'epoch': epoch + 1,
191
+ 'model_state_dict': model.state_dict(),
192
+ 'optimizer_state_dict': optimizer.state_dict(),
193
+ 'loss': train_loss,
194
+ }, checkpoint_path)
195
+ print(f"Checkpoint saved at {checkpoint_path}\n")
196
+
197
+ # Save hyperparameters only once
198
+ if epoch == 0: # Hyperparameters don't change midway through training
199
+ hyperparams_file = os.path.join(model_checkpoint_dir, "hyperparams.txt")
200
+ with open(hyperparams_file, 'w') as f:
201
+ for key, value in hyperparams.items():
202
+ f.write(f"{key}: {value}\n")
203
+ print(f"Hyperparameters saved at {hyperparams_file}\n")
204
+
205
+
206
+ # Evaluate model on test dataset
207
+ print("Test set")
208
+ test_preds, test_labels = evaluate(model, test_dataloader, device)
209
+ test_metrics = calculate_metrics(test_preds, test_labels)
210
+ print("TEST METRICS:")
211
+ print(f"Accuracy: {test_metrics[0]:.4f}")
212
+ print(f"Precision: {test_metrics[1]:.4f}")
213
+ print(f"Recall: {test_metrics[2]:.4f}")
214
+ print(f"F1 Macro Score: {test_metrics[3]:.4f}")
215
+ print(f"F1 Micro Score: {test_metrics[4]:.4f}")
216
+
217
+ # Save test results
218
+ test_results_file = os.path.join(model_checkpoint_dir, "test_results.txt")
219
+ with open(test_results_file, 'w') as f:
220
+ f.write("TEST METRICS:\n")
221
+ f.write(f"Accuracy: {test_metrics[0]:.4f}\n")
222
+ f.write(f"Precision: {test_metrics[1]:.4f}\n")
223
+ f.write(f"Recall: {test_metrics[2]:.4f}\n")
224
+ f.write(f"F1 Macro Score: {test_metrics[3]:.4f}\n")
225
+ f.write(f"F1 Micro: {test_metrics[4]:.4f}\n")
226
+ print(f"Test results saved at {test_results_file}\n")
benchmarks/Supervised/Membrane Type/membrane_type_test.csv ADDED
The diff for this file is too large to render. See raw diff
 
benchmarks/Supervised/Membrane Type/membrane_type_train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b8eec677afa2de578d04ee1a0fc9582b2f8cfc47622cbd6374309cd6ab96f3
3
+ size 12335695
benchmarks/Supervised/Membrane Type/split_membrane_type_data.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Splits the DeepLoc 2.1 membrane type data into train/val and testing splits
2
+ # Partition value of "4" indicates testing data
3
+
4
+ import pandas as pd
5
+
6
+ path = "/workspace/a03-sgoel/MDpLM/benchmarks/DeepLoc/Membrane Type"
7
+
8
+ df = pd.read_csv(path + "/unsplit_membrane_type_all.csv")
9
+ df = df.drop(columns=['Unnamed: 0'])
10
+
11
+ train = df[df['Partition'] != 4]
12
+ test = df[df['Partition'] == 4]
13
+
14
+ train.to_csv(path + "/membrane_type_train.csv", index=False)
15
+ test.to_csv(path + "/membrane_type_test.csv", index=False)
benchmarks/Supervised/Membrane Type/unsplit_membrane_type_all.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d878da32a06092f880262048e3c1eb692721c274b0a458fcc712a0dcbd80c71
3
+ size 15683507
benchmarks/Supervised/Solubility/solubility_transformer.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from transformers import AutoModel, AutoTokenizer
6
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
7
+ from sklearn.model_selection import ParameterGrid
8
+ from tqdm import tqdm
9
+ import pandas as pd
10
+ import numpy as np
11
+ import sys
12
+ import os
13
+ from datetime import datetime
14
+ import logging
15
+
16
+ logging.getLogger("transformers").setLevel(logging.ERROR)
17
+
18
+ # Hyperparameters dictionary
19
+ path = "/workspace/sg666/MDpLM"
20
+ hyperparams = {
21
+ "train_data": path + "/data/membrane/train.csv",
22
+ "val_data": path + "/data/membrane/val.csv",
23
+ "test_data": path + "/data/membrane/test.csv",
24
+ 'esm_model_path': "facebook/esm2_t33_650M_UR50D",
25
+ 'mlm_model_path': path + "/benchmarks/MLM/model_ckpts/best_model_epoch",
26
+ "mdlm_model_path": path + "/checkpoints/membrane_automodel/epochs30_lr3e-4_bsz16_gradclip1_beta-one0.9_beta-two0.999_bf16_all-params",
27
+ "batch_size": 1,
28
+ "learning_rate": 5e-5,
29
+ "num_epochs": 2,
30
+ "num_layers": 4,
31
+ "num_heads": 16,
32
+ "dropout": 0.5
33
+ }
34
+
35
+
36
+ # Helper functions to obtain all embeddings for a sequence
37
+ def load_models(esm_model_path, mlm_model_path, mdlm_model_path):
38
+ esm_tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
39
+ esm_model = AutoModel.from_pretrained(esm_model_path).to(device)
40
+ mlm_model = AutoModel.from_pretrained(mlm_model_path).to(device)
41
+ mdlm_model = AutoModel.from_pretrained(mdlm_model_path).to(device)
42
+ return esm_tokenizer, esm_model, mlm_model, mdlm_model
43
+
44
+
45
+ def get_latents(embedding_type, esm_model_path, mlm_model_path, mdlm_model_path, sequence, device):
46
+ tokenizer, esm_model, mlm_model, mdlm_model = load_models(esm_model_path, mlm_model_path, mdlm_model_path)
47
+
48
+ if embedding_type == "esm":
49
+ model = esm_model
50
+ elif embedding_type == "mlm":
51
+ model = mlm_model
52
+ elif embedding_type == "mdlm":
53
+ model = mdlm_model
54
+
55
+ inputs = tokenizer(sequence.upper(), return_tensors="pt").to(device)['input_ids']
56
+ with torch.no_grad():
57
+ embeddings = model(inputs).last_hidden_state.squeeze(0)[1:-1]
58
+
59
+ return embeddings
60
+
61
+
62
+ # Dataset class that loads embeddings and labels
63
+ class SolubilityDataset(Dataset):
64
+ def __init__(self, embedding_type, csv_file, esm_model_path, mlm_model_path, mdlm_model_path, device):
65
+ self.data = pd.read_csv(csv_file).head(5)
66
+ #self.data = self.data[self.data['Sequence'].apply(len) < 1024].reset_index(drop=True)
67
+ self.embedding_type = embedding_type
68
+ self.esm_model_path = esm_model_path
69
+ self.mlm_model_path = mlm_model_path
70
+ self.mdlm_model_path = mdlm_model_path
71
+ self.device = device
72
+
73
+ def __len__(self):
74
+ return len(self.data)
75
+
76
+ def __getitem__(self, idx):
77
+ sequence = self.data.iloc[idx]['Sequence']
78
+ seq_len = len(sequence)
79
+ embeddings = get_latents(self.embedding_type, self.esm_model_path, self.mlm_model_path, self.mdlm_model_path,
80
+ sequence, self.device)
81
+ # Lowercase residues = soluble, uppercase = insoluble
82
+ label = [0 if residue.islower() else 1 for residue in sequence]
83
+ labels = torch.tensor(label, dtype=torch.float32)
84
+
85
+ return embeddings, labels, seq_len
86
+
87
+ # Transformer model class
88
+ class SolubilityPredictor(nn.Module):
89
+ def __init__(self, input_dim, hidden_dim, num_heads, num_layers, dropout):
90
+ super(SolubilityPredictor, self).__init__()
91
+ #self.embedding_dim = input_dim
92
+ # self.self_attention = nn.MultiheadAttention(input_dim, num_heads, dropout)
93
+ # encoder_layer = nn.TransformerEncoderLayer(
94
+ # d_model=hidden_dim,
95
+ # nhead=num_heads,
96
+ # dropout=dropout,
97
+ # batch_first=True
98
+ # )
99
+ # self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
100
+ self.classifier = nn.Sequential(
101
+ nn.Linear(input_dim, 320),
102
+ nn.ReLU(),
103
+ nn.Linear(320, 1)
104
+ )
105
+ self.sigmoid = nn.Sigmoid()
106
+
107
+ def forward(self, embeddings):
108
+ #attn_out, _ = self.self_attention(embeddings, embeddings, embeddings)
109
+ #transformer_out = self.transformer_encoder(attn_out)#.squeeze(1).mean(dim=1)
110
+ #logits = self.classifier(transformer_out)
111
+
112
+ logits = self.classifier(embeddings)
113
+ probs = self.sigmoid(logits.squeeze(-1))
114
+
115
+ return probs # Get probabilities of dimension seq_len
116
+
117
+
118
+ # Training function
119
+ def train(model, train_loader, val_loader, optimizer, criterion, device):
120
+ """
121
+ Trains the model for a single epoch.
122
+ Args:
123
+ model (nn.Module): model that will be trained
124
+ dataloader (DataLoader): PyTorch DataLoader with training data
125
+ optimizer (torch.optim): optimizer
126
+ criterion (nn.Module): loss function
127
+ device (torch.device): device (GPU or CPU to train the model
128
+ Returns:
129
+ total_loss (float): model loss
130
+ """
131
+ # Training loop
132
+ model.train()
133
+ train_loss = 0
134
+
135
+ prog_bar = tqdm(total=len(train_loader), leave=True, file=sys.stdout)
136
+ for step, batch in enumerate(train_loader, start=1):
137
+ embeddings, labels, seq_len = batch
138
+ embeddings, labels = embeddings.to(device), labels.to(device)
139
+ embeddings = embeddings.squeeze(1)
140
+ optimizer.zero_grad()
141
+ outputs = model(embeddings)
142
+ loss = criterion(outputs, labels)
143
+ loss.backward()
144
+ optimizer.step()
145
+ train_loss += loss.item()
146
+ prog_bar.update()
147
+ sys.stdout.flush()
148
+ prog_bar.close()
149
+
150
+ # Validation loop
151
+ model.eval()
152
+ val_loss = 0.0
153
+
154
+ prog_bar = tqdm(total=len(val_loader), leave=True, file=sys.stdout)
155
+ for step, batch in enumerate(val_loader):
156
+ embeddings, labels, seq_len = batch
157
+ embeddings, labels = embeddings.to(device), labels.to(device)
158
+ with torch.no_grad():
159
+ outputs = model(embeddings)
160
+ loss = criterion(outputs, labels)
161
+ val_loss += loss.item()
162
+ prog_bar.update()
163
+ sys.stdout.flush()
164
+ prog_bar.close()
165
+
166
+ return train_loss/len(train_loader), val_loss/len(val_loader)
167
+
168
+
169
+
170
+ # Evaluation function
171
+ def evaluate(model, dataloader, device):
172
+ """
173
+ Performs inference on a trained model
174
+ Args:
175
+ model (nn.Module): the trained model
176
+ dataloader (DataLoader): PyTorch DataLoader with testing data
177
+ device (torch.device): device (GPU or CPU) to be used for inference
178
+ Returns:
179
+ preds (list): predicted per-residue disorder labels
180
+ true_labels (list): ground truth per-residue disorder labels
181
+ """
182
+ model.eval()
183
+ preds, true_labels = [], []
184
+ with torch.no_grad():
185
+ for embeddings, labels, seq_len in tqdm(dataloader):
186
+ embeddings, labels = embeddings.to(device), labels.to(device)
187
+ outputs = model(embeddings)
188
+ preds.append(outputs.cpu().numpy())
189
+ true_labels.append(labels.cpu().numpy())
190
+ return preds, true_labels
191
+
192
+ # Metrics calculation
193
+ def calculate_metrics(preds, labels, threshold=0.5):
194
+ """
195
+ Calculates metrics to assess model performance
196
+ Args:
197
+ preds (list): model's predictions
198
+ labels (list): ground truth labels
199
+ threshold (float): minimum threshold a prediction must be met to be considered disordered
200
+ Returns:
201
+ accuracy (float): accuracy
202
+ precision (float): precision
203
+ recall (float): recall
204
+ f1 (float): F1 score
205
+ roc_auc (float): AUROC score
206
+ """
207
+ flat_binary_preds, flat_prob_preds, flat_labels = [], [], []
208
+
209
+ for pred, label in zip(preds, labels):
210
+ flat_binary_preds.extend((pred > threshold).astype(int).flatten())
211
+ flat_prob_preds.extend(pred.flatten())
212
+ flat_labels.extend(label.flatten())
213
+
214
+ flat_binary_preds = np.array(flat_binary_preds)
215
+ flat_prob_preds = np.array(flat_prob_preds)
216
+ flat_labels = np.array(flat_labels)
217
+
218
+ accuracy = accuracy_score(flat_labels, flat_binary_preds)
219
+ precision = precision_score(flat_labels, flat_binary_preds)
220
+ recall = recall_score(flat_labels, flat_binary_preds)
221
+ f1 = f1_score(flat_labels, flat_binary_preds)
222
+ roc_auc = roc_auc_score(flat_labels, flat_prob_preds)
223
+
224
+ return accuracy, precision, recall, f1, roc_auc
225
+
226
+
227
+ if __name__ == "__main__":
228
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
229
+ print(device)
230
+
231
+ for embedding_type in ['mlm', 'esm', 'mdlm']:
232
+ best_val_loss = float('inf')
233
+ best_model = None
234
+
235
+ # Load train and test dataset
236
+ train_dataset = SolubilityDataset(embedding_type,
237
+ hyperparams['train_data'],
238
+ hyperparams['esm_model_path'],
239
+ hyperparams['mlm_model_path'],
240
+ hyperparams['mdlm_model_path'],
241
+ device)
242
+ test_dataset = SolubilityDataset(embedding_type,
243
+ hyperparams['test_data'],
244
+ hyperparams['esm_model_path'],
245
+ hyperparams['mlm_model_path'],
246
+ hyperparams['mdlm_model_path'],
247
+ device)
248
+ val_dataset = SolubilityDataset(embedding_type,
249
+ hyperparams['val_data'],
250
+ hyperparams['esm_model_path'],
251
+ hyperparams['mlm_model_path'],
252
+ hyperparams['mdlm_model_path'],
253
+ device)
254
+
255
+ # Load PyTorch datasets into DataLoaders
256
+ train_dataloader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
257
+ val_dataloader = DataLoader(val_dataset, batch_size=hyperparams["batch_size"], shuffle=False)
258
+ test_dataloader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)
259
+
260
+ # Set device to GPU
261
+
262
+ ### Grid search to explore hyperparameter space
263
+ # Define hyperparameters
264
+ param_grid = {
265
+ 'learning_rate': [5e-4],
266
+ 'batch_size': [1],
267
+ 'num_heads': [4],
268
+ 'num_layers': [2],
269
+ 'dropout': [0.5],
270
+ 'num_epochs': [5]
271
+ }
272
+
273
+ # Loop over the parameter grid
274
+ grid = ParameterGrid(param_grid)
275
+ for params in grid:
276
+ # Update hyperparameters
277
+ hyperparams.update(params)
278
+
279
+ # Update model with the new set of hyperparms
280
+ input_dim=640 if embedding_type=="mdlm" else 1280
281
+ hidden_dim = input_dim
282
+ model = SolubilityPredictor(
283
+ input_dim=input_dim,
284
+ hidden_dim=hidden_dim,
285
+ num_layers=hyperparams["num_layers"],
286
+ num_heads=hyperparams["num_heads"],
287
+ dropout=hyperparams['dropout']
288
+ )
289
+ model = model.to(device) # Push model to GPU
290
+
291
+ # Update optimizer
292
+ optimizer = optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
293
+ criterion = nn.BCELoss()
294
+ num_epochs = hyperparams['num_epochs']
295
+
296
+ # Train
297
+ for epoch in range(hyperparams["num_epochs"]):
298
+ print(f"EPOCH {epoch+1}/{hyperparams['num_epochs']}")
299
+ train_loss, val_loss = train(model, train_dataloader, val_dataloader, optimizer, criterion, device)
300
+ print(f"TRAIN LOSS: {train_loss:.4f}")
301
+ print(f"VALIDATION LOSS: {val_loss:.4f}\n")
302
+ sys.stdout.flush()
303
+
304
+ if val_loss < best_val_loss:
305
+ best_val_loss = val_loss
306
+ best_model = model.state_dict()
307
+
308
+ # Evaluate model on test sequences
309
+ print("TEST METRICS:")
310
+ test_preds, test_labels = evaluate(model, test_dataloader, device)
311
+ test_metrics = calculate_metrics(test_preds, test_labels)
312
+ print(f"Accuracy: {test_metrics[0]:.4f}")
313
+ print(f"Precision: {test_metrics[1]:.4f}")
314
+ print(f"Recall: {test_metrics[2]:.4f}")
315
+ print(f"F1 Score: {test_metrics[3]:.4f}")
316
+ print(f"ROC AUC: {test_metrics[4]:.4f}")
317
+ print(f"\n")
318
+ sys.stdout.flush()
319
+
320
+ ### Save model and metrics for this hyperparameter combination
321
+ folder_name = f"{path}/benchmarks/Supervised/Solubility/transformer_models/{embedding_type}/lr{hyperparams['learning_rate']}_bs{hyperparams['batch_size']}_epochs{hyperparams['num_epochs']}_layers{hyperparams['num_layers']}_heads{hyperparams['num_heads']}_drpt{hyperparams['dropout']}"
322
+ os.makedirs(folder_name, exist_ok=True)
323
+
324
+ # Save current model for this hyperparameter combination
325
+ model_file_path = os.path.join(folder_name, "model.pth")
326
+ torch.save(model.state_dict(), model_file_path)
327
+
328
+ # Save hyperparameters and test metrics to txt file
329
+ output_file_path = os.path.join(folder_name, "hyperparams_and_test_results.txt")
330
+ with open(output_file_path, 'w') as out_file:
331
+ for key, value in hyperparams.items():
332
+ out_file.write(f"{key}: {value}\n")
333
+
334
+ out_file.write("\nTEST METRICS:\n")
335
+ out_file.write(f"Accuracy: {test_metrics[0]:.4f}\n")
336
+ out_file.write(f"Precision: {test_metrics[1]:.4f}\n")
337
+ out_file.write(f"Recall: {test_metrics[2]:.4f}\n")
338
+ out_file.write(f"F1 Score: {test_metrics[3]:.4f}\n")
339
+ out_file.write(f"ROC AUC: {test_metrics[4]:.4f}\n")
340
+
341
+ # Save the best model and its hyperparameters
342
+ if best_model is not None:
343
+ best_model_dir = f"{path}/benchmarks/Supervised/Solubility/transformer_models/{embedding_type}"
344
+ os.makedirs(best_model_dir, exist_ok=True)
345
+ best_model_path = os.path.join(best_model_dir, "best_model.pth")
346
+ torch.save(best_model, best_model_path)
347
+
348
+ # Save the hyperparameters for the best model
349
+ best_hyperparams_path = f"{path}/benchmarks/Supervised/Solubility/transformer_models/{embedding_type}/best_model_hyperparams.txt"
350
+ with open(best_hyperparams_path, 'w') as out_file:
351
+ out_file.write("Best Validation Loss: {:.4f}\n".format(best_val_loss))
352
+ for key, value in hyperparams.items():
353
+ out_file.write(f"{key}: {value}\n")
checkpoints/.DS_Store ADDED
Binary file (8.2 kB). View file
 
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/esm2_t30_150M_UR50D",
3
+ "architectures": [
4
+ "EsmModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "classifier_dropout": null,
8
+ "emb_layer_norm_before": false,
9
+ "esmfold_config": null,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.0,
12
+ "hidden_size": 640,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2560,
15
+ "is_folding_model": false,
16
+ "layer_norm_eps": 1e-05,
17
+ "mask_token_id": 32,
18
+ "max_position_embeddings": 1026,
19
+ "model_type": "esm",
20
+ "num_attention_heads": 20,
21
+ "num_hidden_layers": 30,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "rotary",
24
+ "token_dropout": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.44.2",
27
+ "use_cache": true,
28
+ "vocab_list": null,
29
+ "vocab_size": 33
30
+ }
config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _self_
3
+ - /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor]
4
+ - /model: small
5
+ - /strategy: ddp
6
+ - /noise: loglinear
7
+ - /lr_scheduler: constant_warmup
8
+
9
+ mode: sample_eval # train / ppl_eval / sample_eval
10
+ diffusion: absorbing_state
11
+ backbone: membrane_esm_finetune # dit / dimamba / ar / vanilla_esm_pretrain / membrane_esm_finetune
12
+ parameterization: subs # subs / d3pm / sedd
13
+ time_conditioning: False
14
+ T: 0 # 0 (continuous time) / 1000
15
+ subs_masking: False
16
+
17
+ seed: 42
18
+
19
+ data:
20
+ train:
21
+ vanilla_esm_train_path: /workspace/sg666/MDpLM/data/uniref50/200k_seqs/train.csv
22
+ membrane_esm_train_path: /workspace/sg666/MDpLM/data/membrane/train.csv
23
+ wrap: null
24
+ test:
25
+ vanilla_esm_test_path: /workspace/sg666/MDpLM/data/uniref50/200k_seqs/test.csv
26
+ membrane_esm_test_path: /workspace/sg666/MDpLM/data/membrane/test.csv
27
+ wrap: null
28
+ valid:
29
+ vanilla_esm_valid_path: /workspace/sg666/MDpLM/data/uniref50/200k_seqs/val.csv
30
+ membrane_esm_valid_path: /workspace/sg666/MDpLM/data/membrane/val.csv
31
+ wrap: null
32
+ wrapping: True
33
+
34
+ loader:
35
+ global_batch_size: 8
36
+ eval_global_batch_size: ${.global_batch_size}
37
+ # Note: batch_size and eval_batch_size are **per machine**
38
+ batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
39
+ eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
40
+ num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
41
+ pin_memory: True
42
+
43
+ sampling:
44
+ predictor: ddpm_cache # analytic, ddpm, ddpm_cache
45
+ steps: 128
46
+ noise_removal: True
47
+ # TODO(yair): @subham, why aren't these params under `eval`?
48
+ num_sample_batches: 2 # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
49
+ num_sample_log: 2
50
+ semi_ar: False
51
+ stride_length: 1
52
+ num_strides: 1
53
+
54
+ training:
55
+ ema: 0.9999
56
+ antithetic_sampling: True
57
+ importance_sampling: False
58
+ sampling_eps: 1e-3
59
+ change_of_variables: False
60
+ mlm_model_path: /workspace/sg666/MDpLM/benchmarks/MLM/model_ckpts_650M/best_model_epoch
61
+ esm_model_path: facebook/esm2_t30_150M_UR50D
62
+ focus_mask: False
63
+
64
+ eval:
65
+ checkpoint_path: /workspace/sg666/MDpLM/checkpoints/membrane_mdlm/eos-wrapping_epochs60_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16/checkpoints/best.ckpt # Used to evaluate a checkpoint after training.
66
+ disable_ema: False
67
+ compute_generative_perplexity: False
68
+ perplexity_batch_size: 8
69
+ compute_perplexity_on_sanity: False
70
+ gen_ppl_eval_model_name_or_path: gpt2-large # gpt2-large, meta-llama/Llama-2-7b-hf
71
+ generate_samples: True
72
+ generation_model: /workspace/sg666/MDpLM/checkpoints/membrane_automodel/epochs60_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16/
73
+
74
+ optim:
75
+ weight_decay: 0.075
76
+ lr: 3e-4
77
+ beta1: 0.9
78
+ beta2: 0.999
79
+ eps: 1e-8
80
+
81
+ Model:
82
+ hidden_size: 1280
83
+ cond_dim: 256
84
+ n_heads: 20
85
+ n_blocks: 4
86
+ dropout: 0.5
87
+ length: null #512
88
+ scale_by_sigma: True
89
+
90
+ trainer:
91
+ _target_: lightning.Trainer
92
+ accelerator: cuda
93
+ num_nodes: 1
94
+ devices: ${device_count:}
95
+ accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
96
+ gradient_clip_val: 1.0
97
+ precision: bf16
98
+ num_sanity_val_steps: 2
99
+ max_epochs: 60
100
+ max_steps: 1_000_000
101
+ log_every_n_steps: 10
102
+ limit_train_batches: 1.0 # train on full dataset, can be used to toggle quick run
103
+ limit_val_batches: 1.0 # validate on full dataset, can be used to toggle quick run
104
+ val_check_interval: 955
105
+
106
+ wandb:
107
+ project: MDpLM_finetune_membrane_200k-seqs
108
+ notes: null
109
+ group: programmablebio
110
+ job_type: null
111
+ name: dit_test #dit_wrapping_epochs60_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16
112
+ id: ${.name}_${seed}
113
+
114
+ hydra:
115
+ run:
116
+ dir: /workspace/sg666/MDpLM/outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}
117
+ job:
118
+ chdir: true
119
+
120
+ checkpointing:
121
+ # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
122
+ save_dir: /workspace/sg666/MDpLM/checkpoints/membrane_mdlm/
123
+ # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
124
+ resume_from_ckpt: false
125
+ resume_ckpt_path: ${.save_dir}/epochs30_lr3e-4_bsz8_gradclip1_beta-one0.9_beta-two0.999_bf16_all-params_no-compile/checkpoints/last.ckpt #/checkpoints/last.ckpt
126
+ pretrained_esm_mdlm_automodel_path: /workspace/sg666/MDpLM/checkpoints/vanilla_esm_pretrained_automodel/epochs10_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16/
127
+ finetuned_esm_mdlm_automodel_path: /workspace/sg666/MDpLM/checkpoints/membrane_mdlm/
data/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/membrane/test.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/membrane/train.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/membrane/val.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/uniref/100k_seqs/check_data.ipynb ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 21,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import seaborn as sns\n",
11
+ "import matplotlib.pyplot as plt\n",
12
+ "import numpy as np"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 4,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "path = \"/home/sg666/MDpLM/data/uniref50/100k_seqs\""
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 5,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "train = pd.read_csv(path + \"/train.csv\")\n",
31
+ "test = pd.read_csv(path + \"/test.csv\")\n",
32
+ "val = pd.read_csv(path + '/val.csv')"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 23,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "valid_residues = ['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']\n",
42
+ "\n",
43
+ "for df in [train, test, val]:\n",
44
+ " df['Length'] = df['Sequence'].str.len()\n",
45
+ "\n",
46
+ " for residue in valid_residues:\n",
47
+ " df[residue] = 0\n",
48
+ "\n",
49
+ " for idx, row in df.iterrows():\n",
50
+ " sequence = row['Sequence']\n",
51
+ "\n",
52
+ " for residue in valid_residues:\n",
53
+ " df.at[idx, residue] = sequence.count(residue)"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 28,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "image/png": "",
64
+ "text/plain": [
65
+ "<Figure size 1200x600 with 1 Axes>"
66
+ ]
67
+ },
68
+ "metadata": {},
69
+ "output_type": "display_data"
70
+ }
71
+ ],
72
+ "source": [
73
+ "amino_acid_frequencies = {}\n",
74
+ "\n",
75
+ "datasets = {'Train': train, 'Test': test, 'Val': val}\n",
76
+ "\n",
77
+ "\n",
78
+ "for name, df in datasets.items():\n",
79
+ " # Count total occurrences of each amino acid in each dataset\n",
80
+ " amino_acid_frequencies[name] = df[valid_residues].sum() / df['Length'].sum()\n",
81
+ "\n",
82
+ "# Convert frequencies to a dataframe for easier manipulation\n",
83
+ "freq_df = pd.DataFrame(amino_acid_frequencies)\n",
84
+ "\n",
85
+ "# Plot the histogram comparing amino acid frequencies\n",
86
+ "plt.figure(figsize=(12, 6))\n",
87
+ "\n",
88
+ "# Set bar width\n",
89
+ "bar_width = 0.2\n",
90
+ "\n",
91
+ "# Generate positions for the bars\n",
92
+ "amino_acids = list(valid_residues)\n",
93
+ "x = np.arange(len(amino_acids)) # positions for the amino acids\n",
94
+ "\n",
95
+ "# Plot the bars for each dataset with an offset\n",
96
+ "plt.bar(x - bar_width, freq_df['Train'], width=bar_width, label='Train', align='center')\n",
97
+ "plt.bar(x, freq_df['Test'], width=bar_width, label='Test', align='center')\n",
98
+ "plt.bar(x + bar_width, freq_df['Val'], width=bar_width, label='Val', align='center')\n",
99
+ "\n",
100
+ "plt.xticks(x, amino_acids)\n",
101
+ "\n",
102
+ "plt.title('UniRef50 (100k Sequences) - Train, Test, and Val Residue Frequencies')\n",
103
+ "plt.xlabel('Amino Acid')\n",
104
+ "plt.ylabel('Frequency')\n",
105
+ "\n",
106
+ "plt.legend()\n",
107
+ "plt.show()"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 30,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "data": {
117
+ "image/png": "",
118
+ "text/plain": [
119
+ "<Figure size 1200x600 with 1 Axes>"
120
+ ]
121
+ },
122
+ "metadata": {},
123
+ "output_type": "display_data"
124
+ }
125
+ ],
126
+ "source": [
127
+ "# 2. Sequence lengths histogram\n",
128
+ "plt.figure(figsize=(12, 6))\n",
129
+ "for name, df in datasets.items():\n",
130
+ " plt.hist(df['Length'], bins=30, alpha=0.7, label=name, density=True)\n",
131
+ "\n",
132
+ "plt.title('UniRef50 (100k Sequences) - Train, Test, and Val Sequence Lengths')\n",
133
+ "plt.xlabel('Sequence Length')\n",
134
+ "plt.ylabel('Frequency')\n",
135
+ "plt.legend()\n",
136
+ "plt.show()"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": []
145
+ }
146
+ ],
147
+ "metadata": {
148
+ "kernelspec": {
149
+ "display_name": "Python 3",
150
+ "language": "python",
151
+ "name": "python3"
152
+ },
153
+ "language_info": {
154
+ "codemirror_mode": {
155
+ "name": "ipython",
156
+ "version": 3
157
+ },
158
+ "file_extension": ".py",
159
+ "mimetype": "text/x-python",
160
+ "name": "python",
161
+ "nbconvert_exporter": "python",
162
+ "pygments_lexer": "ipython3",
163
+ "version": "3.10.12"
164
+ }
165
+ },
166
+ "nbformat": 4,
167
+ "nbformat_minor": 2
168
+ }
data/uniref/100k_seqs/test.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/uniref/100k_seqs/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baa07e6863c4d4b3fdc707b539d9520d66fc8d52be68c9d1c444fa96abc3b77f
3
+ size 20059182
data/uniref/100k_seqs/val.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/uniref/200k_seqs/check_data.ipynb ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import seaborn as sns\n",
11
+ "import matplotlib.pyplot as plt\n",
12
+ "import numpy as np"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 2,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "path = \"/home/sg666/MDpLM/data/uniref50/200k_seqs\""
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 3,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "train = pd.read_csv(path + \"/train.csv\")\n",
31
+ "test = pd.read_csv(path + \"/test.csv\")\n",
32
+ "val = pd.read_csv(path + '/val.csv')"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 4,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "valid_residues = ['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']\n",
42
+ "\n",
43
+ "for df in [train, test, val]:\n",
44
+ " df['Length'] = df['Sequence'].str.len()\n",
45
+ "\n",
46
+ " for residue in valid_residues:\n",
47
+ " df[residue] = 0\n",
48
+ "\n",
49
+ " for idx, row in df.iterrows():\n",
50
+ " sequence = row['Sequence']\n",
51
+ "\n",
52
+ " for residue in valid_residues:\n",
53
+ " df.at[idx, residue] = sequence.count(residue)"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 5,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "image/png": "",
64
+ "text/plain": [
65
+ "<Figure size 1200x600 with 1 Axes>"
66
+ ]
67
+ },
68
+ "metadata": {},
69
+ "output_type": "display_data"
70
+ }
71
+ ],
72
+ "source": [
73
+ "amino_acid_frequencies = {}\n",
74
+ "\n",
75
+ "datasets = {'Train': train, 'Test': test, 'Val': val}\n",
76
+ "\n",
77
+ "\n",
78
+ "for name, df in datasets.items():\n",
79
+ " # Count total occurrences of each amino acid in each dataset\n",
80
+ " amino_acid_frequencies[name] = df[valid_residues].sum() / df['Length'].sum()\n",
81
+ "\n",
82
+ "# Convert frequencies to a dataframe for easier manipulation\n",
83
+ "freq_df = pd.DataFrame(amino_acid_frequencies)\n",
84
+ "\n",
85
+ "# Plot the histogram comparing amino acid frequencies\n",
86
+ "plt.figure(figsize=(12, 6))\n",
87
+ "\n",
88
+ "# Set bar width\n",
89
+ "bar_width = 0.2\n",
90
+ "\n",
91
+ "# Generate positions for the bars\n",
92
+ "amino_acids = list(valid_residues)\n",
93
+ "x = np.arange(len(amino_acids)) # positions for the amino acids\n",
94
+ "\n",
95
+ "# Plot the bars for each dataset with an offset\n",
96
+ "plt.bar(x - bar_width, freq_df['Train'], width=bar_width, label='Train', align='center')\n",
97
+ "plt.bar(x, freq_df['Test'], width=bar_width, label='Test', align='center')\n",
98
+ "plt.bar(x + bar_width, freq_df['Val'], width=bar_width, label='Val', align='center')\n",
99
+ "\n",
100
+ "plt.xticks(x, amino_acids)\n",
101
+ "\n",
102
+ "plt.title('UniRef50 (200k Sequences) - Train, Test, and Val Residue Frequencies')\n",
103
+ "plt.xlabel('Amino Acid')\n",
104
+ "plt.ylabel('Frequency')\n",
105
+ "\n",
106
+ "plt.legend()\n",
107
+ "plt.show()"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 6,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "data": {
117
+ "image/png": "",
118
+ "text/plain": [
119
+ "<Figure size 1200x600 with 1 Axes>"
120
+ ]
121
+ },
122
+ "metadata": {},
123
+ "output_type": "display_data"
124
+ }
125
+ ],
126
+ "source": [
127
+ "# 2. Sequence lengths histogram\n",
128
+ "plt.figure(figsize=(12, 6))\n",
129
+ "for name, df in datasets.items():\n",
130
+ " plt.hist(df['Length'], bins=30, alpha=0.7, label=name)\n",
131
+ "\n",
132
+ "plt.title('UniRef50 (200k Sequences) - Train, Test, and Val Sequence Lengths')\n",
133
+ "plt.xlabel('Sequence Length')\n",
134
+ "plt.ylabel('Frequency')\n",
135
+ "plt.legend()\n",
136
+ "plt.show()"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": null,
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": []
145
+ }
146
+ ],
147
+ "metadata": {
148
+ "kernelspec": {
149
+ "display_name": "Python 3",
150
+ "language": "python",
151
+ "name": "python3"
152
+ },
153
+ "language_info": {
154
+ "codemirror_mode": {
155
+ "name": "ipython",
156
+ "version": 3
157
+ },
158
+ "file_extension": ".py",
159
+ "mimetype": "text/x-python",
160
+ "name": "python",
161
+ "nbconvert_exporter": "python",
162
+ "pygments_lexer": "ipython3",
163
+ "version": "3.10.12"
164
+ }
165
+ },
166
+ "nbformat": 4,
167
+ "nbformat_minor": 2
168
+ }