-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_PMGen.py
183 lines (164 loc) · 12.1 KB
/
run_PMGen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import argparse
import pandas as pd
from run_utils import run_PMGen_wrapper, run_PMGen_modeling, run_proteinmpnn, run_single_proteinmpnn, protein_mpnn_wrapper, MultipleAnchors, get_best_structres
from Bio import SeqIO
import warnings
import os
from Bio import BiopythonDeprecationWarning
warnings.filterwarnings("ignore", category=BiopythonDeprecationWarning)
def remove_files_in_directory(directory):
"""Remove all files inside the given directory (excluding directories and symlinks)."""
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path): # Only remove files, not directories or links
os.remove(file_path)
def main():
parser = argparse.ArgumentParser(description="Run PMGen wrapper or modeling.")
# Default settings
parser.add_argument('--run', choices=['parallel', 'single'], required=False, default='parallel',
help='Run mode: parallel or single (one by one). parallel only works if --mode wrapper')
parser.add_argument('--mode', choices=['wrapper', 'modeling'], required=False, default='wrapper',
help='Recommended. Select wrapper or modeling mode')
# Required arguments for modeling mode
parser.add_argument('--peptide', type=str, help='Peptide sequence')
parser.add_argument('--mhc_seq', type=str, help='MHC Sequence. for MHC-I provide one chain sequence.'
'for MHC-II provide chain_A/chain_B separated by "/", respectively', default=None)
parser.add_argument('--mhc_fasta', type=str, help='a Fasta file with single a header and sequence For MHC-I'
'only provide one chain. For MHC-II provide >Alpha and >Beta Sequences.'
'If you want to run for multiple structures, use --df and --mode wrapper', default=None)
parser.add_argument('--mhc_allele', type=str, help='MHC allele (optional). We recommend to use --mhc_seq instead.')
parser.add_argument('--mhc_type', type=int, help='MHC type', choices=[1,2])
parser.add_argument('--id', type=str, help='Identifier')
parser.add_argument('--anchors', type=str, default=None, help='Anchor positions as a list, e.g [2,9]. (optional)')
parser.add_argument('--predict_anchor', action='store_true', help='Recommended. Enable anchor prediction')
# General arguments
parser.add_argument('--output_dir', type=str, required=True, help='Output directory')
parser.add_argument('--num_templates', type=int, default=4, help='Number of engineered templates')
parser.add_argument('--num_recycles', type=int, default=3, help='Number of recycles')
parser.add_argument('--models', nargs='+', default=['model_2_ptm'], help='List of models to use. if fine'
'tuned, please use _ft at the end of your model.')
parser.add_argument('--alphafold_param_folder', type=str, default='AFfine/af_params/params_original/',
help='Path to AlphaFold parameters')
parser.add_argument('--fine_tuned_model_path', type=str,
default='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
help='Path to fine-tuned model')
parser.add_argument('--benchmark', action='store_true', help='Enable benchmarking')
parser.add_argument('--best_n_templates', type=int, default=4, help='Best N templates')
parser.add_argument('--n_homology_models', type=int, default=1, help='Number of homology models')
parser.add_argument('--max_ram', type=int, default=3, help='Maximum RAM GB per job (only for parallel mode)')
parser.add_argument('--max_cores', type=int, default=4, help='Maximum number of CPU cores (only for parallel mode)')
parser.add_argument('--dirty_mode', action='store_true')
parser.add_argument('--initial_guess', action='store_true', help='Activates Faster AF initial Guess mode instead of Homology modelling mode')
# Wrapper mode argument
parser.add_argument('--df', type=str, help='Recommended. Path to input TSV file (required for wrapper mode)'
'check the github documentation for more information.')
parser.add_argument('--multiple_anchors', action='store_true', help='If enabled, not the best anchor, but all predicted anchors will '
'be used for separate predictions and the best ones will be reported. default False.')
parser.add_argument('--top_k', type=int, default=3, help='if --multiple_anchors is True, number of top anchors to predict the structures for.')
parser.add_argument('--best_structures', action='store_true', help='If multiple anchors or multiple models are being used'
'Activating this flag will find the best structures from their core'
'predicted lddt and saves them in a --output_dir/best_structures'
'directory.')
# ProteinMPNN Arguments
parser.add_argument('--peptide_design', action='store_true', help='Enables peptide design.')
parser.add_argument('--only_pseudo_sequence_design', action='store_true', help='Enables MHC pseudo-sequence design.')
parser.add_argument('--mhc_design', action='store_true', help='Enables whole mhc design. we recommend to use only_pseudo_sequence_design mode.')
parser.add_argument('--num_sequences_peptide', type=int, default=10, help='Number of peptide sequences to be generated. Works only with --peptide_design')
parser.add_argument('--num_sequences_mhc', type=int, default=10, help='Number of mhc sequences to be generated. Works only with --only_pseudo_sequence_design or --mhc_design')
parser.add_argument('--sampling_temp', type=float, default=0.1, help='ProteinMPNN sampling temperature.')
parser.add_argument('--batch_size', type=int, default=1, help='ProteinMPNN batch size.')
parser.add_argument('--hot_spot_thr', type=float, default=6.0, help='Distance threshold to peptide, to define hot-spots on mhc.')
parser.add_argument('--binder_pred', action='store_true', help='Enables binder prediction from ProteinMPNN generated peptides.')
# Setting to Run only a part:
parser.add_argument('--no_alphafold', action='store_false', help='does not run alphafold.')
parser.add_argument('--only_protein_mpnn', action='store_true', help='Skips PANDORA and AF modeling, and runs ProteinMPNN for already available predictions.')
parser.add_argument('--no_pandora', action='store_false', help='does not run pandora')
parser.add_argument('--protein_mpnn_dryrun', action='store_true', help='Overwrites all proteinMPNN flags and just dry run. hotspots are saved.')
parser.add_argument('--return_all_outputs', action='store_true', help='If active, returns all alphafold outputs')
args = parser.parse_args()
# Validation for modeling mode
if args.mode == 'modeling':
if not args.mhc_seq and not args.mhc_allele:
raise ValueError("Either --mhc_seq or --mhc_allele must be provided for modeling mode.")
if not args.id:
args.id = args.peptide # Use peptide as ID if not provided
# Run wrapper mode
if args.mode == 'wrapper':
if not args.df:
raise ValueError("--df is required for wrapper mode")
df = pd.read_csv(args.df, sep='\t')
df['mhc_seq'] = [i.replace('-', '') for i in df.mhc_seq.tolist()] # remove gaps from df:
if args.multiple_anchors:
L1 = len(df)
ma = MultipleAnchors(args, args.dirty_mode)
df = ma.process()
L2 = len(df)
print(f'New DataFrame has {L2} rows, {L2 - L1} difference with original df')
# outputs for proteinmpnn later
tmp_pdb_dict = {}
for i, row in df.iterrows():
out_alphafold = os.path.join(args.output_dir, 'alphafold', row['id'])
tmp_pdb_dict[row['id']] = out_alphafold
runner = run_PMGen_wrapper(df=df, output_dir=args.output_dir, num_templates=args.num_templates,
num_recycles=args.num_recycles, models=args.models,
alphafold_param_folder=args.alphafold_param_folder,
fine_tuned_model_path=args.fine_tuned_model_path,
benchmark=args.benchmark, best_n_templates=args.best_n_templates,
n_homology_models=args.n_homology_models, pandora_force_run=args.no_pandora,
no_modelling=args.initial_guess)
if args.run == 'parallel' and not args.only_protein_mpnn:
runner.run_wrapper_parallel(max_ram=args.max_ram, max_cores=args.max_cores, run_alphafold=args.no_alphafold)
elif args.run == 'single' and not args.only_protein_mpnn:
runner.run_wrapper(run_alphafold=args.no_alphafold)
else:
print('--Warning!-- Only ProteinMPNN mode, Alphafold and PANDORA are skipped.')
# check outputs if they exist:
output_pdbs_dict = {}
for key, value in tmp_pdb_dict.items():
# {'id':[out1, out2], 'id2':[out1, out2], ...}
output_pdbs_dict[key] = [os.path.join(value, i) for i in os.listdir(value) if i.endswith('.pdb') and 'model_' in i and not i.endswith('.npy')]
if args.best_structures: # get best structure out of multiple models and multiple predicted anchors:
_ = get_best_structres(args.output_dir, df, args.multiple_anchors)
# Run modeling mode
elif args.mode == 'modeling':
sequences = []
if not args.mhc_seq:
for record in SeqIO.parse(args.mhc_fasta, "fasta"):
sequences.append(str(record.seq))
if args.mhc_type == 1:
sequence = sequences[0]
elif args.mhc_type == 2: sequence = sequences[0] + "/" + sequences[1]
else:
sequence = args.mhc_seq
runner = run_PMGen_modeling(peptide=args.peptide, mhc_seq=sequence, mhc_type=args.mhc_type,
id=args.id, output_dir=args.output_dir, anchors=args.anchors,
mhc_allele=args.mhc_allele, predict_anchor=args.predict_anchor,
num_templates=args.num_templates, num_recycles=args.num_recycles,
models=args.models, alphafold_param_folder=args.alphafold_param_folder,
fine_tuned_model_path=args.fine_tuned_model_path,
benchmark=args.benchmark, best_n_templates=args.best_n_templates,
n_homology_models=args.n_homology_models, pandora_force_run=args.no_pandora)
if not args.only_protein_mpnn:
runner.run_PMGen(run_alphafold=args.no_alphafold)
else:
print('--Warning!-- Only ProteinMPNN mode, Alphafold and PANDORA are skipped.')
output_pdbs_dict = {}
out_alphafold = os.path.join(args.output_dir, 'alphafold', args.id)
output_pdbs_dict[args.id] = [os.path.join(out_alphafold, i) for i in os.listdir(out_alphafold) if i.endswith('.pdb') and 'model_' in i and not i.endswith('.npy')]
# {'id':[output1, output2, ...]}
if not args.only_protein_mpnn:
print("Alphafold Runs completed.")
else:
print('Alphafold Runs Skipped!')
# get the pdb outputs for listing them and protmpnn
if args.peptide_design or args.only_pseudo_sequence_design or args.mhc_design or args.protein_mpnn_dryrun:
if args.protein_mpnn_dryrun:
args.peptide_design = False
args.only_pseudo_sequence_design = False
args.mhc_design = False
print("Running ProteinMPNN Dry-Run")
print("### Start ProteinMPNN runs ###")
print('files:\n', output_pdbs_dict)
protein_mpnn_wrapper(output_pdbs_dict, args, args.max_cores, mode=args.run)
if __name__ == "__main__":
main()