2020# !pip install kaleido # to export plotly figures as png
2121# !pip install --upgrade nbformat # to avoid plotly error
2222
23+ import argparse
2324import json
2425import logging
2526import os
2930import pandas as pd
3031
3132# import libraries
32-
3333import alignment as align
3434import clustering as clus
3535import compute_statistics as comp_stat
4040import mapping as map
4141import preprocessing as prep
4242
43+ repo_folder = Path (__file__ ).resolve ().parents [1 ]
44+
45+ parser = argparse .ArgumentParser (description = "Protein Assembly Script" )
46+ parser .add_argument ("--input_csv" , type = str , help = "Input file" )
47+ parser .add_argument (
48+ "--folder_outputs" , default = "outputs" , type = str , help = "Outputs folder"
49+ )
50+ parser .add_argument ("--training" , action = "store_true" , help = "Training mode" )
51+
4352logging .basicConfig (
4453 level = logging .INFO , format = "%(asctime)s [%(levelname)s] %(message)s"
4554)
4655logger = logging .getLogger (__name__ )
4756
4857BASE_DIR = Path (__file__ ).resolve ().parents [1 ]
4958JSON_DIR = BASE_DIR / "json"
50- INPUT_DIR = BASE_DIR / "inputs"
51- FASTA_DIR = BASE_DIR / "fasta"
52- OUTPUTS_DIR = BASE_DIR / "outputs"
59+ # INPUT_DIR = BASE_DIR / "inputs"
60+ # FASTA_DIR = BASE_DIR / "fasta"
61+ # OUTPUTS_DIR = BASE_DIR / "outputs"
5362
5463
5564def get_sample_metadata (run , chain = "" , json_path = JSON_DIR / "sample_metadata.json" ):
@@ -68,17 +77,19 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
6877 raise ValueError (f"No metadata found for run '{ run } ' with chain '{ chain } '." )
6978
7079
71- def main ():
80+ def main (input_csv : str , folder_outputs : str = "outputs" , training : bool = False ):
7281 """Main function to run the assembly script."""
7382
74- logger . info ( "Starting protein assembly pipeline." )
83+ input_csv = Path ( input_csv )
7584
76- run = "ma3"
85+ logger . info ( "Starting protein assembly pipeline." )
7786
78- meta = get_sample_metadata (run , chain = "light" )
79- protein = meta ["protein" ]
80- chain = meta ["chain" ]
81- proteases = meta ["proteases" ]
87+ run = input_csv .stem
88+ if training :
89+ meta = get_sample_metadata (run , chain = "light" )
90+ protein = meta ["protein" ]
91+ # chain = meta["chain"]
92+ proteases = meta ["proteases" ]
8293
8394 ass_method = "dbg"
8495
@@ -91,34 +102,37 @@ def main():
91102
92103 logger .info ("Parameters loaded." )
93104
94- folder_outputs = f"../outputs/{ run } { chain } "
95- prep .create_directory (folder_outputs )
96- combination_folder_out = os .path .join (
97- folder_outputs ,
98- f"comb_{ ass_method } _c{ conf } _ks{ kmer_size } _ts{ size_threshold } _mo{ min_overlap } _mi{ min_identity } _mm{ max_mismatches } " ,
105+ folder_outputs = Path (folder_outputs ) / run
106+ folder_outputs .mkdir (parents = True , exist_ok = True )
107+
108+ combination_folder_out = (
109+ folder_outputs
110+ / f"comb_{ ass_method } _c{ conf } _ks{ kmer_size } _ts{ size_threshold } _mo{ min_overlap } _mi{ min_identity } _mm{ max_mismatches } "
99111 )
100112 prep .create_subdirectories_outputs (combination_folder_out )
101113
102114 logger .info (f"Output folders created at: { combination_folder_out } " )
103115
104116 # Data cleaning
105117 logger .info ("Starting data cleaning..." )
106-
107- protein_norm = prep .normalize_sequence (protein )
108- df = pd .read_csv (f"../inputs/{ run } .csv" )
109- df ["protease" ] = df ["experiment_name" ].apply (
110- lambda name : prep .extract_protease (name , proteases )
111- )
118+ if training :
119+ protein_norm = prep .normalize_sequence (protein )
120+ df = pd .read_csv (input_csv )
121+ if training :
122+ df ["protease" ] = df ["experiment_name" ].apply (
123+ lambda name : prep .extract_protease (name , proteases )
124+ )
112125 df = prep .clean_dataframe (df )
113126 df ["cleaned_preds" ] = df ["preds" ].apply (prep .remove_modifications )
114127 cleaned_psms = df ["cleaned_preds" ].tolist ()
115128 filtered_psms = prep .filter_contaminants (
116- cleaned_psms , run , "../ fasta/contaminants.fasta"
129+ cleaned_psms , run , repo_folder / " fasta/contaminants.fasta"
117130 )
118131 df = df [df ["cleaned_preds" ].isin (filtered_psms )]
119- df ["mapped" ] = df ["cleaned_preds" ].apply (
120- lambda x : "True" if x in protein_norm else "False"
121- )
132+ if training :
133+ df ["mapped" ] = df ["cleaned_preds" ].apply (
134+ lambda x : "True" if x in protein_norm else "False"
135+ )
122136 df = df [df ["conf" ] > conf ]
123137 df .reset_index (drop = True , inplace = True )
124138 final_psms = df ["cleaned_preds" ].tolist ()
@@ -145,16 +159,17 @@ def main():
145159 f"{ combination_folder_out } /contigs/{ ass_method } _contig_{ conf } _{ run } .fasta" ,
146160 "fasta" ,
147161 )
148- mapped_contigs = map .process_protein_contigs_scaffold (
149- assembled_contigs , protein_norm , max_mismatches , min_identity
150- )
151- df_contigs = map .create_dataframe_from_mapped_sequences (data = mapped_contigs )
152- comp_stat .compute_assembly_statistics (
153- df = df_contigs ,
154- sequence_type = "contigs" ,
155- output_folder = f"{ combination_folder_out } /statistics" ,
156- reference = protein_norm ,
157- )
162+ if training :
163+ mapped_contigs = map .process_protein_contigs_scaffold (
164+ assembled_contigs , protein_norm , max_mismatches , min_identity
165+ )
166+ df_contigs = map .create_dataframe_from_mapped_sequences (data = mapped_contigs )
167+ comp_stat .compute_assembly_statistics (
168+ df = df_contigs ,
169+ sequence_type = "contigs" ,
170+ output_folder = f"{ combination_folder_out } /statistics" ,
171+ reference = protein_norm ,
172+ )
158173 assembled_scaffolds = dbg .create_scaffolds (assembled_contigs , min_overlap )
159174 assembled_scaffolds = list (set (assembled_scaffolds ))
160175 assembled_scaffolds = sorted (assembled_scaffolds , key = len , reverse = True )
@@ -180,22 +195,23 @@ def main():
180195 f"{ combination_folder_out } /scaffolds/{ ass_method } _scaffold_{ conf } _{ run } .fasta" ,
181196 "fasta" ,
182197 )
183- mapped_scaffolds = map .process_protein_contigs_scaffold (
184- assembled_contigs = assembled_scaffolds ,
185- target_protein = protein_norm ,
186- max_mismatches = max_mismatches ,
187- min_identity = min_identity ,
188- )
198+ if training :
199+ mapped_scaffolds = map .process_protein_contigs_scaffold (
200+ assembled_contigs = assembled_scaffolds ,
201+ target_protein = protein_norm ,
202+ max_mismatches = max_mismatches ,
203+ min_identity = min_identity ,
204+ )
189205
190- df_scaffolds_mapped = map .create_dataframe_from_mapped_sequences (
191- data = mapped_scaffolds
192- )
193- comp_stat .compute_assembly_statistics (
194- df = df_scaffolds_mapped ,
195- sequence_type = "scaffolds" ,
196- output_folder = f"{ combination_folder_out } /statistics" ,
197- reference = protein_norm ,
198- )
206+ df_scaffolds_mapped = map .create_dataframe_from_mapped_sequences (
207+ data = mapped_scaffolds
208+ )
209+ comp_stat .compute_assembly_statistics (
210+ df = df_scaffolds_mapped ,
211+ sequence_type = "scaffolds" ,
212+ output_folder = f"{ combination_folder_out } /statistics" ,
213+ reference = protein_norm ,
214+ )
199215
200216 # Clustering
201217 scaffolds_folder_out = f"{ combination_folder_out } /scaffolds"
@@ -241,4 +257,9 @@ def main():
241257
242258
243259if __name__ == "__main__" :
244- main ()
260+ args = parser .parse_args ()
261+ main (
262+ input_csv = args .input_csv ,
263+ folder_outputs = args .folder_outputs ,
264+ training = args .training ,
265+ )
0 commit comments