2626)
2727
2828
29+ def _make_diamond_db (in_aa_reference_fp : Path , out_db_fp : Path ) -> None :
30+ """
31+ Creates a Diamond-formatted database file that can be used for protein
32+ sequence alignment with Diamond blastx.
33+
34+ Args:
35+ in_aa_reference_fp (Path): Path to the input amino acid reference FASTA file
36+ with their gene names as headers
37+ out_db_fp (Path): Path to the output Diamond database file.
38+ """
39+ try :
40+ logging .info ("Diamond makedb" )
41+ logging .info ("== Making Sequence DB ==" )
42+ result = subprocess .run (
43+ [
44+ "diamond" ,
45+ "makedb" ,
46+ "--in" ,
47+ str (in_aa_reference_fp ),
48+ "-d" ,
49+ str (out_db_fp ),
50+ ],
51+ stdout = subprocess .DEVNULL ,
52+ stderr = subprocess .DEVNULL ,
53+ check = False ,
54+ )
55+ if result .returncode != 0 :
56+ raise RuntimeError (
57+ f"Error occurred while making sequence DB with diamond makedb "
58+ f"- Error Code: { result .returncode } "
59+ )
60+ except Exception as e :
61+ if not isinstance (e , RuntimeError ):
62+ logging .error (
63+ f"An error occurred while making sequence DB - Error Code: { e } "
64+ )
65+ raise
66+
67+
2968def nuc_to_aa_alignment (
3069 in_nuc_alignment_fp : Path ,
31- in_aa_reference_fp : Path ,
70+ in_aa_db_fp : Path ,
3271 out_aa_alignment_fp : Path ,
3372) -> None :
3473 """
3574 Function to convert files and translate and align with Diamond / blastx.
3675
3776 Args:
3877 in_nuc_alignment_fp (Path): Path to the input nucleotide alignment file.
39- in_aa_reference_fp (Path): Path to the input amino acid reference file.
78+ in_aa_db_fp (Path): Path to the input amino acid diamond database file.
4079 out_aa_alignment_fp (Path): Path to the output amino acid alignment file.
4180
4281 Returns:
@@ -76,36 +115,6 @@ def nuc_to_aa_alignment(
76115
77116 logging .info (f"Using temporary directory for diamond: { temp_dir_path } " )
78117
79- # temporary file file for amino acid reference DB
80- db_ref_fp = temp_dir_path / Path (in_aa_reference_fp .stem + ".temp.db" )
81- try :
82- # ==== Make Sequence DB ====
83- logging .info ("Diamond makedb" )
84- logging .info ("== Making Sequence DB ==" )
85- result = subprocess .run (
86- [
87- "diamond" ,
88- "makedb" ,
89- "--in" ,
90- str (in_aa_reference_fp ),
91- "-d" ,
92- str (db_ref_fp ),
93- ],
94- stdout = subprocess .DEVNULL ,
95- stderr = subprocess .DEVNULL ,
96- check = False ,
97- )
98- if result .returncode != 0 :
99- raise RuntimeError (
100- f"Error occurred while making sequence DB with diamond makedb "
101- f"- Error Code: { result } "
102- )
103- except Exception as e :
104- logging .error (
105- f"An error occurred while making sequence DB - Error Code: { e } "
106- )
107- raise
108-
109118 try :
110119 # ==== Alignment ====
111120 logging .info ("Diamond blastx alignment" )
@@ -114,7 +123,7 @@ def nuc_to_aa_alignment(
114123 "diamond" ,
115124 "blastx" ,
116125 "-d" ,
117- str (db_ref_fp ),
126+ str (in_aa_db_fp ),
118127 "-q" ,
119128 str (fasta_nuc_for_aa_alignment ),
120129 "-o" ,
@@ -287,7 +296,10 @@ def enrich_read_with_aa_seq(
287296
288297
289298def parse_translate_align (
290- nuc_reference_fp : Path , aa_reference_fp : Path , nuc_alignment_fp : Path
299+ nuc_reference_fp : Path ,
300+ aa_reference_fp : Path ,
301+ nuc_alignment_fp : Path ,
302+ aa_db_fp : Path ,
291303) -> Dict [str , AlignedRead ]:
292304 """Parse nucleotides, translate and align amino acids the input files."""
293305 with tempfile .TemporaryDirectory () as temp_dir :
@@ -299,7 +311,7 @@ def parse_translate_align(
299311
300312 missing_files = [
301313 str (f )
302- for f in [nuc_reference_fp , aa_reference_fp , nuc_alignment_fp ]
314+ for f in [nuc_reference_fp , aa_reference_fp , nuc_alignment_fp , aa_db_fp ]
303315 if not f .exists ()
304316 ]
305317 if missing_files :
@@ -315,7 +327,7 @@ def parse_translate_align(
315327
316328 nuc_to_aa_alignment (
317329 in_nuc_alignment_fp = BAM_NUC_ALIGNMENT_FILE ,
318- in_aa_reference_fp = aa_reference_fp ,
330+ in_aa_db_fp = aa_db_fp ,
319331 out_aa_alignment_fp = AA_ALIGNMENT_FILE ,
320332 )
321333
@@ -373,14 +385,16 @@ def enrich_single_read(read: AlignedRead) -> AlignedRead:
373385 return enrich_single_read
374386
375387
376- def process_bam_files (bam_splits_fps , nuc_reference_fp , aa_reference_fp , metadata_fp ):
388+ def process_bam_files (
389+ bam_splits_fps , nuc_reference_fp , aa_reference_fp , aa_db_fp , metadata_fp
390+ ):
377391 """Generator to process BAM files and yield JSON strings."""
378392
379393 enrich_single_read = curry_read_with_metadata (metadata_fp )
380394
381395 for bam_split_fp in bam_splits_fps :
382396 for read in parse_translate_align (
383- nuc_reference_fp , aa_reference_fp , bam_split_fp
397+ nuc_reference_fp , aa_reference_fp , bam_split_fp , aa_db_fp
384398 ).values ():
385399 enriched_read = enrich_single_read (read )
386400 yield enriched_read .to_silo_json ()
@@ -437,6 +451,10 @@ def parse_translate_align_in_batches(
437451 with tempfile .TemporaryDirectory () as temp_dir :
438452 temp_dir_path = Path (temp_dir )
439453
454+ # Create Diamond DB once
455+ aa_db_fp = temp_dir_path / "diamond_db.dmnd"
456+ _make_diamond_db (aa_reference_fp , aa_db_fp )
457+
440458 bam_splits_fps = convert .split_bam (
441459 input_bam = nuc_alignment_fp , out_dir = temp_dir_path , chunk_size = chunk_size
442460 )
@@ -452,7 +470,11 @@ def parse_translate_align_in_batches(
452470 with open (output_fp , "wb" ) as f , cctx .stream_writer (f ) as compressor :
453471 buffer = []
454472 for json_str in process_bam_files (
455- bam_splits_fps , nuc_reference_fp , aa_reference_fp , metadata_fp
473+ bam_splits_fps ,
474+ nuc_reference_fp ,
475+ aa_reference_fp ,
476+ aa_db_fp ,
477+ metadata_fp ,
456478 ):
457479 buffer .append (json_str )
458480 if len (buffer ) >= write_chunk_size :
0 commit comments