3838 "Escherichia_coli" : {
3939 "fasta" : FASTA_DIR / "eschColi_K_12_MG1655-mature-tRNAs.fa" ,
4040 "cm" : CM_DIR / "TRNAinf-bact.cm" ,
41+ "sec_cm" : CM_DIR / "TRNAinf-bact-SeC.cm" ,
4142 },
4243 "Saccharomyces_cerevisiae" : {
4344 "fasta" : FASTA_DIR / "sacCer3-mature-tRNAs.fa" ,
4445 "cm" : CM_DIR / "TRNAinf-euk.cm" ,
46+ "sec_cm" : CM_DIR / "TRNAinf-euk-SeC.cm" ,
4547 },
4648 "Homo_sapiens" : {
4749 "fasta" : FASTA_DIR / "hg38-mature-tRNAs.fa" ,
4850 "cm" : CM_DIR / "TRNAinf-euk.cm" ,
51+ "sec_cm" : CM_DIR / "TRNAinf-euk-SeC.cm" ,
4952 },
5053 "T4_phage" : {
5154 "fasta" : FASTA_DIR / "phageT4-tRNAs.fa" ,
5255 "cm" : CM_DIR / "TRNAinf-bact.cm" ,
56+ "sec_cm" : CM_DIR / "TRNAinf-bact-SeC.cm" ,
5357 },
5458}
5559
60+ # Map GtRNAdb isotype names to MODOMICS subtype names
61+ ISOTYPE_TO_MODOMICS = {
62+ "fMet" : "Ini" ,
63+ "Ile2" : "Ile" ,
64+ "SeC" : "Sec" ,
65+ }
66+
5667
5768def main ():
5869 """Generate SVGs for all configured organisms."""
@@ -64,21 +75,37 @@ def main():
6475 org_dir = OUTPUT_DIR / org_name
6576 org_dir .mkdir (parents = True , exist_ok = True )
6677
67- # Step 1: Align sequences against tRNAscan-SE CM
78+ # Step 1: Split FASTA into standard and SeC sequences
6879 fasta_path = org_info ["fasta" ]
6980 if not fasta_path .exists ():
7081 print (f" WARNING: FASTA not found: { fasta_path } " )
7182 continue
7283
84+ std_fasta , sec_fasta = split_sec_fasta (fasta_path )
85+
86+ # Step 2a: Align standard sequences against tRNAscan-SE CM
7387 cm_path = org_info ["cm" ]
74- sto_path = run_cmalign (fasta_path , cm_path )
88+ sto_path = run_cmalign (std_fasta , cm_path )
7589 if sto_path is None :
7690 print (f" WARNING: cmalign failed for { org_name } " )
7791 continue
7892
79- # Step 2: Parse aligned sequences and structures
8093 trnas = parse_cmalign_stockholm (sto_path )
8194
95+ # Step 2b: Align SeC sequences with the SeC-specific CM
96+ if sec_fasta is not None :
97+ sec_cm = org_info .get ("sec_cm" )
98+ if sec_cm and sec_cm .exists ():
99+ sec_sto = run_cmalign (sec_fasta , sec_cm )
100+ if sec_sto is not None :
101+ sec_trnas = parse_cmalign_stockholm (sec_sto )
102+ trnas .update (sec_trnas )
103+ print (f" Added { len (sec_trnas )} SeC tRNA(s)" )
104+ else :
105+ print (" WARNING: cmalign failed for SeC sequences" )
106+ else :
107+ print (" WARNING: SeC CM not found, skipping SeC tRNAs" )
108+
82109 if not trnas :
83110 print (f" WARNING: No tRNA data extracted for { org_name } " )
84111 continue
@@ -97,7 +124,10 @@ def main():
97124 filtered = {
98125 k : v
99126 for k , v in trnas .items ()
100- if extract_isotype (k ) in modomics_isotypes
127+ if ISOTYPE_TO_MODOMICS .get (
128+ extract_isotype (k ), extract_isotype (k )
129+ )
130+ in modomics_isotypes
101131 }
102132 print (
103133 f" Filtered to { len (filtered )} /{ len (trnas )} tRNAs "
@@ -185,6 +215,57 @@ def run_cmalign(fasta_path: Path, cm_path: Path) -> Path | None:
185215 return None
186216
187217
218+ def split_sec_fasta (fasta_path : Path ) -> tuple [Path , Path | None ]:
219+ """Split a FASTA into standard and selenocysteine tRNA sequences.
220+
221+ SeC tRNAs need a different covariance model for structural alignment.
222+ Returns (standard_fasta_path, sec_fasta_path_or_None).
223+ """
224+ std_records = []
225+ sec_records = []
226+
227+ with open (fasta_path ) as f :
228+ header = None
229+ seq_lines = []
230+ for line in f :
231+ line = line .rstrip ()
232+ if line .startswith (">" ):
233+ if header is not None :
234+ record = header + "\n " + "\n " .join (seq_lines ) + "\n "
235+ if "tRNA-SeC-" in header or "tRNA-Sec-" in header :
236+ sec_records .append (record )
237+ else :
238+ std_records .append (record )
239+ header = line
240+ seq_lines = []
241+ else :
242+ seq_lines .append (line )
243+ if header is not None :
244+ record = header + "\n " + "\n " .join (seq_lines ) + "\n "
245+ if "tRNA-SeC-" in header or "tRNA-Sec-" in header :
246+ sec_records .append (record )
247+ else :
248+ std_records .append (record )
249+
250+ # Write standard sequences to temp file
251+ std_fd , std_path = tempfile .mkstemp (suffix = ".fa" )
252+ with open (std_path , "w" ) as f :
253+ f .writelines (std_records )
254+ std_path = Path (std_path )
255+
256+ # Write SeC sequences if any
257+ sec_path = None
258+ if sec_records :
259+ sec_fd , sec_path = tempfile .mkstemp (suffix = ".fa" )
260+ with open (sec_path , "w" ) as f :
261+ f .writelines (sec_records )
262+ sec_path = Path (sec_path )
263+ print (f" Split FASTA: { len (std_records )} standard, "
264+ f"{ len (sec_records )} SeC sequences" )
265+
266+ return std_path , sec_path
267+
268+
188269def parse_cmalign_stockholm (sto_path : Path ) -> dict :
189270 """Parse cmalign Stockholm output into per-tRNA sequences and structures.
190271
0 commit comments