Skip to content

Commit f72cedc

Browse files
Add missing special tRNA structures (fMet, Ile2, SeC) (#26)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5cdd313 commit f72cedc

273 files changed

Lines changed: 38063 additions & 33591 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

data-raw/structures/generate_svgs.py

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,32 @@
3838
"Escherichia_coli": {
3939
"fasta": FASTA_DIR / "eschColi_K_12_MG1655-mature-tRNAs.fa",
4040
"cm": CM_DIR / "TRNAinf-bact.cm",
41+
"sec_cm": CM_DIR / "TRNAinf-bact-SeC.cm",
4142
},
4243
"Saccharomyces_cerevisiae": {
4344
"fasta": FASTA_DIR / "sacCer3-mature-tRNAs.fa",
4445
"cm": CM_DIR / "TRNAinf-euk.cm",
46+
"sec_cm": CM_DIR / "TRNAinf-euk-SeC.cm",
4547
},
4648
"Homo_sapiens": {
4749
"fasta": FASTA_DIR / "hg38-mature-tRNAs.fa",
4850
"cm": CM_DIR / "TRNAinf-euk.cm",
51+
"sec_cm": CM_DIR / "TRNAinf-euk-SeC.cm",
4952
},
5053
"T4_phage": {
5154
"fasta": FASTA_DIR / "phageT4-tRNAs.fa",
5255
"cm": CM_DIR / "TRNAinf-bact.cm",
56+
"sec_cm": CM_DIR / "TRNAinf-bact-SeC.cm",
5357
},
5458
}
5559

60+
# Map GtRNAdb isotype names to MODOMICS subtype names
61+
ISOTYPE_TO_MODOMICS = {
62+
"fMet": "Ini",
63+
"Ile2": "Ile",
64+
"SeC": "Sec",
65+
}
66+
5667

5768
def main():
5869
"""Generate SVGs for all configured organisms."""
@@ -64,21 +75,37 @@ def main():
6475
org_dir = OUTPUT_DIR / org_name
6576
org_dir.mkdir(parents=True, exist_ok=True)
6677

67-
# Step 1: Align sequences against tRNAscan-SE CM
78+
# Step 1: Split FASTA into standard and SeC sequences
6879
fasta_path = org_info["fasta"]
6980
if not fasta_path.exists():
7081
print(f" WARNING: FASTA not found: {fasta_path}")
7182
continue
7283

84+
std_fasta, sec_fasta = split_sec_fasta(fasta_path)
85+
86+
# Step 2a: Align standard sequences against tRNAscan-SE CM
7387
cm_path = org_info["cm"]
74-
sto_path = run_cmalign(fasta_path, cm_path)
88+
sto_path = run_cmalign(std_fasta, cm_path)
7589
if sto_path is None:
7690
print(f" WARNING: cmalign failed for {org_name}")
7791
continue
7892

79-
# Step 2: Parse aligned sequences and structures
8093
trnas = parse_cmalign_stockholm(sto_path)
8194

95+
# Step 2b: Align SeC sequences with the SeC-specific CM
96+
if sec_fasta is not None:
97+
sec_cm = org_info.get("sec_cm")
98+
if sec_cm and sec_cm.exists():
99+
sec_sto = run_cmalign(sec_fasta, sec_cm)
100+
if sec_sto is not None:
101+
sec_trnas = parse_cmalign_stockholm(sec_sto)
102+
trnas.update(sec_trnas)
103+
print(f" Added {len(sec_trnas)} SeC tRNA(s)")
104+
else:
105+
print(" WARNING: cmalign failed for SeC sequences")
106+
else:
107+
print(" WARNING: SeC CM not found, skipping SeC tRNAs")
108+
82109
if not trnas:
83110
print(f" WARNING: No tRNA data extracted for {org_name}")
84111
continue
@@ -97,7 +124,10 @@ def main():
97124
filtered = {
98125
k: v
99126
for k, v in trnas.items()
100-
if extract_isotype(k) in modomics_isotypes
127+
if ISOTYPE_TO_MODOMICS.get(
128+
extract_isotype(k), extract_isotype(k)
129+
)
130+
in modomics_isotypes
101131
}
102132
print(
103133
f" Filtered to {len(filtered)}/{len(trnas)} tRNAs "
@@ -185,6 +215,57 @@ def run_cmalign(fasta_path: Path, cm_path: Path) -> Path | None:
185215
return None
186216

187217

218+
def split_sec_fasta(fasta_path: Path) -> tuple[Path, Path | None]:
219+
"""Split a FASTA into standard and selenocysteine tRNA sequences.
220+
221+
SeC tRNAs need a different covariance model for structural alignment.
222+
Returns (standard_fasta_path, sec_fasta_path_or_None).
223+
"""
224+
std_records = []
225+
sec_records = []
226+
227+
with open(fasta_path) as f:
228+
header = None
229+
seq_lines = []
230+
for line in f:
231+
line = line.rstrip()
232+
if line.startswith(">"):
233+
if header is not None:
234+
record = header + "\n" + "\n".join(seq_lines) + "\n"
235+
if "tRNA-SeC-" in header or "tRNA-Sec-" in header:
236+
sec_records.append(record)
237+
else:
238+
std_records.append(record)
239+
header = line
240+
seq_lines = []
241+
else:
242+
seq_lines.append(line)
243+
if header is not None:
244+
record = header + "\n" + "\n".join(seq_lines) + "\n"
245+
if "tRNA-SeC-" in header or "tRNA-Sec-" in header:
246+
sec_records.append(record)
247+
else:
248+
std_records.append(record)
249+
250+
# Write standard sequences to temp file
251+
std_fd, std_path = tempfile.mkstemp(suffix=".fa")
252+
with open(std_path, "w") as f:
253+
f.writelines(std_records)
254+
std_path = Path(std_path)
255+
256+
# Write SeC sequences if any
257+
sec_path = None
258+
if sec_records:
259+
sec_fd, sec_path = tempfile.mkstemp(suffix=".fa")
260+
with open(sec_path, "w") as f:
261+
f.writelines(sec_records)
262+
sec_path = Path(sec_path)
263+
print(f" Split FASTA: {len(std_records)} standard, "
264+
f"{len(sec_records)} SeC sequences")
265+
266+
return std_path, sec_path
267+
268+
188269
def parse_cmalign_stockholm(sto_path: Path) -> dict:
189270
"""Parse cmalign Stockholm output into per-tRNA sequences and structures.
190271

0 commit comments

Comments
 (0)