Skip to content

Commit 649fd6e

Browse files
ewelsclaude
andcommitted
fix(io/bam): write header with a lenient validator (STAR tRNA names)
noodles-sam-0.64 rejects `(` `)` `[` `]` `{` `}` `<` `>` `,` `\` in @sq SN: values. yeast tRNA transcript IDs from Ensembl GTFs contain parentheses (tP(UGG)A, tA(UGC)A, etc.) so writing a transcriptome BAM errored with `invalid reference sequence name (<unknown>)` and truncated the output file. Replace BamWriter's noodles-driven header write with a local write_bam_header_lenient that: - Emits BAM\x01 + l_text + SAM-text block (built via render_sam_text_lenient — iterates @hd / @sq / @rg / @pg / @co directly from the noodles sam::Header, writing raw bytes so forbidden chars pass through) - Emits the binary reference list (n_ref + l_name/name/l_ref triples) using CString for NUL-termination Subsequent record writes still go through noodles (no validation there). Verified with 5 yeast reads → valid BAM with 4 transcriptome records + samtools view accepts the output. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 304a39c commit 649fd6e

1 file changed

Lines changed: 140 additions & 3 deletions

File tree

src/io/bam.rs

Lines changed: 140 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ use crate::error::Error;
33
use crate::genome::Genome;
44
use crate::params::Parameters;
55
use crate::quant::transcriptome::TranscriptomeIndex;
6+
use byteorder::{LittleEndian, WriteBytesExt};
67
use noodles::bam;
78
use noodles::sam;
89
use noodles::sam::alignment::io::Write as SamWrite;
910
use noodles::sam::alignment::record_buf::RecordBuf;
11+
use std::ffi::CString;
1012
use std::fs::File;
11-
use std::io::BufWriter;
13+
use std::io::{BufWriter, Write};
1214
use std::path::Path;
1315

1416
/// Buffer for BAM records built by parallel threads
@@ -43,10 +45,21 @@ pub struct BamWriter {
4345

4446
impl BamWriter {
4547
/// Create a BAM writer at `output_path` with the given prepared header.
48+
///
49+
/// Uses a lenient header writer that bypasses noodles' SAM-spec
50+
/// reference-name validator. STAR accepts `(` / `)` in reference names
51+
/// (yeast tRNA transcripts like `tP(UGG)A`), which noodles' strict
52+
/// validator rejects. Writing the SAM text portion of the BAM header
53+
/// manually sidesteps that validation while preserving every other byte.
4654
fn with_header(output_path: &Path, header: sam::Header) -> Result<Self, Error> {
4755
let buf_writer = BufWriter::new(File::create(output_path)?);
48-
let mut writer = bam::io::Writer::new(buf_writer);
49-
writer.write_header(&header)?;
56+
// Wrap in BGZF *before* writing header bytes so the header lands in
57+
// BGZF blocks just like noodles would write it.
58+
let mut bgzf = noodles::bgzf::Writer::new(buf_writer);
59+
write_bam_header_lenient(&mut bgzf, &header)?;
60+
// Reconstruct a bam::io::Writer on top of the already-BGZF-framed
61+
// underlying stream so subsequent record writes use noodles.
62+
let writer = bam::io::Writer::from(bgzf);
5063
Ok(Self { writer, header })
5164
}
5265

@@ -96,6 +109,130 @@ impl BamWriter {
96109
}
97110
}
98111

112+
/// Write a BAM header that tolerates reference sequence names STAR emits
113+
/// (e.g. `tP(UGG)A`) but that noodles' SAM-spec validator rejects.
114+
///
115+
/// Replicates `noodles_bam::io::writer::header::write_header` byte-for-byte
116+
/// for compliant headers; the only divergence is that the SAM text block
117+
/// between `BAM\x01` and the binary reference list is produced via a local
118+
/// formatter instead of `sam::io::Writer::write_header`, so forbidden-char
119+
/// names pass through unchanged.
120+
///
121+
/// Binary reference list (after the text block) uses `CString::new` — the
122+
/// only constraint there is "no interior nul", which is enforced upstream
123+
/// via the usual UTF-8 input.
124+
fn write_bam_header_lenient<W: Write>(writer: &mut W, header: &sam::Header) -> Result<(), Error> {
125+
const MAGIC: &[u8; 4] = b"BAM\x01";
126+
127+
writer.write_all(MAGIC)?;
128+
129+
// Build the SAM text block byte-for-byte identical to
130+
// `sam::io::Writer::write_header` minus the name validator:
131+
// `@HD`, `@SQ` (one per reference), `@RG` (if any), `@PG` (if any),
132+
// `@CO` (if any), each line terminated by `\n`.
133+
let text = render_sam_text_lenient(header);
134+
let l_text = i32::try_from(text.len()).map_err(|_| {
135+
Error::Index(format!(
136+
"BAM SAM-text header exceeds i32::MAX bytes: {} bytes",
137+
text.len()
138+
))
139+
})?;
140+
writer.write_i32::<LittleEndian>(l_text)?;
141+
writer.write_all(&text)?;
142+
143+
// Binary reference list: n_ref then (l_name, name\0, l_ref) per ref.
144+
let refs = header.reference_sequences();
145+
let n_ref = i32::try_from(refs.len())
146+
.map_err(|_| Error::Index("BAM reference count exceeds i32::MAX".into()))?;
147+
writer.write_i32::<LittleEndian>(n_ref)?;
148+
for (name, rs) in refs {
149+
let c_name = CString::new(name.to_vec()).map_err(|e| {
150+
Error::Index(format!("reference name contains interior NUL byte: {}", e))
151+
})?;
152+
let name_bytes = c_name.as_bytes_with_nul();
153+
let l_name = u32::try_from(name_bytes.len()).map_err(|_| {
154+
Error::Index(format!(
155+
"reference name longer than u32::MAX: {} bytes",
156+
name_bytes.len()
157+
))
158+
})?;
159+
writer.write_u32::<LittleEndian>(l_name)?;
160+
writer.write_all(name_bytes)?;
161+
let l_ref = i32::try_from(usize::from(rs.length()))
162+
.map_err(|_| Error::Index("reference length exceeds i32::MAX".into()))?;
163+
writer.write_i32::<LittleEndian>(l_ref)?;
164+
}
165+
166+
Ok(())
167+
}
168+
169+
fn render_sam_text_lenient(header: &sam::Header) -> Vec<u8> {
170+
let mut buf: Vec<u8> = Vec::new();
171+
172+
// @HD line. noodles' Map<Header> in this version doesn't expose
173+
// sort_order/group_order via dedicated accessors; we serialize through
174+
// the generic `other_fields` map (which includes SO/GO when set).
175+
if let Some(hd) = header.header() {
176+
buf.extend_from_slice(b"@HD\tVN:");
177+
buf.extend_from_slice(hd.version().to_string().as_bytes());
178+
for (tag, value) in hd.other_fields() {
179+
buf.push(b'\t');
180+
buf.extend_from_slice(tag.as_ref());
181+
buf.push(b':');
182+
buf.extend_from_slice(value);
183+
}
184+
buf.push(b'\n');
185+
}
186+
187+
// @SQ lines. Use the raw bytes so forbidden characters pass through.
188+
for (name, rs) in header.reference_sequences() {
189+
buf.extend_from_slice(b"@SQ\tSN:");
190+
buf.extend_from_slice(name);
191+
buf.extend_from_slice(b"\tLN:");
192+
buf.extend_from_slice(usize::from(rs.length()).to_string().as_bytes());
193+
// Other optional @SQ fields (AH, AN, AS, DS, M5, SP, TP, UR) —
194+
// ruSTAR doesn't set any today, so skip.
195+
buf.push(b'\n');
196+
}
197+
198+
// @RG lines.
199+
for (id, rg) in header.read_groups() {
200+
buf.extend_from_slice(b"@RG\tID:");
201+
buf.extend_from_slice(id);
202+
for (tag, value) in rg.other_fields() {
203+
buf.push(b'\t');
204+
buf.extend_from_slice(tag.as_ref());
205+
buf.push(b':');
206+
buf.extend_from_slice(value);
207+
}
208+
buf.push(b'\n');
209+
}
210+
211+
// @PG lines — noodles' map doesn't guarantee insertion order; for
212+
// ruSTAR we emit a single @PG with id "ruSTAR". If more are added
213+
// later, pipe them in here.
214+
for (id, pg) in header.programs().as_ref() {
215+
buf.extend_from_slice(b"@PG\tID:");
216+
buf.extend_from_slice(id);
217+
for (tag, value) in pg.other_fields() {
218+
buf.push(b'\t');
219+
buf.extend_from_slice(tag.as_ref());
220+
buf.push(b':');
221+
buf.extend_from_slice(value);
222+
}
223+
buf.push(b'\n');
224+
}
225+
226+
// @CO lines (comments).
227+
for comment in header.comments() {
228+
buf.extend_from_slice(b"@CO\t");
229+
buf.extend_from_slice(comment);
230+
buf.push(b'\n');
231+
}
232+
233+
buf
234+
}
235+
99236
#[cfg(test)]
100237
mod tests {
101238
use super::*;

0 commit comments

Comments
 (0)