@@ -3,12 +3,14 @@ use crate::error::Error;
33use crate :: genome:: Genome ;
44use crate :: params:: Parameters ;
55use crate :: quant:: transcriptome:: TranscriptomeIndex ;
6+ use byteorder:: { LittleEndian , WriteBytesExt } ;
67use noodles:: bam;
78use noodles:: sam;
89use noodles:: sam:: alignment:: io:: Write as SamWrite ;
910use noodles:: sam:: alignment:: record_buf:: RecordBuf ;
11+ use std:: ffi:: CString ;
1012use std:: fs:: File ;
11- use std:: io:: BufWriter ;
13+ use std:: io:: { BufWriter , Write } ;
1214use std:: path:: Path ;
1315
1416/// Buffer for BAM records built by parallel threads
@@ -43,10 +45,21 @@ pub struct BamWriter {
4345
4446impl BamWriter {
4547 /// Create a BAM writer at `output_path` with the given prepared header.
48+ ///
49+ /// Uses a lenient header writer that bypasses noodles' SAM-spec
50+ /// reference-name validator. STAR accepts `(` / `)` in reference names
51+ /// (yeast tRNA transcripts like `tP(UGG)A`), which noodles' strict
52+ /// validator rejects. Writing the SAM text portion of the BAM header
53+ /// manually sidesteps that validation while preserving every other byte.
4654 fn with_header ( output_path : & Path , header : sam:: Header ) -> Result < Self , Error > {
4755 let buf_writer = BufWriter :: new ( File :: create ( output_path) ?) ;
48- let mut writer = bam:: io:: Writer :: new ( buf_writer) ;
49- writer. write_header ( & header) ?;
56+ // Wrap in BGZF *before* writing header bytes so the header lands in
57+ // BGZF blocks just like noodles would write it.
58+ let mut bgzf = noodles:: bgzf:: Writer :: new ( buf_writer) ;
59+ write_bam_header_lenient ( & mut bgzf, & header) ?;
60+ // Reconstruct a bam::io::Writer on top of the already-BGZF-framed
61+ // underlying stream so subsequent record writes use noodles.
62+ let writer = bam:: io:: Writer :: from ( bgzf) ;
5063 Ok ( Self { writer, header } )
5164 }
5265
@@ -96,6 +109,130 @@ impl BamWriter {
96109 }
97110}
98111
112+ /// Write a BAM header that tolerates reference sequence names STAR emits
113+ /// (e.g. `tP(UGG)A`) but that noodles' SAM-spec validator rejects.
114+ ///
115+ /// Replicates `noodles_bam::io::writer::header::write_header` byte-for-byte
116+ /// for compliant headers; the only divergence is that the SAM text block
117+ /// between `BAM\x01` and the binary reference list is produced via a local
118+ /// formatter instead of `sam::io::Writer::write_header`, so forbidden-char
119+ /// names pass through unchanged.
120+ ///
121+ /// Binary reference list (after the text block) uses `CString::new` — the
122+ /// only constraint there is "no interior nul", which is enforced upstream
123+ /// via the usual UTF-8 input.
124+ fn write_bam_header_lenient < W : Write > ( writer : & mut W , header : & sam:: Header ) -> Result < ( ) , Error > {
125+ const MAGIC : & [ u8 ; 4 ] = b"BAM\x01 " ;
126+
127+ writer. write_all ( MAGIC ) ?;
128+
129+ // Build the SAM text block byte-for-byte identical to
130+ // `sam::io::Writer::write_header` minus the name validator:
131+ // `@HD`, `@SQ` (one per reference), `@RG` (if any), `@PG` (if any),
132+ // `@CO` (if any), each line terminated by `\n`.
133+ let text = render_sam_text_lenient ( header) ;
134+ let l_text = i32:: try_from ( text. len ( ) ) . map_err ( |_| {
135+ Error :: Index ( format ! (
136+ "BAM SAM-text header exceeds i32::MAX bytes: {} bytes" ,
137+ text. len( )
138+ ) )
139+ } ) ?;
140+ writer. write_i32 :: < LittleEndian > ( l_text) ?;
141+ writer. write_all ( & text) ?;
142+
143+ // Binary reference list: n_ref then (l_name, name\0, l_ref) per ref.
144+ let refs = header. reference_sequences ( ) ;
145+ let n_ref = i32:: try_from ( refs. len ( ) )
146+ . map_err ( |_| Error :: Index ( "BAM reference count exceeds i32::MAX" . into ( ) ) ) ?;
147+ writer. write_i32 :: < LittleEndian > ( n_ref) ?;
148+ for ( name, rs) in refs {
149+ let c_name = CString :: new ( name. to_vec ( ) ) . map_err ( |e| {
150+ Error :: Index ( format ! ( "reference name contains interior NUL byte: {}" , e) )
151+ } ) ?;
152+ let name_bytes = c_name. as_bytes_with_nul ( ) ;
153+ let l_name = u32:: try_from ( name_bytes. len ( ) ) . map_err ( |_| {
154+ Error :: Index ( format ! (
155+ "reference name longer than u32::MAX: {} bytes" ,
156+ name_bytes. len( )
157+ ) )
158+ } ) ?;
159+ writer. write_u32 :: < LittleEndian > ( l_name) ?;
160+ writer. write_all ( name_bytes) ?;
161+ let l_ref = i32:: try_from ( usize:: from ( rs. length ( ) ) )
162+ . map_err ( |_| Error :: Index ( "reference length exceeds i32::MAX" . into ( ) ) ) ?;
163+ writer. write_i32 :: < LittleEndian > ( l_ref) ?;
164+ }
165+
166+ Ok ( ( ) )
167+ }
168+
169+ fn render_sam_text_lenient ( header : & sam:: Header ) -> Vec < u8 > {
170+ let mut buf: Vec < u8 > = Vec :: new ( ) ;
171+
172+ // @HD line. noodles' Map<Header> in this version doesn't expose
173+ // sort_order/group_order via dedicated accessors; we serialize through
174+ // the generic `other_fields` map (which includes SO/GO when set).
175+ if let Some ( hd) = header. header ( ) {
176+ buf. extend_from_slice ( b"@HD\t VN:" ) ;
177+ buf. extend_from_slice ( hd. version ( ) . to_string ( ) . as_bytes ( ) ) ;
178+ for ( tag, value) in hd. other_fields ( ) {
179+ buf. push ( b'\t' ) ;
180+ buf. extend_from_slice ( tag. as_ref ( ) ) ;
181+ buf. push ( b':' ) ;
182+ buf. extend_from_slice ( value) ;
183+ }
184+ buf. push ( b'\n' ) ;
185+ }
186+
187+ // @SQ lines. Use the raw bytes so forbidden characters pass through.
188+ for ( name, rs) in header. reference_sequences ( ) {
189+ buf. extend_from_slice ( b"@SQ\t SN:" ) ;
190+ buf. extend_from_slice ( name) ;
191+ buf. extend_from_slice ( b"\t LN:" ) ;
192+ buf. extend_from_slice ( usize:: from ( rs. length ( ) ) . to_string ( ) . as_bytes ( ) ) ;
193+ // Other optional @SQ fields (AH, AN, AS, DS, M5, SP, TP, UR) —
194+ // ruSTAR doesn't set any today, so skip.
195+ buf. push ( b'\n' ) ;
196+ }
197+
198+ // @RG lines.
199+ for ( id, rg) in header. read_groups ( ) {
200+ buf. extend_from_slice ( b"@RG\t ID:" ) ;
201+ buf. extend_from_slice ( id) ;
202+ for ( tag, value) in rg. other_fields ( ) {
203+ buf. push ( b'\t' ) ;
204+ buf. extend_from_slice ( tag. as_ref ( ) ) ;
205+ buf. push ( b':' ) ;
206+ buf. extend_from_slice ( value) ;
207+ }
208+ buf. push ( b'\n' ) ;
209+ }
210+
211+ // @PG lines — noodles' map doesn't guarantee insertion order; for
212+ // ruSTAR we emit a single @PG with id "ruSTAR". If more are added
213+ // later, pipe them in here.
214+ for ( id, pg) in header. programs ( ) . as_ref ( ) {
215+ buf. extend_from_slice ( b"@PG\t ID:" ) ;
216+ buf. extend_from_slice ( id) ;
217+ for ( tag, value) in pg. other_fields ( ) {
218+ buf. push ( b'\t' ) ;
219+ buf. extend_from_slice ( tag. as_ref ( ) ) ;
220+ buf. push ( b':' ) ;
221+ buf. extend_from_slice ( value) ;
222+ }
223+ buf. push ( b'\n' ) ;
224+ }
225+
226+ // @CO lines (comments).
227+ for comment in header. comments ( ) {
228+ buf. extend_from_slice ( b"@CO\t " ) ;
229+ buf. extend_from_slice ( comment) ;
230+ buf. push ( b'\n' ) ;
231+ }
232+
233+ buf
234+ }
235+
99236#[ cfg( test) ]
100237mod tests {
101238 use super :: * ;
0 commit comments