@@ -15,7 +15,10 @@ use noodles::sam::alignment::record::data::field::Tag;
1515use noodles:: sam:: alignment:: record_buf:: data:: field:: Value ;
1616use noodles:: sam:: alignment:: record_buf:: data:: field:: value:: Array ;
1717use noodles:: sam:: alignment:: record_buf:: { QualityScores , RecordBuf , Sequence } ;
18- use noodles:: sam:: header:: record:: value:: { Map , map:: Program } ;
18+ use noodles:: sam:: header:: record:: value:: {
19+ Map ,
20+ map:: { Program , ReadGroup , tag:: Other as HeaderOtherTag } ,
21+ } ;
1922use std:: collections:: HashSet ;
2023use std:: fmt:: Write as FmtWrite ;
2124use std:: fs:: File ;
@@ -104,9 +107,11 @@ impl SamWriter {
104107 if params. out_sam_strand_field != "intronMotif" {
105108 attrs. remove ( "XS" ) ;
106109 }
110+ let rg_ids = params. rg_ids ( ) ?;
111+ let rg_id = rg_ids. first ( ) . map ( String :: as_str) ;
107112
108113 for ( hit_index, transcript) in transcripts. iter ( ) . take ( max_output) . enumerate ( ) {
109- let record = transcript_to_record (
114+ let mut record = transcript_to_record (
110115 transcript,
111116 read_name,
112117 read_seq,
@@ -117,6 +122,7 @@ impl SamWriter {
117122 hit_index + 1 , // 1-based
118123 & attrs,
119124 ) ?;
125+ maybe_insert_rg_tag ( & mut record, & attrs, rg_id) ;
120126
121127 self . writer . write_alignment_record ( & self . header , & record) ?;
122128 }
@@ -136,7 +142,7 @@ impl SamWriter {
136142 read_seq : & [ u8 ] ,
137143 read_qual : & [ u8 ] ,
138144 ) -> Result < ( ) , Error > {
139- let record = Self :: build_unmapped_record ( read_name, read_seq, read_qual) ?;
145+ let record = Self :: build_unmapped_record ( read_name, read_seq, read_qual, None ) ?;
140146 self . writer . write_alignment_record ( & self . header , & record) ?;
141147 Ok ( ( ) )
142148 }
@@ -185,10 +191,12 @@ impl SamWriter {
185191 /// * `read_name` - Read identifier
186192 /// * `read_seq` - Read sequence (encoded)
187193 /// * `read_qual` - Quality scores
194+ /// * `rg_id` - Optional read group ID to emit as `RG:Z:<id>` tag
188195 pub fn build_unmapped_record (
189196 read_name : & str ,
190197 read_seq : & [ u8 ] ,
191198 read_qual : & [ u8 ] ,
199+ rg_id : Option < & str > ,
192200 ) -> Result < RecordBuf , Error > {
193201 let mut record = RecordBuf :: default ( ) ;
194202
@@ -206,6 +214,12 @@ impl SamWriter {
206214 // Quality scores
207215 * record. quality_scores_mut ( ) = QualityScores :: from ( read_qual. to_vec ( ) ) ;
208216
217+ if let Some ( id) = rg_id {
218+ record
219+ . data_mut ( )
220+ . insert ( Tag :: READ_GROUP , Value :: String ( BString :: from ( id) ) ) ;
221+ }
222+
209223 Ok ( record)
210224 }
211225
@@ -243,10 +257,12 @@ impl SamWriter {
243257 if params. out_sam_strand_field != "intronMotif" {
244258 attrs. remove ( "XS" ) ;
245259 }
260+ let rg_ids = params. rg_ids ( ) ?;
261+ let rg_id = rg_ids. first ( ) . map ( String :: as_str) ;
246262
247263 let mut records = Vec :: with_capacity ( max_output) ;
248264 for ( hit_index, transcript) in transcripts. iter ( ) . take ( max_output) . enumerate ( ) {
249- let record = transcript_to_record (
265+ let mut record = transcript_to_record (
250266 transcript,
251267 read_name,
252268 read_seq,
@@ -257,6 +273,7 @@ impl SamWriter {
257273 hit_index + 1 , // 1-based
258274 & attrs,
259275 ) ?;
276+ maybe_insert_rg_tag ( & mut record, & attrs, rg_id) ;
260277 records. push ( record) ;
261278 }
262279
@@ -291,7 +308,7 @@ impl SamWriter {
291308 if paired_alignments. is_empty ( ) {
292309 // Both mates unmapped
293310 return Self :: build_paired_unmapped_records (
294- read_name, mate1_seq, mate1_qual, mate2_seq, mate2_qual,
311+ read_name, mate1_seq, mate1_qual, mate2_seq, mate2_qual, params ,
295312 ) ;
296313 }
297314
@@ -307,6 +324,8 @@ impl SamWriter {
307324 if params. out_sam_strand_field != "intronMotif" {
308325 attrs. remove ( "XS" ) ;
309326 }
327+ let rg_ids = params. rg_ids ( ) ?;
328+ let rg_id = rg_ids. first ( ) . map ( String :: as_str) ;
310329
311330 let mut records = Vec :: with_capacity ( max_output * 2 ) ;
312331
@@ -317,7 +336,7 @@ impl SamWriter {
317336 let combined_score = paired_aln. combined_wt_score ;
318337
319338 // Create record for mate1 (this=mate1, mate=mate2)
320- let rec1 = build_paired_mate_record (
339+ let mut rec1 = build_paired_mate_record (
321340 read_name,
322341 mate1_seq,
323342 mate1_qual,
@@ -333,10 +352,11 @@ impl SamWriter {
333352 combined_score,
334353 & attrs,
335354 ) ?;
355+ maybe_insert_rg_tag ( & mut rec1, & attrs, rg_id) ;
336356 records. push ( rec1) ;
337357
338358 // Create record for mate2 (this=mate2, mate=mate1)
339- let rec2 = build_paired_mate_record (
359+ let mut rec2 = build_paired_mate_record (
340360 read_name,
341361 mate2_seq,
342362 mate2_qual,
@@ -352,6 +372,7 @@ impl SamWriter {
352372 combined_score,
353373 & attrs,
354374 ) ?;
375+ maybe_insert_rg_tag ( & mut rec2, & attrs, rg_id) ;
355376 records. push ( rec2) ;
356377 }
357378
@@ -389,6 +410,8 @@ impl SamWriter {
389410 if params. out_sam_strand_field != "intronMotif" {
390411 attrs. remove ( "XS" ) ;
391412 }
413+ let rg_ids = params. rg_ids ( ) ?;
414+ let rg_id = rg_ids. first ( ) . map ( String :: as_str) ;
392415
393416 // Compute mapped mate's per-chr position for co-location
394417 let chr_start = genome. chr_start [ mapped_transcript. chr_idx ] ;
@@ -491,6 +514,7 @@ impl SamWriter {
491514 ) ;
492515 data. insert ( Tag :: new ( b'M' , b'D' ) , Value :: String ( BString :: from ( md) ) ) ;
493516 }
517+ maybe_insert_rg_tag ( & mut mapped_rec, & attrs, rg_id) ;
494518
495519 // --- Build unmapped mate record ---
496520 let mut unmapped_rec = RecordBuf :: default ( ) ;
@@ -530,6 +554,7 @@ impl SamWriter {
530554 let unmapped_seq_bytes: Vec < u8 > = unmapped_seq. iter ( ) . map ( |& b| decode_base ( b) ) . collect ( ) ;
531555 * unmapped_rec. sequence_mut ( ) = Sequence :: from ( unmapped_seq_bytes) ;
532556 * unmapped_rec. quality_scores_mut ( ) = QualityScores :: from ( unmapped_qual. to_vec ( ) ) ;
557+ maybe_insert_rg_tag ( & mut unmapped_rec, & attrs, rg_id) ;
533558
534559 // Order: mate1 first, mate2 second
535560 if mate1_is_mapped {
@@ -550,8 +575,12 @@ impl SamWriter {
550575 mate1_qual : & [ u8 ] ,
551576 mate2_seq : & [ u8 ] ,
552577 mate2_qual : & [ u8 ] ,
578+ params : & Parameters ,
553579 ) -> Result < Vec < RecordBuf > , Error > {
554580 let mut records = Vec :: with_capacity ( 2 ) ;
581+ let attrs = params. sam_attribute_set ( ) ;
582+ let rg_ids = params. rg_ids ( ) ?;
583+ let rg_id = rg_ids. first ( ) . map ( String :: as_str) ;
555584
556585 // Mate1 record
557586 let mut rec1 = RecordBuf :: default ( ) ;
@@ -567,6 +596,7 @@ impl SamWriter {
567596 let seq1_bytes: Vec < u8 > = mate1_seq. iter ( ) . map ( |& b| decode_base ( b) ) . collect ( ) ;
568597 * rec1. sequence_mut ( ) = Sequence :: from ( seq1_bytes) ;
569598 * rec1. quality_scores_mut ( ) = QualityScores :: from ( mate1_qual. to_vec ( ) ) ;
599+ maybe_insert_rg_tag ( & mut rec1, & attrs, rg_id) ;
570600 records. push ( rec1) ;
571601
572602 // Mate2 record
@@ -583,14 +613,15 @@ impl SamWriter {
583613 let seq2_bytes: Vec < u8 > = mate2_seq. iter ( ) . map ( |& b| decode_base ( b) ) . collect ( ) ;
584614 * rec2. sequence_mut ( ) = Sequence :: from ( seq2_bytes) ;
585615 * rec2. quality_scores_mut ( ) = QualityScores :: from ( mate2_qual. to_vec ( ) ) ;
616+ maybe_insert_rg_tag ( & mut rec2, & attrs, rg_id) ;
586617 records. push ( rec2) ;
587618
588619 Ok ( records)
589620 }
590621}
591622
592623/// Build paired SAM header from genome
593- pub fn build_sam_header ( genome : & Genome , _params : & Parameters ) -> Result < sam:: Header , Error > {
624+ pub fn build_sam_header ( genome : & Genome , params : & Parameters ) -> Result < sam:: Header , Error > {
594625 let mut builder = sam:: Header :: builder ( ) ;
595626
596627 // @HD line (default version and unsorted)
@@ -610,12 +641,52 @@ pub fn build_sam_header(genome: &Genome, _params: &Parameters) -> Result<sam::He
610641 ) ;
611642 }
612643
644+ // @RG lines from --outSAMattrRGline. When multiple input files share the
645+ // same RG ID, only emit one @RG line.
646+ let rg_lines = params. parsed_rg_lines ( ) ?;
647+ let mut seen_ids: HashSet < String > = HashSet :: new ( ) ;
648+ for line in & rg_lines {
649+ let mut fields = line. split ( '\t' ) ;
650+ let id = fields
651+ . next ( )
652+ . and_then ( |f| f. strip_prefix ( "ID:" ) )
653+ . ok_or_else ( || Error :: Parameter ( format ! ( "malformed RG line '{}'" , line) ) ) ?;
654+ if !seen_ids. insert ( id. to_string ( ) ) {
655+ continue ;
656+ }
657+ let mut map = Map :: < ReadGroup > :: default ( ) ;
658+ for field in fields {
659+ if field. len ( ) < 3 || & field[ 2 ..3 ] != ":" {
660+ return Err ( Error :: Parameter ( format ! (
661+ "RG field '{}' is not TAG:value" ,
662+ field
663+ ) ) ) ;
664+ }
665+ let tag_bytes: [ u8 ; 2 ] = field. as_bytes ( ) [ ..2 ] . try_into ( ) . unwrap ( ) ;
666+ let other_tag: HeaderOtherTag < _ > =
667+ HeaderOtherTag :: try_from ( tag_bytes) . map_err ( |e| {
668+ Error :: Parameter ( format ! ( "invalid RG tag '{}': {}" , & field[ ..2 ] , e) )
669+ } ) ?;
670+ map. other_fields_mut ( ) . insert ( other_tag, field[ 3 ..] . into ( ) ) ;
671+ }
672+ builder = builder. add_read_group ( id, map) ;
673+ }
674+
613675 // @PG line
614676 builder = builder. add_program ( "ruSTAR" , Map :: < Program > :: default ( ) ) ;
615677
616678 Ok ( builder. build ( ) )
617679}
618680
681+ /// Insert `RG:Z:<id>` on the record when RG is in `attrs` and an ID is set.
682+ fn maybe_insert_rg_tag ( record : & mut RecordBuf , attrs : & HashSet < String > , rg_id : Option < & str > ) {
683+ if let ( true , Some ( id) ) = ( attrs. contains ( "RG" ) , rg_id) {
684+ record
685+ . data_mut ( )
686+ . insert ( Tag :: READ_GROUP , Value :: String ( BString :: from ( id) ) ) ;
687+ }
688+ }
689+
619690/// Convert Transcript to SAM record
620691#[ allow( clippy:: too_many_arguments) ]
621692fn transcript_to_record (
@@ -1129,6 +1200,99 @@ mod tests {
11291200 assert_eq ! ( header. reference_sequences( ) . len( ) , 1 ) ;
11301201 }
11311202
1203+ #[ test]
1204+ fn test_build_sam_header_with_rg ( ) {
1205+ let genome = make_test_genome ( ) ;
1206+ let params = Parameters :: parse_from ( vec ! [
1207+ "ruSTAR" ,
1208+ "--readFilesIn" ,
1209+ "test.fq" ,
1210+ "--outSAMattrRGline" ,
1211+ "ID:rg0" ,
1212+ "SM:sample0" ,
1213+ "LB:lib0" ,
1214+ ] ) ;
1215+ let header = build_sam_header ( & genome, & params) . unwrap ( ) ;
1216+ let rgs = header. read_groups ( ) ;
1217+ assert_eq ! ( rgs. len( ) , 1 ) ;
1218+ assert ! ( rgs. contains_key( & b"rg0" [ ..] ) ) ;
1219+ let map = rgs. get ( & b"rg0" [ ..] ) . unwrap ( ) ;
1220+ // SM and LB should be present as other_fields
1221+ let sm_tag = HeaderOtherTag :: < _ > :: try_from ( [ b'S' , b'M' ] ) . unwrap ( ) ;
1222+ let lb_tag = HeaderOtherTag :: < _ > :: try_from ( [ b'L' , b'B' ] ) . unwrap ( ) ;
1223+ let sm: & [ u8 ] = map. other_fields ( ) . get ( & sm_tag) . unwrap ( ) . as_ref ( ) ;
1224+ let lb: & [ u8 ] = map. other_fields ( ) . get ( & lb_tag) . unwrap ( ) . as_ref ( ) ;
1225+ assert_eq ! ( sm, b"sample0" ) ;
1226+ assert_eq ! ( lb, b"lib0" ) ;
1227+ }
1228+
1229+ #[ test]
1230+ fn test_sam_output_includes_rg_header_and_tag ( ) {
1231+ use crate :: align:: transcript:: Exon ;
1232+ use std:: io:: Read ;
1233+
1234+ let genome = make_test_genome ( ) ;
1235+ let params = Parameters :: parse_from ( vec ! [
1236+ "ruSTAR" ,
1237+ "--readFilesIn" ,
1238+ "test.fq" ,
1239+ "--outSAMattrRGline" ,
1240+ "ID:rg0" ,
1241+ "SM:sample0" ,
1242+ ] ) ;
1243+
1244+ let tmpfile = NamedTempFile :: new ( ) . unwrap ( ) ;
1245+ let mut writer = SamWriter :: create ( tmpfile. path ( ) , & genome, & params) . unwrap ( ) ;
1246+
1247+ let transcript = Transcript {
1248+ chr_idx : 0 ,
1249+ genome_start : 0 ,
1250+ genome_end : 4 ,
1251+ is_reverse : false ,
1252+ exons : vec ! [ Exon {
1253+ genome_start: 0 ,
1254+ genome_end: 4 ,
1255+ read_start: 0 ,
1256+ read_end: 4 ,
1257+ } ] ,
1258+ cigar : vec ! [ CigarOp :: Match ( 4 ) ] ,
1259+ score : 100 ,
1260+ n_mismatch : 0 ,
1261+ n_gap : 0 ,
1262+ n_junction : 0 ,
1263+ junction_motifs : vec ! [ ] ,
1264+ junction_annotated : vec ! [ ] ,
1265+ read_seq : vec ! [ 0 , 1 , 2 , 3 ] ,
1266+ } ;
1267+
1268+ writer
1269+ . write_alignment (
1270+ "read1" ,
1271+ & [ 0 , 1 , 2 , 3 ] ,
1272+ & [ 30 , 30 , 30 , 30 ] ,
1273+ & [ transcript] ,
1274+ & genome,
1275+ & params,
1276+ 1 ,
1277+ )
1278+ . unwrap ( ) ;
1279+ drop ( writer) ;
1280+
1281+ let mut contents = String :: new ( ) ;
1282+ std:: fs:: File :: open ( tmpfile. path ( ) )
1283+ . unwrap ( )
1284+ . read_to_string ( & mut contents)
1285+ . unwrap ( ) ;
1286+ assert ! (
1287+ contents. contains( "@RG\t ID:rg0\t SM:sample0" ) ,
1288+ "missing @RG header; got:\n {contents}"
1289+ ) ;
1290+ assert ! (
1291+ contents. contains( "RG:Z:rg0" ) ,
1292+ "missing RG:Z tag on record; got:\n {contents}"
1293+ ) ;
1294+ }
1295+
11321296 #[ test]
11331297 fn test_sam_writer_creation ( ) {
11341298 let genome = make_test_genome ( ) ;
@@ -1207,13 +1371,15 @@ mod tests {
12071371 let mate1_qual = vec ! [ 30 , 30 , 30 , 30 ] ;
12081372 let mate2_seq = vec ! [ 3 , 2 , 1 , 0 ] ; // TGCA
12091373 let mate2_qual = vec ! [ 30 , 30 , 30 , 30 ] ;
1374+ let params = Parameters :: parse_from ( vec ! [ "ruSTAR" , "--readFilesIn" , "t.fq" ] ) ;
12101375
12111376 let records = SamWriter :: build_paired_unmapped_records (
12121377 "read1" ,
12131378 & mate1_seq,
12141379 & mate1_qual,
12151380 & mate2_seq,
12161381 & mate2_qual,
1382+ & params,
12171383 )
12181384 . unwrap ( ) ;
12191385
0 commit comments