Skip to content

Commit 6740954

Browse files
ewelsclaude
andcommitted
Add --outSAMattrRGline flag and emit @rg header + RG:Z tags
Parses STAR's comma-separated multi-RG syntax (first field must be ID: on each block, `,` separates blocks, tokens are tab-joined into the SAM @rg body). Replicates a single RG line across N input files like STAR does, and errors on count mismatch or RG-in-outSAMattributes without a line. Auto-appends RG to sam_attribute_set() when a line is set (matches Parameters_samAttributes.cpp:201). SAM/BAM headers now emit one @rg per unique ID, and every record (mapped, unmapped, half-mapped, both- mates) carries RG:Z:<id> when attrs contain RG. Unblocks nf-core/rnaseq, which invokes STAR with `--outSAMattrRGline 'ID:X' 'SM:X'` and previously errored on the unknown flag. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 023e88e commit 6740954

4 files changed

Lines changed: 378 additions & 12 deletions

File tree

src/io/bam.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ mod tests {
125125
let read_qual = vec![30, 30, 30, 30];
126126

127127
// Build record using SAM builder
128-
let record =
129-
crate::io::sam::SamWriter::build_unmapped_record(read_name, &read_seq, &read_qual)
130-
.unwrap();
128+
let record = crate::io::sam::SamWriter::build_unmapped_record(
129+
read_name, &read_seq, &read_qual, None,
130+
)
131+
.unwrap();
131132

132133
let result = writer.write_batch(&[record]);
133134
assert!(result.is_ok(), "Writing unmapped read should succeed");
@@ -204,12 +205,14 @@ mod tests {
204205
"read1",
205206
&[0, 1, 2, 3],
206207
&[30, 30, 30, 30],
208+
None,
207209
)
208210
.unwrap(),
209211
crate::io::sam::SamWriter::build_unmapped_record(
210212
"read2",
211213
&[0, 1, 2, 3],
212214
&[30, 30, 30, 30],
215+
None,
213216
)
214217
.unwrap(),
215218
];

src/io/sam.rs

Lines changed: 174 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@ use noodles::sam::alignment::record::data::field::Tag;
1515
use noodles::sam::alignment::record_buf::data::field::Value;
1616
use noodles::sam::alignment::record_buf::data::field::value::Array;
1717
use noodles::sam::alignment::record_buf::{QualityScores, RecordBuf, Sequence};
18-
use noodles::sam::header::record::value::{Map, map::Program};
18+
use noodles::sam::header::record::value::{
19+
Map,
20+
map::{Program, ReadGroup, tag::Other as HeaderOtherTag},
21+
};
1922
use std::collections::HashSet;
2023
use std::fmt::Write as FmtWrite;
2124
use std::fs::File;
@@ -104,9 +107,11 @@ impl SamWriter {
104107
if params.out_sam_strand_field != "intronMotif" {
105108
attrs.remove("XS");
106109
}
110+
let rg_ids = params.rg_ids()?;
111+
let rg_id = rg_ids.first().map(String::as_str);
107112

108113
for (hit_index, transcript) in transcripts.iter().take(max_output).enumerate() {
109-
let record = transcript_to_record(
114+
let mut record = transcript_to_record(
110115
transcript,
111116
read_name,
112117
read_seq,
@@ -117,6 +122,7 @@ impl SamWriter {
117122
hit_index + 1, // 1-based
118123
&attrs,
119124
)?;
125+
maybe_insert_rg_tag(&mut record, &attrs, rg_id);
120126

121127
self.writer.write_alignment_record(&self.header, &record)?;
122128
}
@@ -136,7 +142,7 @@ impl SamWriter {
136142
read_seq: &[u8],
137143
read_qual: &[u8],
138144
) -> Result<(), Error> {
139-
let record = Self::build_unmapped_record(read_name, read_seq, read_qual)?;
145+
let record = Self::build_unmapped_record(read_name, read_seq, read_qual, None)?;
140146
self.writer.write_alignment_record(&self.header, &record)?;
141147
Ok(())
142148
}
@@ -185,10 +191,12 @@ impl SamWriter {
185191
/// * `read_name` - Read identifier
186192
/// * `read_seq` - Read sequence (encoded)
187193
/// * `read_qual` - Quality scores
194+
/// * `rg_id` - Optional read group ID to emit as `RG:Z:<id>` tag
188195
pub fn build_unmapped_record(
189196
read_name: &str,
190197
read_seq: &[u8],
191198
read_qual: &[u8],
199+
rg_id: Option<&str>,
192200
) -> Result<RecordBuf, Error> {
193201
let mut record = RecordBuf::default();
194202

@@ -206,6 +214,12 @@ impl SamWriter {
206214
// Quality scores
207215
*record.quality_scores_mut() = QualityScores::from(read_qual.to_vec());
208216

217+
if let Some(id) = rg_id {
218+
record
219+
.data_mut()
220+
.insert(Tag::READ_GROUP, Value::String(BString::from(id)));
221+
}
222+
209223
Ok(record)
210224
}
211225

@@ -243,10 +257,12 @@ impl SamWriter {
243257
if params.out_sam_strand_field != "intronMotif" {
244258
attrs.remove("XS");
245259
}
260+
let rg_ids = params.rg_ids()?;
261+
let rg_id = rg_ids.first().map(String::as_str);
246262

247263
let mut records = Vec::with_capacity(max_output);
248264
for (hit_index, transcript) in transcripts.iter().take(max_output).enumerate() {
249-
let record = transcript_to_record(
265+
let mut record = transcript_to_record(
250266
transcript,
251267
read_name,
252268
read_seq,
@@ -257,6 +273,7 @@ impl SamWriter {
257273
hit_index + 1, // 1-based
258274
&attrs,
259275
)?;
276+
maybe_insert_rg_tag(&mut record, &attrs, rg_id);
260277
records.push(record);
261278
}
262279

@@ -291,7 +308,7 @@ impl SamWriter {
291308
if paired_alignments.is_empty() {
292309
// Both mates unmapped
293310
return Self::build_paired_unmapped_records(
294-
read_name, mate1_seq, mate1_qual, mate2_seq, mate2_qual,
311+
read_name, mate1_seq, mate1_qual, mate2_seq, mate2_qual, params,
295312
);
296313
}
297314

@@ -307,6 +324,8 @@ impl SamWriter {
307324
if params.out_sam_strand_field != "intronMotif" {
308325
attrs.remove("XS");
309326
}
327+
let rg_ids = params.rg_ids()?;
328+
let rg_id = rg_ids.first().map(String::as_str);
310329

311330
let mut records = Vec::with_capacity(max_output * 2);
312331

@@ -317,7 +336,7 @@ impl SamWriter {
317336
let combined_score = paired_aln.combined_wt_score;
318337

319338
// Create record for mate1 (this=mate1, mate=mate2)
320-
let rec1 = build_paired_mate_record(
339+
let mut rec1 = build_paired_mate_record(
321340
read_name,
322341
mate1_seq,
323342
mate1_qual,
@@ -333,10 +352,11 @@ impl SamWriter {
333352
combined_score,
334353
&attrs,
335354
)?;
355+
maybe_insert_rg_tag(&mut rec1, &attrs, rg_id);
336356
records.push(rec1);
337357

338358
// Create record for mate2 (this=mate2, mate=mate1)
339-
let rec2 = build_paired_mate_record(
359+
let mut rec2 = build_paired_mate_record(
340360
read_name,
341361
mate2_seq,
342362
mate2_qual,
@@ -352,6 +372,7 @@ impl SamWriter {
352372
combined_score,
353373
&attrs,
354374
)?;
375+
maybe_insert_rg_tag(&mut rec2, &attrs, rg_id);
355376
records.push(rec2);
356377
}
357378

@@ -389,6 +410,8 @@ impl SamWriter {
389410
if params.out_sam_strand_field != "intronMotif" {
390411
attrs.remove("XS");
391412
}
413+
let rg_ids = params.rg_ids()?;
414+
let rg_id = rg_ids.first().map(String::as_str);
392415

393416
// Compute mapped mate's per-chr position for co-location
394417
let chr_start = genome.chr_start[mapped_transcript.chr_idx];
@@ -491,6 +514,7 @@ impl SamWriter {
491514
);
492515
data.insert(Tag::new(b'M', b'D'), Value::String(BString::from(md)));
493516
}
517+
maybe_insert_rg_tag(&mut mapped_rec, &attrs, rg_id);
494518

495519
// --- Build unmapped mate record ---
496520
let mut unmapped_rec = RecordBuf::default();
@@ -530,6 +554,7 @@ impl SamWriter {
530554
let unmapped_seq_bytes: Vec<u8> = unmapped_seq.iter().map(|&b| decode_base(b)).collect();
531555
*unmapped_rec.sequence_mut() = Sequence::from(unmapped_seq_bytes);
532556
*unmapped_rec.quality_scores_mut() = QualityScores::from(unmapped_qual.to_vec());
557+
maybe_insert_rg_tag(&mut unmapped_rec, &attrs, rg_id);
533558

534559
// Order: mate1 first, mate2 second
535560
if mate1_is_mapped {
@@ -550,8 +575,12 @@ impl SamWriter {
550575
mate1_qual: &[u8],
551576
mate2_seq: &[u8],
552577
mate2_qual: &[u8],
578+
params: &Parameters,
553579
) -> Result<Vec<RecordBuf>, Error> {
554580
let mut records = Vec::with_capacity(2);
581+
let attrs = params.sam_attribute_set();
582+
let rg_ids = params.rg_ids()?;
583+
let rg_id = rg_ids.first().map(String::as_str);
555584

556585
// Mate1 record
557586
let mut rec1 = RecordBuf::default();
@@ -567,6 +596,7 @@ impl SamWriter {
567596
let seq1_bytes: Vec<u8> = mate1_seq.iter().map(|&b| decode_base(b)).collect();
568597
*rec1.sequence_mut() = Sequence::from(seq1_bytes);
569598
*rec1.quality_scores_mut() = QualityScores::from(mate1_qual.to_vec());
599+
maybe_insert_rg_tag(&mut rec1, &attrs, rg_id);
570600
records.push(rec1);
571601

572602
// Mate2 record
@@ -583,14 +613,15 @@ impl SamWriter {
583613
let seq2_bytes: Vec<u8> = mate2_seq.iter().map(|&b| decode_base(b)).collect();
584614
*rec2.sequence_mut() = Sequence::from(seq2_bytes);
585615
*rec2.quality_scores_mut() = QualityScores::from(mate2_qual.to_vec());
616+
maybe_insert_rg_tag(&mut rec2, &attrs, rg_id);
586617
records.push(rec2);
587618

588619
Ok(records)
589620
}
590621
}
591622

592623
/// Build paired SAM header from genome
593-
pub fn build_sam_header(genome: &Genome, _params: &Parameters) -> Result<sam::Header, Error> {
624+
pub fn build_sam_header(genome: &Genome, params: &Parameters) -> Result<sam::Header, Error> {
594625
let mut builder = sam::Header::builder();
595626

596627
// @HD line (default version and unsorted)
@@ -610,12 +641,52 @@ pub fn build_sam_header(genome: &Genome, _params: &Parameters) -> Result<sam::He
610641
);
611642
}
612643

644+
// @RG lines from --outSAMattrRGline. When multiple input files share the
645+
// same RG ID, only emit one @RG line.
646+
let rg_lines = params.parsed_rg_lines()?;
647+
let mut seen_ids: HashSet<String> = HashSet::new();
648+
for line in &rg_lines {
649+
let mut fields = line.split('\t');
650+
let id = fields
651+
.next()
652+
.and_then(|f| f.strip_prefix("ID:"))
653+
.ok_or_else(|| Error::Parameter(format!("malformed RG line '{}'", line)))?;
654+
if !seen_ids.insert(id.to_string()) {
655+
continue;
656+
}
657+
let mut map = Map::<ReadGroup>::default();
658+
for field in fields {
659+
if field.len() < 3 || &field[2..3] != ":" {
660+
return Err(Error::Parameter(format!(
661+
"RG field '{}' is not TAG:value",
662+
field
663+
)));
664+
}
665+
let tag_bytes: [u8; 2] = field.as_bytes()[..2].try_into().unwrap();
666+
let other_tag: HeaderOtherTag<_> =
667+
HeaderOtherTag::try_from(tag_bytes).map_err(|e| {
668+
Error::Parameter(format!("invalid RG tag '{}': {}", &field[..2], e))
669+
})?;
670+
map.other_fields_mut().insert(other_tag, field[3..].into());
671+
}
672+
builder = builder.add_read_group(id, map);
673+
}
674+
613675
// @PG line
614676
builder = builder.add_program("ruSTAR", Map::<Program>::default());
615677

616678
Ok(builder.build())
617679
}
618680

681+
/// Insert `RG:Z:<id>` on the record when RG is in `attrs` and an ID is set.
682+
fn maybe_insert_rg_tag(record: &mut RecordBuf, attrs: &HashSet<String>, rg_id: Option<&str>) {
683+
if let (true, Some(id)) = (attrs.contains("RG"), rg_id) {
684+
record
685+
.data_mut()
686+
.insert(Tag::READ_GROUP, Value::String(BString::from(id)));
687+
}
688+
}
689+
619690
/// Convert Transcript to SAM record
620691
#[allow(clippy::too_many_arguments)]
621692
fn transcript_to_record(
@@ -1129,6 +1200,99 @@ mod tests {
11291200
assert_eq!(header.reference_sequences().len(), 1);
11301201
}
11311202

1203+
#[test]
1204+
fn test_build_sam_header_with_rg() {
1205+
let genome = make_test_genome();
1206+
let params = Parameters::parse_from(vec![
1207+
"ruSTAR",
1208+
"--readFilesIn",
1209+
"test.fq",
1210+
"--outSAMattrRGline",
1211+
"ID:rg0",
1212+
"SM:sample0",
1213+
"LB:lib0",
1214+
]);
1215+
let header = build_sam_header(&genome, &params).unwrap();
1216+
let rgs = header.read_groups();
1217+
assert_eq!(rgs.len(), 1);
1218+
assert!(rgs.contains_key(&b"rg0"[..]));
1219+
let map = rgs.get(&b"rg0"[..]).unwrap();
1220+
// SM and LB should be present as other_fields
1221+
let sm_tag = HeaderOtherTag::<_>::try_from([b'S', b'M']).unwrap();
1222+
let lb_tag = HeaderOtherTag::<_>::try_from([b'L', b'B']).unwrap();
1223+
let sm: &[u8] = map.other_fields().get(&sm_tag).unwrap().as_ref();
1224+
let lb: &[u8] = map.other_fields().get(&lb_tag).unwrap().as_ref();
1225+
assert_eq!(sm, b"sample0");
1226+
assert_eq!(lb, b"lib0");
1227+
}
1228+
1229+
#[test]
1230+
fn test_sam_output_includes_rg_header_and_tag() {
1231+
use crate::align::transcript::Exon;
1232+
use std::io::Read;
1233+
1234+
let genome = make_test_genome();
1235+
let params = Parameters::parse_from(vec![
1236+
"ruSTAR",
1237+
"--readFilesIn",
1238+
"test.fq",
1239+
"--outSAMattrRGline",
1240+
"ID:rg0",
1241+
"SM:sample0",
1242+
]);
1243+
1244+
let tmpfile = NamedTempFile::new().unwrap();
1245+
let mut writer = SamWriter::create(tmpfile.path(), &genome, &params).unwrap();
1246+
1247+
let transcript = Transcript {
1248+
chr_idx: 0,
1249+
genome_start: 0,
1250+
genome_end: 4,
1251+
is_reverse: false,
1252+
exons: vec![Exon {
1253+
genome_start: 0,
1254+
genome_end: 4,
1255+
read_start: 0,
1256+
read_end: 4,
1257+
}],
1258+
cigar: vec![CigarOp::Match(4)],
1259+
score: 100,
1260+
n_mismatch: 0,
1261+
n_gap: 0,
1262+
n_junction: 0,
1263+
junction_motifs: vec![],
1264+
junction_annotated: vec![],
1265+
read_seq: vec![0, 1, 2, 3],
1266+
};
1267+
1268+
writer
1269+
.write_alignment(
1270+
"read1",
1271+
&[0, 1, 2, 3],
1272+
&[30, 30, 30, 30],
1273+
&[transcript],
1274+
&genome,
1275+
&params,
1276+
1,
1277+
)
1278+
.unwrap();
1279+
drop(writer);
1280+
1281+
let mut contents = String::new();
1282+
std::fs::File::open(tmpfile.path())
1283+
.unwrap()
1284+
.read_to_string(&mut contents)
1285+
.unwrap();
1286+
assert!(
1287+
contents.contains("@RG\tID:rg0\tSM:sample0"),
1288+
"missing @RG header; got:\n{contents}"
1289+
);
1290+
assert!(
1291+
contents.contains("RG:Z:rg0"),
1292+
"missing RG:Z tag on record; got:\n{contents}"
1293+
);
1294+
}
1295+
11321296
#[test]
11331297
fn test_sam_writer_creation() {
11341298
let genome = make_test_genome();
@@ -1207,13 +1371,15 @@ mod tests {
12071371
let mate1_qual = vec![30, 30, 30, 30];
12081372
let mate2_seq = vec![3, 2, 1, 0]; // TGCA
12091373
let mate2_qual = vec![30, 30, 30, 30];
1374+
let params = Parameters::parse_from(vec!["ruSTAR", "--readFilesIn", "t.fq"]);
12101375

12111376
let records = SamWriter::build_paired_unmapped_records(
12121377
"read1",
12131378
&mate1_seq,
12141379
&mate1_qual,
12151380
&mate2_seq,
12161381
&mate2_qual,
1382+
&params,
12171383
)
12181384
.unwrap();
12191385

0 commit comments

Comments
 (0)