Skip to content

Commit 27e3da1

Browse files
committed
fix(transcriptome-bam): write per-record RG:Z: tag matching @rg header
When --outSAMattrRGline is supplied, rustar writes the @rg header to both the genome and transcriptome BAMs, but only stamps the per-record RG:Z: tag on the genome BAM. The transcriptome BAM was emitting 0 RG:Z: records vs STAR's 1-per-record output. Add the RG tag stamp to the transcriptome record builder (paired-end and single-end paths) so every record carries RG:Z:<id> matching the @rg header, byte-symmetric with STAR. Fixes #32
1 parent 70be24d commit 27e3da1

1 file changed

Lines changed: 125 additions & 0 deletions

File tree

src/io/sam.rs

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,12 @@ impl SamWriter {
588588
let n_alignments = projected.len();
589589
let mut records = Vec::with_capacity(n_alignments);
590590

591+
// STAR pushes ATTR_RG onto both outSAMattrOrder and outSAMattrOrderQuant
592+
// when --outSAMattrRGline is set; mirror that here so transcriptome
593+
// records get the per-record RG:Z:<id> tag matching the @RG header.
594+
let rg_id_owned = params.primary_rg_id()?;
595+
let rg_id = rg_id_owned.as_deref();
596+
591597
for (hit_idx, t) in projected.iter().enumerate() {
592598
let mut record = RecordBuf::default();
593599
record.name_mut().replace(read_name.into());
@@ -653,6 +659,8 @@ impl SamWriter {
653659
data.insert(Tag::new(b'n', b'M'), Value::from(t.n_mismatch as i32));
654660
}
655661

662+
maybe_insert_rg_tag(&mut record, rg_id);
663+
656664
records.push(record);
657665
}
658666

@@ -1524,6 +1532,123 @@ mod tests {
15241532
assert!(!record.flags().is_secondary());
15251533
}
15261534

1535+
#[test]
1536+
fn test_build_transcriptome_records_stamps_rg_tag() {
1537+
// Regression test for #32: when --outSAMattrRGline is supplied, every
1538+
// transcriptome record must carry the RG:Z:<id> tag matching the @RG
1539+
// header (mirroring STAR's outSAMattrOrderQuant.push_back(ATTR_RG) at
1540+
// Parameters_samAttributes.cpp:201-205).
1541+
let params = Parameters::parse_from(vec![
1542+
"rustar-aligner",
1543+
"--readFilesIn",
1544+
"test.fq",
1545+
"--outSAMattrRGline",
1546+
"ID:rg0",
1547+
"SM:sample0",
1548+
]);
1549+
1550+
let projected = vec![
1551+
Transcript {
1552+
chr_idx: 0,
1553+
genome_start: 0,
1554+
genome_end: 4,
1555+
is_reverse: false,
1556+
exons: vec![],
1557+
cigar: vec![CigarOp::Match(4)],
1558+
score: 100,
1559+
n_mismatch: 0,
1560+
n_gap: 0,
1561+
n_junction: 0,
1562+
junction_motifs: vec![],
1563+
junction_annotated: vec![],
1564+
read_seq: vec![0, 1, 2, 3],
1565+
},
1566+
Transcript {
1567+
chr_idx: 0,
1568+
genome_start: 2,
1569+
genome_end: 6,
1570+
is_reverse: true,
1571+
exons: vec![],
1572+
cigar: vec![CigarOp::Match(4)],
1573+
score: 90,
1574+
n_mismatch: 1,
1575+
n_gap: 0,
1576+
n_junction: 0,
1577+
junction_motifs: vec![],
1578+
junction_annotated: vec![],
1579+
read_seq: vec![0, 1, 2, 3],
1580+
},
1581+
];
1582+
1583+
let records = SamWriter::build_transcriptome_records(
1584+
"read1",
1585+
&[0, 1, 2, 3],
1586+
&[30, 30, 30, 30],
1587+
&projected,
1588+
255,
1589+
&params,
1590+
0,
1591+
)
1592+
.expect("build_transcriptome_records");
1593+
1594+
assert_eq!(records.len(), 2, "expected one record per projected hit");
1595+
for (i, rec) in records.iter().enumerate() {
1596+
let rg = rec
1597+
.data()
1598+
.get(&Tag::READ_GROUP)
1599+
.unwrap_or_else(|| panic!("record {i} missing RG tag"));
1600+
match rg {
1601+
Value::String(s) => assert_eq!(
1602+
s.as_slice(),
1603+
b"rg0",
1604+
"record {i}: expected RG:Z:rg0, got {s:?}"
1605+
),
1606+
other => panic!("record {i}: RG tag is not a string: {other:?}"),
1607+
}
1608+
}
1609+
}
1610+
1611+
#[test]
1612+
fn test_build_transcriptome_records_no_rg_when_unset() {
1613+
// When --outSAMattrRGline is not supplied, no RG:Z: tag should be
1614+
// emitted on transcriptome records (matching the existing genome-BAM
1615+
// behavior gated on `rg_id.is_some()`).
1616+
let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]);
1617+
1618+
let projected = vec![Transcript {
1619+
chr_idx: 0,
1620+
genome_start: 0,
1621+
genome_end: 4,
1622+
is_reverse: false,
1623+
exons: vec![],
1624+
cigar: vec![CigarOp::Match(4)],
1625+
score: 100,
1626+
n_mismatch: 0,
1627+
n_gap: 0,
1628+
n_junction: 0,
1629+
junction_motifs: vec![],
1630+
junction_annotated: vec![],
1631+
read_seq: vec![0, 1, 2, 3],
1632+
}];
1633+
1634+
let records = SamWriter::build_transcriptome_records(
1635+
"read1",
1636+
&[0, 1, 2, 3],
1637+
&[30, 30, 30, 30],
1638+
&projected,
1639+
255,
1640+
&params,
1641+
0,
1642+
)
1643+
.expect("build_transcriptome_records");
1644+
1645+
assert_eq!(records.len(), 1);
1646+
assert!(
1647+
records[0].data().get(&Tag::READ_GROUP).is_none(),
1648+
"RG tag should not be present when --outSAMattrRGline is unset"
1649+
);
1650+
}
1651+
15271652
#[test]
15281653
fn test_build_paired_unmapped_records() {
15291654
let mate1_seq = vec![0, 1, 2, 3]; // ACGT

0 commit comments

Comments
 (0)