Skip to content

Commit d4a382b

Browse files
committed
feat(bam): emit XS:A: strand tag when --outSAMstrandField intronMotif
The flag was accepted at the CLI parser but never produced any XS:A: tags on the output BAM. Downstream tools that need strand inference (StringTie, Cufflinks, rseqc infer_experiment.py) saw nothing. Mirror STAR's two coupling paths: --outSAMattributes XS auto-enables --outSAMstrandField intronMotif, and vice versa. For each output record with at least one canonical intron motif, map the motif to + or - per STAR's convention; non-canonical or mixed motifs omit the tag. Fixes #30
1 parent 70be24d commit d4a382b

2 files changed

Lines changed: 210 additions & 16 deletions

File tree

src/io/sam.rs

Lines changed: 127 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,7 @@ impl SamWriter {
103103
};
104104
let effective_n = n_alignments.max(n_for_mapq);
105105
let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique);
106-
let mut attrs = params.sam_attribute_set();
107-
if params.out_sam_strand_field != "intronMotif" {
108-
attrs.remove("XS");
109-
}
106+
let attrs = params.effective_sam_attribute_set();
110107
let rg_id_owned = params.primary_rg_id()?;
111108
let rg_id = rg_id_owned.as_deref();
112109

@@ -232,10 +229,7 @@ impl SamWriter {
232229
};
233230
let effective_n = n_alignments.max(n_for_mapq);
234231
let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique);
235-
let mut attrs = params.sam_attribute_set();
236-
if params.out_sam_strand_field != "intronMotif" {
237-
attrs.remove("XS");
238-
}
232+
let attrs = params.effective_sam_attribute_set();
239233
let rg_id_owned = params.primary_rg_id()?;
240234
let rg_id = rg_id_owned.as_deref();
241235

@@ -299,10 +293,7 @@ impl SamWriter {
299293
};
300294
let effective_n = n_alignments.max(n_for_mapq);
301295
let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique);
302-
let mut attrs = params.sam_attribute_set();
303-
if params.out_sam_strand_field != "intronMotif" {
304-
attrs.remove("XS");
305-
}
296+
let attrs = params.effective_sam_attribute_set();
306297
let rg_id_owned = params.primary_rg_id()?;
307298
let rg_id = rg_id_owned.as_deref();
308299

@@ -385,10 +376,7 @@ impl SamWriter {
385376
let n_alignments = 1usize;
386377
let effective_n = n_alignments.max(n_for_mapq);
387378
let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique);
388-
let mut attrs = params.sam_attribute_set();
389-
if params.out_sam_strand_field != "intronMotif" {
390-
attrs.remove("XS");
391-
}
379+
let attrs = params.effective_sam_attribute_set();
392380
let rg_id_owned = params.primary_rg_id()?;
393381
let rg_id = rg_id_owned.as_deref();
394382

@@ -3596,4 +3584,127 @@ mod tests {
35963584
assert!(records_m2[0].flags().is_unmapped()); // mate1 is unmapped
35973585
assert!(!records_m2[1].flags().is_unmapped()); // mate2 is mapped
35983586
}
3587+
3588+
fn spliced_gtag_transcript() -> Transcript {
3589+
Transcript {
3590+
chr_idx: 0,
3591+
genome_start: 0,
3592+
genome_end: 200,
3593+
is_reverse: false,
3594+
exons: vec![],
3595+
cigar: vec![
3596+
CigarOp::Match(25),
3597+
CigarOp::RefSkip(100),
3598+
CigarOp::Match(25),
3599+
],
3600+
score: 50,
3601+
n_mismatch: 0,
3602+
n_gap: 0,
3603+
n_junction: 1,
3604+
junction_motifs: vec![SpliceMotif::GtAg],
3605+
junction_annotated: vec![false],
3606+
read_seq: vec![0; 4],
3607+
}
3608+
}
3609+
3610+
#[test]
3611+
fn xs_tag_emitted_when_strand_field_intron_motif_only() {
3612+
let genome = make_test_genome();
3613+
let params = Parameters::parse_from(vec![
3614+
"rustar-aligner",
3615+
"--readFilesIn",
3616+
"r.fq",
3617+
"--outSAMstrandField",
3618+
"intronMotif",
3619+
]);
3620+
3621+
let transcripts = vec![spliced_gtag_transcript()];
3622+
let read_seq = vec![0, 1, 2, 3];
3623+
let read_qual = vec![30, 30, 30, 30];
3624+
3625+
let records = SamWriter::build_alignment_records(
3626+
"read1",
3627+
&read_seq,
3628+
&read_qual,
3629+
&transcripts,
3630+
&genome,
3631+
&params,
3632+
1,
3633+
)
3634+
.unwrap();
3635+
3636+
assert_eq!(records.len(), 1);
3637+
let data = records[0].data();
3638+
assert_eq!(
3639+
data.get(&Tag::new(b'X', b'S')),
3640+
Some(&Value::Character(b'+')),
3641+
"XS:A:+ should be emitted when --outSAMstrandField intronMotif is set"
3642+
);
3643+
}
3644+
3645+
#[test]
3646+
fn xs_tag_emitted_when_xs_in_attributes_only() {
3647+
let genome = make_test_genome();
3648+
let params = Parameters::parse_from(vec![
3649+
"rustar-aligner",
3650+
"--readFilesIn",
3651+
"r.fq",
3652+
"--outSAMattributes",
3653+
"NH",
3654+
"HI",
3655+
"XS",
3656+
]);
3657+
3658+
let transcripts = vec![spliced_gtag_transcript()];
3659+
let read_seq = vec![0, 1, 2, 3];
3660+
let read_qual = vec![30, 30, 30, 30];
3661+
3662+
let records = SamWriter::build_alignment_records(
3663+
"read1",
3664+
&read_seq,
3665+
&read_qual,
3666+
&transcripts,
3667+
&genome,
3668+
&params,
3669+
1,
3670+
)
3671+
.unwrap();
3672+
3673+
assert_eq!(records.len(), 1);
3674+
let data = records[0].data();
3675+
assert_eq!(
3676+
data.get(&Tag::new(b'X', b'S')),
3677+
Some(&Value::Character(b'+')),
3678+
"XS:A:+ should be emitted when XS is listed in --outSAMattributes"
3679+
);
3680+
}
3681+
3682+
#[test]
3683+
fn xs_tag_absent_when_neither_xs_nor_intron_motif_requested() {
3684+
let genome = make_test_genome();
3685+
let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "r.fq"]);
3686+
3687+
let transcripts = vec![spliced_gtag_transcript()];
3688+
let read_seq = vec![0, 1, 2, 3];
3689+
let read_qual = vec![30, 30, 30, 30];
3690+
3691+
let records = SamWriter::build_alignment_records(
3692+
"read1",
3693+
&read_seq,
3694+
&read_qual,
3695+
&transcripts,
3696+
&genome,
3697+
&params,
3698+
1,
3699+
)
3700+
.unwrap();
3701+
3702+
assert_eq!(records.len(), 1);
3703+
let data = records[0].data();
3704+
assert_eq!(
3705+
data.get(&Tag::new(b'X', b'S')),
3706+
None,
3707+
"XS should not be emitted under default attributes and strand field"
3708+
);
3709+
}
35993710
}

src/params.rs

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,33 @@ impl Parameters {
777777
attrs
778778
}
779779

780+
/// Effective `--outSAMattributes` set after coupling with `--outSAMstrandField`.
781+
///
782+
/// Mirrors STAR's `Parameters_samAttributes.cpp:172-179` and `:213-216`:
783+
/// `XS` in the attribute set and `--outSAMstrandField intronMotif` imply
784+
/// each other. If either is set the returned set contains `XS`; otherwise
785+
/// `XS` is excluded even if it was listed in `--outSAMattributes` without
786+
/// a matching strand field (STAR forces the strand field on, so XS stays).
787+
pub fn effective_sam_attribute_set(&self) -> HashSet<String> {
788+
let mut attrs = self.sam_attribute_set();
789+
if self.effective_out_sam_strand_field() == "intronMotif" {
790+
attrs.insert("XS".to_string());
791+
}
792+
attrs
793+
}
794+
795+
/// Effective `--outSAMstrandField` value after coupling with `--outSAMattributes`.
796+
///
797+
/// `XS` in `--outSAMattributes` forces `intronMotif` even when the user
798+
/// left `--outSAMstrandField` at its default (`None`).
799+
pub fn effective_out_sam_strand_field(&self) -> &str {
800+
if self.out_sam_strand_field == "intronMotif" || self.sam_attribute_set().contains("XS") {
801+
"intronMotif"
802+
} else {
803+
self.out_sam_strand_field.as_str()
804+
}
805+
}
806+
780807
/// True if the user provided a non-default `--outSAMattrRGline`.
781808
pub fn rg_line_set(&self) -> bool {
782809
!self.out_sam_attr_rg_line.is_empty() && self.out_sam_attr_rg_line[0] != "-"
@@ -953,6 +980,19 @@ impl Parameters {
953980
}
954981
self.rg_ids()?;
955982

983+
let user_xs_in_attrs = self.sam_attribute_set().contains("XS");
984+
let user_strand_intron_motif = self.out_sam_strand_field == "intronMotif";
985+
if user_xs_in_attrs && !user_strand_intron_motif {
986+
log::info!(
987+
"--outSAMattributes contains XS, therefore rustar-aligner will use --outSAMstrandField intronMotif"
988+
);
989+
}
990+
if user_strand_intron_motif && !user_xs_in_attrs {
991+
log::info!(
992+
"--outSAMstrandField=intronMotif, therefore rustar-aligner will output XS attribute"
993+
);
994+
}
995+
956996
// quantMode TranscriptomeSAM requires transcript annotations —
957997
// either via --sjdbGTFfile or pre-generated transcriptInfo.tab
958998
// et al in --genomeDir (persisted at genomeGenerate time). At
@@ -1488,4 +1528,47 @@ mod tests {
14881528
]);
14891529
assert_eq!(p.align_sj_stitch_mismatch_nmax, vec![1, -1, 2, 3]);
14901530
}
1531+
1532+
#[test]
1533+
fn xs_strand_field_intron_motif_adds_xs_to_attrs() {
1534+
let p = parse(&[
1535+
"--readFilesIn",
1536+
"r.fq",
1537+
"--outSAMstrandField",
1538+
"intronMotif",
1539+
]);
1540+
let attrs = p.effective_sam_attribute_set();
1541+
assert!(attrs.contains("XS"));
1542+
assert_eq!(p.effective_out_sam_strand_field(), "intronMotif");
1543+
}
1544+
1545+
#[test]
1546+
fn xs_attr_forces_intron_motif_strand_field() {
1547+
let p = parse(&[
1548+
"--readFilesIn",
1549+
"r.fq",
1550+
"--outSAMattributes",
1551+
"NH",
1552+
"HI",
1553+
"XS",
1554+
]);
1555+
assert_eq!(p.effective_out_sam_strand_field(), "intronMotif");
1556+
let attrs = p.effective_sam_attribute_set();
1557+
assert!(attrs.contains("XS"));
1558+
}
1559+
1560+
#[test]
1561+
fn xs_coupling_dormant_without_user_request() {
1562+
let p = parse(&["--readFilesIn", "r.fq"]);
1563+
let attrs = p.effective_sam_attribute_set();
1564+
assert!(!attrs.contains("XS"));
1565+
assert_eq!(p.effective_out_sam_strand_field(), "None");
1566+
}
1567+
1568+
#[test]
1569+
fn xs_attr_via_all_preset_couples_strand_field() {
1570+
let p = parse(&["--readFilesIn", "r.fq", "--outSAMattributes", "All"]);
1571+
assert_eq!(p.effective_out_sam_strand_field(), "intronMotif");
1572+
assert!(p.effective_sam_attribute_set().contains("XS"));
1573+
}
14911574
}

0 commit comments

Comments
 (0)