Skip to content

Commit 02cc0e1

Browse files
committed
Merge PR #7: --quantMode TranscriptomeSAM output
- Transcriptome index built from GTF at genomeGenerate, persisted to disk - Genome alignment projection onto transcripts (align_to_transcripts) - Aligned.toTranscriptome.out.bam pipeline - fix(params): don't require --sjdbGTFfile at alignReads when index has it - fix(io/bam): lenient BAM header validator for STAR tRNA names - fix(genome): match STAR's genomeParameters.txt byte-for-byte - fix(quant): STAR-exact geneInfo/exonGeTrInfo/sjdbList tab formats - Exon gains i_frag: u8 field (mirrors STAR's EX_iFrag) - try_pair_transcripts simplified (single caller always supplies score)
2 parents 1bdbb9a + a26d636 commit 02cc0e1

15 files changed

Lines changed: 3877 additions & 116 deletions

File tree

src/align/read_align.rs

Lines changed: 127 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
use crate::align::score::{AlignmentScorer, SpliceMotif};
33
use crate::align::seed::Seed;
44
use crate::align::stitch::{
5-
cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core,
6-
stitch_seeds_with_jdb_debug, PE_SPACER_BASE,
5+
PE_SPACER_BASE, cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core,
6+
stitch_seeds_with_jdb_debug,
77
};
8-
use crate::align::transcript::Transcript;
8+
use crate::align::transcript::{Exon, Transcript};
99
use crate::error::Error;
1010
use crate::index::GenomeIndex;
1111
use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters};
@@ -20,7 +20,7 @@ use std::hash::{DefaultHasher, Hash, Hasher};
2020
/// via rayon, so we instead fold the read name into the seed — this keeps tie
2121
/// breaks reproducible regardless of thread count while still honoring the
2222
/// user's `--runRNGseed` value.
23-
fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
23+
pub(crate) fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
2424
let mut hasher = DefaultHasher::new();
2525
read_name.hash(&mut hasher);
2626
run_rng_seed.wrapping_mul(hasher.finish().wrapping_add(1))
@@ -74,6 +74,52 @@ pub struct PairedAlignment {
7474
pub combined_n_match: u32,
7575
}
7676

77+
impl PairedAlignment {
78+
/// Build a STAR-style combined two-mate `Transcript` for transcriptome
79+
/// projection.
80+
///
81+
/// Matches STAR's single-`Transcript`-per-pair model: mate1 exons with
82+
/// `i_frag = 0`, then mate2 exons rewritten to `i_frag = 1`. Only
83+
/// meaningful for pairs on the same chromosome and strand — both are
84+
/// invariants of a `PairedAlignment` (checked in `try_pair_transcripts`).
85+
///
86+
/// The returned transcript's `cigar` is empty: transcriptome BAM
87+
/// emission generates per-mate CIGARs from the split exon list rather
88+
/// than consuming a combined one.
89+
pub fn combined_transcript_for_projection(&self) -> Transcript {
90+
let m1 = &self.mate1_transcript;
91+
let m2 = &self.mate2_transcript;
92+
93+
let mut exons: Vec<Exon> = Vec::with_capacity(m1.exons.len() + m2.exons.len());
94+
for e in &m1.exons {
95+
let mut ee = e.clone();
96+
ee.i_frag = 0;
97+
exons.push(ee);
98+
}
99+
for e in &m2.exons {
100+
let mut ee = e.clone();
101+
ee.i_frag = 1;
102+
exons.push(ee);
103+
}
104+
105+
Transcript {
106+
chr_idx: m1.chr_idx,
107+
genome_start: m1.genome_start.min(m2.genome_start),
108+
genome_end: m1.genome_end.max(m2.genome_end),
109+
is_reverse: m1.is_reverse,
110+
exons,
111+
cigar: Vec::new(),
112+
score: m1.score + m2.score,
113+
n_mismatch: m1.n_mismatch + m2.n_mismatch,
114+
n_gap: m1.n_gap + m2.n_gap,
115+
n_junction: m1.n_junction + m2.n_junction,
116+
junction_motifs: Vec::new(),
117+
junction_annotated: Vec::new(),
118+
read_seq: Vec::new(),
119+
}
120+
}
121+
}
122+
77123
/// Result of paired-end alignment, covering all mapping outcomes.
78124
#[derive(Debug, Clone)]
79125
pub enum PairedAlignmentResult {
@@ -634,8 +680,13 @@ pub fn align_paired_read(
634680
// PE_SPACER_BASE=11 stops MMP search at the boundary — seeds never span mates.
635681
// For 301bp: Nstart=7 (301/50+1). Position 129 gives ≤21bp seeds (spacer at 150)
636682
// → less likely unique for repetitive (rDNA) sequences → fewer N² cross-copy pairings.
637-
let mut combined_seeds =
638-
Seed::find_seeds(&combined_read, index, params.seed_map_min, params, debug_name)?;
683+
let mut combined_seeds = Seed::find_seeds(
684+
&combined_read,
685+
index,
686+
params.seed_map_min,
687+
params,
688+
debug_name,
689+
)?;
639690
// After find_seeds, ALL seeds (search_rc=false and search_rc=true) have read_pos
640691
// in combined_read coordinates. The RC conversion in search_direction_sparse:
641692
// seed.read_pos = combined_len - p - L (where p is position in RC(combined_read))
@@ -670,13 +721,7 @@ pub fn align_paired_read(
670721
)?;
671722

672723
for wt in &wts {
673-
match split_combined_wt(
674-
wt,
675-
len1,
676-
len2,
677-
stitch_is_reverse,
678-
scorer.align_intron_min,
679-
) {
724+
match split_combined_wt(wt, len1, len2, stitch_is_reverse, scorer.align_intron_min) {
680725
Some((m1_wt, m2_wt)) => {
681726
let (m1_read_slice, m1_orig_rev, m2_read_slice, m2_orig_rev) =
682727
if stitch_is_reverse {
@@ -706,8 +751,8 @@ pub fn align_paired_read(
706751
&scorer,
707752
&stitch_cluster,
708753
m1_orig_rev,
709-
m1_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
710-
!m1_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
754+
m1_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
755+
!m1_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
711756
) else {
712757
continue;
713758
};
@@ -718,8 +763,8 @@ pub fn align_paired_read(
718763
&scorer,
719764
&stitch_cluster,
720765
m2_orig_rev,
721-
m2_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
722-
!m2_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
766+
m2_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
767+
!m2_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
723768
) else {
724769
continue;
725770
};
@@ -736,9 +781,7 @@ pub fn align_paired_read(
736781

737782
let combined_span =
738783
t1.genome_end.max(t2.genome_end) - t1.genome_start.min(t2.genome_start);
739-
let combined_score_override = Some(
740-
wt.score + scorer.genomic_length_penalty(combined_span),
741-
);
784+
let combined_wt_score = wt.score + scorer.genomic_length_penalty(combined_span);
742785

743786
if let Some(pair) = try_pair_transcripts(
744787
&t1,
@@ -747,8 +790,7 @@ pub fn align_paired_read(
747790
len2,
748791
params,
749792
combined_score_threshold,
750-
&scorer,
751-
combined_score_override,
793+
combined_wt_score,
752794
) {
753795
joint_pairs.push(pair);
754796
}
@@ -803,7 +845,6 @@ pub fn align_paired_read(
803845
}
804846
}
805847

806-
807848
// --- Decision tree: dedup, score-filter, quality-filter, then half-mapped fallback ---
808849

809850
// Step 1: position dedup — remove exact (chr, mate1_pos, mate2_pos, strand, CIGAR) duplicates.
@@ -961,19 +1002,17 @@ pub fn align_paired_read(
9611002
}
9621003

9631004
// Half-mapped fallback: report the best-scoring single-mate transcript.
964-
// STAR applies quality filter to the COMBINED read (Lread-1 = len1+len2), so use the
965-
// same combined_score_threshold here. This matches STAR's behavior where a single-mate
966-
// alignment that falls below the combined threshold is not output at all.
967-
let thresh1 = combined_score_threshold.max(params.out_filter_score_min);
968-
let thresh2 = combined_score_threshold.max(params.out_filter_score_min);
1005+
// STAR applies the quality filter to the COMBINED read (Lread-1 = len1+len2), so we
1006+
// use the same threshold for each mate here.
1007+
let single_mate_threshold = combined_score_threshold.max(params.out_filter_score_min);
9691008

9701009
let best_m1 = single_mate1_transcripts
9711010
.into_iter()
972-
.filter(|t| t.score >= thresh1)
1011+
.filter(|t| t.score >= single_mate_threshold)
9731012
.max_by_key(|t| t.score);
9741013
let best_m2 = single_mate2_transcripts
9751014
.into_iter()
976-
.filter(|t| t.score >= thresh2)
1015+
.filter(|t| t.score >= single_mate_threshold)
9771016
.max_by_key(|t| t.score);
9781017

9791018
match (best_m1, best_m2) {
@@ -1031,8 +1070,7 @@ fn try_pair_transcripts(
10311070
len2: usize,
10321071
params: &Parameters,
10331072
combined_score_threshold: i32,
1034-
scorer: &AlignmentScorer,
1035-
combined_wt_score_override: Option<i32>,
1073+
combined_wt_score: i32,
10361074
) -> Option<PairedAlignment> {
10371075
// Must be same chromosome
10381076
if t1.chr_idx != t2.chr_idx {
@@ -1067,20 +1105,6 @@ fn try_pair_transcripts(
10671105
return None;
10681106
}
10691107

1070-
// Combined score: use override if provided (combined-read path supplies the original
1071-
// stitch_recurse score + combined-span penalty, which correctly includes mismatch
1072-
// penalties). Fallback formula (per-mate path) undoes per-mate span penalties and
1073-
// applies a single combined-span penalty.
1074-
let combined_span = right.genome_end - left.genome_start;
1075-
let combined_wt_score = combined_wt_score_override.unwrap_or_else(|| {
1076-
let t1_span = t1.genome_end - t1.genome_start;
1077-
let t2_span = t2.genome_end - t2.genome_start;
1078-
let p1 = scorer.genomic_length_penalty(t1_span);
1079-
let p2 = scorer.genomic_length_penalty(t2_span);
1080-
let combined_p = scorer.genomic_length_penalty(combined_span);
1081-
t1.score + t2.score - p1 - p2 + combined_p
1082-
});
1083-
10841108
// SCORE-GATE: reject pairs where score is below the absolute floor
10851109
if combined_wt_score + params.out_filter_multimap_score_range < combined_score_threshold {
10861110
return None;
@@ -1331,9 +1355,54 @@ mod tests {
13311355
suffix_array,
13321356
sa_index,
13331357
junction_db: crate::junction::SpliceJunctionDb::empty(),
1358+
transcriptome: None,
13341359
}
13351360
}
13361361

1362+
#[test]
1363+
fn combined_transcript_for_projection_rewrites_mate2_ifrag() {
1364+
use crate::align::transcript::CigarOp;
1365+
1366+
let make_tr = |gs: u64, ge: u64, rs: usize, re: usize| Transcript {
1367+
chr_idx: 0,
1368+
genome_start: gs,
1369+
genome_end: ge,
1370+
is_reverse: false,
1371+
exons: vec![Exon {
1372+
genome_start: gs,
1373+
genome_end: ge,
1374+
read_start: rs,
1375+
read_end: re,
1376+
i_frag: 0,
1377+
}],
1378+
cigar: vec![CigarOp::Match((ge - gs) as u32)],
1379+
score: 100,
1380+
n_mismatch: 0,
1381+
n_gap: 0,
1382+
n_junction: 0,
1383+
junction_motifs: vec![],
1384+
junction_annotated: vec![],
1385+
read_seq: vec![],
1386+
};
1387+
let pair = PairedAlignment {
1388+
mate1_transcript: make_tr(1000, 1100, 0, 100),
1389+
mate2_transcript: make_tr(1300, 1400, 0, 100),
1390+
mate1_region: (0, 100),
1391+
mate2_region: (0, 100),
1392+
is_proper_pair: true,
1393+
insert_size: 400,
1394+
combined_wt_score: 200,
1395+
combined_n_match: 200,
1396+
};
1397+
let combined = pair.combined_transcript_for_projection();
1398+
assert_eq!(combined.exons.len(), 2);
1399+
assert_eq!(combined.exons[0].i_frag, 0);
1400+
assert_eq!(combined.exons[1].i_frag, 1);
1401+
assert_eq!(combined.genome_start, 1000);
1402+
assert_eq!(combined.genome_end, 1400);
1403+
assert_eq!(combined.score, 200);
1404+
}
1405+
13371406
#[test]
13381407
fn test_align_read_no_seeds() {
13391408
let index = make_test_index();
@@ -1424,6 +1493,7 @@ mod tests {
14241493
genome_end: 1100,
14251494
read_start: 0,
14261495
read_end: 100,
1496+
i_frag: 0,
14271497
}],
14281498
cigar: vec![CigarOp::Match(100)],
14291499
score: 100,
@@ -1445,6 +1515,7 @@ mod tests {
14451515
genome_end: 1300,
14461516
read_start: 0,
14471517
read_end: 100,
1518+
i_frag: 0,
14481519
}],
14491520
cigar: vec![CigarOp::Match(100)],
14501521
score: 100,
@@ -1477,6 +1548,7 @@ mod tests {
14771548
genome_end: 1100,
14781549
read_start: 0,
14791550
read_end: 100,
1551+
i_frag: 0,
14801552
}],
14811553
cigar: vec![CigarOp::Match(100)],
14821554
score: 100,
@@ -1498,6 +1570,7 @@ mod tests {
14981570
genome_end: 1400,
14991571
read_start: 0,
15001572
read_end: 100,
1573+
i_frag: 0,
15011574
}],
15021575
cigar: vec![CigarOp::Match(100)],
15031576
score: 100,
@@ -1528,6 +1601,7 @@ mod tests {
15281601
genome_end: 1100,
15291602
read_start: 0,
15301603
read_end: 100,
1604+
i_frag: 0,
15311605
}],
15321606
cigar: vec![CigarOp::Match(100)],
15331607
score: 100,
@@ -1549,6 +1623,7 @@ mod tests {
15491623
genome_end: 1300,
15501624
read_start: 0,
15511625
read_end: 100,
1626+
i_frag: 0,
15521627
}],
15531628
cigar: vec![CigarOp::Match(100)],
15541629
score: 100,
@@ -1580,6 +1655,7 @@ mod tests {
15801655
genome_end: 1300,
15811656
read_start: 0,
15821657
read_end: 100,
1658+
i_frag: 0,
15831659
}],
15841660
cigar: vec![CigarOp::Match(100)],
15851661
score: 100,
@@ -1602,6 +1678,7 @@ mod tests {
16021678
genome_end: 1300,
16031679
read_start: 0,
16041680
read_end: 100,
1681+
i_frag: 0,
16051682
}],
16061683
cigar: vec![CigarOp::Match(100)],
16071684
score: 100,
@@ -1679,6 +1756,7 @@ mod tests {
16791756
genome_end: 1300,
16801757
read_start: 0,
16811758
read_end: 100,
1759+
i_frag: 0,
16821760
}],
16831761
cigar: vec![CigarOp::Match(100)],
16841762
score: 100,
@@ -1700,6 +1778,7 @@ mod tests {
17001778
genome_end: 1100,
17011779
read_start: 0,
17021780
read_end: 100,
1781+
i_frag: 0,
17031782
}],
17041783
cigar: vec![CigarOp::Match(100)],
17051784
score: 100,
@@ -1739,6 +1818,7 @@ mod tests {
17391818
genome_end: 1200,
17401819
read_start: 0,
17411820
read_end: 100,
1821+
i_frag: 0,
17421822
}],
17431823
cigar: vec![CigarOp::Match(100)],
17441824
score: 100,
@@ -1823,6 +1903,7 @@ mod tests {
18231903
genome_end: 1100,
18241904
read_start: 0,
18251905
read_end: 100,
1906+
i_frag: 0,
18261907
}],
18271908
cigar: vec![CigarOp::Match(100)],
18281909
score: 100,

0 commit comments

Comments
 (0)