Skip to content

Commit 8ff6d97

Browse files
committed
more MAPQ fixes
1 parent d6342fe commit 8ff6d97

5 files changed

Lines changed: 136 additions & 44 deletions

File tree

CLAUDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Always run `cargo clippy`, `cargo fmt --check`, and `cargo test` before consider
3232

3333
## Current Status
3434

35-
**268 tests passing, 0 clippy warnings.** SE: 99.7% position agreement (adjusted), 99.9% CIGAR (pos-agreeing), 2.2% splice rate (STAR: 2.2%), 66 shared junctions, **100.0% MAPQ agreement, MAPQ inflation: 0, deflation: 0**. 127 position disagreements (ALL verified as genuine ties). 1 CIGAR-only disagree (ERR12389696.13573895, insertion placement, seed-level tie). **1 truly actionable SE issue remains** (CIGAR insertion placement only). PE: **8390/8390 both-mapped (0 gap, exact STAR match)**, 0 half-mapped, **6 MAPQ inflations** (was 24), **2 MAPQ deflations** (new, spurious subset WT), **98.879% PE faithfulness** (Phase 16.46: per-position PE dedup removal). See [ROADMAP.md](ROADMAP.md) for detailed phase tracking and [docs/](docs/) for per-phase notes.
35+
**268 tests passing, 0 clippy warnings.** SE: 8796/8926 compare_sam.py (98.5%), 2.2% splice rate (STAR: 2.2%), 66 shared junctions, **100.0% MAPQ agreement, MAPQ inflation: 0, deflation: 0**. 127 position disagreements (ALL verified as genuine ties). 1 CIGAR-only disagree (ERR12389696.13573895, insertion placement, seed-level tie). **0 STAR-only / 0 ruSTAR-only SE reads**. PE: **8390/8390 both-mapped (0 gap, exact STAR match)**, 0 half-mapped, **4 MAPQ inflations** (rDNA/repeat secondary loci), **98.903% PE faithfulness** (Phase 16.50). See [ROADMAP.md](ROADMAP.md) for detailed phase tracking and [docs/](docs/) for per-phase notes.
3636

3737
## Source Layout
3838

ROADMAP.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ Paired-end (Phase 8) builds on threaded infrastructure. GTF/junctions (Phase 7)
5151
| 12 | Chimeric Detection || 170 | SE chimeric, Chimeric.out.junction |
5252
| [13](docs/phase13_accuracy.md) | Performance + Accuracy || 205 | 94.5% pos, 97.8% CIGAR, 2.1% splice |
5353
| [15](docs/phase15_sam_tags.md) | SAM Tags + PE Fix || 235 | NH/HI/AS/NM/nM/XS/jM/jI/MD, PE fix |
54-
| [16](docs/phase16_algorithm.md) | Algorithm Parity |* | 268 | SE: 99.7% pos, 2.2% splice, **MAPQ 100%**; PE: **8390/8390 (0 gap)**, 99.0% per-mate pos, 98.9% CIGAR, **6 MAPQ inflations**, 0 deflations; faithfulness: SE 98.57%, PE 98.89%, SJ 96.97% (Phase 16.48) |
54+
| [16](docs/phase16_algorithm.md) | Algorithm Parity |* | 268 | SE: **8796/8926 (0 STAR-only)**, 2.2% splice, **MAPQ 100%**; PE: **8390/8390 (0 gap)**, 99.0% per-mate pos, 98.9% CIGAR, **4 MAPQ inflations**, 0 deflations; faithfulness: SE 98.5%+, PE 98.903%, SJ 96.97% (Phase 16.50) |
5555
| [17](docs/phase17_features.md) | Features + Polish |* | 268 | Log.final.out, clippy cleanup, sorted BAM planned |
5656
| 14 | STARsolo | DEFERRED || Waiting for accuracy parity |
5757

src/align/read_align.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,6 @@ fn build_combined_read(mate1: &[u8], mate2: &[u8]) -> Vec<u8> {
564564
v
565565
}
566566

567-
568567
/// Align paired-end reads using STAR's joint combined-read approach (Phase 16.11).
569568
///
570569
/// # Algorithm (STAR-faithful)

src/align/seed.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,7 @@ fn find_seed_at_position(
390390
// matched_level directly as the MMP. This causes chains to advance by shorter
391391
// amounts, ensuring intermediate positions (missed if advancing by the true MMP)
392392
// are not skipped.
393-
let (match_length, narrowed_start, narrowed_end) = if bounds_tight
394-
&& matched_level < sa_nbases
393+
let (match_length, narrowed_start, narrowed_end) = if bounds_tight && matched_level < sa_nbases
395394
{
396395
// STAR short-circuit (maxMappableLength2strands.cpp):
397396
// "if (Lind < gSAindexNbases && iSA1noN && iSA2good) { maxL=Lind; }"

src/align/stitch.rs

Lines changed: 133 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ fn score_region(
8181
break;
8282
}
8383
let read_base = read_seq[read_pos];
84-
let Some(genome_base) = index.genome.get_base(genome_start + i as u64 + genome_offset)
84+
let Some(genome_base) = index
85+
.genome
86+
.get_base(genome_start + i as u64 + genome_offset)
8587
else {
8688
break;
8789
};
@@ -580,8 +582,32 @@ pub fn cluster_seeds(
580582
// STAR resets nWA=0 after window creation, then re-assigns ALL seeds (including
581583
// anchors) through assignAlignToWindow. We match this by not pre-loading anchors
582584
// in Phase 2 and processing all seeds here through the same capacity logic.
583-
let mut too_many_anchors = false; // STAR: MARKER_TOO_MANY_ANCHORS_PER_WINDOW
584-
'outer: for (seed_idx, seed) in seeds.iter().enumerate() {
585+
//
586+
// STAR processes all pieces in a sorted pass (rStart asc, length desc for ties).
587+
// This sorting ensures that when two seeds overlap on the same diagonal, the longer
588+
// one enters the window first and blocks the shorter one via overlap detection —
589+
// preventing window capacity overflow from many short copies on the same diagonal.
590+
//
591+
// We replicate this by: (1) collecting all candidates per window, (2) pre-deduping
592+
// overlapping entries per diagonal (longest wins, shorter blocked before processing),
593+
// then (3) processing survivors in original discovery order. This achieves the same
594+
// overlap suppression without globally reordering seeds across windows.
595+
596+
// Per-window candidate entries collected before sorting.
597+
struct WinCandidate {
598+
seed_idx: usize,
599+
sa_pos: u64,
600+
forward_pos: u64,
601+
length: usize,
602+
n_loci: usize,
603+
is_anchor: bool,
604+
ps_rstart: usize, // positive-strand read start (sort key)
605+
}
606+
607+
let mut win_candidates: Vec<Vec<WinCandidate>> =
608+
(0..windows.len()).map(|_| Vec::new()).collect();
609+
610+
for (seed_idx, seed) in seeds.iter().enumerate() {
585611
let n_loci = seed.sa_end - seed.sa_start;
586612
if n_loci == 0 {
587613
continue;
@@ -608,6 +634,97 @@ pub fn cluster_seeds(
608634
_ => continue,
609635
};
610636

637+
let ps_rstart = if windows[win_idx].is_reverse {
638+
read_len - (length + seed.read_pos)
639+
} else {
640+
seed.read_pos
641+
};
642+
643+
win_candidates[win_idx].push(WinCandidate {
644+
seed_idx,
645+
sa_pos,
646+
forward_pos,
647+
length,
648+
n_loci,
649+
is_anchor: is_anchor_seed,
650+
ps_rstart,
651+
});
652+
}
653+
}
654+
655+
// Pre-dedup overlapping diagonals (order-independent, length wins):
656+
// Two candidate entries on the same diagonal with overlapping read ranges represent
657+
// the same alignment position — keep only the longest. This is what STAR achieves
658+
// via sorted-order overlap detection (longer seed enters first, shorter blocked).
659+
// Pre-computing this dedup ensures correct results regardless of discovery order,
660+
// fixing window capacity overflow without changing capacity-eviction behavior
661+
// for other reads (seeds on unique diagonals are unaffected).
662+
//
663+
// After pre-dedup, Phase 4 processes survivors in original discovery order.
664+
// The overlap detection in the main Phase 4 loop below is still present but will
665+
// never find an overlap (since pre-dedup already resolved all of them). It serves
666+
// as a safety net for any edge cases not covered by pre-dedup.
667+
let win_n = windows.len();
668+
let mut win_blocked: Vec<Vec<bool>> = (0..win_n)
669+
.map(|i| vec![false; win_candidates[i].len()])
670+
.collect();
671+
672+
for win_idx in 0..win_n {
673+
let candidates = &win_candidates[win_idx];
674+
if candidates.is_empty() {
675+
continue;
676+
}
677+
// Sort indices by length descending so that the longest entry per diagonal
678+
// is processed first (and blocks shorter overlapping entries on the same diagonal).
679+
let mut by_len: Vec<usize> = (0..candidates.len()).collect();
680+
by_len.sort_by(|&a, &b| candidates[b].length.cmp(&candidates[a].length));
681+
682+
// For each diagonal, track accepted [ps_rstart, ps_rend) ranges.
683+
let mut diag_ranges: HashMap<i64, Vec<(usize, usize)>> = HashMap::new();
684+
for &ci in &by_len {
685+
let cand = &candidates[ci];
686+
let diag = cand.forward_pos as i64 - cand.ps_rstart as i64;
687+
let ps_rend = cand.ps_rstart + cand.length;
688+
689+
let blocked = diag_ranges.get(&diag).is_some_and(|ranges| {
690+
ranges.iter().any(|&(rs, re)| {
691+
(cand.ps_rstart >= rs && cand.ps_rstart < re) || (ps_rend >= rs && ps_rend < re)
692+
})
693+
});
694+
695+
if blocked {
696+
win_blocked[win_idx][ci] = true;
697+
} else {
698+
diag_ranges
699+
.entry(diag)
700+
.or_default()
701+
.push((cand.ps_rstart, ps_rend));
702+
}
703+
}
704+
}
705+
706+
// Process each window's candidates in original discovery order,
707+
// skipping pre-dedup-blocked entries.
708+
let mut too_many_anchors = false; // STAR: MARKER_TOO_MANY_ANCHORS_PER_WINDOW
709+
'outer: for win_idx in 0..win_n {
710+
if !windows[win_idx].alive {
711+
continue;
712+
}
713+
for (ci, cand) in win_candidates[win_idx].iter().enumerate() {
714+
if win_blocked[win_idx][ci] {
715+
continue; // blocked by a longer overlapping entry on same diagonal
716+
}
717+
718+
let seed_idx = cand.seed_idx;
719+
let seed = &seeds[seed_idx];
720+
let length = cand.length;
721+
let forward_pos = cand.forward_pos;
722+
let sa_pos = cand.sa_pos;
723+
let n_loci = cand.n_loci;
724+
let is_anchor_seed = cand.is_anchor;
725+
let new_ps_rstart = cand.ps_rstart;
726+
let new_ps_rend = new_ps_rstart + length;
727+
611728
let window = &mut windows[win_idx];
612729

613730
// STAR's WALrec early rejection (assignAlignToWindow line 20):
@@ -617,51 +734,29 @@ pub fn cluster_seeds(
617734
continue;
618735
}
619736

620-
// STAR's overlap detection (assignAlignToWindow lines 28-84):
621-
// Check if new entry overlaps an existing entry on the same diagonal.
622-
// Same diagonal means (genome_pos - read_pos) is equal. If overlap
623-
// found, keep the longer entry. This deduplicates shorter seeds
624-
// subsumed by longer seeds at the same alignment position.
625-
//
626-
// CRITICAL: STAR uses positive-strand read coordinates for overlap
627-
// detection (stitchPieces.cpp line 151: aRstart = Lread - (aLength+aRstart)
628-
// for reverse-strand). Without this, reverse-strand seeds covering the
629-
// same genomic exon have different diagonals → no overlap → anchor
630-
// proliferation → window capacity overflow → splice-target seed eviction.
631-
let (new_ps_rstart, new_ps_rend) = if window.is_reverse {
632-
let ps = read_len - (length + seed.read_pos);
633-
(ps, ps + length)
634-
} else {
635-
(seed.read_pos, seed.read_pos + length)
636-
};
737+
// Safety-net overlap detection: after pre-dedup, no overlapping entries
738+
// should remain. This check handles any edge cases and matches STAR's
739+
// assignAlignToWindow overlap logic for correctness.
637740
let new_diag = forward_pos as i64 - new_ps_rstart as i64;
638741
let mut overlap_idx = None;
639742
for (i, wa) in window.alignments.iter().enumerate() {
640-
let (wa_ps_rstart, wa_ps_rend) = if window.is_reverse {
641-
let ps = read_len - (wa.length + wa.read_pos);
642-
(ps, ps + wa.length)
743+
let wa_ps_rstart = if window.is_reverse {
744+
read_len - (wa.length + wa.read_pos)
643745
} else {
644-
(wa.read_pos, wa.read_pos + wa.length)
746+
wa.read_pos
645747
};
748+
let wa_ps_rend = wa_ps_rstart + wa.length;
646749
let wa_diag = wa.genome_pos as i64 - wa_ps_rstart as i64;
647-
if new_diag == wa_diag {
648-
// Same diagonal — check for overlapping read ranges
649-
// in positive-strand coordinates (matches STAR's exact
650-
// overlap condition from assignAlignToWindow)
651-
if (new_ps_rstart >= wa_ps_rstart && new_ps_rstart < wa_ps_rend)
652-
|| (new_ps_rend >= wa_ps_rstart && new_ps_rend < wa_ps_rend)
653-
{
654-
overlap_idx = Some(i);
655-
break;
656-
}
750+
if new_diag == wa_diag
751+
&& ((new_ps_rstart >= wa_ps_rstart && new_ps_rstart < wa_ps_rend)
752+
|| (new_ps_rend >= wa_ps_rstart && new_ps_rend < wa_ps_rend))
753+
{
754+
overlap_idx = Some(i);
755+
break;
657756
}
658757
}
659758
if let Some(oi) = overlap_idx {
660759
if length > window.alignments[oi].length {
661-
// New entry is longer — remove old and re-insert at correct
662-
// sorted position (matches STAR assignAlignToWindow lines 26-62).
663-
// The new entry may have a different length → different ps_rstart
664-
// → different sort position.
665760
window.alignments.remove(oi);
666761
let insert_pos = window.alignments.partition_point(|wa| {
667762
let wa_ps = if window.is_reverse {
@@ -685,7 +780,6 @@ pub fn cluster_seeds(
685780
},
686781
);
687782
}
688-
// Either way (replaced or not), don't add as new entry
689783
continue;
690784
}
691785

0 commit comments

Comments
 (0)