@@ -81,7 +81,9 @@ fn score_region(
8181 break ;
8282 }
8383 let read_base = read_seq[ read_pos] ;
84- let Some ( genome_base) = index. genome . get_base ( genome_start + i as u64 + genome_offset)
84+ let Some ( genome_base) = index
85+ . genome
86+ . get_base ( genome_start + i as u64 + genome_offset)
8587 else {
8688 break ;
8789 } ;
@@ -580,8 +582,32 @@ pub fn cluster_seeds(
580582 // STAR resets nWA=0 after window creation, then re-assigns ALL seeds (including
581583 // anchors) through assignAlignToWindow. We match this by not pre-loading anchors
582584 // in Phase 2 and processing all seeds here through the same capacity logic.
583- let mut too_many_anchors = false ; // STAR: MARKER_TOO_MANY_ANCHORS_PER_WINDOW
584- ' outer: for ( seed_idx, seed) in seeds. iter ( ) . enumerate ( ) {
585+ //
586+ // STAR processes all pieces in a sorted pass (rStart asc, length desc for ties).
587+ // This sorting ensures that when two seeds overlap on the same diagonal, the longer
588+ // one enters the window first and blocks the shorter one via overlap detection —
589+ // preventing window capacity overflow from many short copies on the same diagonal.
590+ //
591+ // We replicate this by: (1) collecting all candidates per window, (2) pre-deduping
592+ // overlapping entries per diagonal (longest wins, shorter blocked before processing),
593+ // then (3) processing survivors in original discovery order. This achieves the same
594+ // overlap suppression without globally reordering seeds across windows.
595+
596+ // Per-window candidate entries collected before sorting.
597+ struct WinCandidate {
598+ seed_idx : usize ,
599+ sa_pos : u64 ,
600+ forward_pos : u64 ,
601+ length : usize ,
602+ n_loci : usize ,
603+ is_anchor : bool ,
604+ ps_rstart : usize , // positive-strand read start (sort key)
605+ }
606+
607+ let mut win_candidates: Vec < Vec < WinCandidate > > =
608+ ( 0 ..windows. len ( ) ) . map ( |_| Vec :: new ( ) ) . collect ( ) ;
609+
610+ for ( seed_idx, seed) in seeds. iter ( ) . enumerate ( ) {
585611 let n_loci = seed. sa_end - seed. sa_start ;
586612 if n_loci == 0 {
587613 continue ;
@@ -608,6 +634,97 @@ pub fn cluster_seeds(
608634 _ => continue ,
609635 } ;
610636
637+ let ps_rstart = if windows[ win_idx] . is_reverse {
638+ read_len - ( length + seed. read_pos )
639+ } else {
640+ seed. read_pos
641+ } ;
642+
643+ win_candidates[ win_idx] . push ( WinCandidate {
644+ seed_idx,
645+ sa_pos,
646+ forward_pos,
647+ length,
648+ n_loci,
649+ is_anchor : is_anchor_seed,
650+ ps_rstart,
651+ } ) ;
652+ }
653+ }
654+
655+ // Pre-dedup overlapping diagonals (order-independent, length wins):
656+ // Two candidate entries on the same diagonal with overlapping read ranges represent
657+ // the same alignment position — keep only the longest. This is what STAR achieves
658+ // via sorted-order overlap detection (longer seed enters first, shorter blocked).
659+ // Pre-computing this dedup ensures correct results regardless of discovery order,
660+ // fixing window capacity overflow without changing capacity-eviction behavior
661+ // for other reads (seeds on unique diagonals are unaffected).
662+ //
663+ // After pre-dedup, Phase 4 processes survivors in original discovery order.
664+ // The overlap detection in the main Phase 4 loop below is still present but will
665+ // never find an overlap (since pre-dedup already resolved all of them). It serves
666+ // as a safety net for any edge cases not covered by pre-dedup.
667+ let win_n = windows. len ( ) ;
668+ let mut win_blocked: Vec < Vec < bool > > = ( 0 ..win_n)
669+ . map ( |i| vec ! [ false ; win_candidates[ i] . len( ) ] )
670+ . collect ( ) ;
671+
672+ for win_idx in 0 ..win_n {
673+ let candidates = & win_candidates[ win_idx] ;
674+ if candidates. is_empty ( ) {
675+ continue ;
676+ }
677+ // Sort indices by length descending so that the longest entry per diagonal
678+ // is processed first (and blocks shorter overlapping entries on the same diagonal).
679+ let mut by_len: Vec < usize > = ( 0 ..candidates. len ( ) ) . collect ( ) ;
680+ by_len. sort_by ( |& a, & b| candidates[ b] . length . cmp ( & candidates[ a] . length ) ) ;
681+
682+ // For each diagonal, track accepted [ps_rstart, ps_rend) ranges.
683+ let mut diag_ranges: HashMap < i64 , Vec < ( usize , usize ) > > = HashMap :: new ( ) ;
684+ for & ci in & by_len {
685+ let cand = & candidates[ ci] ;
686+ let diag = cand. forward_pos as i64 - cand. ps_rstart as i64 ;
687+ let ps_rend = cand. ps_rstart + cand. length ;
688+
689+ let blocked = diag_ranges. get ( & diag) . is_some_and ( |ranges| {
690+ ranges. iter ( ) . any ( |& ( rs, re) | {
691+ ( cand. ps_rstart >= rs && cand. ps_rstart < re) || ( ps_rend >= rs && ps_rend < re)
692+ } )
693+ } ) ;
694+
695+ if blocked {
696+ win_blocked[ win_idx] [ ci] = true ;
697+ } else {
698+ diag_ranges
699+ . entry ( diag)
700+ . or_default ( )
701+ . push ( ( cand. ps_rstart , ps_rend) ) ;
702+ }
703+ }
704+ }
705+
706+ // Process each window's candidates in original discovery order,
707+ // skipping pre-dedup-blocked entries.
708+ let mut too_many_anchors = false ; // STAR: MARKER_TOO_MANY_ANCHORS_PER_WINDOW
709+ ' outer: for win_idx in 0 ..win_n {
710+ if !windows[ win_idx] . alive {
711+ continue ;
712+ }
713+ for ( ci, cand) in win_candidates[ win_idx] . iter ( ) . enumerate ( ) {
714+ if win_blocked[ win_idx] [ ci] {
715+ continue ; // blocked by a longer overlapping entry on same diagonal
716+ }
717+
718+ let seed_idx = cand. seed_idx ;
719+ let seed = & seeds[ seed_idx] ;
720+ let length = cand. length ;
721+ let forward_pos = cand. forward_pos ;
722+ let sa_pos = cand. sa_pos ;
723+ let n_loci = cand. n_loci ;
724+ let is_anchor_seed = cand. is_anchor ;
725+ let new_ps_rstart = cand. ps_rstart ;
726+ let new_ps_rend = new_ps_rstart + length;
727+
611728 let window = & mut windows[ win_idx] ;
612729
613730 // STAR's WALrec early rejection (assignAlignToWindow line 20):
@@ -617,51 +734,29 @@ pub fn cluster_seeds(
617734 continue ;
618735 }
619736
620- // STAR's overlap detection (assignAlignToWindow lines 28-84):
621- // Check if new entry overlaps an existing entry on the same diagonal.
622- // Same diagonal means (genome_pos - read_pos) is equal. If overlap
623- // found, keep the longer entry. This deduplicates shorter seeds
624- // subsumed by longer seeds at the same alignment position.
625- //
626- // CRITICAL: STAR uses positive-strand read coordinates for overlap
627- // detection (stitchPieces.cpp line 151: aRstart = Lread - (aLength+aRstart)
628- // for reverse-strand). Without this, reverse-strand seeds covering the
629- // same genomic exon have different diagonals → no overlap → anchor
630- // proliferation → window capacity overflow → splice-target seed eviction.
631- let ( new_ps_rstart, new_ps_rend) = if window. is_reverse {
632- let ps = read_len - ( length + seed. read_pos ) ;
633- ( ps, ps + length)
634- } else {
635- ( seed. read_pos , seed. read_pos + length)
636- } ;
737+ // Safety-net overlap detection: after pre-dedup, no overlapping entries
738+ // should remain. This check handles any edge cases and matches STAR's
739+ // assignAlignToWindow overlap logic for correctness.
637740 let new_diag = forward_pos as i64 - new_ps_rstart as i64 ;
638741 let mut overlap_idx = None ;
639742 for ( i, wa) in window. alignments . iter ( ) . enumerate ( ) {
640- let ( wa_ps_rstart, wa_ps_rend) = if window. is_reverse {
641- let ps = read_len - ( wa. length + wa. read_pos ) ;
642- ( ps, ps + wa. length )
743+ let wa_ps_rstart = if window. is_reverse {
744+ read_len - ( wa. length + wa. read_pos )
643745 } else {
644- ( wa. read_pos , wa . read_pos + wa . length )
746+ wa. read_pos
645747 } ;
748+ let wa_ps_rend = wa_ps_rstart + wa. length ;
646749 let wa_diag = wa. genome_pos as i64 - wa_ps_rstart as i64 ;
647- if new_diag == wa_diag {
648- // Same diagonal — check for overlapping read ranges
649- // in positive-strand coordinates (matches STAR's exact
650- // overlap condition from assignAlignToWindow)
651- if ( new_ps_rstart >= wa_ps_rstart && new_ps_rstart < wa_ps_rend)
652- || ( new_ps_rend >= wa_ps_rstart && new_ps_rend < wa_ps_rend)
653- {
654- overlap_idx = Some ( i) ;
655- break ;
656- }
750+ if new_diag == wa_diag
751+ && ( ( new_ps_rstart >= wa_ps_rstart && new_ps_rstart < wa_ps_rend)
752+ || ( new_ps_rend >= wa_ps_rstart && new_ps_rend < wa_ps_rend) )
753+ {
754+ overlap_idx = Some ( i) ;
755+ break ;
657756 }
658757 }
659758 if let Some ( oi) = overlap_idx {
660759 if length > window. alignments [ oi] . length {
661- // New entry is longer — remove old and re-insert at correct
662- // sorted position (matches STAR assignAlignToWindow lines 26-62).
663- // The new entry may have a different length → different ps_rstart
664- // → different sort position.
665760 window. alignments . remove ( oi) ;
666761 let insert_pos = window. alignments . partition_point ( |wa| {
667762 let wa_ps = if window. is_reverse {
@@ -685,7 +780,6 @@ pub fn cluster_seeds(
685780 } ,
686781 ) ;
687782 }
688- // Either way (replaced or not), don't add as new entry
689783 continue ;
690784 }
691785
0 commit comments