11/// Read alignment driver function
22use crate :: align:: score:: { AlignmentScorer , SpliceMotif } ;
33use crate :: align:: seed:: Seed ;
4- use crate :: align:: stitch:: {
5- cluster_seeds, stitch_seeds_with_jdb_debug,
6- } ;
4+ use crate :: align:: stitch:: { cluster_seeds, stitch_seeds_with_jdb_debug} ;
75use crate :: align:: transcript:: Transcript ;
86use crate :: error:: Error ;
97use crate :: index:: GenomeIndex ;
108use crate :: params:: { IntronMotifFilter , IntronStrandFilter , Parameters } ;
119use crate :: stats:: UnmappedReason ;
10+ use rand:: { SeedableRng , rngs:: StdRng , seq:: SliceRandom } ;
11+ use std:: hash:: { DefaultHasher , Hash , Hasher } ;
12+
13+ /// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
14+ ///
15+ /// STAR seeds `std::mt19937` once per chunk/thread (`runRNGseed*(iChunk+1)`),
16+ /// then advances the state sequentially per read. ruSTAR parallelises per-read
17+ /// via rayon, so we instead fold the read name into the seed — this keeps tie
18+ /// breaks reproducible regardless of thread count while still honoring the
19+ /// user's `--runRNGseed` value.
20+ fn per_read_seed ( run_rng_seed : u64 , read_name : & str ) -> u64 {
21+ let mut hasher = DefaultHasher :: new ( ) ;
22+ read_name. hash ( & mut hasher) ;
23+ run_rng_seed. wrapping_mul ( hasher. finish ( ) . wrapping_add ( 1 ) )
24+ }
25+
26+ /// Shuffle the prefix of `items` whose `score_fn` equals the first element's score.
27+ ///
28+ /// Mirrors STAR's `ReadAlign_multMapSelect` / `funPrimaryAlignMark`: best-scoring
29+ /// alignments are randomized so primary selection (index 0) is not biased by
30+ /// upstream sort order. Non-tied elements are left alone.
31+ fn shuffle_tied_prefix < T > ( items : & mut [ T ] , score_fn : impl Fn ( & T ) -> i32 , seed : u64 ) {
32+ let Some ( first) = items. first ( ) else {
33+ return ;
34+ } ;
35+ let best = score_fn ( first) ;
36+ let tied = items. iter ( ) . take_while ( |t| score_fn ( t) == best) . count ( ) ;
37+ if tied < 2 {
38+ return ;
39+ }
40+ items[ ..tied] . shuffle ( & mut StdRng :: seed_from_u64 ( seed) ) ;
41+ }
1242
1343/// Result of aligning a single read: (transcripts, chimeric_alignments, n_for_mapq, unmapped_reason)
1444pub type AlignReadResult = (
@@ -301,6 +331,13 @@ pub fn align_read(
301331 . then_with ( || a. is_reverse . cmp ( & b. is_reverse ) )
302332 } ) ;
303333
334+ // Randomize primary among best-scoring ties (ReadAlign_multMapSelect.cpp:71-79).
335+ shuffle_tied_prefix (
336+ & mut transcripts,
337+ |t| t. score ,
338+ per_read_seed ( params. run_rng_seed , read_name) ,
339+ ) ;
340+
304341 // Score-range filter: keep only alignments within outFilterMultimapScoreRange of the best.
305342 // (STAR's multMapSelect step — must run before quality filters.)
306343 if !transcripts. is_empty ( ) {
@@ -621,7 +658,10 @@ pub fn align_paired_read(
621658 // This correctly represents mate2 on - strand for FR pairs without explicit RC handling.
622659 let debug_name: & str = if debug_pe { read_name } else { "" } ;
623660 let mut mate1_transcripts: Vec < Transcript > = Vec :: new ( ) ;
624- for cluster in mate1_clusters. iter ( ) . take ( params. align_windows_per_read_nmax ) {
661+ for cluster in mate1_clusters
662+ . iter ( )
663+ . take ( params. align_windows_per_read_nmax )
664+ {
625665 let ts = stitch_seeds_with_jdb_debug (
626666 cluster,
627667 mate1_seq,
@@ -635,7 +675,10 @@ pub fn align_paired_read(
635675 }
636676
637677 let mut mate2_transcripts: Vec < Transcript > = Vec :: new ( ) ;
638- for cluster in mate2_clusters. iter ( ) . take ( params. align_windows_per_read_nmax ) {
678+ for cluster in mate2_clusters
679+ . iter ( )
680+ . take ( params. align_windows_per_read_nmax )
681+ {
639682 let ts = stitch_seeds_with_jdb_debug (
640683 cluster,
641684 mate2_seq,
@@ -831,6 +874,13 @@ pub fn align_paired_read(
831874 } )
832875 } ) ;
833876
877+ // Randomize primary among best-scoring pairs (STAR's funPrimaryAlignMark).
878+ shuffle_tied_prefix (
879+ & mut joint_pairs,
880+ |pa| pa. combined_wt_score ,
881+ per_read_seed ( params. run_rng_seed , read_name) ,
882+ ) ;
883+
834884 // Step 4: quality filter (mappedFilter).
835885 filter_paired_transcripts ( & mut joint_pairs, params) ;
836886
@@ -845,12 +895,10 @@ pub fn align_paired_read(
845895
846896 // Half-mapped fallback: report the best-scoring single-mate transcript.
847897 // Apply per-mate quality threshold (outFilterScoreMinOverLread * (len - 1)).
848- let thresh1 =
849- ( ( params. out_filter_score_min_over_lread * ( len1 as f64 - 1.0 ) ) as i32 )
850- . max ( params. out_filter_score_min ) ;
851- let thresh2 =
852- ( ( params. out_filter_score_min_over_lread * ( len2 as f64 - 1.0 ) ) as i32 )
853- . max ( params. out_filter_score_min ) ;
898+ let thresh1 = ( ( params. out_filter_score_min_over_lread * ( len1 as f64 - 1.0 ) ) as i32 )
899+ . max ( params. out_filter_score_min ) ;
900+ let thresh2 = ( ( params. out_filter_score_min_over_lread * ( len2 as f64 - 1.0 ) ) as i32 )
901+ . max ( params. out_filter_score_min ) ;
854902
855903 let best_m1 = mate1_transcripts
856904 . into_iter ( )
@@ -1056,9 +1104,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
10561104 // the read is unmapped.
10571105 //
10581106 // Step 1: find the best pair and check quality thresholds on it.
1059- let best_pa = paired_alns
1060- . iter ( )
1061- . max_by_key ( |pa| pa. combined_wt_score ) ;
1107+ let best_pa = paired_alns. iter ( ) . max_by_key ( |pa| pa. combined_wt_score ) ;
10621108 if let Some ( best) = best_pa {
10631109 let mate1_len = ( best. mate1_region . 1 - best. mate1_region . 0 ) as f64 ;
10641110 let mate2_len = ( best. mate2_region . 1 - best. mate2_region . 0 ) as f64 ;
@@ -1074,8 +1120,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
10741120 // per-mate spans are small (penalty ≈ −2 each → sum penalty −4 vs combined penalty −2).
10751121 let combined_score = best. combined_wt_score ;
10761122 if combined_score < params. out_filter_score_min
1077- || combined_score
1078- < ( params. out_filter_score_min_over_lread * combined_lread_m1) as i32
1123+ || combined_score < ( params. out_filter_score_min_over_lread * combined_lread_m1) as i32
10791124 {
10801125 paired_alns. clear ( ) ;
10811126 return ;
@@ -1089,8 +1134,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
10891134 // which directly mirrors STAR's joint transcript nMatch without extension inflation.
10901135 let combined_match = best. combined_n_match ;
10911136 if combined_match < params. out_filter_match_nmin
1092- || combined_match
1093- < ( params. out_filter_match_nmin_over_lread * combined_lread_m1) as u32
1137+ || combined_match < ( params. out_filter_match_nmin_over_lread * combined_lread_m1) as u32
10941138 {
10951139 paired_alns. clear ( ) ;
10961140 return ;
@@ -1765,4 +1809,55 @@ mod tests {
17651809 assert ! ( mate1_is_mapped) ;
17661810 }
17671811 }
1812+
1813+ #[ test]
1814+ fn shuffle_tied_prefix_is_deterministic ( ) {
1815+ // Same seed + same input → same permutation on reruns.
1816+ let items: Vec < ( i32 , u32 ) > = ( 0 ..8 ) . map ( |i| ( 100 , i) ) . collect ( ) ;
1817+ let mut a = items. clone ( ) ;
1818+ let mut b = items. clone ( ) ;
1819+ shuffle_tied_prefix ( & mut a, |t| t. 0 , 12345 ) ;
1820+ shuffle_tied_prefix ( & mut b, |t| t. 0 , 12345 ) ;
1821+ assert_eq ! ( a, b) ;
1822+ }
1823+
1824+ #[ test]
1825+ fn shuffle_tied_prefix_respects_ties ( ) {
1826+ // Only the top-score prefix gets shuffled; lower-scored tail is left alone.
1827+ let mut items = vec ! [ ( 100 , 0u32 ) , ( 100 , 1 ) , ( 100 , 2 ) , ( 50 , 3 ) , ( 40 , 4 ) ] ;
1828+ shuffle_tied_prefix ( & mut items, |t| t. 0 , 777 ) ;
1829+ // Last two elements (non-tied) stay in place.
1830+ assert_eq ! ( items[ 3 ] , ( 50 , 3 ) ) ;
1831+ assert_eq ! ( items[ 4 ] , ( 40 , 4 ) ) ;
1832+ // Tied prefix contains the original three items in some order.
1833+ let mut top: Vec < u32 > = items[ ..3 ] . iter ( ) . map ( |t| t. 1 ) . collect ( ) ;
1834+ top. sort ( ) ;
1835+ assert_eq ! ( top, vec![ 0 , 1 , 2 ] ) ;
1836+ }
1837+
1838+ #[ test]
1839+ fn shuffle_tied_prefix_different_seeds_can_diverge ( ) {
1840+ // Probabilistic: for a tied set of 8, at least two seeds should disagree
1841+ // on the chosen primary. (Exhaustive over a small seed range is fine.)
1842+ let base: Vec < ( i32 , u32 ) > = ( 0 ..8 ) . map ( |i| ( 100 , i) ) . collect ( ) ;
1843+ let mut firsts = std:: collections:: HashSet :: new ( ) ;
1844+ for seed in 0 ..32u64 {
1845+ let mut v = base. clone ( ) ;
1846+ shuffle_tied_prefix ( & mut v, |t| t. 0 , seed) ;
1847+ firsts. insert ( v[ 0 ] . 1 ) ;
1848+ }
1849+ assert ! (
1850+ firsts. len( ) >= 2 ,
1851+ "expected different seeds to pick different primaries, got {:?}" ,
1852+ firsts
1853+ ) ;
1854+ }
1855+
1856+ #[ test]
1857+ fn shuffle_tied_prefix_noop_when_no_ties ( ) {
1858+ let mut items = vec ! [ ( 100 , 0u32 ) , ( 90 , 1 ) , ( 80 , 2 ) ] ;
1859+ let before = items. clone ( ) ;
1860+ shuffle_tied_prefix ( & mut items, |t| t. 0 , 42 ) ;
1861+ assert_eq ! ( items, before) ;
1862+ }
17681863}
0 commit comments