@@ -10,6 +10,38 @@ use crate::error::Error;
1010use crate :: index:: GenomeIndex ;
1111use crate :: params:: { IntronMotifFilter , IntronStrandFilter , Parameters } ;
1212use crate :: stats:: UnmappedReason ;
13+ use rand:: { SeedableRng , rngs:: StdRng , seq:: SliceRandom } ;
14+ use std:: hash:: { DefaultHasher , Hash , Hasher } ;
15+
16+ /// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
17+ ///
18+ /// STAR seeds `std::mt19937` once per chunk/thread (`runRNGseed*(iChunk+1)`),
19+ /// then advances the state sequentially per read. ruSTAR parallelises per-read
20+ /// via rayon, so we instead fold the read name into the seed — this keeps tie
21+ /// breaks reproducible regardless of thread count while still honoring the
22+ /// user's `--runRNGseed` value.
23+ fn per_read_seed ( run_rng_seed : u64 , read_name : & str ) -> u64 {
24+ let mut hasher = DefaultHasher :: new ( ) ;
25+ read_name. hash ( & mut hasher) ;
26+ run_rng_seed. wrapping_mul ( hasher. finish ( ) . wrapping_add ( 1 ) )
27+ }
28+
29+ /// Shuffle the prefix of `items` whose `score_fn` equals the first element's score.
30+ ///
31+ /// Mirrors STAR's `ReadAlign_multMapSelect` / `funPrimaryAlignMark`: best-scoring
32+ /// alignments are randomized so primary selection (index 0) is not biased by
33+ /// upstream sort order. Non-tied elements are left alone.
34+ fn shuffle_tied_prefix < T > ( items : & mut [ T ] , score_fn : impl Fn ( & T ) -> i32 , seed : u64 ) {
35+ let Some ( first) = items. first ( ) else {
36+ return ;
37+ } ;
38+ let best = score_fn ( first) ;
39+ let tied = items. iter ( ) . take_while ( |t| score_fn ( t) == best) . count ( ) ;
40+ if tied < 2 {
41+ return ;
42+ }
43+ items[ ..tied] . shuffle ( & mut StdRng :: seed_from_u64 ( seed) ) ;
44+ }
1345
1446/// Result of aligning a single read: (transcripts, chimeric_alignments, n_for_mapq, unmapped_reason)
1547pub type AlignReadResult = (
@@ -302,6 +334,13 @@ pub fn align_read(
302334 . then_with ( || a. is_reverse . cmp ( & b. is_reverse ) )
303335 } ) ;
304336
337+ // Randomize primary among best-scoring ties (ReadAlign_multMapSelect.cpp:71-79).
338+ shuffle_tied_prefix (
339+ & mut transcripts,
340+ |t| t. score ,
341+ per_read_seed ( params. run_rng_seed , read_name) ,
342+ ) ;
343+
305344 // Score-range filter: keep only alignments within outFilterMultimapScoreRange of the best.
306345 // (STAR's multMapSelect step — must run before quality filters.)
307346 if !transcripts. is_empty ( ) {
@@ -902,6 +941,13 @@ pub fn align_paired_read(
902941 } )
903942 } ) ;
904943
944+ // Randomize primary among best-scoring pairs (STAR's funPrimaryAlignMark).
945+ shuffle_tied_prefix (
946+ & mut joint_pairs,
947+ |pa| pa. combined_wt_score ,
948+ per_read_seed ( params. run_rng_seed , read_name) ,
949+ ) ;
950+
905951 // Step 4: quality filter (mappedFilter).
906952 filter_paired_transcripts ( & mut joint_pairs, params) ;
907953
@@ -1122,9 +1168,13 @@ fn calculate_insert_size(mate1_trans: &Transcript, mate2_trans: &Transcript) ->
11221168/// Filter paired transcripts by quality thresholds.
11231169/// STAR's mappedFilter applies ALL quality checks to trBest (the highest-scoring transcript).
11241170fn filter_paired_transcripts ( paired_alns : & mut Vec < PairedAlignment > , params : & Parameters ) {
1125- let best_pa = paired_alns
1126- . iter ( )
1127- . max_by_key ( |pa| pa. combined_wt_score ) ;
1171+ // STAR's mappedFilter (ReadAlign_mappedFilter.cpp) applies ALL quality thresholds to trBest
1172+ // (the highest-scoring transcript), NOT to each individual transcript. If trBest passes,
1173+ // all transcripts in the score window are included (they affect NH/MAPQ). If trBest fails,
1174+ // the read is unmapped.
1175+ //
1176+ // Step 1: find the best pair and check quality thresholds on it.
1177+ let best_pa = paired_alns. iter ( ) . max_by_key ( |pa| pa. combined_wt_score ) ;
11281178 if let Some ( best) = best_pa {
11291179 let mate1_len = ( best. mate1_region . 1 - best. mate1_region . 0 ) as f64 ;
11301180 let mate2_len = ( best. mate2_region . 1 - best. mate2_region . 0 ) as f64 ;
@@ -1133,17 +1183,15 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
11331183 let combined_score = best. combined_wt_score ;
11341184
11351185 if combined_score < params. out_filter_score_min
1136- || combined_score
1137- < ( params. out_filter_score_min_over_lread * combined_lread_m1) as i32
1186+ || combined_score < ( params. out_filter_score_min_over_lread * combined_lread_m1) as i32
11381187 {
11391188 paired_alns. clear ( ) ;
11401189 return ;
11411190 }
11421191
11431192 let combined_match = best. combined_n_match ;
11441193 if combined_match < params. out_filter_match_nmin
1145- || combined_match
1146- < ( params. out_filter_match_nmin_over_lread * combined_lread_m1) as u32
1194+ || combined_match < ( params. out_filter_match_nmin_over_lread * combined_lread_m1) as u32
11471195 {
11481196 paired_alns. clear ( ) ;
11491197 return ;
@@ -1814,4 +1862,55 @@ mod tests {
18141862 assert ! ( mate1_is_mapped) ;
18151863 }
18161864 }
1865+
1866+ #[ test]
1867+ fn shuffle_tied_prefix_is_deterministic ( ) {
1868+ // Same seed + same input → same permutation on reruns.
1869+ let items: Vec < ( i32 , u32 ) > = ( 0 ..8 ) . map ( |i| ( 100 , i) ) . collect ( ) ;
1870+ let mut a = items. clone ( ) ;
1871+ let mut b = items. clone ( ) ;
1872+ shuffle_tied_prefix ( & mut a, |t| t. 0 , 12345 ) ;
1873+ shuffle_tied_prefix ( & mut b, |t| t. 0 , 12345 ) ;
1874+ assert_eq ! ( a, b) ;
1875+ }
1876+
1877+ #[ test]
1878+ fn shuffle_tied_prefix_respects_ties ( ) {
1879+ // Only the top-score prefix gets shuffled; lower-scored tail is left alone.
1880+ let mut items = vec ! [ ( 100 , 0u32 ) , ( 100 , 1 ) , ( 100 , 2 ) , ( 50 , 3 ) , ( 40 , 4 ) ] ;
1881+ shuffle_tied_prefix ( & mut items, |t| t. 0 , 777 ) ;
1882+ // Last two elements (non-tied) stay in place.
1883+ assert_eq ! ( items[ 3 ] , ( 50 , 3 ) ) ;
1884+ assert_eq ! ( items[ 4 ] , ( 40 , 4 ) ) ;
1885+ // Tied prefix contains the original three items in some order.
1886+ let mut top: Vec < u32 > = items[ ..3 ] . iter ( ) . map ( |t| t. 1 ) . collect ( ) ;
1887+ top. sort ( ) ;
1888+ assert_eq ! ( top, vec![ 0 , 1 , 2 ] ) ;
1889+ }
1890+
1891+ #[ test]
1892+ fn shuffle_tied_prefix_different_seeds_can_diverge ( ) {
1893+ // Probabilistic: for a tied set of 8, at least two seeds should disagree
1894+ // on the chosen primary. (Exhaustive over a small seed range is fine.)
1895+ let base: Vec < ( i32 , u32 ) > = ( 0 ..8 ) . map ( |i| ( 100 , i) ) . collect ( ) ;
1896+ let mut firsts = std:: collections:: HashSet :: new ( ) ;
1897+ for seed in 0 ..32u64 {
1898+ let mut v = base. clone ( ) ;
1899+ shuffle_tied_prefix ( & mut v, |t| t. 0 , seed) ;
1900+ firsts. insert ( v[ 0 ] . 1 ) ;
1901+ }
1902+ assert ! (
1903+ firsts. len( ) >= 2 ,
1904+ "expected different seeds to pick different primaries, got {:?}" ,
1905+ firsts
1906+ ) ;
1907+ }
1908+
1909+ #[ test]
1910+ fn shuffle_tied_prefix_noop_when_no_ties ( ) {
1911+ let mut items = vec ! [ ( 100 , 0u32 ) , ( 90 , 1 ) , ( 80 , 2 ) ] ;
1912+ let before = items. clone ( ) ;
1913+ shuffle_tied_prefix ( & mut items, |t| t. 0 , 42 ) ;
1914+ assert_eq ! ( items, before) ;
1915+ }
18171916}
0 commit comments