Skip to content

Commit e67864d

Browse files
authored
Merge pull request #5 from ewels/feat/run-rng-seed
feat: add `--runRNGseed` flag with seeded primary tie-break
2 parents e508928 + baeea81 commit e67864d

7 files changed

Lines changed: 275 additions & 45 deletions

File tree

Cargo.lock

Lines changed: 78 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ flate2 = "1"
4949
rayon = "1"
5050
dashmap = "6"
5151
chrono = "0.4"
52+
rand = "0.8"
5253

5354
[dev-dependencies]
5455
tempfile = "3"

src/align/read_align.rs

Lines changed: 113 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,44 @@
11
/// Read alignment driver function
22
use crate::align::score::{AlignmentScorer, SpliceMotif};
33
use crate::align::seed::Seed;
4-
use crate::align::stitch::{
5-
cluster_seeds, stitch_seeds_with_jdb_debug,
6-
};
4+
use crate::align::stitch::{cluster_seeds, stitch_seeds_with_jdb_debug};
75
use crate::align::transcript::Transcript;
86
use crate::error::Error;
97
use crate::index::GenomeIndex;
108
use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters};
119
use crate::stats::UnmappedReason;
10+
use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom};
11+
use std::hash::{DefaultHasher, Hash, Hasher};
12+
13+
/// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
14+
///
15+
/// STAR seeds `std::mt19937` once per chunk/thread (`runRNGseed*(iChunk+1)`),
16+
/// then advances the state sequentially per read. ruSTAR parallelises per-read
17+
/// via rayon, so we instead fold the read name into the seed — this keeps tie
18+
/// breaks reproducible regardless of thread count while still honoring the
19+
/// user's `--runRNGseed` value.
20+
fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
21+
let mut hasher = DefaultHasher::new();
22+
read_name.hash(&mut hasher);
23+
run_rng_seed.wrapping_mul(hasher.finish().wrapping_add(1))
24+
}
25+
26+
/// Shuffle the prefix of `items` whose `score_fn` equals the first element's score.
27+
///
28+
/// Mirrors STAR's `ReadAlign_multMapSelect` / `funPrimaryAlignMark`: best-scoring
29+
/// alignments are randomized so primary selection (index 0) is not biased by
30+
/// upstream sort order. Non-tied elements are left alone.
31+
fn shuffle_tied_prefix<T>(items: &mut [T], score_fn: impl Fn(&T) -> i32, seed: u64) {
32+
let Some(first) = items.first() else {
33+
return;
34+
};
35+
let best = score_fn(first);
36+
let tied = items.iter().take_while(|t| score_fn(t) == best).count();
37+
if tied < 2 {
38+
return;
39+
}
40+
items[..tied].shuffle(&mut StdRng::seed_from_u64(seed));
41+
}
1242

1343
/// Result of aligning a single read: (transcripts, chimeric_alignments, n_for_mapq, unmapped_reason)
1444
pub type AlignReadResult = (
@@ -301,6 +331,13 @@ pub fn align_read(
301331
.then_with(|| a.is_reverse.cmp(&b.is_reverse))
302332
});
303333

334+
// Randomize primary among best-scoring ties (ReadAlign_multMapSelect.cpp:71-79).
335+
shuffle_tied_prefix(
336+
&mut transcripts,
337+
|t| t.score,
338+
per_read_seed(params.run_rng_seed, read_name),
339+
);
340+
304341
// Score-range filter: keep only alignments within outFilterMultimapScoreRange of the best.
305342
// (STAR's multMapSelect step — must run before quality filters.)
306343
if !transcripts.is_empty() {
@@ -621,7 +658,10 @@ pub fn align_paired_read(
621658
// This correctly represents mate2 on - strand for FR pairs without explicit RC handling.
622659
let debug_name: &str = if debug_pe { read_name } else { "" };
623660
let mut mate1_transcripts: Vec<Transcript> = Vec::new();
624-
for cluster in mate1_clusters.iter().take(params.align_windows_per_read_nmax) {
661+
for cluster in mate1_clusters
662+
.iter()
663+
.take(params.align_windows_per_read_nmax)
664+
{
625665
let ts = stitch_seeds_with_jdb_debug(
626666
cluster,
627667
mate1_seq,
@@ -635,7 +675,10 @@ pub fn align_paired_read(
635675
}
636676

637677
let mut mate2_transcripts: Vec<Transcript> = Vec::new();
638-
for cluster in mate2_clusters.iter().take(params.align_windows_per_read_nmax) {
678+
for cluster in mate2_clusters
679+
.iter()
680+
.take(params.align_windows_per_read_nmax)
681+
{
639682
let ts = stitch_seeds_with_jdb_debug(
640683
cluster,
641684
mate2_seq,
@@ -831,6 +874,13 @@ pub fn align_paired_read(
831874
})
832875
});
833876

877+
// Randomize primary among best-scoring pairs (STAR's funPrimaryAlignMark).
878+
shuffle_tied_prefix(
879+
&mut joint_pairs,
880+
|pa| pa.combined_wt_score,
881+
per_read_seed(params.run_rng_seed, read_name),
882+
);
883+
834884
// Step 4: quality filter (mappedFilter).
835885
filter_paired_transcripts(&mut joint_pairs, params);
836886

@@ -845,12 +895,10 @@ pub fn align_paired_read(
845895

846896
// Half-mapped fallback: report the best-scoring single-mate transcript.
847897
// Apply per-mate quality threshold (outFilterScoreMinOverLread * (len - 1)).
848-
let thresh1 =
849-
((params.out_filter_score_min_over_lread * (len1 as f64 - 1.0)) as i32)
850-
.max(params.out_filter_score_min);
851-
let thresh2 =
852-
((params.out_filter_score_min_over_lread * (len2 as f64 - 1.0)) as i32)
853-
.max(params.out_filter_score_min);
898+
let thresh1 = ((params.out_filter_score_min_over_lread * (len1 as f64 - 1.0)) as i32)
899+
.max(params.out_filter_score_min);
900+
let thresh2 = ((params.out_filter_score_min_over_lread * (len2 as f64 - 1.0)) as i32)
901+
.max(params.out_filter_score_min);
854902

855903
let best_m1 = mate1_transcripts
856904
.into_iter()
@@ -1056,9 +1104,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
10561104
// the read is unmapped.
10571105
//
10581106
// Step 1: find the best pair and check quality thresholds on it.
1059-
let best_pa = paired_alns
1060-
.iter()
1061-
.max_by_key(|pa| pa.combined_wt_score);
1107+
let best_pa = paired_alns.iter().max_by_key(|pa| pa.combined_wt_score);
10621108
if let Some(best) = best_pa {
10631109
let mate1_len = (best.mate1_region.1 - best.mate1_region.0) as f64;
10641110
let mate2_len = (best.mate2_region.1 - best.mate2_region.0) as f64;
@@ -1074,8 +1120,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
10741120
// per-mate spans are small (penalty ≈ −2 each → sum penalty −4 vs combined penalty −2).
10751121
let combined_score = best.combined_wt_score;
10761122
if combined_score < params.out_filter_score_min
1077-
|| combined_score
1078-
< (params.out_filter_score_min_over_lread * combined_lread_m1) as i32
1123+
|| combined_score < (params.out_filter_score_min_over_lread * combined_lread_m1) as i32
10791124
{
10801125
paired_alns.clear();
10811126
return;
@@ -1089,8 +1134,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
10891134
// which directly mirrors STAR's joint transcript nMatch without extension inflation.
10901135
let combined_match = best.combined_n_match;
10911136
if combined_match < params.out_filter_match_nmin
1092-
|| combined_match
1093-
< (params.out_filter_match_nmin_over_lread * combined_lread_m1) as u32
1137+
|| combined_match < (params.out_filter_match_nmin_over_lread * combined_lread_m1) as u32
10941138
{
10951139
paired_alns.clear();
10961140
return;
@@ -1765,4 +1809,55 @@ mod tests {
17651809
assert!(mate1_is_mapped);
17661810
}
17671811
}
1812+
1813+
#[test]
1814+
fn shuffle_tied_prefix_is_deterministic() {
1815+
// Same seed + same input → same permutation on reruns.
1816+
let items: Vec<(i32, u32)> = (0..8).map(|i| (100, i)).collect();
1817+
let mut a = items.clone();
1818+
let mut b = items.clone();
1819+
shuffle_tied_prefix(&mut a, |t| t.0, 12345);
1820+
shuffle_tied_prefix(&mut b, |t| t.0, 12345);
1821+
assert_eq!(a, b);
1822+
}
1823+
1824+
#[test]
1825+
fn shuffle_tied_prefix_respects_ties() {
1826+
// Only the top-score prefix gets shuffled; lower-scored tail is left alone.
1827+
let mut items = vec![(100, 0u32), (100, 1), (100, 2), (50, 3), (40, 4)];
1828+
shuffle_tied_prefix(&mut items, |t| t.0, 777);
1829+
// Last two elements (non-tied) stay in place.
1830+
assert_eq!(items[3], (50, 3));
1831+
assert_eq!(items[4], (40, 4));
1832+
// Tied prefix contains the original three items in some order.
1833+
let mut top: Vec<u32> = items[..3].iter().map(|t| t.1).collect();
1834+
top.sort();
1835+
assert_eq!(top, vec![0, 1, 2]);
1836+
}
1837+
1838+
#[test]
1839+
fn shuffle_tied_prefix_different_seeds_can_diverge() {
1840+
// Probabilistic: for a tied set of 8, at least two seeds should disagree
1841+
// on the chosen primary. (Exhaustive over a small seed range is fine.)
1842+
let base: Vec<(i32, u32)> = (0..8).map(|i| (100, i)).collect();
1843+
let mut firsts = std::collections::HashSet::new();
1844+
for seed in 0..32u64 {
1845+
let mut v = base.clone();
1846+
shuffle_tied_prefix(&mut v, |t| t.0, seed);
1847+
firsts.insert(v[0].1);
1848+
}
1849+
assert!(
1850+
firsts.len() >= 2,
1851+
"expected different seeds to pick different primaries, got {:?}",
1852+
firsts
1853+
);
1854+
}
1855+
1856+
#[test]
1857+
fn shuffle_tied_prefix_noop_when_no_ties() {
1858+
let mut items = vec![(100, 0u32), (90, 1), (80, 2)];
1859+
let before = items.clone();
1860+
shuffle_tied_prefix(&mut items, |t| t.0, 42);
1861+
assert_eq!(items, before);
1862+
}
17681863
}

src/align/stitch.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,7 +2407,6 @@ pub(crate) fn stitch_seeds_with_jdb_debug(
24072407
Ok(transcripts)
24082408
}
24092409

2410-
24112410
/// Shared core: preprocessing + recursive stitcher, returns working transcripts + context.
24122411
#[allow(clippy::too_many_arguments)]
24132412
pub(crate) fn stitch_seeds_core(
@@ -2750,8 +2749,6 @@ pub(crate) fn stitch_seeds_core(
27502749
))
27512750
}
27522751

2753-
2754-
27552752
#[cfg(test)]
27562753
mod tests {
27572754
use super::*;

0 commit comments

Comments
 (0)