Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ flate2 = "1"
rayon = "1"
dashmap = "6"
chrono = "0.4"
rand = "0.8"

[dev-dependencies]
tempfile = "3"
Expand Down
142 changes: 116 additions & 26 deletions src/align/read_align.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,44 @@
/// Read alignment driver function
use crate::align::score::{AlignmentScorer, SpliceMotif};
use crate::align::seed::Seed;
use crate::align::stitch::{
cluster_seeds, stitch_seeds_with_jdb_debug,
};
use crate::align::stitch::{cluster_seeds, stitch_seeds_with_jdb_debug};
use crate::align::transcript::Transcript;
use crate::error::Error;
use crate::index::GenomeIndex;
use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters};
use crate::stats::UnmappedReason;
use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom};
use std::hash::{DefaultHasher, Hash, Hasher};

/// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
///
/// STAR seeds `std::mt19937` once per chunk/thread (`runRNGseed*(iChunk+1)`),
/// then advances the state sequentially per read. ruSTAR parallelises per-read
/// via rayon, so we instead fold the read name into the seed — this keeps tie
/// breaks reproducible regardless of thread count while still honoring the
/// user's `--runRNGseed` value.
fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
let mut hasher = DefaultHasher::new();
read_name.hash(&mut hasher);
run_rng_seed.wrapping_mul(hasher.finish().wrapping_add(1))
}

/// Shuffle the prefix of `items` whose `score_fn` equals the first element's score.
///
/// Mirrors STAR's `ReadAlign_multMapSelect` / `funPrimaryAlignMark`: best-scoring
/// alignments are randomized so primary selection (index 0) is not biased by
/// upstream sort order. Non-tied elements are left alone.
fn shuffle_tied_prefix<T>(items: &mut [T], score_fn: impl Fn(&T) -> i32, seed: u64) {
let Some(first) = items.first() else {
return;
};
let best = score_fn(first);
let tied = items.iter().take_while(|t| score_fn(t) == best).count();
if tied < 2 {
return;
}
items[..tied].shuffle(&mut StdRng::seed_from_u64(seed));
}

/// Result of aligning a single read: (transcripts, chimeric_alignments, n_for_mapq, unmapped_reason)
pub type AlignReadResult = (
Expand Down Expand Up @@ -301,6 +331,13 @@ pub fn align_read(
.then_with(|| a.is_reverse.cmp(&b.is_reverse))
});

// Randomize primary among best-scoring ties (ReadAlign_multMapSelect.cpp:71-79).
shuffle_tied_prefix(
&mut transcripts,
|t| t.score,
per_read_seed(params.run_rng_seed, read_name),
);

// Score-range filter: keep only alignments within outFilterMultimapScoreRange of the best.
// (STAR's multMapSelect step — must run before quality filters.)
if !transcripts.is_empty() {
Expand Down Expand Up @@ -621,7 +658,10 @@ pub fn align_paired_read(
// This correctly represents mate2 on - strand for FR pairs without explicit RC handling.
let debug_name: &str = if debug_pe { read_name } else { "" };
let mut mate1_transcripts: Vec<Transcript> = Vec::new();
for cluster in mate1_clusters.iter().take(params.align_windows_per_read_nmax) {
for cluster in mate1_clusters
.iter()
.take(params.align_windows_per_read_nmax)
{
let ts = stitch_seeds_with_jdb_debug(
cluster,
mate1_seq,
Expand All @@ -635,7 +675,10 @@ pub fn align_paired_read(
}

let mut mate2_transcripts: Vec<Transcript> = Vec::new();
for cluster in mate2_clusters.iter().take(params.align_windows_per_read_nmax) {
for cluster in mate2_clusters
.iter()
.take(params.align_windows_per_read_nmax)
{
let ts = stitch_seeds_with_jdb_debug(
cluster,
mate2_seq,
Expand All @@ -656,14 +699,9 @@ pub fn align_paired_read(
let mut joint_pairs: Vec<PairedAlignment> = Vec::new();
for t1 in &mate1_transcripts {
for t2 in &mate2_transcripts {
if let Some(pair) = try_pair_transcripts(
t1,
t2,
len1,
len2,
params,
combined_score_threshold,
) {
if let Some(pair) =
try_pair_transcripts(t1, t2, len1, len2, params, combined_score_threshold)
{
joint_pairs.push(pair);
}
}
Expand Down Expand Up @@ -826,6 +864,13 @@ pub fn align_paired_read(
})
});

// Randomize primary among best-scoring pairs (STAR's funPrimaryAlignMark).
shuffle_tied_prefix(
&mut joint_pairs,
|pa| pa.combined_wt_score,
per_read_seed(params.run_rng_seed, read_name),
);

// Step 4: quality filter (mappedFilter).
filter_paired_transcripts(&mut joint_pairs, params);

Expand All @@ -840,12 +885,10 @@ pub fn align_paired_read(

// Half-mapped fallback: report the best-scoring single-mate transcript.
// Apply per-mate quality threshold (outFilterScoreMinOverLread * (len - 1)).
let thresh1 =
((params.out_filter_score_min_over_lread * (len1 as f64 - 1.0)) as i32)
.max(params.out_filter_score_min);
let thresh2 =
((params.out_filter_score_min_over_lread * (len2 as f64 - 1.0)) as i32)
.max(params.out_filter_score_min);
let thresh1 = ((params.out_filter_score_min_over_lread * (len1 as f64 - 1.0)) as i32)
.max(params.out_filter_score_min);
let thresh2 = ((params.out_filter_score_min_over_lread * (len2 as f64 - 1.0)) as i32)
.max(params.out_filter_score_min);

let best_m1 = mate1_transcripts
.into_iter()
Expand Down Expand Up @@ -1041,9 +1084,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
// the read is unmapped.
//
// Step 1: find the best pair and check quality thresholds on it.
let best_pa = paired_alns
.iter()
.max_by_key(|pa| pa.combined_wt_score);
let best_pa = paired_alns.iter().max_by_key(|pa| pa.combined_wt_score);
if let Some(best) = best_pa {
let mate1_len = (best.mate1_region.1 - best.mate1_region.0) as f64;
let mate2_len = (best.mate2_region.1 - best.mate2_region.0) as f64;
Expand All @@ -1059,8 +1100,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
// per-mate spans are small (penalty ≈ −2 each → sum penalty −4 vs combined penalty −2).
let combined_score = best.combined_wt_score;
if combined_score < params.out_filter_score_min
|| combined_score
< (params.out_filter_score_min_over_lread * combined_lread_m1) as i32
|| combined_score < (params.out_filter_score_min_over_lread * combined_lread_m1) as i32
{
paired_alns.clear();
return;
Expand All @@ -1074,8 +1114,7 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
// which directly mirrors STAR's joint transcript nMatch without extension inflation.
let combined_match = best.combined_n_match;
if combined_match < params.out_filter_match_nmin
|| combined_match
< (params.out_filter_match_nmin_over_lread * combined_lread_m1) as u32
|| combined_match < (params.out_filter_match_nmin_over_lread * combined_lread_m1) as u32
{
paired_alns.clear();
return;
Expand Down Expand Up @@ -1750,4 +1789,55 @@ mod tests {
assert!(mate1_is_mapped);
}
}

#[test]
fn shuffle_tied_prefix_is_deterministic() {
// Same seed + same input → same permutation on reruns.
let items: Vec<(i32, u32)> = (0..8).map(|i| (100, i)).collect();
let mut a = items.clone();
let mut b = items.clone();
shuffle_tied_prefix(&mut a, |t| t.0, 12345);
shuffle_tied_prefix(&mut b, |t| t.0, 12345);
assert_eq!(a, b);
}

#[test]
fn shuffle_tied_prefix_respects_ties() {
// Only the top-score prefix gets shuffled; lower-scored tail is left alone.
let mut items = vec![(100, 0u32), (100, 1), (100, 2), (50, 3), (40, 4)];
shuffle_tied_prefix(&mut items, |t| t.0, 777);
// Last two elements (non-tied) stay in place.
assert_eq!(items[3], (50, 3));
assert_eq!(items[4], (40, 4));
// Tied prefix contains the original three items in some order.
let mut top: Vec<u32> = items[..3].iter().map(|t| t.1).collect();
top.sort();
assert_eq!(top, vec![0, 1, 2]);
}

#[test]
fn shuffle_tied_prefix_different_seeds_can_diverge() {
// Probabilistic: for a tied set of 8, at least two seeds should disagree
// on the chosen primary. (Exhaustive over a small seed range is fine.)
let base: Vec<(i32, u32)> = (0..8).map(|i| (100, i)).collect();
let mut firsts = std::collections::HashSet::new();
for seed in 0..32u64 {
let mut v = base.clone();
shuffle_tied_prefix(&mut v, |t| t.0, seed);
firsts.insert(v[0].1);
}
assert!(
firsts.len() >= 2,
"expected different seeds to pick different primaries, got {:?}",
firsts
);
}

#[test]
fn shuffle_tied_prefix_noop_when_no_ties() {
let mut items = vec![(100, 0u32), (90, 1), (80, 2)];
let before = items.clone();
shuffle_tied_prefix(&mut items, |t| t.0, 42);
assert_eq!(items, before);
}
}
3 changes: 0 additions & 3 deletions src/align/stitch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2407,7 +2407,6 @@ pub(crate) fn stitch_seeds_with_jdb_debug(
Ok(transcripts)
}


/// Shared core: preprocessing + recursive stitcher, returns working transcripts + context.
#[allow(clippy::too_many_arguments)]
fn stitch_seeds_core(
Expand Down Expand Up @@ -2750,8 +2749,6 @@ fn stitch_seeds_core(
))
}



#[cfg(test)]
mod tests {
use super::*;
Expand Down
Loading
Loading