Skip to content

Commit 1bdbb9a

Browse files
committed
Merge origin/main (PRs #5 run-rng-seed, scverse#6 sam-rg-line) into dev
- shuffle_tied_prefix: deterministic per-read RNG for tied primary selection - --runRNGseed param (default 777, matches STAR) - --outSAMattrRGline: @rg header + RG:Z tags - Various code simplifications and formatting
2 parents 5d454b0 + 0763745 commit 1bdbb9a

9 files changed

Lines changed: 637 additions & 74 deletions

File tree

Cargo.lock

Lines changed: 78 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ flate2 = "1"
4949
rayon = "1"
5050
dashmap = "6"
5151
chrono = "0.4"
52+
rand = "0.8"
5253

5354
[dev-dependencies]
5455
tempfile = "3"

src/align/read_align.rs

Lines changed: 106 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,38 @@ use crate::error::Error;
1010
use crate::index::GenomeIndex;
1111
use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters};
1212
use crate::stats::UnmappedReason;
13+
use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom};
14+
use std::hash::{DefaultHasher, Hash, Hasher};
15+
16+
/// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
17+
///
18+
/// STAR seeds `std::mt19937` once per chunk/thread (`runRNGseed*(iChunk+1)`),
19+
/// then advances the state sequentially per read. ruSTAR parallelises per-read
20+
/// via rayon, so we instead fold the read name into the seed — this keeps tie
21+
/// breaks reproducible regardless of thread count while still honoring the
22+
/// user's `--runRNGseed` value.
23+
fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
24+
let mut hasher = DefaultHasher::new();
25+
read_name.hash(&mut hasher);
26+
run_rng_seed.wrapping_mul(hasher.finish().wrapping_add(1))
27+
}
28+
29+
/// Shuffle the prefix of `items` whose `score_fn` equals the first element's score.
30+
///
31+
/// Mirrors STAR's `ReadAlign_multMapSelect` / `funPrimaryAlignMark`: best-scoring
32+
/// alignments are randomized so primary selection (index 0) is not biased by
33+
/// upstream sort order. Non-tied elements are left alone.
34+
fn shuffle_tied_prefix<T>(items: &mut [T], score_fn: impl Fn(&T) -> i32, seed: u64) {
35+
let Some(first) = items.first() else {
36+
return;
37+
};
38+
let best = score_fn(first);
39+
let tied = items.iter().take_while(|t| score_fn(t) == best).count();
40+
if tied < 2 {
41+
return;
42+
}
43+
items[..tied].shuffle(&mut StdRng::seed_from_u64(seed));
44+
}
1345

1446
/// Result of aligning a single read: (transcripts, chimeric_alignments, n_for_mapq, unmapped_reason)
1547
pub type AlignReadResult = (
@@ -302,6 +334,13 @@ pub fn align_read(
302334
.then_with(|| a.is_reverse.cmp(&b.is_reverse))
303335
});
304336

337+
// Randomize primary among best-scoring ties (ReadAlign_multMapSelect.cpp:71-79).
338+
shuffle_tied_prefix(
339+
&mut transcripts,
340+
|t| t.score,
341+
per_read_seed(params.run_rng_seed, read_name),
342+
);
343+
305344
// Score-range filter: keep only alignments within outFilterMultimapScoreRange of the best.
306345
// (STAR's multMapSelect step — must run before quality filters.)
307346
if !transcripts.is_empty() {
@@ -902,6 +941,13 @@ pub fn align_paired_read(
902941
})
903942
});
904943

944+
// Randomize primary among best-scoring pairs (STAR's funPrimaryAlignMark).
945+
shuffle_tied_prefix(
946+
&mut joint_pairs,
947+
|pa| pa.combined_wt_score,
948+
per_read_seed(params.run_rng_seed, read_name),
949+
);
950+
905951
// Step 4: quality filter (mappedFilter).
906952
filter_paired_transcripts(&mut joint_pairs, params);
907953

@@ -1122,9 +1168,13 @@ fn calculate_insert_size(mate1_trans: &Transcript, mate2_trans: &Transcript) ->
11221168
/// Filter paired transcripts by quality thresholds.
11231169
/// STAR's mappedFilter applies ALL quality checks to trBest (the highest-scoring transcript).
11241170
fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Parameters) {
1125-
let best_pa = paired_alns
1126-
.iter()
1127-
.max_by_key(|pa| pa.combined_wt_score);
1171+
// STAR's mappedFilter (ReadAlign_mappedFilter.cpp) applies ALL quality thresholds to trBest
1172+
// (the highest-scoring transcript), NOT to each individual transcript. If trBest passes,
1173+
// all transcripts in the score window are included (they affect NH/MAPQ). If trBest fails,
1174+
// the read is unmapped.
1175+
//
1176+
// Step 1: find the best pair and check quality thresholds on it.
1177+
let best_pa = paired_alns.iter().max_by_key(|pa| pa.combined_wt_score);
11281178
if let Some(best) = best_pa {
11291179
let mate1_len = (best.mate1_region.1 - best.mate1_region.0) as f64;
11301180
let mate2_len = (best.mate2_region.1 - best.mate2_region.0) as f64;
@@ -1133,17 +1183,15 @@ fn filter_paired_transcripts(paired_alns: &mut Vec<PairedAlignment>, params: &Pa
11331183
let combined_score = best.combined_wt_score;
11341184

11351185
if combined_score < params.out_filter_score_min
1136-
|| combined_score
1137-
< (params.out_filter_score_min_over_lread * combined_lread_m1) as i32
1186+
|| combined_score < (params.out_filter_score_min_over_lread * combined_lread_m1) as i32
11381187
{
11391188
paired_alns.clear();
11401189
return;
11411190
}
11421191

11431192
let combined_match = best.combined_n_match;
11441193
if combined_match < params.out_filter_match_nmin
1145-
|| combined_match
1146-
< (params.out_filter_match_nmin_over_lread * combined_lread_m1) as u32
1194+
|| combined_match < (params.out_filter_match_nmin_over_lread * combined_lread_m1) as u32
11471195
{
11481196
paired_alns.clear();
11491197
return;
@@ -1814,4 +1862,55 @@ mod tests {
18141862
assert!(mate1_is_mapped);
18151863
}
18161864
}
1865+
1866+
#[test]
1867+
fn shuffle_tied_prefix_is_deterministic() {
1868+
// Same seed + same input → same permutation on reruns.
1869+
let items: Vec<(i32, u32)> = (0..8).map(|i| (100, i)).collect();
1870+
let mut a = items.clone();
1871+
let mut b = items.clone();
1872+
shuffle_tied_prefix(&mut a, |t| t.0, 12345);
1873+
shuffle_tied_prefix(&mut b, |t| t.0, 12345);
1874+
assert_eq!(a, b);
1875+
}
1876+
1877+
#[test]
1878+
fn shuffle_tied_prefix_respects_ties() {
1879+
// Only the top-score prefix gets shuffled; lower-scored tail is left alone.
1880+
let mut items = vec![(100, 0u32), (100, 1), (100, 2), (50, 3), (40, 4)];
1881+
shuffle_tied_prefix(&mut items, |t| t.0, 777);
1882+
// Last two elements (non-tied) stay in place.
1883+
assert_eq!(items[3], (50, 3));
1884+
assert_eq!(items[4], (40, 4));
1885+
// Tied prefix contains the original three items in some order.
1886+
let mut top: Vec<u32> = items[..3].iter().map(|t| t.1).collect();
1887+
top.sort();
1888+
assert_eq!(top, vec![0, 1, 2]);
1889+
}
1890+
1891+
#[test]
1892+
fn shuffle_tied_prefix_different_seeds_can_diverge() {
1893+
// Probabilistic: for a tied set of 8, at least two seeds should disagree
1894+
// on the chosen primary. (Exhaustive over a small seed range is fine.)
1895+
let base: Vec<(i32, u32)> = (0..8).map(|i| (100, i)).collect();
1896+
let mut firsts = std::collections::HashSet::new();
1897+
for seed in 0..32u64 {
1898+
let mut v = base.clone();
1899+
shuffle_tied_prefix(&mut v, |t| t.0, seed);
1900+
firsts.insert(v[0].1);
1901+
}
1902+
assert!(
1903+
firsts.len() >= 2,
1904+
"expected different seeds to pick different primaries, got {:?}",
1905+
firsts
1906+
);
1907+
}
1908+
1909+
#[test]
1910+
fn shuffle_tied_prefix_noop_when_no_ties() {
1911+
let mut items = vec![(100, 0u32), (90, 1), (80, 2)];
1912+
let before = items.clone();
1913+
shuffle_tied_prefix(&mut items, |t| t.0, 42);
1914+
assert_eq!(items, before);
1915+
}
18171916
}

src/align/stitch.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,7 +2567,6 @@ pub(crate) fn stitch_seeds_with_jdb_debug(
25672567
Ok(transcripts)
25682568
}
25692569

2570-
25712570
/// Shared core: preprocessing + recursive stitcher, returns working transcripts + context.
25722571
#[allow(clippy::too_many_arguments)]
25732572
pub(crate) fn stitch_seeds_core(
@@ -2914,8 +2913,6 @@ pub(crate) fn stitch_seeds_core(
29142913
))
29152914
}
29162915

2917-
2918-
29192916
#[cfg(test)]
29202917
mod tests {
29212918
use super::*;

src/io/bam.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ mod tests {
125125
let read_qual = vec![30, 30, 30, 30];
126126

127127
// Build record using SAM builder
128-
let record =
129-
crate::io::sam::SamWriter::build_unmapped_record(read_name, &read_seq, &read_qual)
130-
.unwrap();
128+
let record = crate::io::sam::SamWriter::build_unmapped_record(
129+
read_name, &read_seq, &read_qual, None,
130+
)
131+
.unwrap();
131132

132133
let result = writer.write_batch(&[record]);
133134
assert!(result.is_ok(), "Writing unmapped read should succeed");
@@ -204,12 +205,14 @@ mod tests {
204205
"read1",
205206
&[0, 1, 2, 3],
206207
&[30, 30, 30, 30],
208+
None,
207209
)
208210
.unwrap(),
209211
crate::io::sam::SamWriter::build_unmapped_record(
210212
"read2",
211213
&[0, 1, 2, 3],
212214
&[30, 30, 30, 30],
215+
None,
213216
)
214217
.unwrap(),
215218
];

0 commit comments

Comments
 (0)