Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions crates/salmon-quant/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -362,15 +362,19 @@ pub fn quantify(opts: &QuantOptions) -> Result<QuantResult> {
// bias models are weighted by abundance-aware posteriors (salmon's online
// phase), not score-only weights. The offline EM still gives the final
// point estimate.
let online = (opts.seq_bias || opts.gc_bias || opts.pos_bias).then(|| {
// The online estimate runs unconditionally: besides weighting the observed
// bias models (when bias correction is on), it provides the abundance-aware
// posterior used to train the fragment-length distribution (salmon's
// `r < exp(aln.logProb)` acceptance). The offline EM does not read it.
let online = {
let ref_lens: Vec<u64> = (0..num_refs).map(|t| salmon.ref_len(t)).collect();
salmon_infer::OnlineInference::new(
Some(salmon_infer::OnlineInference::new(
&ref_lens,
0.05,
opts.forgetting_factor,
opts.num_aux_model_samples,
)
});
))
};

// ---- parallel mapping pass (borrows the accumulators) -------------------
{
Expand Down
147 changes: 79 additions & 68 deletions crates/salmon-quant/src/processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,24 @@ const LOG_EPSILON: f64 = -23.998_158_637_57; // (0.375e-10f64).ln()
/// posterior only after this many fragments have been assigned.
pub(crate) const NUM_PRE_BURNIN: u64 = 5000;

thread_local! {
/// Per-thread PRNG state for stochastic FLD-sample acceptance (mirrors
/// salmon's per-thread RNG used when training the fragment-length model).
static FLD_RNG: std::cell::Cell<u64> = const { std::cell::Cell::new(0x2545_F491_4F6C_DD1D) };
}

/// Draw a pseudo-random value in `[0, 1)` from the per-thread xorshift state.
fn fld_rng_u01() -> f64 {
FLD_RNG.with(|s| {
let mut x = s.get();
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
s.set(x);
((x >> 11) as f64) * (1.0 / ((1u64 << 53) as f64))
})
}

/// Collect the orientation-aware 5'/3' sequence-bias contexts for one mapping,
/// weighted by `weight` (the fragment-transcript posterior). A forward read's 5'
/// context feeds the forward model; a reverse read's 5' context
Expand Down Expand Up @@ -326,50 +344,53 @@ fn record(
// also advances the online masses by `logForgettingMass + log(posterior)`;
// without it they fall back to the normalized aux (score) weights.
let collecting = seqbias.is_some() || gcbias.is_some() || posbias.is_some();
let bias_w: Vec<f64> = if collecting {
if let Some(online) = sh.online {
// Per-mapping log auxiliary probability (salmon's
// `auxProb + startPosProb`, abundance-independent):
// logFragCov = ln(score weight)
// startPosProb = proper pair -> -ln(refLen - flen + 1) (flen<=refLen,
// else LOG_EPSILON); otherwise -ln(refLen)
// logFragProb = proper pair (after pre-burn-in) -> live FLD pmf(flen);
// unexpected orphan in a paired library -> LOG_EPSILON.
// The FLD term discriminates isoforms/paralogs whose implied insert
// size differs (e.g. alternative splicing), which the length norm alone
// cannot capture.
let use_aux = online.num_assigned() >= sh.pre_burnin;
let mm: Vec<(u32, f64)> = compat
.iter()
.map(|(m, w)| {
let rl = sh.salmon.ref_len(m.tid as usize).max(1) as f64;
let proper = m.status == MateStatus::PairedEndPaired && m.fragment_len > 0;
let flen = m.fragment_len as f64;
let start_pos_prob = if proper {
if flen <= rl {
-((rl - flen + 1.0).ln())
} else {
LOG_EPSILON
}
// Abundance-aware online posterior, computed once per fragment within the
// model-training window (`o.collecting()`), advancing the online masses.
// Mirrors salmon's `aln.logProb = transcriptLogCount + auxProb + startPosProb`
// normalized over the fragment's mappings, where
// logFragCov = ln(score weight)
// startPosProb = proper pair -> -ln(refLen - flen + 1) (flen<=refLen, else
// LOG_EPSILON); otherwise -ln(refLen)
// logFragProb = proper pair (after pre-burn-in) -> live FLD pmf(flen);
// unexpected orphan in a paired library -> LOG_EPSILON.
// Used for abundance-aware bias collection and abundance-aware FLD training.
let online_post: Option<Vec<f64>> = sh.online.filter(|o| o.collecting()).map(|online| {
let use_aux = online.num_assigned() >= sh.pre_burnin;
let mm: Vec<(u32, f64)> = compat
.iter()
.map(|(m, w)| {
let rl = sh.salmon.ref_len(m.tid as usize).max(1) as f64;
let proper = m.status == MateStatus::PairedEndPaired && m.fragment_len > 0;
let flen = m.fragment_len as f64;
let start_pos_prob = if proper {
if flen <= rl {
-((rl - flen + 1.0).ln())
} else {
-(rl.ln())
};
let log_frag_prob = if proper {
if use_aux {
sh.fld.pmf(m.fragment_len as usize)
} else {
0.0
}
} else if sh.paired_lib {
LOG_EPSILON // unexpected orphan
LOG_EPSILON
}
} else {
-(rl.ln())
};
let log_frag_prob = if proper {
if use_aux {
sh.fld.pmf(m.fragment_len as usize)
} else {
0.0
};
let log_cov = if *w > 0.0 { w.ln() } else { f64::NEG_INFINITY };
(m.tid, log_cov + start_pos_prob + log_frag_prob)
})
.collect();
online.assign_fragment(&mm, log_fm)
}
} else if sh.paired_lib {
LOG_EPSILON // unexpected orphan
} else {
0.0
};
let log_cov = if *w > 0.0 { w.ln() } else { f64::NEG_INFINITY };
(m.tid, log_cov + start_pos_prob + log_frag_prob)
})
.collect();
online.assign_fragment(&mm, log_fm)
});
let bias_w: Vec<f64> = if collecting {
if let Some(post) = &online_post {
post.clone()
} else {
let wsum: f64 = compat.iter().map(|(_, w)| *w).sum();
compat
Expand Down Expand Up @@ -424,36 +445,26 @@ fn record(
})
.collect();

// Observe a fragment length from the best concordant compatible pair,
// weighted by that pair's posterior confidence among the concordant pairs.
// salmon trains its FLD stochastically (it accepts an alignment with
// probability proportional to exp(aln.logProb)), so ambiguous multimappers
// contribute little; adding every best pair at full weight overdisperses the
// FLD on paralog/near-duplicate-rich inputs. Down-weighting by confidence is
// the deterministic analog.
let mut conc_sum = 0.0_f64;
let mut best_concordant: Option<&ScoredMapping> = None;
for m in maps.iter() {
if m.status == MateStatus::PairedEndPaired
&& m.fragment_len > 0
&& sh
.expected_format
.is_none_or(|exp| is_compatible(exp, m.format, m.is_fw, m.status))
{
conc_sum += m.weight;
if best_concordant.is_none_or(|b| m.weight > b.weight) {
best_concordant = Some(m);
// Abundance-aware FLD training: accept each concordant compatible pair's
// fragment length with probability = its abundance-aware online posterior
// (salmon's `if (r < exp(aln.logProb)) fragLengthDist.addVal(...)`, where
// aln.logProb includes transcriptLogCount). For reads shared between
// near-duplicates this preferentially samples the dominant transcript's
// implied length, concentrating the FLD as salmon does (vs adding every best
// pair at full weight, which overdisperses it). Frozen after the training
// window (`online_post` is `None`).
if let Some(post) = &online_post {
for (i, (m, _)) in compat.iter().enumerate() {
let conc = m.status == MateStatus::PairedEndPaired
&& m.fragment_len > 0
&& sh
.expected_format
.is_none_or(|exp| is_compatible(exp, m.format, m.is_fw, m.status));
if conc && fld_rng_u01() < post[i] {
sh.fld.add_val(m.fragment_len as usize, 0.0);
}
}
}
if let Some(best) = best_concordant {
let conf = if conc_sum > 0.0 {
best.weight / conc_sum
} else {
1.0
};
sh.fld.add_val(best.fragment_len as usize, conf.ln());
}

// Build the equivalence class: sorted, de-duplicated transcript ids + weights.
pairs.sort_by_key(|p| p.0);
Expand Down
Loading