fix(quant): normalize eq-class weights in log space (conserve mapped mass)

rob-p · claude · rob-p · commit 2bccea8b8ed3 · 2026-06-19T15:22:49.000-04:00
The per-fragment eq-class weights were normalized in linear space (w/Σw, guarded by `wsum > 0`). When a fragment's implied lengths all have ~0 FLD probability (logFragProb at the no-mass sentinel), every linear weight `w*exp(logFragProb)` underflows to exactly 0, so Σw == 0, the `wsum > 0` guard leaves the weights all zero, the eq-class denom is 0, and the VBEM silently drops that class's count — losing mapped mass. (The EM's degenerate-class branch is a no-op; C++ salmon's is too, so C++ relies on never producing a zero denom.) Adopt C++'s normalization: compute each mapping's log weight (ln(score) + logFragProb) and subtract the per-fragment log-sum-exp (C++ `exp(auxProb - auxDenom)`). This is mathematically identical to w/Σw for the non-degenerate case (per-class scaling is EM-invariant) but stays well-defined under total underflow, yielding relative weights instead of all-zero — so no class is dropped. On SRR1039508 (full) the mapped-mass loss drops from 190.1 fragments to 0.1 (matching C++); the change vs the prior linear path is within run-to-run wobble (log-Pearson 0.99956, < the 0.99951 run-to-run baseline) and leaves the nonzero transcript count unchanged. Reverts the earlier special-case underflow guard in favor of this general normalization. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01B7JMur5DmDpECddErpi2JS
diff --git a/crates/salmon-quant/src/processor.rs b/crates/salmon-quant/src/processor.rs
@@ -15,6 +15,7 @@ use paraseq::parallel::{PairedParallelProcessor, ParallelProcessor};
 use paraseq::Record;
 use piscem_rs::mapping::hit_searcher::{HitSearcher, SkippingStrategy};
 
+use salmon_core::math::{log_add, LOG_0, LOG_1};
 use salmon_core::{is_compatible, LibraryFormat, MateStatus};
 use salmon_eqclass::{range_factorize_bins, EquivalenceClassBuilder, TranscriptGroup};
 use salmon_index::SalmonIndex;
@@ -450,24 +451,46 @@ fn record(
     } else {
         None
     };
-    let mut pairs: Vec<(u32, f64)> = compat
+    // Per-mapping FLD log-probability (the un-normalized log-pmf at the implied
+    // fragment length; `LOG_1` = 0 when the auxiliary model is inactive or the
+    // mapping is not a proper pair).
+    let log_fps: Vec<f64> = compat
         .iter()
-        .map(|(m, w)| {
-            let log_frag_prob =
-                if use_aux && m.status == MateStatus::PairedEndPaired && m.fragment_len > 0 {
-                    match fld_snap.as_deref() {
-                        Some(snap) if !snap.is_empty() => {
-                            snap[(m.fragment_len as usize).min(snap.len() - 1)]
-                        }
-                        // pre-first-refresh fallback (only the early pre-burn-in batches)
-                        _ => sh.fld.pmf(m.fragment_len as usize),
+        .map(|(m, _)| {
+            if use_aux && m.status == MateStatus::PairedEndPaired && m.fragment_len > 0 {
+                match fld_snap.as_deref() {
+                    Some(snap) if !snap.is_empty() => {
+                        snap[(m.fragment_len as usize).min(snap.len() - 1)]
                     }
-                } else {
-                    0.0
-                };
-            (m.tid, *w * log_frag_prob.exp())
+                    // pre-first-refresh fallback (only the early pre-burn-in batches)
+                    _ => sh.fld.pmf(m.fragment_len as usize),
+                }
+            } else {
+                LOG_1
+            }
         })
         .collect();
+    // Per-fragment conditional probabilities, normalized in *log* space — the
+    // log weight of each mapping is `ln(score weight) + logFragProb`, and we
+    // subtract the log-sum-exp over the fragment's mappings (C++'s
+    // `exp(auxProb - auxDenom)`). This is the same normalization salmon's
+    // C++ does and is mathematically identical to the linear `w/Σw`, but doing it
+    // in log space keeps it well-defined when every FLD weight underflows: a
+    // fragment whose implied lengths all have ~0 FLD probability (logFragProb at
+    // the no-mass sentinel) would, in linear space, give all-zero weights — the
+    // `wsum > 0` normalization below then leaves them zero, the eq-class denom is
+    // 0, and the VBEM silently drops the fragment's count (lost mapped mass; the
+    // EM's degenerate-class branch is a no-op, as in C++). In log space the same
+    // fragment yields well-defined relative weights, so no mass is lost.
+    let log_denom = compat
+        .iter()
+        .zip(&log_fps)
+        .fold(LOG_0, |acc, ((_, w), &lfp)| log_add(acc, w.ln() + lfp));
+    let mut pairs: Vec<(u32, f64)> = compat
+        .iter()
+        .zip(&log_fps)
+        .map(|((m, w), &lfp)| (m.tid, (w.ln() + lfp - log_denom).exp()))
+        .collect();
 
     // Abundance-aware FLD training: accept each concordant compatible pair's
     // fragment length with probability = its abundance-aware online posterior