Skip to content

Commit 5682ef0

Browse files
rob-pclaude
andcommitted
fix(map): penalize indels hidden between read-overlapping MEMs
`anchored_align_score` treated any pair of adjacent chain MEMs that overlapped in the read as a pure overlap (`score -= max_overlap * match`), which silently absorbed an indel whenever the two MEMs sat on different diagonals (read-overlap != ref-overlap). Raw k-mer anchors always overlap in the read, so a small reference-side indel between two otherwise-exact blocks was scored as a perfect match — inflating the alignment score and keeping transcripts the read does not actually match well. Concrete case: ERR188044.600028 vs ENST00000677249.1 has a 3 bp insertion relative to the transcript. Two MEMs overlapped by 1 base in the read but 4 in the reference (a 3 bp diagonal jump). The old branch scored it 152 (perfect); the true optimum is 134. The inflated score cleared the `minAlnProb` keep filter, so Rust retained the transcript while C++ (PuffAligner gap DP) and Rust's own `--fullLengthAlignment` path both scored 134 and correctly dropped it. Fix: unify the gap/overlap handling. Compute the per-axis overlap, trim both MEM ends back to a common boundary, and DP-align the residual: * same diagonal -> residual empty -> gap DP returns 0 (old behavior) * gap-separated -> DP the gap (old `if` behavior) * shifted diagonals -> residual exposes the indel, charged an affine gap This matches the full-length DP and PuffAligner; uni-MEM extension would not fix it (the indel-at-boundary geometry survives extension). Validation on a 1M-read GEUVADIS subset (matched dedup indices, C++ --orphanChainSubThresh 0.0): C++/Rust target-set concordance rises 97.53% -> 99.14% (mean per-read Jaccard 0.99628). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 9604e21 commit 5682ef0

1 file changed

Lines changed: 45 additions & 11 deletions

File tree

crates/salmon-map/src/align.rs

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -220,17 +220,26 @@ fn anchored_align_score(
220220
score += mem.len * m;
221221
if i + 1 < mems.len() {
222222
let nxt = &mems[i + 1];
223-
let (rg_s, rg_e) = (mem.read_end(), nxt.read_start);
224-
let (tg_s, tg_e) = (mem.ref_end(), nxt.ref_start);
225-
if rg_e >= rg_s && tg_e >= tg_s {
226-
let qg = &query[rg_s as usize..rg_e as usize];
227-
let tg = &ref_seq[tg_s as usize..tg_e as usize];
228-
score += cached_gap_score(qg, tg, cfg);
229-
} else {
230-
// Overlapping MEMs: remove the double-counted exact bases.
231-
let ov = (rg_s - rg_e).max(tg_s - tg_e).max(0);
232-
score -= ov * m;
233-
}
223+
// Per-axis overlap of this MEM with the next (negative ⇒ a true gap).
224+
let read_ov = mem.read_end() - nxt.read_start;
225+
let ref_ov = mem.ref_end() - nxt.ref_start;
226+
// Trim the overlapping exact bases back to a common boundary on BOTH
227+
// axes, then DP-align whatever residual remains between the MEMs:
228+
// * same diagonal (read_ov == ref_ov): residual is empty on both
229+
// axes, so this degenerates to "remove the double-counted bases"
230+
// (the cached gap DP returns 0 for empty/empty);
231+
// * gap-separated (both overlaps <= 0): ov is 0 and we DP the gap;
232+
// * different diagonals (read_ov != ref_ov): trimming to the larger
233+
// overlap exposes the implied indel as a one-sided residual, which
234+
// the gap DP penalizes — instead of the indel being silently
235+
// absorbed as a plain overlap (which inflated the score).
236+
let ov = read_ov.max(ref_ov).max(0);
237+
score -= ov * m;
238+
let q_lo = (mem.read_end() - ov).max(mem.read_start).min(nxt.read_start) as usize;
239+
let t_lo = (mem.ref_end() - ov).max(mem.ref_start).min(nxt.ref_start) as usize;
240+
let qg = &query[q_lo..nxt.read_start as usize];
241+
let tg = &ref_seq[t_lo..nxt.ref_start as usize];
242+
score += cached_gap_score(qg, tg, cfg);
234243
}
235244
}
236245

@@ -627,6 +636,31 @@ mod tests {
627636
assert!(a.valid);
628637
}
629638

639+
#[test]
640+
fn overlapping_mems_on_shifted_diagonals_incur_gap_penalty() {
641+
// Two MEMs that overlap by 1 base in the read but by 4 in the reference
642+
// (diagonals 3 apart) — a hidden 3 bp indel. This is the exact geometry
643+
// observed for ERR188044.600028 vs ENST00000677249.1. Pre-fix the
644+
// overlap branch scored it as a plain overlap (sum*match - max_overlap),
645+
// silently absorbing the indel and inflating the score; now the residual
646+
// is DP-aligned and the diagonal jump costs an affine gap.
647+
let cfg = AlignConfig::default();
648+
let query = gen_seq(76, 5);
649+
let reference = gen_seq(900, 9);
650+
// mem0: read[0..45]@ref790 (diag 745); mem1: read[44..76]@ref831 (diag 787).
651+
// read_ov = 45-44 = 1 ; ref_ov = (790+45)-831 = 4 -> implied 3 bp indel.
652+
let mems = vec![Mem::new(0, 790, 45), Mem::new(44, 831, 32)];
653+
let m = cfg.match_score as i32;
654+
let score = anchored_align_score(&query, &reference, &mems, &cfg);
655+
// Old (buggy) behavior absorbed the indel as a 4-base overlap only:
656+
let absorbed = (45 + 32) * m - 4 * m; // = 146
657+
// Fixed: 73 matched bases minus a 3 bp affine gap (open + 3*extend).
658+
let gap = cfg.gap_open_pen as i32 + 3 * cfg.gap_extend_pen as i32;
659+
let expected = (45 + 32 - 4) * m - gap; // 146 - 12 = 134
660+
assert_eq!(score, expected, "expected indel-penalized score");
661+
assert!(score < absorbed, "indel was not penalized: {score} >= {absorbed}");
662+
}
663+
630664
#[test]
631665
fn single_mismatch_reduces_score_but_stays_valid() {
632666
let reference = gen_seq(300, 11);

0 commit comments

Comments
 (0)