Skip to content

Commit ee6e630

Browse files
rob-pclaude
andcommitted
perf(map): merge same-diagonal uni-MEMs; drop per-read map + per-group set
The `--uniMEMs` seeding path extended every raw k-mer hit independently, producing several heavily-overlapping uni-MEMs on one diagonal (one per surviving seed) and deduping only *exactly equal* ones via a per-group `HashSet`. The chainer's minimap2-style gain (`dq.min(dr).min(len)`) counts a successor's new bases by its start offset, which equals true coverage only for fixed-length anchors; for these variable-length overlapping uni-MEMs it undercounts the end extension, so the chain scored below the single longest anchor and was split — deflating chain coverage and tripping the downstream `consensus_filter`. Net effect: `--uniMEMs` disagreed with the default k-mer mode on 1.3% of reads and ran ~18% slower. Fix: collapse same-diagonal overlapping/abutting uni-MEMs into maximal anchors (`merge_same_diagonal`) — what pufferfish's one-uni-MEM-per-diagonal seeding produces directly — so the chain sees a single maximal anchor. This also replaces the exact-dedup `HashSet`, and the grouping moves from a per-read `HashMap` to reused sort-based thread-local scratch (mirroring `collect`'s `PROJ_SCRATCH`), removing the allocator traffic. Result on 1M ERR188044 pairs vs the default sparse k-mer mode: * target-set agreement 98.69% -> 99.90% (residual = threshold-marginal reads where anchor granularity tips a borderline map/no-map); * map phase 2.87s -> 2.42s, now on par with (marginally faster than) sparse. Only affects the opt-in `--uniMEMs` path; the default seeding is unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 9604e21 commit ee6e630

1 file changed

Lines changed: 129 additions & 54 deletions

File tree

crates/salmon-map/src/extend.rs

Lines changed: 129 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
//! MEM vs. unitig-constrained uni-MEM) account for read-placement differences
2424
//! vs. C++ salmon? (It does not — see `docs/mapping-parity-differences.md`.)
2525
26-
use std::collections::{HashMap, HashSet};
26+
use std::collections::HashSet;
2727

2828
use piscem_rs::index::contig_table::EntryEncoding;
2929
use piscem_rs::index::reference_index::ReferenceIndex;
@@ -174,6 +174,7 @@ pub fn collect_read_unimems<'idx, R: RefProvider>(
174174

175175
/// A projected seed that also carries the reference span of the **unitig** it
176176
/// came from, so extension can be clamped to the unitig (true uni-MEM).
177+
#[derive(Clone, Copy)]
177178
struct SeedWithContig {
178179
mem: Mem,
179180
/// reference-forward start of the unitig
@@ -182,45 +183,47 @@ struct SeedWithContig {
182183
u_hi: i32,
183184
}
184185

185-
/// Project raw k-mer hits, keeping each seed's **unitig reference span**.
186+
thread_local! {
187+
/// Reused per-thread scratch for [`candidates_from_raw_hits_true_unimems`]:
188+
/// a flat `(tid, is_fw, seed)` buffer (sorted to group by target/orientation
189+
/// instead of a per-read `HashMap`), a per-group buffer of extended uni-MEMs,
190+
/// and the merged-anchor output buffer. Mirrors `collect`'s `PROJ_SCRATCH`,
191+
/// eliminating the per-read map + per-group `HashSet` allocations.
192+
static UNIMEM_SCRATCH: std::cell::RefCell<(Vec<(u32, bool, SeedWithContig)>, Vec<Mem>, Vec<Mem>)> =
193+
const { std::cell::RefCell::new((Vec::new(), Vec::new(), Vec::new())) };
194+
}
195+
196+
/// Collapse same-diagonal overlapping/abutting uni-MEMs into maximal anchors.
186197
///
187-
/// Like [`project_raw_hits`] but records, per seed, the reference-forward
188-
/// `[base_pos, base_pos + contig_len)` window of the unitig the k-mer landed on
189-
/// (`base_pos = EntryEncoding::pos`, contiguous in the reference regardless of
190-
/// the contig's orientation). That window is what bounds a true uni-MEM.
191-
fn project_raw_hits_with_contig(
192-
raw_hits: &[(i32, ProjectedHits<'_>)],
193-
encoding: &EntryEncoding,
194-
read_len: i32,
195-
k: i32,
196-
max_hit_occ: usize,
197-
) -> HashMap<(u32, bool), Vec<SeedWithContig>> {
198-
let mut groups: HashMap<(u32, bool), Vec<SeedWithContig>> = HashMap::new();
199-
for (read_pos, phit) in raw_hits {
200-
if phit.num_hits() > max_hit_occ {
201-
continue;
202-
}
203-
let contig_len = phit.contig_len() as i32;
204-
for entry in phit.ref_range().iter() {
205-
let tid = encoding.transcript_id(entry);
206-
let base_pos = encoding.pos(entry) as i32;
207-
let rp = phit.decode_hit(entry, encoding);
208-
let read_start = if rp.is_fw {
209-
*read_pos
210-
} else {
211-
read_len - (*read_pos + k)
212-
};
213-
groups
214-
.entry((tid, rp.is_fw))
215-
.or_default()
216-
.push(SeedWithContig {
217-
mem: Mem::new(read_start, rp.pos as i32, k),
218-
u_lo: base_pos,
219-
u_hi: base_pos + contig_len,
220-
});
198+
/// Extending each raw k-mer hit independently yields several heavily-overlapping
199+
/// uni-MEMs on one diagonal (one per surviving seed). The minimap2-style chaining
200+
/// gain counts a successor's new bases by its *start* offset, which undercounts a
201+
/// longer anchor's end extension — so the chainer scores the overlapping group
202+
/// below the single longest anchor and splits it, deflating chain coverage. We
203+
/// instead merge them here (as pufferfish's one-uni-MEM-per-diagonal seeding
204+
/// does), so the chain sees a single maximal anchor. Anchors on the same diagonal
205+
/// separated by a real gap (a SNP run) are left distinct for the chainer; only
206+
/// overlapping/touching runs are merged. `ext` is consumed (sorted) and the
207+
/// result written to `out`.
208+
fn merge_same_diagonal(ext: &mut [Mem], out: &mut Vec<Mem>) {
209+
out.clear();
210+
if ext.is_empty() {
211+
return;
212+
}
213+
// diagonal = ref_start - read_start; collinear (gap-free) anchors share it.
214+
ext.sort_unstable_by_key(|m| (m.ref_start - m.read_start, m.read_start));
215+
let mut cur = ext[0];
216+
for &m in &ext[1..] {
217+
let same_diag = (m.ref_start - m.read_start) == (cur.ref_start - cur.read_start);
218+
if same_diag && m.read_start <= cur.read_end() {
219+
let new_end = cur.read_end().max(m.read_end());
220+
cur = Mem::new(cur.read_start, cur.ref_start, new_end - cur.read_start);
221+
} else {
222+
out.push(cur);
223+
cur = m;
221224
}
222225
}
223-
groups
226+
out.push(cur);
224227
}
225228

226229
/// Project then extend each seed into a **true uni-MEM** (extension clamped to
@@ -238,30 +241,66 @@ pub fn candidates_from_raw_hits_true_unimems<R: RefProvider>(
238241
cfg: &MemCollectorConfig,
239242
) -> Vec<MappingCandidate> {
240243
let read_len = read.len() as i32;
241-
let groups = project_raw_hits_with_contig(raw_hits, encoding, read_len, k, cfg.max_hit_occ);
242244
let rc = revcomp(read);
243245
let mut chain_cfg = cfg.chain.clone();
244246
chain_cfg.seed_len = k;
245247

246-
let mut candidates = Vec::new();
247-
for ((tid, is_fw), seeds) in groups {
248-
let ref_seq = refs.ref_seq(tid);
249-
let query: &[u8] = if is_fw { read } else { &rc };
250-
251-
let mut seen = HashSet::new();
252-
let mut unimems = Vec::with_capacity(seeds.len());
253-
for s in seeds {
254-
let e = extend_mem_within(query, ref_seq, s.mem, s.u_lo, s.u_hi);
255-
if seen.insert((e.read_start, e.ref_start, e.len)) {
256-
unimems.push(e);
248+
UNIMEM_SCRATCH.with(|cell| {
249+
let (flat, ext, merged) = &mut *cell.borrow_mut();
250+
// Flatten projected seeds (carrying their unitig span) into a single
251+
// buffer, grouped by (tid, is_fw) via a sort — no per-read hash map.
252+
flat.clear();
253+
for (read_pos, phit) in raw_hits {
254+
if phit.num_hits() > cfg.max_hit_occ {
255+
continue;
256+
}
257+
let contig_len = phit.contig_len() as i32;
258+
for entry in phit.ref_range().iter() {
259+
let tid = encoding.transcript_id(entry);
260+
let base_pos = encoding.pos(entry) as i32;
261+
let rp = phit.decode_hit(entry, encoding);
262+
let read_start = if rp.is_fw {
263+
*read_pos
264+
} else {
265+
read_len - (*read_pos + k)
266+
};
267+
flat.push((
268+
tid,
269+
rp.is_fw,
270+
SeedWithContig {
271+
mem: Mem::new(read_start, rp.pos as i32, k),
272+
u_lo: base_pos,
273+
u_hi: base_pos + contig_len,
274+
},
275+
));
257276
}
258277
}
259-
260-
for chain in chain_mems(&unimems, is_fw, &chain_cfg) {
261-
candidates.push(MappingCandidate { tid, is_fw, chain });
278+
flat.sort_unstable_by_key(|&(tid, fw, _)| (tid, fw));
279+
280+
let mut candidates = Vec::new();
281+
let mut i = 0;
282+
while i < flat.len() {
283+
let (tid, is_fw, _) = flat[i];
284+
let ref_seq = refs.ref_seq(tid);
285+
let query: &[u8] = if is_fw { read } else { &rc };
286+
// Extend each seed within its unitig, then merge same-diagonal
287+
// overlapping uni-MEMs into maximal anchors (which also dedups exact
288+
// duplicates, replacing the per-group HashSet).
289+
ext.clear();
290+
let mut j = i;
291+
while j < flat.len() && flat[j].0 == tid && flat[j].1 == is_fw {
292+
let s = &flat[j].2;
293+
ext.push(extend_mem_within(query, ref_seq, s.mem, s.u_lo, s.u_hi));
294+
j += 1;
295+
}
296+
merge_same_diagonal(ext, merged);
297+
for chain in chain_mems(merged, is_fw, &chain_cfg) {
298+
candidates.push(MappingCandidate { tid, is_fw, chain });
299+
}
300+
i = j;
262301
}
263-
}
264-
candidates
302+
candidates
303+
})
265304
}
266305

267306
/// Collect true uni-MEM mapping candidates (unitig-constrained extension) for
@@ -295,6 +334,42 @@ mod tests {
295334
use piscem_rs::index::contig_table::{ContigTable, ContigTableBuilder};
296335
use piscem_rs::mapping::projected_hits::ProjectedHits;
297336

337+
#[test]
338+
fn merge_collapses_same_diagonal_overlaps() {
339+
// The read-100000/ENST00000518938.1 geometry: three overlapping uni-MEMs
340+
// on diagonal 76 (one per surviving seed). They must collapse to a single
341+
// maximal anchor read[0,52) so the chain coverage matches the equivalent
342+
// fixed-k-mer chain (52), instead of the chainer splitting them.
343+
let mut ext = vec![
344+
Mem::new(0, 76, 34),
345+
Mem::new(4, 80, 34),
346+
Mem::new(8, 84, 44),
347+
];
348+
let mut out = Vec::new();
349+
merge_same_diagonal(&mut ext, &mut out);
350+
assert_eq!(out.len(), 1, "same-diagonal overlaps must merge into one");
351+
assert_eq!(
352+
(out[0].read_start, out[0].ref_start, out[0].len),
353+
(0, 76, 52)
354+
);
355+
}
356+
357+
#[test]
358+
fn merge_keeps_distinct_diagonals_and_real_gaps() {
359+
// Same diagonal but a real read gap (a SNP run) stays distinct; a
360+
// different diagonal stays distinct — only overlapping/touching runs merge.
361+
let mut ext = vec![
362+
Mem::new(0, 100, 20), // diag 100, read[0,20)
363+
Mem::new(10, 110, 20), // diag 100, overlaps prev -> merges to read[0,30)
364+
Mem::new(40, 140, 10), // diag 100, gap after read 30 -> distinct
365+
Mem::new(50, 200, 20), // diag 150 -> distinct
366+
];
367+
let mut out = Vec::new();
368+
merge_same_diagonal(&mut ext, &mut out);
369+
assert_eq!(out.len(), 3);
370+
assert_eq!((out[0].read_start, out[0].len), (0, 30));
371+
}
372+
298373
/// Minimal in-memory reference store for the extension tests.
299374
struct TestRefs {
300375
seqs: Vec<Vec<u8>>,

0 commit comments

Comments
 (0)