2323//! MEM vs. unitig-constrained uni-MEM) account for read-placement differences
2424//! vs. C++ salmon? (It does not — see `docs/mapping-parity-differences.md`.)
2525
26- use std:: collections:: { HashMap , HashSet } ;
26+ use std:: collections:: HashSet ;
2727
2828use piscem_rs:: index:: contig_table:: EntryEncoding ;
2929use piscem_rs:: index:: reference_index:: ReferenceIndex ;
@@ -174,6 +174,7 @@ pub fn collect_read_unimems<'idx, R: RefProvider>(
174174
175175/// A projected seed that also carries the reference span of the **unitig** it
176176/// came from, so extension can be clamped to the unitig (true uni-MEM).
177+ #[ derive( Clone , Copy ) ]
177178struct SeedWithContig {
178179 mem : Mem ,
179180 /// reference-forward start of the unitig
@@ -182,45 +183,47 @@ struct SeedWithContig {
182183 u_hi : i32 ,
183184}
184185
185- /// Project raw k-mer hits, keeping each seed's **unitig reference span**.
186+ thread_local ! {
187+ /// Reused per-thread scratch for [`candidates_from_raw_hits_true_unimems`]:
188+ /// a flat `(tid, is_fw, seed)` buffer (sorted to group by target/orientation
189+ /// instead of a per-read `HashMap`), a per-group buffer of extended uni-MEMs,
190+ /// and the merged-anchor output buffer. Mirrors `collect`'s `PROJ_SCRATCH`,
191+ /// eliminating the per-read map + per-group `HashSet` allocations.
192+ static UNIMEM_SCRATCH : std:: cell:: RefCell <( Vec <( u32 , bool , SeedWithContig ) >, Vec <Mem >, Vec <Mem >) > =
193+ const { std:: cell:: RefCell :: new( ( Vec :: new( ) , Vec :: new( ) , Vec :: new( ) ) ) } ;
194+ }
195+
196+ /// Collapse same-diagonal overlapping/abutting uni-MEMs into maximal anchors.
186197///
187- /// Like [`project_raw_hits`] but records, per seed, the reference-forward
188- /// `[base_pos, base_pos + contig_len)` window of the unitig the k-mer landed on
189- /// (`base_pos = EntryEncoding::pos`, contiguous in the reference regardless of
190- /// the contig's orientation). That window is what bounds a true uni-MEM.
191- fn project_raw_hits_with_contig (
192- raw_hits : & [ ( i32 , ProjectedHits < ' _ > ) ] ,
193- encoding : & EntryEncoding ,
194- read_len : i32 ,
195- k : i32 ,
196- max_hit_occ : usize ,
197- ) -> HashMap < ( u32 , bool ) , Vec < SeedWithContig > > {
198- let mut groups: HashMap < ( u32 , bool ) , Vec < SeedWithContig > > = HashMap :: new ( ) ;
199- for ( read_pos, phit) in raw_hits {
200- if phit. num_hits ( ) > max_hit_occ {
201- continue ;
202- }
203- let contig_len = phit. contig_len ( ) as i32 ;
204- for entry in phit. ref_range ( ) . iter ( ) {
205- let tid = encoding. transcript_id ( entry) ;
206- let base_pos = encoding. pos ( entry) as i32 ;
207- let rp = phit. decode_hit ( entry, encoding) ;
208- let read_start = if rp. is_fw {
209- * read_pos
210- } else {
211- read_len - ( * read_pos + k)
212- } ;
213- groups
214- . entry ( ( tid, rp. is_fw ) )
215- . or_default ( )
216- . push ( SeedWithContig {
217- mem : Mem :: new ( read_start, rp. pos as i32 , k) ,
218- u_lo : base_pos,
219- u_hi : base_pos + contig_len,
220- } ) ;
198+ /// Extending each raw k-mer hit independently yields several heavily-overlapping
199+ /// uni-MEMs on one diagonal (one per surviving seed). The minimap2-style chaining
200+ /// gain counts a successor's new bases by its *start* offset, which undercounts a
201+ /// longer anchor's end extension — so the chainer scores the overlapping group
202+ /// below the single longest anchor and splits it, deflating chain coverage. We
203+ /// instead merge them here (as pufferfish's one-uni-MEM-per-diagonal seeding
204+ /// does), so the chain sees a single maximal anchor. Anchors on the same diagonal
205+ /// separated by a real gap (a SNP run) are left distinct for the chainer; only
206+ /// overlapping/touching runs are merged. `ext` is consumed (sorted) and the
207+ /// result written to `out`.
208+ fn merge_same_diagonal ( ext : & mut [ Mem ] , out : & mut Vec < Mem > ) {
209+ out. clear ( ) ;
210+ if ext. is_empty ( ) {
211+ return ;
212+ }
213+ // diagonal = ref_start - read_start; collinear (gap-free) anchors share it.
214+ ext. sort_unstable_by_key ( |m| ( m. ref_start - m. read_start , m. read_start ) ) ;
215+ let mut cur = ext[ 0 ] ;
216+ for & m in & ext[ 1 ..] {
217+ let same_diag = ( m. ref_start - m. read_start ) == ( cur. ref_start - cur. read_start ) ;
218+ if same_diag && m. read_start <= cur. read_end ( ) {
219+ let new_end = cur. read_end ( ) . max ( m. read_end ( ) ) ;
220+ cur = Mem :: new ( cur. read_start , cur. ref_start , new_end - cur. read_start ) ;
221+ } else {
222+ out. push ( cur) ;
223+ cur = m;
221224 }
222225 }
223- groups
226+ out . push ( cur ) ;
224227}
225228
226229/// Project then extend each seed into a **true uni-MEM** (extension clamped to
@@ -238,30 +241,66 @@ pub fn candidates_from_raw_hits_true_unimems<R: RefProvider>(
238241 cfg : & MemCollectorConfig ,
239242) -> Vec < MappingCandidate > {
240243 let read_len = read. len ( ) as i32 ;
241- let groups = project_raw_hits_with_contig ( raw_hits, encoding, read_len, k, cfg. max_hit_occ ) ;
242244 let rc = revcomp ( read) ;
243245 let mut chain_cfg = cfg. chain . clone ( ) ;
244246 chain_cfg. seed_len = k;
245247
246- let mut candidates = Vec :: new ( ) ;
247- for ( ( tid, is_fw) , seeds) in groups {
248- let ref_seq = refs. ref_seq ( tid) ;
249- let query: & [ u8 ] = if is_fw { read } else { & rc } ;
250-
251- let mut seen = HashSet :: new ( ) ;
252- let mut unimems = Vec :: with_capacity ( seeds. len ( ) ) ;
253- for s in seeds {
254- let e = extend_mem_within ( query, ref_seq, s. mem , s. u_lo , s. u_hi ) ;
255- if seen. insert ( ( e. read_start , e. ref_start , e. len ) ) {
256- unimems. push ( e) ;
248+ UNIMEM_SCRATCH . with ( |cell| {
249+ let ( flat, ext, merged) = & mut * cell. borrow_mut ( ) ;
250+ // Flatten projected seeds (carrying their unitig span) into a single
251+ // buffer, grouped by (tid, is_fw) via a sort — no per-read hash map.
252+ flat. clear ( ) ;
253+ for ( read_pos, phit) in raw_hits {
254+ if phit. num_hits ( ) > cfg. max_hit_occ {
255+ continue ;
256+ }
257+ let contig_len = phit. contig_len ( ) as i32 ;
258+ for entry in phit. ref_range ( ) . iter ( ) {
259+ let tid = encoding. transcript_id ( entry) ;
260+ let base_pos = encoding. pos ( entry) as i32 ;
261+ let rp = phit. decode_hit ( entry, encoding) ;
262+ let read_start = if rp. is_fw {
263+ * read_pos
264+ } else {
265+ read_len - ( * read_pos + k)
266+ } ;
267+ flat. push ( (
268+ tid,
269+ rp. is_fw ,
270+ SeedWithContig {
271+ mem : Mem :: new ( read_start, rp. pos as i32 , k) ,
272+ u_lo : base_pos,
273+ u_hi : base_pos + contig_len,
274+ } ,
275+ ) ) ;
257276 }
258277 }
259-
260- for chain in chain_mems ( & unimems, is_fw, & chain_cfg) {
261- candidates. push ( MappingCandidate { tid, is_fw, chain } ) ;
278+ flat. sort_unstable_by_key ( |& ( tid, fw, _) | ( tid, fw) ) ;
279+
280+ let mut candidates = Vec :: new ( ) ;
281+ let mut i = 0 ;
282+ while i < flat. len ( ) {
283+ let ( tid, is_fw, _) = flat[ i] ;
284+ let ref_seq = refs. ref_seq ( tid) ;
285+ let query: & [ u8 ] = if is_fw { read } else { & rc } ;
286+ // Extend each seed within its unitig, then merge same-diagonal
287+ // overlapping uni-MEMs into maximal anchors (which also dedups exact
288+ // duplicates, replacing the per-group HashSet).
289+ ext. clear ( ) ;
290+ let mut j = i;
291+ while j < flat. len ( ) && flat[ j] . 0 == tid && flat[ j] . 1 == is_fw {
292+ let s = & flat[ j] . 2 ;
293+ ext. push ( extend_mem_within ( query, ref_seq, s. mem , s. u_lo , s. u_hi ) ) ;
294+ j += 1 ;
295+ }
296+ merge_same_diagonal ( ext, merged) ;
297+ for chain in chain_mems ( merged, is_fw, & chain_cfg) {
298+ candidates. push ( MappingCandidate { tid, is_fw, chain } ) ;
299+ }
300+ i = j;
262301 }
263- }
264- candidates
302+ candidates
303+ } )
265304}
266305
267306/// Collect true uni-MEM mapping candidates (unitig-constrained extension) for
@@ -295,6 +334,42 @@ mod tests {
295334 use piscem_rs:: index:: contig_table:: { ContigTable , ContigTableBuilder } ;
296335 use piscem_rs:: mapping:: projected_hits:: ProjectedHits ;
297336
337+ #[ test]
338+ fn merge_collapses_same_diagonal_overlaps ( ) {
339+ // The read-100000/ENST00000518938.1 geometry: three overlapping uni-MEMs
340+ // on diagonal 76 (one per surviving seed). They must collapse to a single
341+ // maximal anchor read[0,52) so the chain coverage matches the equivalent
342+ // fixed-k-mer chain (52), instead of the chainer splitting them.
343+ let mut ext = vec ! [
344+ Mem :: new( 0 , 76 , 34 ) ,
345+ Mem :: new( 4 , 80 , 34 ) ,
346+ Mem :: new( 8 , 84 , 44 ) ,
347+ ] ;
348+ let mut out = Vec :: new ( ) ;
349+ merge_same_diagonal ( & mut ext, & mut out) ;
350+ assert_eq ! ( out. len( ) , 1 , "same-diagonal overlaps must merge into one" ) ;
351+ assert_eq ! (
352+ ( out[ 0 ] . read_start, out[ 0 ] . ref_start, out[ 0 ] . len) ,
353+ ( 0 , 76 , 52 )
354+ ) ;
355+ }
356+
357+ #[ test]
358+ fn merge_keeps_distinct_diagonals_and_real_gaps ( ) {
359+ // Same diagonal but a real read gap (a SNP run) stays distinct; a
360+ // different diagonal stays distinct — only overlapping/touching runs merge.
361+ let mut ext = vec ! [
362+ Mem :: new( 0 , 100 , 20 ) , // diag 100, read[0,20)
363+ Mem :: new( 10 , 110 , 20 ) , // diag 100, overlaps prev -> merges to read[0,30)
364+ Mem :: new( 40 , 140 , 10 ) , // diag 100, gap after read 30 -> distinct
365+ Mem :: new( 50 , 200 , 20 ) , // diag 150 -> distinct
366+ ] ;
367+ let mut out = Vec :: new ( ) ;
368+ merge_same_diagonal ( & mut ext, & mut out) ;
369+ assert_eq ! ( out. len( ) , 3 ) ;
370+ assert_eq ! ( ( out[ 0 ] . read_start, out[ 0 ] . len) , ( 0 , 30 ) ) ;
371+ }
372+
298373 /// Minimal in-memory reference store for the extension tests.
299374 struct TestRefs {
300375 seqs : Vec < Vec < u8 > > ,
0 commit comments