Skip to content

Commit 166cf5f

Browse files
ewelsclaude
andcommitted
feat(junction): sort + cross-strand dedup for prepared junctions
Adds PreparedJunction::stored_start/end/original_start/original_end so callers can request either STAR's on-disk coordinate form (canonical → original; non-canonical → shifted) or the canonical-for-extraction form used when reading Gsj flanking bytes. sort_and_dedup() ports STAR's second-pass sort + cross-strand dedup from sjdbPrepare.cpp:141-192. ruSTAR's SpliceJunctionDb already deduplicates on (chr, start, end, strand), so the first-pass intra-strand dedup branches never fire for our inputs — kept second-pass cross-strand logic: - Undefined vs defined strand → keep defined - Both non-canonical → collapse to strand 0 (undefined) - One canonical → keep canonical - Both canonical → keep the one whose strand matches 2 - motif%2 6 new tests exercise each branch of merge_cross_strand plus the sort ordering; all pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 81ece99 commit 166cf5f

1 file changed

Lines changed: 211 additions & 1 deletion

File tree

src/junction/sjdb_insert.rs

Lines changed: 211 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ pub struct PreparedJunction {
6868
pub start_pos: u64,
6969
/// Shift-adjusted 0-based genome position of the last intron base.
7070
pub end_pos: u64,
71-
/// STAR motif code (`MOTIF_*`).
71+
/// STAR motif code (0 = non-canonical, 1-6 = canonical variants).
7272
pub motif: u8,
7373
/// Repeat length to the left of the (pre-shift) donor.
7474
pub shift_left: u8,
@@ -78,6 +78,40 @@ pub struct PreparedJunction {
7878
pub strand: u8,
7979
}
8080

81+
impl PreparedJunction {
82+
/// Value STAR writes into `mapGen.sjdbStart` after its post-dedup
83+
/// sort: ORIGINAL pre-shift start for canonical motifs, shifted
84+
/// start for non-canonical (matches `sjdbPrepare.cpp:127,174`).
85+
pub fn stored_start(&self) -> u64 {
86+
if self.motif == 0 {
87+
self.start_pos
88+
} else {
89+
self.start_pos + self.shift_left as u64
90+
}
91+
}
92+
93+
/// Companion to `stored_start` — the `mapGen.sjdbEnd` value.
94+
pub fn stored_end(&self) -> u64 {
95+
if self.motif == 0 {
96+
self.end_pos
97+
} else {
98+
self.end_pos + self.shift_left as u64
99+
}
100+
}
101+
102+
/// Original (pre-shift) 0-based position of the first intron base.
103+
/// Used for Gsj flanking-sequence extraction (STAR uses this regardless
104+
/// of motif).
105+
pub fn original_start(&self) -> u64 {
106+
self.start_pos + self.shift_left as u64
107+
}
108+
109+
/// Companion to `original_start`.
110+
pub fn original_end(&self) -> u64 {
111+
self.end_pos + self.shift_left as u64
112+
}
113+
}
114+
81115
/// Convert a splice-junction database entry to a fully-prepared entry
82116
/// carrying motif, shifts, strand, and shift-adjusted coordinates.
83117
///
@@ -118,6 +152,91 @@ pub fn prepare_junction(
118152
}
119153
}
120154

155+
/// Sort a prepared junction list into STAR's post-dedup order and apply
156+
/// the cross-strand deduplication that STAR does after its second sort
157+
/// (`sjdbPrepare.cpp:141-192`).
158+
///
159+
/// STAR's first-pass (intra-strand) dedup collapses duplicate sjdb
160+
/// entries from the same source at the same `(start, end, strand)`.
161+
/// ruSTAR's `SpliceJunctionDb` already deduplicates on that key at the
162+
/// HashMap level, so those first-pass branches never trigger here; the
163+
/// second-pass cross-strand collision dedup does.
164+
///
165+
/// Dedup rules when two surviving junctions share `(stored_start,
166+
/// stored_end)` but have different strand assignments:
167+
///
168+
/// - Undefined strand vs defined strand → keep the defined-strand one.
169+
/// - Both non-canonical → collapse to a single entry with strand = 0
170+
/// (undefined).
171+
/// - One canonical + one not → keep the canonical one.
172+
/// - Both canonical but on correct vs wrong strand relative to motif —
173+
/// keep the one whose strand matches `2 - motif % 2`.
174+
pub fn sort_and_dedup(mut junctions: Vec<PreparedJunction>) -> Vec<PreparedJunction> {
175+
junctions.sort_by(|a, b| {
176+
a.stored_start()
177+
.cmp(&b.stored_start())
178+
.then_with(|| a.stored_end().cmp(&b.stored_end()))
179+
});
180+
181+
let mut out: Vec<PreparedJunction> = Vec::with_capacity(junctions.len());
182+
for j in junctions {
183+
match out.last() {
184+
Some(last)
185+
if last.stored_start() == j.stored_start()
186+
&& last.stored_end() == j.stored_end() =>
187+
{
188+
if let Some(winner) = merge_cross_strand(last, &j) {
189+
*out.last_mut().unwrap() = winner;
190+
}
191+
// else: keep `last` unchanged.
192+
}
193+
_ => out.push(j),
194+
}
195+
}
196+
out
197+
}
198+
199+
/// Decide what `(stored_start, stored_end)` duplicate to keep.
200+
/// Returns `Some(new)` to replace the stored entry, `None` to keep it.
201+
/// For the "both non-canonical on opposite strands" case we keep the
202+
/// existing entry but force its strand to 0 in-place (handled by the
203+
/// caller via a special-case — represented here as returning a cloned
204+
/// `old` with strand=0).
205+
fn merge_cross_strand(
206+
old: &PreparedJunction,
207+
new: &PreparedJunction,
208+
) -> Option<PreparedJunction> {
209+
// Strand 0 = undefined.
210+
if old.strand > 0 && new.strand == 0 {
211+
return None; // keep old
212+
}
213+
if old.strand == 0 && new.strand > 0 {
214+
return Some(new.clone()); // replace
215+
}
216+
// Both non-canonical → collapse to undefined strand on the old one.
217+
if old.motif == 0 && new.motif == 0 {
218+
let mut merged = old.clone();
219+
merged.strand = 0;
220+
return Some(merged);
221+
}
222+
// One canonical, one not: prefer canonical.
223+
if old.motif > 0 && new.motif == 0 {
224+
return None;
225+
}
226+
if old.motif == 0 && new.motif > 0 {
227+
return Some(new.clone());
228+
}
229+
// Both canonical with defined strands. Keep the one on the correct
230+
// strand for its motif (2 - motif % 2). If the old one is on the
231+
// correct strand, skip the new one; otherwise replace.
232+
let old_expected = 2 - (old.motif % 2);
233+
if old.strand == old_expected {
234+
None
235+
} else {
236+
Some(new.clone())
237+
}
238+
}
239+
121240
#[cfg(test)]
122241
mod tests {
123242
use super::*;
@@ -277,4 +396,95 @@ mod tests {
277396
assert_eq!(pj.start_pos, 101);
278397
assert_eq!(pj.end_pos, 198);
279398
}
399+
400+
fn pj(
401+
chr_idx: usize,
402+
start: u64,
403+
end: u64,
404+
motif: u8,
405+
shift_left: u8,
406+
strand: u8,
407+
) -> PreparedJunction {
408+
PreparedJunction {
409+
chr_idx,
410+
start_pos: start,
411+
end_pos: end,
412+
motif,
413+
shift_left,
414+
shift_right: 0,
415+
strand,
416+
}
417+
}
418+
419+
#[test]
420+
fn stored_and_original_coords_differ_for_canonical_only() {
421+
let canon = pj(0, 100, 200, 1, 3, 1);
422+
// Canonical stores ORIGINAL (shifted + shift_left).
423+
assert_eq!(canon.stored_start(), 103);
424+
assert_eq!(canon.stored_end(), 203);
425+
assert_eq!(canon.original_start(), 103);
426+
assert_eq!(canon.original_end(), 203);
427+
428+
let noncanon = pj(0, 100, 200, 0, 3, 0);
429+
// Non-canonical stores SHIFTED.
430+
assert_eq!(noncanon.stored_start(), 100);
431+
assert_eq!(noncanon.stored_end(), 200);
432+
// original_* still recovers pre-shift coords.
433+
assert_eq!(noncanon.original_start(), 103);
434+
assert_eq!(noncanon.original_end(), 203);
435+
}
436+
437+
#[test]
438+
fn sort_orders_by_stored_coords() {
439+
let a = pj(0, 100, 200, 1, 0, 1); // stored 100..200
440+
let b = pj(0, 50, 150, 1, 0, 1); // stored 50..150
441+
let c = pj(0, 80, 180, 1, 0, 1); // stored 80..180
442+
let out = sort_and_dedup(vec![a.clone(), b.clone(), c.clone()]);
443+
assert_eq!(out, vec![b, c, a]);
444+
}
445+
446+
#[test]
447+
fn dedup_prefers_defined_strand_over_undefined() {
448+
let defined = pj(0, 100, 200, 1, 0, 1);
449+
let undefined = pj(0, 100, 200, 0, 0, 0);
450+
let out = sort_and_dedup(vec![defined.clone(), undefined]);
451+
assert_eq!(out.len(), 1);
452+
assert_eq!(out[0], defined);
453+
}
454+
455+
#[test]
456+
fn dedup_collapses_two_non_canonical_to_undefined_strand() {
457+
// Two non-canonical at same stored coords, opposite strands → one
458+
// entry with strand = 0.
459+
let a = pj(0, 100, 200, 0, 0, 1);
460+
let b = pj(0, 100, 200, 0, 0, 2);
461+
let out = sort_and_dedup(vec![a, b]);
462+
assert_eq!(out.len(), 1);
463+
assert_eq!(out[0].strand, 0);
464+
}
465+
466+
#[test]
467+
fn dedup_prefers_canonical_over_non_canonical() {
468+
// Motif=1 (canonical) beats motif=0 (non) at same stored coords.
469+
let canon = pj(0, 100, 200, 1, 0, 1);
470+
let noncan = pj(0, 100, 200, 0, 0, 2);
471+
let out = sort_and_dedup(vec![noncan, canon.clone()]);
472+
assert_eq!(out.len(), 1);
473+
assert_eq!(out[0], canon);
474+
}
475+
476+
#[test]
477+
fn dedup_prefers_strand_matching_motif() {
478+
// motif=1 (GT/AG +) stored on wrong strand (2) vs a competing
479+
// motif=2 (CT/AC -) on its correct strand. Same stored coords.
480+
// STAR keeps the one whose strand matches `2 - motif%2`.
481+
// For motif=1: expected strand = 2 - 1%2 = 1.
482+
// For motif=2: expected strand = 2 - 2%2 = 2.
483+
let old_wrong = pj(0, 100, 200, 1, 0, 2); // motif 1 wants strand 1, has 2
484+
let new_right = pj(0, 100, 200, 2, 0, 2); // motif 2 wants strand 2, has 2
485+
let out = sort_and_dedup(vec![old_wrong, new_right.clone()]);
486+
assert_eq!(out.len(), 1);
487+
// `old_wrong` is on wrong strand for its motif — STAR replaces.
488+
assert_eq!(out[0], new_right);
489+
}
280490
}

0 commit comments

Comments
 (0)