@@ -68,7 +68,7 @@ pub struct PreparedJunction {
6868 pub start_pos : u64 ,
6969 /// Shift-adjusted 0-based genome position of the last intron base.
7070 pub end_pos : u64 ,
71- /// STAR motif code (`MOTIF_*` ).
71+ /// STAR motif code (0 = non-canonical, 1-6 = canonical variants ).
7272 pub motif : u8 ,
7373 /// Repeat length to the left of the (pre-shift) donor.
7474 pub shift_left : u8 ,
@@ -78,6 +78,40 @@ pub struct PreparedJunction {
7878 pub strand : u8 ,
7979}
8080
81+ impl PreparedJunction {
82+ /// Value STAR writes into `mapGen.sjdbStart` after its post-dedup
83+ /// sort: ORIGINAL pre-shift start for canonical motifs, shifted
84+ /// start for non-canonical (matches `sjdbPrepare.cpp:127,174`).
85+ pub fn stored_start ( & self ) -> u64 {
86+ if self . motif == 0 {
87+ self . start_pos
88+ } else {
89+ self . start_pos + self . shift_left as u64
90+ }
91+ }
92+
93+ /// Companion to `stored_start` — the `mapGen.sjdbEnd` value.
94+ pub fn stored_end ( & self ) -> u64 {
95+ if self . motif == 0 {
96+ self . end_pos
97+ } else {
98+ self . end_pos + self . shift_left as u64
99+ }
100+ }
101+
102+ /// Original (pre-shift) 0-based position of the first intron base.
103+ /// Used for Gsj flanking-sequence extraction (STAR uses this regardless
104+ /// of motif).
105+ pub fn original_start ( & self ) -> u64 {
106+ self . start_pos + self . shift_left as u64
107+ }
108+
109+ /// Companion to `original_start`.
110+ pub fn original_end ( & self ) -> u64 {
111+ self . end_pos + self . shift_left as u64
112+ }
113+ }
114+
81115/// Convert a splice-junction database entry to a fully-prepared entry
82116/// carrying motif, shifts, strand, and shift-adjusted coordinates.
83117///
@@ -118,6 +152,91 @@ pub fn prepare_junction(
118152 }
119153}
120154
155+ /// Sort a prepared junction list into STAR's post-dedup order and apply
156+ /// the cross-strand deduplication that STAR does after its second sort
157+ /// (`sjdbPrepare.cpp:141-192`).
158+ ///
159+ /// STAR's first-pass (intra-strand) dedup collapses duplicate sjdb
160+ /// entries from the same source at the same `(start, end, strand)`.
161+ /// ruSTAR's `SpliceJunctionDb` already deduplicates on that key at the
162+ /// HashMap level, so those first-pass branches never trigger here; the
163+ /// second-pass cross-strand collision dedup does.
164+ ///
165+ /// Dedup rules when two surviving junctions share `(stored_start,
166+ /// stored_end)` but have different strand assignments:
167+ ///
168+ /// - Undefined strand vs defined strand → keep the defined-strand one.
169+ /// - Both non-canonical → collapse to a single entry with strand = 0
170+ /// (undefined).
171+ /// - One canonical + one not → keep the canonical one.
172+ /// - Both canonical but on correct vs wrong strand relative to motif —
173+ /// keep the one whose strand matches `2 - motif % 2`.
174+ pub fn sort_and_dedup ( mut junctions : Vec < PreparedJunction > ) -> Vec < PreparedJunction > {
175+ junctions. sort_by ( |a, b| {
176+ a. stored_start ( )
177+ . cmp ( & b. stored_start ( ) )
178+ . then_with ( || a. stored_end ( ) . cmp ( & b. stored_end ( ) ) )
179+ } ) ;
180+
181+ let mut out: Vec < PreparedJunction > = Vec :: with_capacity ( junctions. len ( ) ) ;
182+ for j in junctions {
183+ match out. last ( ) {
184+ Some ( last)
185+ if last. stored_start ( ) == j. stored_start ( )
186+ && last. stored_end ( ) == j. stored_end ( ) =>
187+ {
188+ if let Some ( winner) = merge_cross_strand ( last, & j) {
189+ * out. last_mut ( ) . unwrap ( ) = winner;
190+ }
191+ // else: keep `last` unchanged.
192+ }
193+ _ => out. push ( j) ,
194+ }
195+ }
196+ out
197+ }
198+
199+ /// Decide what `(stored_start, stored_end)` duplicate to keep.
200+ /// Returns `Some(new)` to replace the stored entry, `None` to keep it.
201+ /// For the "both non-canonical on opposite strands" case we keep the
202+ /// existing entry but force its strand to 0 in-place (handled by the
203+ /// caller via a special-case — represented here as returning a cloned
204+ /// `old` with strand=0).
205+ fn merge_cross_strand (
206+ old : & PreparedJunction ,
207+ new : & PreparedJunction ,
208+ ) -> Option < PreparedJunction > {
209+ // Strand 0 = undefined.
210+ if old. strand > 0 && new. strand == 0 {
211+ return None ; // keep old
212+ }
213+ if old. strand == 0 && new. strand > 0 {
214+ return Some ( new. clone ( ) ) ; // replace
215+ }
216+ // Both non-canonical → collapse to undefined strand on the old one.
217+ if old. motif == 0 && new. motif == 0 {
218+ let mut merged = old. clone ( ) ;
219+ merged. strand = 0 ;
220+ return Some ( merged) ;
221+ }
222+ // One canonical, one not: prefer canonical.
223+ if old. motif > 0 && new. motif == 0 {
224+ return None ;
225+ }
226+ if old. motif == 0 && new. motif > 0 {
227+ return Some ( new. clone ( ) ) ;
228+ }
229+ // Both canonical with defined strands. Keep the one on the correct
230+ // strand for its motif (2 - motif % 2). If the old one is on the
231+ // correct strand, skip the new one; otherwise replace.
232+ let old_expected = 2 - ( old. motif % 2 ) ;
233+ if old. strand == old_expected {
234+ None
235+ } else {
236+ Some ( new. clone ( ) )
237+ }
238+ }
239+
121240#[ cfg( test) ]
122241mod tests {
123242 use super :: * ;
@@ -277,4 +396,95 @@ mod tests {
277396 assert_eq ! ( pj. start_pos, 101 ) ;
278397 assert_eq ! ( pj. end_pos, 198 ) ;
279398 }
399+
400+ fn pj (
401+ chr_idx : usize ,
402+ start : u64 ,
403+ end : u64 ,
404+ motif : u8 ,
405+ shift_left : u8 ,
406+ strand : u8 ,
407+ ) -> PreparedJunction {
408+ PreparedJunction {
409+ chr_idx,
410+ start_pos : start,
411+ end_pos : end,
412+ motif,
413+ shift_left,
414+ shift_right : 0 ,
415+ strand,
416+ }
417+ }
418+
419+ #[ test]
420+ fn stored_and_original_coords_differ_for_canonical_only ( ) {
421+ let canon = pj ( 0 , 100 , 200 , 1 , 3 , 1 ) ;
422+ // Canonical stores ORIGINAL (shifted + shift_left).
423+ assert_eq ! ( canon. stored_start( ) , 103 ) ;
424+ assert_eq ! ( canon. stored_end( ) , 203 ) ;
425+ assert_eq ! ( canon. original_start( ) , 103 ) ;
426+ assert_eq ! ( canon. original_end( ) , 203 ) ;
427+
428+ let noncanon = pj ( 0 , 100 , 200 , 0 , 3 , 0 ) ;
429+ // Non-canonical stores SHIFTED.
430+ assert_eq ! ( noncanon. stored_start( ) , 100 ) ;
431+ assert_eq ! ( noncanon. stored_end( ) , 200 ) ;
432+ // original_* still recovers pre-shift coords.
433+ assert_eq ! ( noncanon. original_start( ) , 103 ) ;
434+ assert_eq ! ( noncanon. original_end( ) , 203 ) ;
435+ }
436+
437+ #[ test]
438+ fn sort_orders_by_stored_coords ( ) {
439+ let a = pj ( 0 , 100 , 200 , 1 , 0 , 1 ) ; // stored 100..200
440+ let b = pj ( 0 , 50 , 150 , 1 , 0 , 1 ) ; // stored 50..150
441+ let c = pj ( 0 , 80 , 180 , 1 , 0 , 1 ) ; // stored 80..180
442+ let out = sort_and_dedup ( vec ! [ a. clone( ) , b. clone( ) , c. clone( ) ] ) ;
443+ assert_eq ! ( out, vec![ b, c, a] ) ;
444+ }
445+
446+ #[ test]
447+ fn dedup_prefers_defined_strand_over_undefined ( ) {
448+ let defined = pj ( 0 , 100 , 200 , 1 , 0 , 1 ) ;
449+ let undefined = pj ( 0 , 100 , 200 , 0 , 0 , 0 ) ;
450+ let out = sort_and_dedup ( vec ! [ defined. clone( ) , undefined] ) ;
451+ assert_eq ! ( out. len( ) , 1 ) ;
452+ assert_eq ! ( out[ 0 ] , defined) ;
453+ }
454+
455+ #[ test]
456+ fn dedup_collapses_two_non_canonical_to_undefined_strand ( ) {
457+ // Two non-canonical at same stored coords, opposite strands → one
458+ // entry with strand = 0.
459+ let a = pj ( 0 , 100 , 200 , 0 , 0 , 1 ) ;
460+ let b = pj ( 0 , 100 , 200 , 0 , 0 , 2 ) ;
461+ let out = sort_and_dedup ( vec ! [ a, b] ) ;
462+ assert_eq ! ( out. len( ) , 1 ) ;
463+ assert_eq ! ( out[ 0 ] . strand, 0 ) ;
464+ }
465+
466+ #[ test]
467+ fn dedup_prefers_canonical_over_non_canonical ( ) {
468+ // Motif=1 (canonical) beats motif=0 (non) at same stored coords.
469+ let canon = pj ( 0 , 100 , 200 , 1 , 0 , 1 ) ;
470+ let noncan = pj ( 0 , 100 , 200 , 0 , 0 , 2 ) ;
471+ let out = sort_and_dedup ( vec ! [ noncan, canon. clone( ) ] ) ;
472+ assert_eq ! ( out. len( ) , 1 ) ;
473+ assert_eq ! ( out[ 0 ] , canon) ;
474+ }
475+
476+ #[ test]
477+ fn dedup_prefers_strand_matching_motif ( ) {
478+ // motif=1 (GT/AG +) stored on wrong strand (2) vs a competing
479+ // motif=2 (CT/AC -) on its correct strand. Same stored coords.
480+ // STAR keeps the one whose strand matches `2 - motif%2`.
481+ // For motif=1: expected strand = 2 - 1%2 = 1.
482+ // For motif=2: expected strand = 2 - 2%2 = 2.
483+ let old_wrong = pj ( 0 , 100 , 200 , 1 , 0 , 2 ) ; // motif 1 wants strand 1, has 2
484+ let new_right = pj ( 0 , 100 , 200 , 2 , 0 , 2 ) ; // motif 2 wants strand 2, has 2
485+ let out = sort_and_dedup ( vec ! [ old_wrong, new_right. clone( ) ] ) ;
486+ assert_eq ! ( out. len( ) , 1 ) ;
487+ // `old_wrong` is on wrong strand for its motif — STAR replaces.
488+ assert_eq ! ( out[ 0 ] , new_right) ;
489+ }
280490}
0 commit comments