@@ -107,6 +107,84 @@ pub fn compute_shifts(genome: &[u8], s: u64, e: u64, n_genome_real: u64) -> (u8,
107107 ( jj_l, jj_r)
108108}
109109
110+ /// A single junction with all metadata STAR needs to write the sjdb
111+ /// files and extend the genome. Positions are 0-based absolute genome
112+ /// coordinates; `start_pos` / `end_pos` are the FIRST and LAST bases of
113+ /// the intron (inclusive), already shifted left by `shift_left` so they
114+ /// sit at the canonical (left-most) representation of the motif.
115+ #[ derive( Debug , Clone , PartialEq , Eq ) ]
116+ pub struct PreparedJunction {
117+ /// Chromosome index the junction belongs to.
118+ pub chr_idx : usize ,
119+ /// Shift-adjusted 0-based genome position of the first intron base.
120+ pub start_pos : u64 ,
121+ /// Shift-adjusted 0-based genome position of the last intron base.
122+ pub end_pos : u64 ,
123+ /// STAR motif code (`MOTIF_*`).
124+ pub motif : u8 ,
125+ /// Repeat length to the left of the (pre-shift) donor.
126+ pub shift_left : u8 ,
127+ /// Repeat length to the right of the (pre-shift) acceptor.
128+ pub shift_right : u8 ,
129+ /// STAR strand code (0 = unknown/dot, 1 = +, 2 = -).
130+ pub strand : u8 ,
131+ }
132+
133+ /// Convert a splice-junction database entry (as stored in
134+ /// `SpliceJunctionDb`) to a fully-prepared entry carrying motif, shifts,
135+ /// strand, and shift-adjusted coordinates.
136+ ///
137+ /// - `chr_idx`, `intron_start`, `intron_end` are ruSTAR's 0-based absolute
138+ /// genome coordinates for the first/last bases of the intron.
139+ /// - `db_strand` is the strand as provided by the GTF / db
140+ /// (0 = unknown, 1 = +, 2 = -). If `0` and the motif encodes a strand,
141+ /// STAR picks the strand from the motif
142+ /// (`sjdbPrepare.cpp:184-188`: `2 - motif % 2`).
143+ /// - `genome` is the forward-strand genome byte slice
144+ /// (`Genome::sequence[..n_genome]`).
145+ /// - `n_genome_real` is the padded-but-without-sjdb genome length,
146+ /// matching STAR's `mapGen.nGenomeReal`.
147+ pub fn prepare_junction (
148+ chr_idx : usize ,
149+ intron_start : u64 ,
150+ intron_end : u64 ,
151+ db_strand : u8 ,
152+ genome : & [ u8 ] ,
153+ n_genome_real : u64 ,
154+ ) -> PreparedJunction {
155+ let motif = classify_motif ( genome, intron_start, intron_end) ;
156+ let ( shift_left, shift_right) = compute_shifts ( genome, intron_start, intron_end, n_genome_real) ;
157+
158+ // STAR's sjdbPrepare.cpp:71-72: after computing shifts, subtract
159+ // shift_left from both start and end to put the junction at its
160+ // canonical (left-most) representation.
161+ let shifted_start = intron_start - shift_left as u64 ;
162+ let shifted_end = intron_end - shift_left as u64 ;
163+
164+ let strand = match db_strand {
165+ 1 | 2 => db_strand,
166+ _ => {
167+ // Unknown db strand: derive from motif (STAR sjdbPrepare.cpp:184-188).
168+ // `2 - motif % 2` maps 1/3/5 → 1 (+), 2/4/6 → 2 (-).
169+ if motif == MOTIF_NON_CANONICAL {
170+ 0
171+ } else {
172+ 2 - ( motif % 2 )
173+ }
174+ }
175+ } ;
176+
177+ PreparedJunction {
178+ chr_idx,
179+ start_pos : shifted_start,
180+ end_pos : shifted_end,
181+ motif,
182+ shift_left,
183+ shift_right,
184+ strand,
185+ }
186+ }
187+
110188#[ cfg( test) ]
111189mod tests {
112190 use super :: * ;
@@ -261,6 +339,74 @@ mod tests {
261339 assert_eq ! ( r, 255 ) ;
262340 }
263341
342+ #[ test]
343+ fn prepare_gt_ag_forward_no_repeat ( ) {
344+ let mut g = vec ! [ 4u8 ; 400 ] ; // start with N
345+ g[ 100 ] = 0 ;
346+ g[ 101 ] = 0 ; // exon1 tail
347+ g[ 102 ] = 2 ; // G: intron start
348+ g[ 103 ] = 3 ; // T
349+ g[ 198 ] = 0 ; // A
350+ g[ 199 ] = 2 ; // G: intron end
351+ g[ 200 ] = 1 ;
352+ g[ 201 ] = 1 ;
353+ let pj = prepare_junction ( 0 , 102 , 199 , 1 , & g, 400 ) ;
354+ assert_eq ! ( pj. motif, MOTIF_GT_AG ) ;
355+ assert_eq ! ( pj. shift_left, 0 ) ;
356+ assert_eq ! ( pj. shift_right, 0 ) ;
357+ assert_eq ! ( pj. start_pos, 102 ) ;
358+ assert_eq ! ( pj. end_pos, 199 ) ;
359+ assert_eq ! ( pj. strand, 1 ) ;
360+ }
361+
362+ #[ test]
363+ fn prepare_dot_strand_derived_from_motif ( ) {
364+ // Non-canonical motif with db_strand=0 → strand 0.
365+ let g = vec ! [ 0u8 ; 300 ] ;
366+ let pj = prepare_junction ( 0 , 100 , 200 , 0 , & g, 300 ) ;
367+ assert_eq ! ( pj. motif, MOTIF_NON_CANONICAL ) ;
368+ assert_eq ! ( pj. strand, 0 ) ;
369+
370+ // GT/AG motif with db_strand=0 → strand derived = 1 (+).
371+ let mut g = vec ! [ 4u8 ; 300 ] ;
372+ g[ 100 ] = 2 ;
373+ g[ 101 ] = 3 ;
374+ g[ 199 ] = 0 ;
375+ g[ 200 ] = 2 ;
376+ let pj = prepare_junction ( 0 , 100 , 200 , 0 , & g, 300 ) ;
377+ assert_eq ! ( pj. motif, MOTIF_GT_AG ) ;
378+ assert_eq ! ( pj. strand, 1 ) ;
379+
380+ // CT/AC motif with db_strand=0 → strand = 2 (-).
381+ let mut g = vec ! [ 4u8 ; 300 ] ;
382+ g[ 100 ] = 1 ;
383+ g[ 101 ] = 3 ;
384+ g[ 199 ] = 0 ;
385+ g[ 200 ] = 1 ;
386+ let pj = prepare_junction ( 0 , 100 , 200 , 0 , & g, 300 ) ;
387+ assert_eq ! ( pj. motif, MOTIF_CT_AC ) ;
388+ assert_eq ! ( pj. strand, 2 ) ;
389+ }
390+
391+ #[ test]
392+ fn prepare_shift_left_applied_to_coords ( ) {
393+ // Construct: one base of repeat to the left of the junction so
394+ // shift_left = 1. Confirm start/end decrement by 1.
395+ let mut g = vec ! [ 4u8 ; 400 ] ;
396+ g[ 102 ] = 2 ; // G
397+ g[ 103 ] = 3 ; // T
398+ g[ 198 ] = 0 ; // A
399+ g[ 199 ] = 2 ; // G
400+ // Put identical 'G' at 101 and 199 so shift_left = 1.
401+ g[ 101 ] = 2 ;
402+ // Put different base at 100 to stop the scan.
403+ g[ 100 ] = 3 ;
404+ let pj = prepare_junction ( 0 , 102 , 199 , 1 , & g, 400 ) ;
405+ assert_eq ! ( pj. shift_left, 1 ) ;
406+ assert_eq ! ( pj. start_pos, 101 ) ;
407+ assert_eq ! ( pj. end_pos, 198 ) ;
408+ }
409+
264410 #[ test]
265411 fn shifts_stop_at_n_base ( ) {
266412 let mut g = vec ! [ 0u8 ; 300 ] ;
0 commit comments