Skip to content

Commit 644f6a9

Browse files
ewelsclaude
andcommitted
feat(junction): PreparedJunction + prepare_junction pipeline
New `PreparedJunction` struct carrying STAR's per-junction fields (chr_idx, shift-adjusted start/end, motif, shift_left, shift_right, strand) and a `prepare_junction` free function that pulls them together from raw db coordinates. Follows STAR's sjdbPrepare.cpp semantics: - Compute motif via classify_motif - Compute shifts via compute_shifts - Subtract shift_left from both coordinates (lines 71-72) to sit at the canonical left-most representation - Derive strand from motif when db_strand is unknown (lines 184-188: 2 - motif % 2, or 0 if motif is non-canonical) Three tests cover: canonical + strand, strand derivation fallback, shift-left applied to coords. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6d1ea6c commit 644f6a9

1 file changed

Lines changed: 146 additions & 0 deletions

File tree

src/junction/sjdb_insert.rs

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,84 @@ pub fn compute_shifts(genome: &[u8], s: u64, e: u64, n_genome_real: u64) -> (u8,
107107
(jj_l, jj_r)
108108
}
109109

110+
/// A single junction with all metadata STAR needs to write the sjdb
111+
/// files and extend the genome. Positions are 0-based absolute genome
112+
/// coordinates; `start_pos` / `end_pos` are the FIRST and LAST bases of
113+
/// the intron (inclusive), already shifted left by `shift_left` so they
114+
/// sit at the canonical (left-most) representation of the motif.
115+
#[derive(Debug, Clone, PartialEq, Eq)]
116+
pub struct PreparedJunction {
117+
/// Chromosome index the junction belongs to.
118+
pub chr_idx: usize,
119+
/// Shift-adjusted 0-based genome position of the first intron base.
120+
pub start_pos: u64,
121+
/// Shift-adjusted 0-based genome position of the last intron base.
122+
pub end_pos: u64,
123+
/// STAR motif code (`MOTIF_*`).
124+
pub motif: u8,
125+
/// Repeat length to the left of the (pre-shift) donor.
126+
pub shift_left: u8,
127+
/// Repeat length to the right of the (pre-shift) acceptor.
128+
pub shift_right: u8,
129+
/// STAR strand code (0 = unknown/dot, 1 = +, 2 = -).
130+
pub strand: u8,
131+
}
132+
133+
/// Convert a splice-junction database entry (as stored in
134+
/// `SpliceJunctionDb`) to a fully-prepared entry carrying motif, shifts,
135+
/// strand, and shift-adjusted coordinates.
136+
///
137+
/// - `chr_idx`, `intron_start`, `intron_end` are ruSTAR's 0-based absolute
138+
/// genome coordinates for the first/last bases of the intron.
139+
/// - `db_strand` is the strand as provided by the GTF / db
140+
/// (0 = unknown, 1 = +, 2 = -). If `0` and the motif encodes a strand,
141+
/// STAR picks the strand from the motif
142+
/// (`sjdbPrepare.cpp:184-188`: `2 - motif % 2`).
143+
/// - `genome` is the forward-strand genome byte slice
144+
/// (`Genome::sequence[..n_genome]`).
145+
/// - `n_genome_real` is the padded-but-without-sjdb genome length,
146+
/// matching STAR's `mapGen.nGenomeReal`.
147+
pub fn prepare_junction(
148+
chr_idx: usize,
149+
intron_start: u64,
150+
intron_end: u64,
151+
db_strand: u8,
152+
genome: &[u8],
153+
n_genome_real: u64,
154+
) -> PreparedJunction {
155+
let motif = classify_motif(genome, intron_start, intron_end);
156+
let (shift_left, shift_right) = compute_shifts(genome, intron_start, intron_end, n_genome_real);
157+
158+
// STAR's sjdbPrepare.cpp:71-72: after computing shifts, subtract
159+
// shift_left from both start and end to put the junction at its
160+
// canonical (left-most) representation.
161+
let shifted_start = intron_start - shift_left as u64;
162+
let shifted_end = intron_end - shift_left as u64;
163+
164+
let strand = match db_strand {
165+
1 | 2 => db_strand,
166+
_ => {
167+
// Unknown db strand: derive from motif (STAR sjdbPrepare.cpp:184-188).
168+
// `2 - motif % 2` maps 1/3/5 → 1 (+), 2/4/6 → 2 (-).
169+
if motif == MOTIF_NON_CANONICAL {
170+
0
171+
} else {
172+
2 - (motif % 2)
173+
}
174+
}
175+
};
176+
177+
PreparedJunction {
178+
chr_idx,
179+
start_pos: shifted_start,
180+
end_pos: shifted_end,
181+
motif,
182+
shift_left,
183+
shift_right,
184+
strand,
185+
}
186+
}
187+
110188
#[cfg(test)]
111189
mod tests {
112190
use super::*;
@@ -261,6 +339,74 @@ mod tests {
261339
assert_eq!(r, 255);
262340
}
263341

342+
#[test]
343+
fn prepare_gt_ag_forward_no_repeat() {
344+
let mut g = vec![4u8; 400]; // start with N
345+
g[100] = 0;
346+
g[101] = 0; // exon1 tail
347+
g[102] = 2; // G: intron start
348+
g[103] = 3; // T
349+
g[198] = 0; // A
350+
g[199] = 2; // G: intron end
351+
g[200] = 1;
352+
g[201] = 1;
353+
let pj = prepare_junction(0, 102, 199, 1, &g, 400);
354+
assert_eq!(pj.motif, MOTIF_GT_AG);
355+
assert_eq!(pj.shift_left, 0);
356+
assert_eq!(pj.shift_right, 0);
357+
assert_eq!(pj.start_pos, 102);
358+
assert_eq!(pj.end_pos, 199);
359+
assert_eq!(pj.strand, 1);
360+
}
361+
362+
#[test]
363+
fn prepare_dot_strand_derived_from_motif() {
364+
// Non-canonical motif with db_strand=0 → strand 0.
365+
let g = vec![0u8; 300];
366+
let pj = prepare_junction(0, 100, 200, 0, &g, 300);
367+
assert_eq!(pj.motif, MOTIF_NON_CANONICAL);
368+
assert_eq!(pj.strand, 0);
369+
370+
// GT/AG motif with db_strand=0 → strand derived = 1 (+).
371+
let mut g = vec![4u8; 300];
372+
g[100] = 2;
373+
g[101] = 3;
374+
g[199] = 0;
375+
g[200] = 2;
376+
let pj = prepare_junction(0, 100, 200, 0, &g, 300);
377+
assert_eq!(pj.motif, MOTIF_GT_AG);
378+
assert_eq!(pj.strand, 1);
379+
380+
// CT/AC motif with db_strand=0 → strand = 2 (-).
381+
let mut g = vec![4u8; 300];
382+
g[100] = 1;
383+
g[101] = 3;
384+
g[199] = 0;
385+
g[200] = 1;
386+
let pj = prepare_junction(0, 100, 200, 0, &g, 300);
387+
assert_eq!(pj.motif, MOTIF_CT_AC);
388+
assert_eq!(pj.strand, 2);
389+
}
390+
391+
#[test]
392+
fn prepare_shift_left_applied_to_coords() {
393+
// Construct: one base of repeat to the left of the junction so
394+
// shift_left = 1. Confirm start/end decrement by 1.
395+
let mut g = vec![4u8; 400];
396+
g[102] = 2; // G
397+
g[103] = 3; // T
398+
g[198] = 0; // A
399+
g[199] = 2; // G
400+
// Put identical 'G' at 101 and 199 so shift_left = 1.
401+
g[101] = 2;
402+
// Put different base at 100 to stop the scan.
403+
g[100] = 3;
404+
let pj = prepare_junction(0, 102, 199, 1, &g, 400);
405+
assert_eq!(pj.shift_left, 1);
406+
assert_eq!(pj.start_pos, 101);
407+
assert_eq!(pj.end_pos, 198);
408+
}
409+
264410
#[test]
265411
fn shifts_stop_at_n_base() {
266412
let mut g = vec![0u8; 300];

0 commit comments

Comments
 (0)