Skip to content

Commit 19c378b

Browse files
ewelsclaude
andcommitted
feat(junction): build_gsj() — concatenated SJ flanking-sequence buffer
New build_gsj() in src/junction/sjdb_insert.rs reproduces STAR's sjdbPrepare.cpp:203-215 in-memory buffer: per junction, sjdb_overhang donor bases + sjdb_overhang acceptor bases + 1 spacer byte (value 5, STAR's GENOME_spacingChar). Extraction uses ORIGINAL (pre-shift) coordinates regardless of motif, relying on the new PreparedJunction::original_{start,end} helpers. 5 tests: canonical single junction, non-canonical junction (verifies shift reversal), multi-junction concatenation with independent spacers, plus two bounds-error cases (donor underflow, acceptor overrun). Next commit will wire the Gsj bytes onto the Genome binary + extend the SA. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 166cf5f commit 19c378b

1 file changed

Lines changed: 169 additions & 4 deletions

File tree

src/junction/sjdb_insert.rs

Lines changed: 169 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,14 @@
1313
//! base suffix array has been built.
1414
1515
use crate::align::score::detect_splice_motif;
16+
use crate::error::Error;
1617
use crate::genome::Genome;
1718
use crate::junction::encode_motif;
1819

20+
/// STAR's inter-SJ spacer byte in the Gsj buffer (same value STAR uses
21+
/// for inter-chromosome padding — `IncludeDefine.h::GENOME_spacingChar`).
22+
const GSJ_SPACING: u8 = 5;
23+
1924
/// Compute STAR's `(sjdbShiftLeft, sjdbShiftRight)` for an intron whose
2025
/// 0-based donor/acceptor positions are `s` and `e`.
2126
///
@@ -202,10 +207,7 @@ pub fn sort_and_dedup(mut junctions: Vec<PreparedJunction>) -> Vec<PreparedJunct
202207
/// existing entry but force its strand to 0 in-place (handled by the
203208
/// caller via a special-case — represented here as returning a cloned
204209
/// `old` with strand=0).
205-
fn merge_cross_strand(
206-
old: &PreparedJunction,
207-
new: &PreparedJunction,
208-
) -> Option<PreparedJunction> {
210+
fn merge_cross_strand(old: &PreparedJunction, new: &PreparedJunction) -> Option<PreparedJunction> {
209211
// Strand 0 = undefined.
210212
if old.strand > 0 && new.strand == 0 {
211213
return None; // keep old
@@ -237,6 +239,62 @@ fn merge_cross_strand(
237239
}
238240
}
239241

242+
/// Build the Gsj buffer — the concatenated splice-junction flanking
243+
/// sequences STAR appends to the Genome binary at `genomeGenerate`.
244+
///
245+
/// Each junction contributes exactly `2 * sjdb_overhang + 1` bytes:
246+
/// `sjdb_overhang` donor-side bases (from the genome position
247+
/// `original_start - sjdb_overhang`), `sjdb_overhang` acceptor-side
248+
/// bases (from `original_end + 1`), and one `GSJ_SPACING` spacer byte
249+
/// at the end. Matches `sjdbPrepare.cpp:203-215`.
250+
///
251+
/// Callers supply junctions in sorted/deduplicated order (via
252+
/// [`sort_and_dedup`]). `genome` is read through
253+
/// `Genome::sequence[..n_genome_real]`.
254+
pub fn build_gsj(
255+
junctions: &[PreparedJunction],
256+
genome: &Genome,
257+
n_genome_real: u64,
258+
sjdb_overhang: u32,
259+
) -> Result<Vec<u8>, Error> {
260+
let overhang = sjdb_overhang as usize;
261+
let sjdb_length = 2 * overhang + 1;
262+
let forward = &genome.sequence[..n_genome_real as usize];
263+
let mut gsj = vec![GSJ_SPACING; junctions.len() * sjdb_length];
264+
265+
for (i, pj) in junctions.iter().enumerate() {
266+
let donor_start = pj
267+
.original_start()
268+
.checked_sub(overhang as u64)
269+
.ok_or_else(|| sjdb_bounds_err(pj, overhang, "donor underflows"))?;
270+
let acceptor_start = pj.original_end() + 1;
271+
272+
let d0 = donor_start as usize;
273+
let a0 = acceptor_start as usize;
274+
if d0 + overhang > forward.len() || a0 + overhang > forward.len() {
275+
return Err(sjdb_bounds_err(pj, overhang, "flank overruns genome"));
276+
}
277+
278+
let base = i * sjdb_length;
279+
gsj[base..base + overhang].copy_from_slice(&forward[d0..d0 + overhang]);
280+
gsj[base + overhang..base + 2 * overhang].copy_from_slice(&forward[a0..a0 + overhang]);
281+
// base + 2*overhang = trailing spacer, already initialized.
282+
}
283+
284+
Ok(gsj)
285+
}
286+
287+
fn sjdb_bounds_err(pj: &PreparedJunction, overhang: usize, reason: &str) -> Error {
288+
Error::Index(format!(
289+
"sjdb flanking window ({} bp overhang) {} for junction at original coords {}..{} on chr {}",
290+
overhang,
291+
reason,
292+
pj.original_start(),
293+
pj.original_end(),
294+
pj.chr_idx,
295+
))
296+
}
297+
240298
#[cfg(test)]
241299
mod tests {
242300
use super::*;
@@ -473,6 +531,113 @@ mod tests {
473531
assert_eq!(out[0], canon);
474532
}
475533

534+
#[test]
535+
fn build_gsj_canonical_no_shift() {
536+
// Single canonical junction at original [100..200] with 3bp overhang.
537+
// Donor bytes = forward[97..100] (3 bytes), acceptor bytes =
538+
// forward[201..204] (3 bytes), then 1 spacer.
539+
let mut f = vec![4u8; 300];
540+
// Overhang-donor region: 97,98,99 = 0,1,2 (A,C,G)
541+
f[97] = 0;
542+
f[98] = 1;
543+
f[99] = 2;
544+
// Intron body filled with noise (irrelevant)
545+
f[100] = 2; // GT
546+
f[101] = 3;
547+
f[199] = 0; // AG
548+
f[200] = 2;
549+
// Acceptor-flank bytes: 201,202,203 = 3,0,1 (T,A,C)
550+
f[201] = 3;
551+
f[202] = 0;
552+
f[203] = 1;
553+
let g = make_test_genome(f);
554+
let junction = pj(0, 100, 200, 1, 0, 1);
555+
let gsj = build_gsj(&[junction], &g, g.n_genome, 3).unwrap();
556+
assert_eq!(gsj, vec![0, 1, 2, 3, 0, 1, GSJ_SPACING]);
557+
}
558+
559+
#[test]
560+
fn build_gsj_noncanonical_reverts_shift_for_extraction() {
561+
// Non-canonical junction whose STORED coords sit 2bp left of
562+
// the ORIGINAL. Flank extraction must use ORIGINAL (the
563+
// `original_start()` / `original_end()` helpers) so the same
564+
// bytes land in Gsj regardless of motif type.
565+
let mut f = vec![4u8; 300];
566+
f[98] = 0; // original_start - overhang = 102 - 4 = 98
567+
f[99] = 1;
568+
f[100] = 2;
569+
f[101] = 3;
570+
// intron garbage at 102..=199
571+
f[200] = 2;
572+
f[201] = 3;
573+
f[202] = 0;
574+
f[203] = 1;
575+
let g = make_test_genome(f);
576+
// start_pos/end_pos are SHIFTED (stored form for motif==0).
577+
// With shift_left=2, original_start = 102, original_end = 199.
578+
let junction = pj(0, 100, 197, 0, 2, 0);
579+
let gsj = build_gsj(&[junction], &g, g.n_genome, 4).unwrap();
580+
// Donor: forward[98..102] = [0,1,2,3]
581+
// Acceptor: forward[200..204] = [2,3,0,1]
582+
assert_eq!(gsj, vec![0, 1, 2, 3, 2, 3, 0, 1, GSJ_SPACING]);
583+
}
584+
585+
#[test]
586+
fn build_gsj_multiple_junctions_concatenate() {
587+
let mut f = vec![4u8; 400];
588+
// Junction A: canonical at [100..200], overhang 2 → donor
589+
// bytes 98,99; acceptor 201,202.
590+
f[98] = 0;
591+
f[99] = 0;
592+
f[201] = 3;
593+
f[202] = 3;
594+
// Junction B: canonical at [300..350], overhang 2 → donor
595+
// bytes 298,299; acceptor 351,352.
596+
f[298] = 1;
597+
f[299] = 1;
598+
f[351] = 2;
599+
f[352] = 2;
600+
let g = make_test_genome(f);
601+
let a = pj(0, 100, 200, 1, 0, 1);
602+
let b = pj(0, 300, 350, 1, 0, 1);
603+
let gsj = build_gsj(&[a, b], &g, g.n_genome, 2).unwrap();
604+
// sjdb_length = 5 (2 donor + 2 acceptor + 1 spacer) per junction.
605+
assert_eq!(
606+
gsj,
607+
vec![
608+
0,
609+
0,
610+
3,
611+
3,
612+
GSJ_SPACING, // junction A
613+
1,
614+
1,
615+
2,
616+
2,
617+
GSJ_SPACING, // junction B
618+
]
619+
);
620+
}
621+
622+
#[test]
623+
fn build_gsj_errors_when_flank_underflows() {
624+
// Junction too close to the left edge → donor_start underflows.
625+
let g = make_test_genome(vec![0u8; 300]);
626+
let bad = pj(0, 3, 200, 1, 0, 1); // original_start=3; overhang 10 underflows
627+
let err = build_gsj(&[bad], &g, g.n_genome, 10).unwrap_err();
628+
assert!(format!("{}", err).contains("underflows"));
629+
}
630+
631+
#[test]
632+
fn build_gsj_errors_when_flank_overruns_genome() {
633+
// Acceptor overruns the right edge.
634+
let g = make_test_genome(vec![0u8; 300]);
635+
// original_end = 295, overhang = 10 → acceptor region [296..306), exceeds n_genome=300.
636+
let bad = pj(0, 100, 295, 1, 0, 1);
637+
let err = build_gsj(&[bad], &g, g.n_genome, 10).unwrap_err();
638+
assert!(format!("{}", err).contains("overruns"));
639+
}
640+
476641
#[test]
477642
fn dedup_prefers_strand_matching_motif() {
478643
// motif=1 (GT/AG +) stored on wrong strand (2) vs a competing

0 commit comments

Comments
 (0)