|
13 | 13 | //! base suffix array has been built. |
14 | 14 |
|
15 | 15 | use crate::align::score::detect_splice_motif; |
| 16 | +use crate::error::Error; |
16 | 17 | use crate::genome::Genome; |
17 | 18 | use crate::junction::encode_motif; |
18 | 19 |
|
| 20 | +/// STAR's inter-SJ spacer byte in the Gsj buffer (same value STAR uses |
| 21 | +/// for inter-chromosome padding — `IncludeDefine.h::GENOME_spacingChar`). |
| 22 | +const GSJ_SPACING: u8 = 5; |
| 23 | + |
19 | 24 | /// Compute STAR's `(sjdbShiftLeft, sjdbShiftRight)` for an intron whose |
20 | 25 | /// 0-based donor/acceptor positions are `s` and `e`. |
21 | 26 | /// |
@@ -202,10 +207,7 @@ pub fn sort_and_dedup(mut junctions: Vec<PreparedJunction>) -> Vec<PreparedJunct |
202 | 207 | /// existing entry but force its strand to 0 in-place (handled by the |
203 | 208 | /// caller via a special-case — represented here as returning a cloned |
204 | 209 | /// `old` with strand=0). |
205 | | -fn merge_cross_strand( |
206 | | - old: &PreparedJunction, |
207 | | - new: &PreparedJunction, |
208 | | -) -> Option<PreparedJunction> { |
| 210 | +fn merge_cross_strand(old: &PreparedJunction, new: &PreparedJunction) -> Option<PreparedJunction> { |
209 | 211 | // Strand 0 = undefined. |
210 | 212 | if old.strand > 0 && new.strand == 0 { |
211 | 213 | return None; // keep old |
@@ -237,6 +239,62 @@ fn merge_cross_strand( |
237 | 239 | } |
238 | 240 | } |
239 | 241 |
|
| 242 | +/// Build the Gsj buffer — the concatenated splice-junction flanking |
| 243 | +/// sequences STAR appends to the Genome binary at `genomeGenerate`. |
| 244 | +/// |
| 245 | +/// Each junction contributes exactly `2 * sjdb_overhang + 1` bytes: |
| 246 | +/// `sjdb_overhang` donor-side bases (from the genome position |
| 247 | +/// `original_start - sjdb_overhang`), `sjdb_overhang` acceptor-side |
| 248 | +/// bases (from `original_end + 1`), and one `GSJ_SPACING` spacer byte |
| 249 | +/// at the end. Matches `sjdbPrepare.cpp:203-215`. |
| 250 | +/// |
| 251 | +/// Callers supply junctions in sorted/deduplicated order (via |
| 252 | +/// [`sort_and_dedup`]). `genome` is read through |
| 253 | +/// `Genome::sequence[..n_genome_real]`. |
| 254 | +pub fn build_gsj( |
| 255 | + junctions: &[PreparedJunction], |
| 256 | + genome: &Genome, |
| 257 | + n_genome_real: u64, |
| 258 | + sjdb_overhang: u32, |
| 259 | +) -> Result<Vec<u8>, Error> { |
| 260 | + let overhang = sjdb_overhang as usize; |
| 261 | + let sjdb_length = 2 * overhang + 1; |
| 262 | + let forward = &genome.sequence[..n_genome_real as usize]; |
| 263 | + let mut gsj = vec![GSJ_SPACING; junctions.len() * sjdb_length]; |
| 264 | + |
| 265 | + for (i, pj) in junctions.iter().enumerate() { |
| 266 | + let donor_start = pj |
| 267 | + .original_start() |
| 268 | + .checked_sub(overhang as u64) |
| 269 | + .ok_or_else(|| sjdb_bounds_err(pj, overhang, "donor underflows"))?; |
| 270 | + let acceptor_start = pj.original_end() + 1; |
| 271 | + |
| 272 | + let d0 = donor_start as usize; |
| 273 | + let a0 = acceptor_start as usize; |
| 274 | + if d0 + overhang > forward.len() || a0 + overhang > forward.len() { |
| 275 | + return Err(sjdb_bounds_err(pj, overhang, "flank overruns genome")); |
| 276 | + } |
| 277 | + |
| 278 | + let base = i * sjdb_length; |
| 279 | + gsj[base..base + overhang].copy_from_slice(&forward[d0..d0 + overhang]); |
| 280 | + gsj[base + overhang..base + 2 * overhang].copy_from_slice(&forward[a0..a0 + overhang]); |
| 281 | + // base + 2*overhang = trailing spacer, already initialized. |
| 282 | + } |
| 283 | + |
| 284 | + Ok(gsj) |
| 285 | +} |
| 286 | + |
| 287 | +fn sjdb_bounds_err(pj: &PreparedJunction, overhang: usize, reason: &str) -> Error { |
| 288 | + Error::Index(format!( |
| 289 | + "sjdb flanking window ({} bp overhang) {} for junction at original coords {}..{} on chr {}", |
| 290 | + overhang, |
| 291 | + reason, |
| 292 | + pj.original_start(), |
| 293 | + pj.original_end(), |
| 294 | + pj.chr_idx, |
| 295 | + )) |
| 296 | +} |
| 297 | + |
240 | 298 | #[cfg(test)] |
241 | 299 | mod tests { |
242 | 300 | use super::*; |
@@ -473,6 +531,113 @@ mod tests { |
473 | 531 | assert_eq!(out[0], canon); |
474 | 532 | } |
475 | 533 |
|
| 534 | + #[test] |
| 535 | + fn build_gsj_canonical_no_shift() { |
| 536 | + // Single canonical junction at original [100..200] with 3bp overhang. |
| 537 | + // Donor bytes = forward[97..100] (3 bytes), acceptor bytes = |
| 538 | + // forward[201..204] (3 bytes), then 1 spacer. |
| 539 | + let mut f = vec![4u8; 300]; |
| 540 | + // Overhang-donor region: 97,98,99 = 0,1,2 (A,C,G) |
| 541 | + f[97] = 0; |
| 542 | + f[98] = 1; |
| 543 | + f[99] = 2; |
| 544 | + // Intron body filled with noise (irrelevant) |
| 545 | + f[100] = 2; // GT |
| 546 | + f[101] = 3; |
| 547 | + f[199] = 0; // AG |
| 548 | + f[200] = 2; |
| 549 | + // Acceptor-flank bytes: 201,202,203 = 3,0,1 (T,A,C) |
| 550 | + f[201] = 3; |
| 551 | + f[202] = 0; |
| 552 | + f[203] = 1; |
| 553 | + let g = make_test_genome(f); |
| 554 | + let junction = pj(0, 100, 200, 1, 0, 1); |
| 555 | + let gsj = build_gsj(&[junction], &g, g.n_genome, 3).unwrap(); |
| 556 | + assert_eq!(gsj, vec![0, 1, 2, 3, 0, 1, GSJ_SPACING]); |
| 557 | + } |
| 558 | + |
| 559 | + #[test] |
| 560 | + fn build_gsj_noncanonical_reverts_shift_for_extraction() { |
| 561 | + // Non-canonical junction whose STORED coords sit 2bp left of |
| 562 | + // the ORIGINAL. Flank extraction must use ORIGINAL (the |
| 563 | + // `original_start()` / `original_end()` helpers) so the same |
| 564 | + // bytes land in Gsj regardless of motif type. |
| 565 | + let mut f = vec![4u8; 300]; |
| 566 | + f[98] = 0; // original_start - overhang = 102 - 4 = 98 |
| 567 | + f[99] = 1; |
| 568 | + f[100] = 2; |
| 569 | + f[101] = 3; |
| 570 | + // intron garbage at 102..=199 |
| 571 | + f[200] = 2; |
| 572 | + f[201] = 3; |
| 573 | + f[202] = 0; |
| 574 | + f[203] = 1; |
| 575 | + let g = make_test_genome(f); |
| 576 | + // start_pos/end_pos are SHIFTED (stored form for motif==0). |
| 577 | + // With shift_left=2, original_start = 102, original_end = 199. |
| 578 | + let junction = pj(0, 100, 197, 0, 2, 0); |
| 579 | + let gsj = build_gsj(&[junction], &g, g.n_genome, 4).unwrap(); |
| 580 | + // Donor: forward[98..102] = [0,1,2,3] |
| 581 | + // Acceptor: forward[200..204] = [2,3,0,1] |
| 582 | + assert_eq!(gsj, vec![0, 1, 2, 3, 2, 3, 0, 1, GSJ_SPACING]); |
| 583 | + } |
| 584 | + |
| 585 | + #[test] |
| 586 | + fn build_gsj_multiple_junctions_concatenate() { |
| 587 | + let mut f = vec![4u8; 400]; |
| 588 | + // Junction A: canonical at [100..200], overhang 2 → donor |
| 589 | + // bytes 98,99; acceptor 201,202. |
| 590 | + f[98] = 0; |
| 591 | + f[99] = 0; |
| 592 | + f[201] = 3; |
| 593 | + f[202] = 3; |
| 594 | + // Junction B: canonical at [300..350], overhang 2 → donor |
| 595 | + // bytes 298,299; acceptor 351,352. |
| 596 | + f[298] = 1; |
| 597 | + f[299] = 1; |
| 598 | + f[351] = 2; |
| 599 | + f[352] = 2; |
| 600 | + let g = make_test_genome(f); |
| 601 | + let a = pj(0, 100, 200, 1, 0, 1); |
| 602 | + let b = pj(0, 300, 350, 1, 0, 1); |
| 603 | + let gsj = build_gsj(&[a, b], &g, g.n_genome, 2).unwrap(); |
| 604 | + // sjdb_length = 5 (2 donor + 2 acceptor + 1 spacer) per junction. |
| 605 | + assert_eq!( |
| 606 | + gsj, |
| 607 | + vec![ |
| 608 | + 0, |
| 609 | + 0, |
| 610 | + 3, |
| 611 | + 3, |
| 612 | + GSJ_SPACING, // junction A |
| 613 | + 1, |
| 614 | + 1, |
| 615 | + 2, |
| 616 | + 2, |
| 617 | + GSJ_SPACING, // junction B |
| 618 | + ] |
| 619 | + ); |
| 620 | + } |
| 621 | + |
| 622 | + #[test] |
| 623 | + fn build_gsj_errors_when_flank_underflows() { |
| 624 | + // Junction too close to the left edge → donor_start underflows. |
| 625 | + let g = make_test_genome(vec![0u8; 300]); |
| 626 | + let bad = pj(0, 3, 200, 1, 0, 1); // original_start=3; overhang 10 underflows |
| 627 | + let err = build_gsj(&[bad], &g, g.n_genome, 10).unwrap_err(); |
| 628 | + assert!(format!("{}", err).contains("underflows")); |
| 629 | + } |
| 630 | + |
| 631 | + #[test] |
| 632 | + fn build_gsj_errors_when_flank_overruns_genome() { |
| 633 | + // Acceptor overruns the right edge. |
| 634 | + let g = make_test_genome(vec![0u8; 300]); |
| 635 | + // original_end = 295, overhang = 10 → acceptor region [296..306), exceeds n_genome=300. |
| 636 | + let bad = pj(0, 100, 295, 1, 0, 1); |
| 637 | + let err = build_gsj(&[bad], &g, g.n_genome, 10).unwrap_err(); |
| 638 | + assert!(format!("{}", err).contains("overruns")); |
| 639 | + } |
| 640 | + |
476 | 641 | #[test] |
477 | 642 | fn dedup_prefers_strand_matching_motif() { |
478 | 643 | // motif=1 (GT/AG +) stored on wrong strand (2) vs a competing |
|
0 commit comments