Skip to content

Commit c3a0ebf

Browse files
committed
Merge PR #8: SJ insertion into Genome + SA at genomeGenerate
- PreparedJunction pipeline: motif detection, shift_left/shift_right - build_gsj(): concatenated SJ flanking-sequence buffer - sort + cross-strand dedup for prepared junctions - Genome::append_sjdb(): extend forward genome + rebuild RC - sjdbInfo.txt + sjdbList.out.tab writers - Wire sjdb insertion into GenomeIndex::build - Extended byte-parity diff harness
2 parents 02cc0e1 + 148f9a4 commit c3a0ebf

9 files changed

Lines changed: 1096 additions & 89 deletions

File tree

src/align/read_align.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,7 @@ mod tests {
13561356
sa_index,
13571357
junction_db: crate::junction::SpliceJunctionDb::empty(),
13581358
transcriptome: None,
1359+
prepared_junctions: Vec::new(),
13591360
}
13601361
}
13611362

src/align/score.rs

Lines changed: 28 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -502,64 +502,15 @@ impl AlignmentScorer {
502502
)
503503
}
504504

505-
/// Detect splice junction motif
506-
///
507-
/// # Arguments
508-
/// - `donor_pos`: Position of the donor site (first base after exon)
509-
/// - `intron_len`: Length of the intron
510-
/// - `genome`: Genome reference
511-
///
512-
/// # Returns
513-
/// The detected splice motif
505+
/// Detect splice junction motif (thin wrapper over the free function
506+
/// so `AlignmentScorer` callers keep working).
514507
pub fn detect_splice_motif(
515508
&self,
516509
donor_pos: u64,
517510
intron_len: u32,
518511
genome: &Genome,
519512
) -> SpliceMotif {
520-
// Read 2bp donor and 2bp acceptor from the FORWARD genome
521-
// Donor: donor_pos, donor_pos+1
522-
// Acceptor: donor_pos+intron_len-2, donor_pos+intron_len-1
523-
// Always read forward strand — motif pattern determines the strand
524-
let d1 = genome.get_base(donor_pos);
525-
let d2 = genome.get_base(donor_pos + 1);
526-
let a1 = genome.get_base(donor_pos + intron_len as u64 - 2);
527-
let a2 = genome.get_base(donor_pos + intron_len as u64 - 1);
528-
529-
// Check if all bases are valid
530-
// A=0, C=1, G=2, T=3
531-
match (d1, d2, a1, a2) {
532-
(Some(d1), Some(d2), Some(a1), Some(a2)) => {
533-
// Forward-strand motifs
534-
// GT-AG: (2,3,0,2)
535-
if d1 == 2 && d2 == 3 && a1 == 0 && a2 == 2 {
536-
return SpliceMotif::GtAg;
537-
}
538-
// GC-AG: (2,1,0,2)
539-
if d1 == 2 && d2 == 1 && a1 == 0 && a2 == 2 {
540-
return SpliceMotif::GcAg;
541-
}
542-
// AT-AC: (0,3,0,1)
543-
if d1 == 0 && d2 == 3 && a1 == 0 && a2 == 1 {
544-
return SpliceMotif::AtAc;
545-
}
546-
// Reverse-strand motifs (reverse complement on forward genome)
547-
// CT-AC: (1,3,0,1) — reverse complement of GT-AG
548-
if d1 == 1 && d2 == 3 && a1 == 0 && a2 == 1 {
549-
return SpliceMotif::CtAc;
550-
}
551-
// CT-GC: (1,3,2,1) — reverse complement of GC-AG
552-
if d1 == 1 && d2 == 3 && a1 == 2 && a2 == 1 {
553-
return SpliceMotif::CtGc;
554-
}
555-
// GT-AT: (2,3,0,3) — reverse complement of AT-AC
556-
if d1 == 2 && d2 == 3 && a1 == 0 && a2 == 3 {
557-
return SpliceMotif::GtAt;
558-
}
559-
SpliceMotif::NonCanonical
560-
}
561-
_ => SpliceMotif::NonCanonical,
562-
}
513+
detect_splice_motif(donor_pos, intron_len, genome)
563514
}
564515

565516
/// Score a splice junction based on motif
@@ -573,6 +524,31 @@ impl AlignmentScorer {
573524
}
574525
}
575526

527+
/// Detect splice junction motif from forward-strand bases at the intron
528+
/// boundaries. Stateless — exposed as a free function so both alignment
529+
/// scoring and `genomeGenerate` splice-junction insertion can share one
530+
/// truth table.
531+
///
532+
/// `donor_pos` is the 0-based position of the intron's first base on the
533+
/// forward strand; `intron_len` is the intron length in bases.
534+
pub fn detect_splice_motif(donor_pos: u64, intron_len: u32, genome: &Genome) -> SpliceMotif {
535+
let d1 = genome.get_base(donor_pos);
536+
let d2 = genome.get_base(donor_pos + 1);
537+
let a1 = genome.get_base(donor_pos + intron_len as u64 - 2);
538+
let a2 = genome.get_base(donor_pos + intron_len as u64 - 1);
539+
540+
// Base encoding: A=0, C=1, G=2, T=3.
541+
match (d1, d2, a1, a2) {
542+
(Some(2), Some(3), Some(0), Some(2)) => SpliceMotif::GtAg,
543+
(Some(2), Some(1), Some(0), Some(2)) => SpliceMotif::GcAg,
544+
(Some(0), Some(3), Some(0), Some(1)) => SpliceMotif::AtAc,
545+
(Some(1), Some(3), Some(0), Some(1)) => SpliceMotif::CtAc,
546+
(Some(1), Some(3), Some(2), Some(1)) => SpliceMotif::CtGc,
547+
(Some(2), Some(3), Some(0), Some(3)) => SpliceMotif::GtAt,
548+
_ => SpliceMotif::NonCanonical,
549+
}
550+
}
551+
576552
/// Splice junction motif types
577553
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
578554
pub enum SpliceMotif {

src/align/stitch.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2976,6 +2976,7 @@ mod tests {
29762976
sa_index,
29772977
junction_db: crate::junction::SpliceJunctionDb::empty(),
29782978
transcriptome: None,
2979+
prepared_junctions: Vec::new(),
29792980
}
29802981
}
29812982

@@ -3095,6 +3096,7 @@ mod tests {
30953096
sa_index,
30963097
junction_db: crate::junction::SpliceJunctionDb::empty(),
30973098
transcriptome: None,
3099+
prepared_junctions: Vec::new(),
30983100
}
30993101
}
31003102

src/genome/mod.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,37 @@ impl Genome {
118118
})
119119
}
120120

121+
/// Append a splice-junction flanking-sequence buffer (`Gsj`) to the
122+
/// forward genome and rebuild the reverse complement over the extended
123+
/// forward range. Matches STAR's in-memory layout after
124+
/// `sjdbBuildIndex.cpp:293` (`memcpy(G+chrStart[nChrReal], Gsj, nGsj)`):
125+
/// the Gsj bytes live immediately after the forward real-genome bytes
126+
/// (`chr_start[n_chr_real]` stays pinned at the pre-sjdb forward total,
127+
/// matching STAR's `chrStart.txt` — verified against STAR docker output).
128+
///
129+
/// `n_genome` grows to include Gsj. `chr_start` / `chr_length` /
130+
/// `chr_name` / `n_chr_real` are NOT updated — Gsj lives outside the
131+
/// chromosome accounting.
132+
pub fn append_sjdb(&mut self, gsj: &[u8]) {
133+
let old_n = self.n_genome;
134+
let new_n = old_n + gsj.len() as u64;
135+
136+
let mut new_seq = vec![GENOME_SPACING_CHAR; (new_n * 2) as usize];
137+
new_seq[..old_n as usize].copy_from_slice(&self.sequence[..old_n as usize]);
138+
new_seq[old_n as usize..new_n as usize].copy_from_slice(gsj);
139+
140+
// Rebuild RC over the extended forward range (STAR stores Gsj_RC
141+
// implicitly — ruSTAR keeps the explicit `[fwd | RC]` layout).
142+
for i in 0..new_n as usize {
143+
let base = new_seq[i];
144+
let complement = if base < 4 { 3 - base } else { base };
145+
new_seq[2 * new_n as usize - 1 - i] = complement;
146+
}
147+
148+
self.sequence = new_seq;
149+
self.n_genome = new_n;
150+
}
151+
121152
/// Access a base from the genome (forward or reverse strand).
122153
///
123154
/// # Arguments
@@ -452,4 +483,49 @@ mod tests {
452483
assert_eq!(genome.position_to_chr(10), Some((1, 2)));
453484
assert_eq!(genome.position_to_chr(11), None); // padding
454485
}
486+
487+
#[test]
488+
fn append_sjdb_extends_forward_and_rebuilds_rc() {
489+
// Forward "ACGT" + spacer padding (bin 3 → 8 bytes padded).
490+
let mut file = NamedTempFile::new().unwrap();
491+
writeln!(file, ">chr1").unwrap();
492+
writeln!(file, "ACGT").unwrap();
493+
let params = make_params(vec![file.path().to_path_buf()], 3);
494+
let mut genome = Genome::from_fasta(&params).unwrap();
495+
assert_eq!(genome.n_genome, 8);
496+
497+
// Gsj bytes: donor "AC" + acceptor "GT" + spacer 5 (five total).
498+
let gsj: Vec<u8> = vec![0, 1, 2, 3, 5];
499+
let gsj_len = gsj.len();
500+
genome.append_sjdb(&gsj);
501+
502+
// n_genome grows by gsj_len; chr_start pinned at pre-sjdb boundary.
503+
assert_eq!(genome.n_genome, 8 + gsj_len as u64);
504+
assert_eq!(genome.chr_start, vec![0, 8]);
505+
assert_eq!(genome.n_chr_real, 1);
506+
507+
// Forward is [real 0..8 | gsj 8..13].
508+
assert_eq!(&genome.sequence[..4], &[0, 1, 2, 3]);
509+
assert_eq!(&genome.sequence[8..13], gsj.as_slice());
510+
511+
// RC over the extended forward range. sequence[2n-1-i] = complement(sequence[i]).
512+
let new_n = genome.n_genome as usize;
513+
assert_eq!(genome.sequence[2 * new_n - 1 - 8], 3); // complement of A at fwd[8]=0
514+
assert_eq!(genome.sequence[2 * new_n - 1 - 12], 5); // spacer stays 5
515+
assert_eq!(genome.sequence.len(), 2 * new_n);
516+
}
517+
518+
#[test]
519+
fn append_sjdb_with_empty_gsj_is_noop() {
520+
let mut file = NamedTempFile::new().unwrap();
521+
writeln!(file, ">chr1").unwrap();
522+
writeln!(file, "ACGT").unwrap();
523+
let params = make_params(vec![file.path().to_path_buf()], 3);
524+
let mut genome = Genome::from_fasta(&params).unwrap();
525+
let before = genome.sequence.clone();
526+
let before_n = genome.n_genome;
527+
genome.append_sjdb(&[]);
528+
assert_eq!(genome.n_genome, before_n);
529+
assert_eq!(genome.sequence, before);
530+
}
455531
}

src/index/io.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,33 @@ impl GenomeIndex {
9090
sa_index,
9191
junction_db,
9292
transcriptome,
93+
prepared_junctions: Vec::new(),
9394
})
9495
}
9596
}
9697

98+
/// Read `genomeFileSizes\t<n_genome> <sa_size>` from genomeParameters.txt
99+
/// and return the first field (total genome byte count, including Gsj if
100+
/// sjdb was baked in). Returns `Ok(None)` if the file or line is absent,
101+
/// leaving the caller to fall back to the chr_start boundary.
102+
fn read_genome_file_size(genome_dir: &Path) -> Result<Option<u64>, Error> {
103+
let path = genome_dir.join("genomeParameters.txt");
104+
let contents = match std::fs::read_to_string(&path) {
105+
Ok(s) => s,
106+
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
107+
Err(e) => return Err(Error::io(e, &path)),
108+
};
109+
for line in contents.lines() {
110+
if let Some(rest) = line.strip_prefix("genomeFileSizes\t")
111+
&& let Some(first) = rest.split_whitespace().next()
112+
&& let Ok(v) = first.parse::<u64>()
113+
{
114+
return Ok(Some(v));
115+
}
116+
}
117+
Ok(None)
118+
}
119+
97120
/// Load genome from disk.
98121
fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result<Genome, Error> {
99122
// Read chromosome metadata
@@ -119,7 +142,14 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result<Genome, Error>
119142
.collect();
120143

121144
let n_chr_real = chr_name.len();
122-
let n_genome = chr_start[n_chr_real]; // Last entry is total size
145+
146+
// `chr_start[n_chr_real]` is the forward boundary of REAL chromosomes
147+
// only — it stays pinned at the pre-sjdb value in STAR (`chrStart.txt`).
148+
// When sjdb has been baked into the index, the total genome size
149+
// (real + Gsj) lives in `genomeParameters.txt` under `genomeFileSizes`.
150+
// Prefer that value; fall back to the chr_start boundary for indices
151+
// built without a GTF.
152+
let n_genome = read_genome_file_size(genome_dir)?.unwrap_or(chr_start[n_chr_real]);
123153

124154
// Load Genome sequence file
125155
let genome_path = genome_dir.join("Genome");

0 commit comments

Comments
 (0)