Skip to content

Commit abf856e

Browse files
pinin4fjordsclaude
andcommitted
fix: decode Gsj-region SA hits into split splice seeds
Pass-1 alignment was silently dropping SA hits that fall in the appended splice-junction flanking buffer (Gsj). Those hits encode candidate splice events for annotated junctions: when a read seed straddles the donor/acceptor boundary of a Gsj slot, the matching genome bytes live in two non-adjacent real-genome positions. STAR decodes these via g1 >= P.nGenomeReal and feeds the (donor, acceptor) pair into the seed pipeline; rustar-aligner was discarding them at position_to_chr (which returns None for positions past the last real chromosome). Plumb n_genome_real onto Genome (pinned at the pre-sjdb forward total) and reload prepared junctions + sjdbOverhang from sjdbInfo.txt on the index-load path. cluster_seeds now expands each SA hit through decode_gsj_hit: real-genome hits pass through unchanged, single-flank Gsj hits are skipped (the equivalent real-genome SA entry already exists in the same range), and boundary-crossing Gsj hits split into two virtual seeds with the donor-side read run and the acceptor-side read run pointing at their respective real-genome positions. The existing stitch DP then chains them via its splice branch. Verified on the yeast SRR6357072 reproducer from the issue: Number of splices: Total jumps from 366 to 631 (target ~720) and GT/AG from 266 to 531 with non-canonical unchanged at 93. The remaining gap is gated on PR #45's annotated-junction lookup landing. Fixes #47 Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 70be24d commit abf856e

16 files changed

Lines changed: 588 additions & 132 deletions

File tree

src/align/read_align.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1505,6 +1505,7 @@ mod tests {
15051505
let genome = Genome {
15061506
sequence,
15071507
n_genome,
1508+
n_genome_real: n_genome,
15081509
n_chr_real: 1,
15091510
chr_name: vec!["chr1".to_string()],
15101511
chr_length: vec![10],
@@ -1535,6 +1536,7 @@ mod tests {
15351536
junction_db: crate::junction::SpliceJunctionDb::empty(),
15361537
transcriptome: None,
15371538
prepared_junctions: Vec::new(),
1539+
sjdb_overhang: 0,
15381540
}
15391541
}
15401542

src/align/score.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,7 @@ mod tests {
637637
Genome {
638638
sequence,
639639
n_genome,
640+
n_genome_real: n_genome,
640641
n_chr_real: 1,
641642
chr_name: vec!["chr1".to_string()],
642643
chr_length: vec![seq.len() as u64],

src/align/stitch.rs

Lines changed: 195 additions & 126 deletions
Large diffs are not rendered by default.

src/chimeric/detect.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,7 @@ mod tests {
10001000
Genome {
10011001
sequence,
10021002
n_genome,
1003+
n_genome_real: n_genome,
10031004
n_chr_real: 2,
10041005
chr_name: vec!["chr0".to_string(), "chr1".to_string()],
10051006
chr_length: vec![chr_len, chr_len],
@@ -1027,6 +1028,7 @@ mod tests {
10271028
junction_db: SpliceJunctionDb::empty(),
10281029
transcriptome: None,
10291030
prepared_junctions: Vec::new(),
1031+
sjdb_overhang: 0,
10301032
}
10311033
}
10321034

src/chimeric/output.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ mod tests {
443443
Genome {
444444
sequence: vec![0u8; 2048],
445445
n_genome: 1024,
446+
n_genome_real: 1024,
446447
n_chr_real: 2,
447448
chr_name: vec!["chr9".to_string(), "chr22".to_string()],
448449
chr_length: vec![512, 512],

src/chimeric/score.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ mod tests {
173173
Genome {
174174
sequence: seq,
175175
n_genome: 100,
176+
n_genome_real: 100,
176177
n_chr_real: 1,
177178
chr_name: vec!["chr1".to_string()],
178179
chr_start: vec![0],

src/genome/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ pub struct Genome {
2525
/// Total length of the forward (padded) genome.
2626
pub n_genome: u64,
2727

28+
/// Length of the real-genome forward strand, excluding any appended
29+
/// Gsj splice-junction buffer. Equals `n_genome` before `append_sjdb`
30+
/// and stays pinned at that value afterwards; matches STAR's
31+
/// `mapGen.nGenomeReal`.
32+
pub n_genome_real: u64,
33+
2834
/// Number of real chromosomes (not including scaffold/contigs if excluded).
2935
pub n_chr_real: usize,
3036

@@ -111,6 +117,7 @@ impl Genome {
111117
Ok(Genome {
112118
sequence,
113119
n_genome,
120+
n_genome_real: n_genome,
114121
n_chr_real,
115122
chr_name,
116123
chr_length,

src/index/io.rs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use crate::index::packed_array::PackedArray;
1111
use crate::index::sa_index::SaIndex;
1212
use crate::index::suffix_array::SuffixArray;
1313
use crate::junction::SpliceJunctionDb;
14+
use crate::junction::sjdb_insert;
1415
use crate::params::Parameters;
1516
use crate::quant::transcriptome::TranscriptomeIndex;
1617

@@ -99,13 +100,32 @@ impl GenomeIndex {
99100
);
100101
}
101102

103+
// Reload prepared junctions + sjdbOverhang from sjdbInfo.txt when
104+
// present. STAR appends a Gsj flanking-sequence buffer to the
105+
// genome at build time; align-time code needs the parsed junctions
106+
// to decode SA hits that land inside that buffer back to real
107+
// `(donor, acceptor)` genome positions.
108+
let sjdb_info_path = genome_dir.join("sjdbInfo.txt");
109+
let (prepared_junctions, sjdb_overhang) = if sjdb_info_path.exists() {
110+
let tab = sjdb_insert::read_sjdb_info_tab(&sjdb_info_path, &genome)?;
111+
log::info!(
112+
"Loaded sjdbInfo.txt: {} junctions, sjdbOverhang={}",
113+
tab.junctions.len(),
114+
tab.sjdb_overhang,
115+
);
116+
(tab.junctions, tab.sjdb_overhang)
117+
} else {
118+
(Vec::new(), 0)
119+
};
120+
102121
Ok(GenomeIndex {
103122
genome,
104123
suffix_array,
105124
sa_index,
106125
junction_db,
107126
transcriptome,
108-
prepared_junctions: Vec::new(),
127+
prepared_junctions,
128+
sjdb_overhang,
109129
})
110130
}
111131
}
@@ -164,7 +184,8 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result<Genome, Error>
164184
// (real + Gsj) lives in `genomeParameters.txt` under `genomeFileSizes`.
165185
// Prefer that value; fall back to the chr_start boundary for indices
166186
// built without a GTF.
167-
let n_genome = read_genome_file_size(genome_dir)?.unwrap_or(chr_start[n_chr_real]);
187+
let n_genome_real = chr_start[n_chr_real];
188+
let n_genome = read_genome_file_size(genome_dir)?.unwrap_or(n_genome_real);
168189

169190
// Load Genome sequence file
170191
let genome_path = genome_dir.join("Genome");
@@ -192,6 +213,7 @@ fn load_genome(genome_dir: &Path, _params: &Parameters) -> Result<Genome, Error>
192213
Ok(Genome {
193214
sequence,
194215
n_genome,
216+
n_genome_real,
195217
n_chr_real,
196218
chr_name,
197219
chr_length,

src/index/mod.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,16 @@ pub struct GenomeIndex {
2727
/// `transcriptInfo.tab` + friends at `genomeGenerate`, reloaded at
2828
/// `alignReads` from the same files.
2929
pub transcriptome: Option<TranscriptomeIndex>,
30-
/// Prepared splice junctions (sorted/deduped) used only on the build
31-
/// path to write `sjdbInfo.txt` + `sjdbList.out.tab`. Empty on the
32-
/// load path — those files have already been written and are not
33-
/// needed at align time.
30+
/// Prepared splice junctions in their post-dedup order (the same
31+
/// order they occupy in the Gsj buffer appended to the genome).
32+
/// Populated on both build and load paths when sjdb is present;
33+
/// empty for indices built without a GTF. Used at align time to
34+
/// decode Gsj-region SA hits back to real-genome `(donor, acceptor)`
35+
/// pairs.
3436
pub prepared_junctions: Vec<PreparedJunction>,
37+
/// `sjdbOverhang` recorded in `sjdbInfo.txt`. Zero when no sjdb
38+
/// junctions are present.
39+
pub sjdb_overhang: u32,
3540
}
3641

3742
impl GenomeIndex {
@@ -143,13 +148,20 @@ impl GenomeIndex {
143148
sa_index.data.len()
144149
);
145150

151+
let sjdb_overhang = if prepared_junctions.is_empty() {
152+
0
153+
} else {
154+
params.sjdb_overhang
155+
};
156+
146157
Ok(GenomeIndex {
147158
genome,
148159
suffix_array,
149160
sa_index,
150161
junction_db,
151162
transcriptome,
152163
prepared_junctions,
164+
sjdb_overhang,
153165
})
154166
}
155167

src/io/bam.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ mod tests {
469469
Genome {
470470
sequence: vec![0, 1, 2, 3, 0, 1, 2, 3], // ACGTACGT
471471
n_genome: 8,
472+
n_genome_real: 8,
472473
n_chr_real: 1,
473474
chr_name: vec!["chr1".to_string()],
474475
chr_length: vec![8],

0 commit comments

Comments
 (0)