Skip to content

Commit 112f485

Browse files
committed
update score and stiching of seeds to match STAR
1 parent 8ff6d97 commit 112f485

3 files changed

Lines changed: 156 additions & 0 deletions

File tree

src/align/score.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ pub struct AlignmentScorer {
4747
pub align_spliced_mate_map_lmin: u32,
4848
/// Min mapped length of spliced mates as fraction of read length (default 0.66)
4949
pub align_spliced_mate_map_lmin_over_lmate: f64,
50+
/// Minimum alignment score relative to read length (outFilterScoreMinOverLread, default 0.66)
51+
pub out_filter_score_min_over_lread: f64,
5052
}
5153

5254
impl AlignmentScorer {
@@ -73,6 +75,7 @@ impl AlignmentScorer {
7375
score_stitch_sj_shift: 1,
7476
align_spliced_mate_map_lmin: 0,
7577
align_spliced_mate_map_lmin_over_lmate: 0.66,
78+
out_filter_score_min_over_lread: 0.66,
7679
}
7780
}
7881

@@ -110,6 +113,7 @@ impl AlignmentScorer {
110113
score_stitch_sj_shift: params.score_stitch_sj_shift,
111114
align_spliced_mate_map_lmin: params.align_spliced_mate_map_lmin,
112115
align_spliced_mate_map_lmin_over_lmate: params.align_spliced_mate_map_lmin_over_lmate,
116+
out_filter_score_min_over_lread: params.out_filter_score_min_over_lread,
113117
}
114118
}
115119

@@ -704,6 +708,7 @@ mod tests {
704708
score_stitch_sj_shift: 1,
705709
align_spliced_mate_map_lmin: 0,
706710
align_spliced_mate_map_lmin_over_lmate: 0.66,
711+
out_filter_score_min_over_lread: 0.66,
707712
};
708713

709714
// Intron from position 2, length 12 (spans positions 2-13 inclusive)
@@ -747,6 +752,7 @@ mod tests {
747752
score_stitch_sj_shift: 1,
748753
align_spliced_mate_map_lmin: 0,
749754
align_spliced_mate_map_lmin_over_lmate: 0.66,
755+
out_filter_score_min_over_lread: 0.66,
750756
};
751757

752758
let motif = scorer.detect_splice_motif(2, 12, &genome);
@@ -789,6 +795,7 @@ mod tests {
789795
score_stitch_sj_shift: 1,
790796
align_spliced_mate_map_lmin: 0,
791797
align_spliced_mate_map_lmin_over_lmate: 0.66,
798+
out_filter_score_min_over_lread: 0.66,
792799
};
793800

794801
let motif = scorer.detect_splice_motif(2, 12, &genome);
@@ -829,6 +836,7 @@ mod tests {
829836
score_stitch_sj_shift: 1,
830837
align_spliced_mate_map_lmin: 0,
831838
align_spliced_mate_map_lmin_over_lmate: 0.66,
839+
out_filter_score_min_over_lread: 0.66,
832840
};
833841

834842
let motif = scorer.detect_splice_motif(2, 12, &genome);
@@ -862,6 +870,7 @@ mod tests {
862870
score_stitch_sj_shift: 1,
863871
align_spliced_mate_map_lmin: 0,
864872
align_spliced_mate_map_lmin_over_lmate: 0.66,
873+
out_filter_score_min_over_lread: 0.66,
865874
};
866875

867876
let (score, gap_type) = scorer.score_gap(0, 5, 0, &genome);
@@ -893,6 +902,7 @@ mod tests {
893902
score_stitch_sj_shift: 1,
894903
align_spliced_mate_map_lmin: 0,
895904
align_spliced_mate_map_lmin_over_lmate: 0.66,
905+
out_filter_score_min_over_lread: 0.66,
896906
};
897907

898908
// Small gap (< align_intron_min) is deletion
@@ -932,6 +942,7 @@ mod tests {
932942
score_stitch_sj_shift: 1,
933943
align_spliced_mate_map_lmin: 0,
934944
align_spliced_mate_map_lmin_over_lmate: 0.66,
945+
out_filter_score_min_over_lread: 0.66,
935946
};
936947

937948
// Gap starting at position 2 (GT), length 26 (>= 21) is splice junction
@@ -969,6 +980,7 @@ mod tests {
969980
score_stitch_sj_shift: 1,
970981
align_spliced_mate_map_lmin: 0,
971982
align_spliced_mate_map_lmin_over_lmate: 0.66,
983+
out_filter_score_min_over_lread: 0.66,
972984
};
973985

974986
// Annotated junction should get bonus
@@ -1010,6 +1022,7 @@ mod tests {
10101022
score_stitch_sj_shift: 1,
10111023
align_spliced_mate_map_lmin: 0,
10121024
align_spliced_mate_map_lmin_over_lmate: 0.66,
1025+
out_filter_score_min_over_lread: 0.66,
10131026
};
10141027

10151028
// CT-AC motif: (1,3,0,1) — reverse complement of GT-AG
@@ -1114,6 +1127,7 @@ mod tests {
11141127
score_stitch_sj_shift: 1,
11151128
align_spliced_mate_map_lmin: 0,
11161129
align_spliced_mate_map_lmin_over_lmate: 0.66,
1130+
out_filter_score_min_over_lread: 0.66,
11171131
};
11181132

11191133
// Gap of exactly 589824 starting at position 100 should be splice junction
@@ -1192,6 +1206,7 @@ mod tests {
11921206
score_stitch_sj_shift: 1,
11931207
align_spliced_mate_map_lmin: 0,
11941208
align_spliced_mate_map_lmin_over_lmate: 0.66,
1209+
out_filter_score_min_over_lread: 0.66,
11951210
};
11961211

11971212
// Gap of 1001 (> 1000 max) should be deletion, not splice junction
@@ -1242,6 +1257,7 @@ mod tests {
12421257
score_stitch_sj_shift: 1,
12431258
align_spliced_mate_map_lmin: 0,
12441259
align_spliced_mate_map_lmin_over_lmate: 0.66,
1260+
out_filter_score_min_over_lread: 0.66,
12451261
}
12461262
}
12471263

src/align/stitch.rs

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,10 @@ pub struct WindowAlignment {
325325
pub is_anchor: bool,
326326
/// Mate ID: 0=mate1, 1=mate2, 2=SE (STAR: PC[iP][PC_iFrag] / WA[iW][iS][WA_iFrag])
327327
pub mate_id: u8,
328+
/// Pre-computed extension score estimate (STAR: scoreSeedBest[iS] base case).
329+
/// = length + left_ext_score + right_ext_score (in stitch coords, forward strand).
330+
/// Computed in stitch_seeds_core after dedup/sort. Default: length as i32.
331+
pub pre_ext_score: i32,
328332
}
329333

330334
/// A cluster of seeds mapping to the same genomic region
@@ -777,6 +781,7 @@ pub fn cluster_seeds(
777781
n_rep: n_loci,
778782
is_anchor: is_anchor_seed,
779783
mate_id: seed.mate_id,
784+
pre_ext_score: length as i32,
780785
},
781786
);
782787
}
@@ -844,6 +849,7 @@ pub fn cluster_seeds(
844849
n_rep: n_loci,
845850
is_anchor: is_anchor_seed,
846851
mate_id: seed.mate_id,
852+
pre_ext_score: length as i32,
847853
},
848854
);
849855
}
@@ -2599,6 +2605,135 @@ fn stitch_seeds_core(
25992605
// Find last anchor index for the anchor constraint
26002606
let last_anchor_idx = wa_entries.iter().rposition(|wa| wa.is_anchor);
26012607

2608+
// --- STAR stitchWindowSeeds.cpp: scoreSeedBest pre-extension ---
2609+
//
2610+
// Phase 1: Pre-extend each seed left and right (base case of scoreSeedBest DP).
2611+
// Uses stitch_read (forward strand, forward genome coords after coord conversion above).
2612+
// EXTEND_ORDER=1: for forward reads, left extension first (5' of read), then right.
2613+
// for reverse reads, right extension first (5' of read in RC), then left.
2614+
// This matches stitch_recurse's `do_left_first = !original_is_reverse`.
2615+
// Mismatch budget for second ext carries over from first ext (STAR: scoreSeedBestMM[iS1]).
2616+
{
2617+
let zero_ext = ExtendResult { extend_len: 0, max_score: 0, n_mismatch: 0 };
2618+
let do_left_first = !stitch_is_reverse;
2619+
for wa in &mut wa_entries {
2620+
let right_start = wa.read_pos + wa.length;
2621+
2622+
let (first_ext, second_ext) = if do_left_first {
2623+
// Forward cluster: left ext first, then right
2624+
let left = if wa.read_pos > 0 {
2625+
extend_alignment(
2626+
stitch_read,
2627+
wa.read_pos,
2628+
wa.sa_pos,
2629+
-1,
2630+
wa.read_pos,
2631+
0,
2632+
wa.length,
2633+
scorer.n_mm_max,
2634+
scorer.p_mm_max,
2635+
index,
2636+
false,
2637+
)
2638+
} else {
2639+
zero_ext.clone()
2640+
};
2641+
let right_len_prev = wa.length + left.extend_len;
2642+
let right = if right_start < stitch_read.len() {
2643+
extend_alignment(
2644+
stitch_read,
2645+
right_start,
2646+
wa.sa_pos + wa.length as u64,
2647+
1,
2648+
stitch_read.len() - right_start,
2649+
left.n_mismatch,
2650+
right_len_prev,
2651+
scorer.n_mm_max,
2652+
scorer.p_mm_max,
2653+
index,
2654+
false,
2655+
)
2656+
} else {
2657+
zero_ext.clone()
2658+
};
2659+
(left, right)
2660+
} else {
2661+
// Reverse cluster: right ext first (5' of RC read), then left
2662+
let right = if right_start < stitch_read.len() {
2663+
extend_alignment(
2664+
stitch_read,
2665+
right_start,
2666+
wa.sa_pos + wa.length as u64,
2667+
1,
2668+
stitch_read.len() - right_start,
2669+
0,
2670+
wa.length,
2671+
scorer.n_mm_max,
2672+
scorer.p_mm_max,
2673+
index,
2674+
false,
2675+
)
2676+
} else {
2677+
zero_ext.clone()
2678+
};
2679+
let left_len_prev = wa.length + right.extend_len;
2680+
let left = if wa.read_pos > 0 {
2681+
extend_alignment(
2682+
stitch_read,
2683+
wa.read_pos,
2684+
wa.sa_pos,
2685+
-1,
2686+
wa.read_pos,
2687+
right.n_mismatch,
2688+
left_len_prev,
2689+
scorer.n_mm_max,
2690+
scorer.p_mm_max,
2691+
index,
2692+
false,
2693+
)
2694+
} else {
2695+
zero_ext.clone()
2696+
};
2697+
(left, right)
2698+
};
2699+
2700+
wa.pre_ext_score = wa.length as i32 + first_ext.max_score + second_ext.max_score;
2701+
}
2702+
}
2703+
2704+
// Phase 2: DP chain over pre-extended scores (STAR: scoreSeedBest chain accumulation).
2705+
// dp[i] = best chain score ending at wa_entries[i].
2706+
// For intra-read gaps: if read_gap != genome_gap this is a potential splice — 0 penalty.
2707+
// For exact-match gaps (read_gap == genome_gap): score the gap as mismatches only
2708+
// (lenient: use 0 since exact-match gaps are handled by the actual DP later).
2709+
// Both cases produce an overestimate — conservative upper bound matching STAR's approach.
2710+
let best_pre_score = {
2711+
let mut dp: Vec<i32> = wa_entries.iter().map(|wa| wa.pre_ext_score).collect();
2712+
for i in 1..wa_entries.len() {
2713+
for j in 0..i {
2714+
let rj_end = wa_entries[j].read_pos + wa_entries[j].length;
2715+
let gj_end = wa_entries[j].sa_pos + wa_entries[j].length as u64;
2716+
// Must not overlap in read or genome
2717+
if wa_entries[i].read_pos < rj_end || wa_entries[i].sa_pos < gj_end {
2718+
continue;
2719+
}
2720+
// Gap penalty: 0 for potential splices (read_gap != genome_gap) or short gaps
2721+
// This prevents underestimating spliced reads whose per-seed pre_ext_score is
2722+
// limited by the splice junction (extension stops at the intron boundary).
2723+
let chain = dp[j] + wa_entries[i].pre_ext_score;
2724+
if chain > dp[i] {
2725+
dp[i] = chain;
2726+
}
2727+
}
2728+
}
2729+
dp.into_iter().max().unwrap_or(0)
2730+
};
2731+
2732+
// Note: scoreSeedBest (best_pre_score) is used by STAR for seed ordering within
2733+
// stitchWindowAligns, NOT as a hard pre-filter gate. We keep pre_ext_score on each WA
2734+
// entry for future use in seed ordering (Phase B), but do not filter here.
2735+
let _ = best_pre_score; // suppress unused warning
2736+
26022737
// Run recursive include/exclude stitcher
26032738
let mut working_transcripts: Vec<WorkingTranscript> = Vec::new();
26042739
let mut recursion_count: u32 = 0;
@@ -2923,6 +3058,7 @@ mod tests {
29233058
n_rep: 1,
29243059
is_anchor: true,
29253060
mate_id: 2,
3061+
pre_ext_score: 5,
29263062
},
29273063
WindowAlignment {
29283064
seed_idx: 1,
@@ -2933,6 +3069,7 @@ mod tests {
29333069
n_rep: 1,
29343070
is_anchor: true,
29353071
mate_id: 2,
3072+
pre_ext_score: 5,
29363073
},
29373074
];
29383075

@@ -3176,6 +3313,7 @@ mod tests {
31763313
score_stitch_sj_shift: 1,
31773314
align_spliced_mate_map_lmin: 0,
31783315
align_spliced_mate_map_lmin_over_lmate: 0.66,
3316+
out_filter_score_min_over_lread: 0.66,
31793317
};
31803318

31813319
// Left overhang (prev.length) = 3, below min of 5
@@ -3218,6 +3356,7 @@ mod tests {
32183356
score_stitch_sj_shift: 1,
32193357
align_spliced_mate_map_lmin: 0,
32203358
align_spliced_mate_map_lmin_over_lmate: 0.66,
3359+
out_filter_score_min_over_lread: 0.66,
32213360
};
32223361

32233362
// Both overhangs >= 5

src/chimeric/detect.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ mod tests {
258258
n_rep: 1,
259259
is_anchor: true,
260260
mate_id: 2,
261+
pre_ext_score: (genome_end - genome_start) as i32,
261262
}],
262263
chr_idx,
263264
genome_start,

0 commit comments

Comments
 (0)