22use crate :: align:: score:: { AlignmentScorer , SpliceMotif } ;
33use crate :: align:: seed:: Seed ;
44use crate :: align:: stitch:: {
5- cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core,
6- stitch_seeds_with_jdb_debug, PE_SPACER_BASE ,
5+ PE_SPACER_BASE , cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core,
6+ stitch_seeds_with_jdb_debug,
77} ;
8- use crate :: align:: transcript:: Transcript ;
8+ use crate :: align:: transcript:: { Exon , Transcript } ;
99use crate :: error:: Error ;
1010use crate :: index:: GenomeIndex ;
1111use crate :: params:: { IntronMotifFilter , IntronStrandFilter , Parameters } ;
@@ -20,7 +20,7 @@ use std::hash::{DefaultHasher, Hash, Hasher};
2020/// via rayon, so we instead fold the read name into the seed — this keeps tie
2121/// breaks reproducible regardless of thread count while still honoring the
2222/// user's `--runRNGseed` value.
23- fn per_read_seed ( run_rng_seed : u64 , read_name : & str ) -> u64 {
23+ pub ( crate ) fn per_read_seed ( run_rng_seed : u64 , read_name : & str ) -> u64 {
2424 let mut hasher = DefaultHasher :: new ( ) ;
2525 read_name. hash ( & mut hasher) ;
2626 run_rng_seed. wrapping_mul ( hasher. finish ( ) . wrapping_add ( 1 ) )
@@ -74,6 +74,52 @@ pub struct PairedAlignment {
7474 pub combined_n_match : u32 ,
7575}
7676
77+ impl PairedAlignment {
78+ /// Build a STAR-style combined two-mate `Transcript` for transcriptome
79+ /// projection.
80+ ///
81+ /// Matches STAR's single-`Transcript`-per-pair model: mate1 exons with
82+ /// `i_frag = 0`, then mate2 exons rewritten to `i_frag = 1`. Only
83+ /// meaningful for pairs on the same chromosome and strand — both are
84+ /// invariants of a `PairedAlignment` (checked in `try_pair_transcripts`).
85+ ///
86+ /// The returned transcript's `cigar` is empty: transcriptome BAM
87+ /// emission generates per-mate CIGARs from the split exon list rather
88+ /// than consuming a combined one.
89+ pub fn combined_transcript_for_projection ( & self ) -> Transcript {
90+ let m1 = & self . mate1_transcript ;
91+ let m2 = & self . mate2_transcript ;
92+
93+ let mut exons: Vec < Exon > = Vec :: with_capacity ( m1. exons . len ( ) + m2. exons . len ( ) ) ;
94+ for e in & m1. exons {
95+ let mut ee = e. clone ( ) ;
96+ ee. i_frag = 0 ;
97+ exons. push ( ee) ;
98+ }
99+ for e in & m2. exons {
100+ let mut ee = e. clone ( ) ;
101+ ee. i_frag = 1 ;
102+ exons. push ( ee) ;
103+ }
104+
105+ Transcript {
106+ chr_idx : m1. chr_idx ,
107+ genome_start : m1. genome_start . min ( m2. genome_start ) ,
108+ genome_end : m1. genome_end . max ( m2. genome_end ) ,
109+ is_reverse : m1. is_reverse ,
110+ exons,
111+ cigar : Vec :: new ( ) ,
112+ score : m1. score + m2. score ,
113+ n_mismatch : m1. n_mismatch + m2. n_mismatch ,
114+ n_gap : m1. n_gap + m2. n_gap ,
115+ n_junction : m1. n_junction + m2. n_junction ,
116+ junction_motifs : Vec :: new ( ) ,
117+ junction_annotated : Vec :: new ( ) ,
118+ read_seq : Vec :: new ( ) ,
119+ }
120+ }
121+ }
122+
77123/// Result of paired-end alignment, covering all mapping outcomes.
78124#[ derive( Debug , Clone ) ]
79125pub enum PairedAlignmentResult {
@@ -634,8 +680,13 @@ pub fn align_paired_read(
634680 // PE_SPACER_BASE=11 stops MMP search at the boundary — seeds never span mates.
635681 // For 301bp: Nstart=7 (301/50+1). Position 129 gives ≤21bp seeds (spacer at 150)
636682 // → less likely unique for repetitive (rDNA) sequences → fewer N² cross-copy pairings.
637- let mut combined_seeds =
638- Seed :: find_seeds ( & combined_read, index, params. seed_map_min , params, debug_name) ?;
683+ let mut combined_seeds = Seed :: find_seeds (
684+ & combined_read,
685+ index,
686+ params. seed_map_min ,
687+ params,
688+ debug_name,
689+ ) ?;
639690 // After find_seeds, ALL seeds (search_rc=false and search_rc=true) have read_pos
640691 // in combined_read coordinates. The RC conversion in search_direction_sparse:
641692 // seed.read_pos = combined_len - p - L (where p is position in RC(combined_read))
@@ -670,13 +721,7 @@ pub fn align_paired_read(
670721 ) ?;
671722
672723 for wt in & wts {
673- match split_combined_wt (
674- wt,
675- len1,
676- len2,
677- stitch_is_reverse,
678- scorer. align_intron_min ,
679- ) {
724+ match split_combined_wt ( wt, len1, len2, stitch_is_reverse, scorer. align_intron_min ) {
680725 Some ( ( m1_wt, m2_wt) ) => {
681726 let ( m1_read_slice, m1_orig_rev, m2_read_slice, m2_orig_rev) =
682727 if stitch_is_reverse {
@@ -706,8 +751,8 @@ pub fn align_paired_read(
706751 & scorer,
707752 & stitch_cluster,
708753 m1_orig_rev,
709- m1_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
710- !m1_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
754+ m1_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
755+ !m1_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
711756 ) else {
712757 continue ;
713758 } ;
@@ -718,8 +763,8 @@ pub fn align_paired_read(
718763 & scorer,
719764 & stitch_cluster,
720765 m2_orig_rev,
721- m2_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
722- !m2_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
766+ m2_orig_rev, // no_left_ext = inner for reverse (orig_is_rev=true)
767+ !m2_orig_rev, // no_right_ext = inner for forward (orig_is_rev=false)
723768 ) else {
724769 continue ;
725770 } ;
@@ -736,9 +781,7 @@ pub fn align_paired_read(
736781
737782 let combined_span =
738783 t1. genome_end . max ( t2. genome_end ) - t1. genome_start . min ( t2. genome_start ) ;
739- let combined_score_override = Some (
740- wt. score + scorer. genomic_length_penalty ( combined_span) ,
741- ) ;
784+ let combined_wt_score = wt. score + scorer. genomic_length_penalty ( combined_span) ;
742785
743786 if let Some ( pair) = try_pair_transcripts (
744787 & t1,
@@ -747,8 +790,7 @@ pub fn align_paired_read(
747790 len2,
748791 params,
749792 combined_score_threshold,
750- & scorer,
751- combined_score_override,
793+ combined_wt_score,
752794 ) {
753795 joint_pairs. push ( pair) ;
754796 }
@@ -803,7 +845,6 @@ pub fn align_paired_read(
803845 }
804846 }
805847
806-
807848 // --- Decision tree: dedup, score-filter, quality-filter, then half-mapped fallback ---
808849
809850 // Step 1: position dedup — remove exact (chr, mate1_pos, mate2_pos, strand, CIGAR) duplicates.
@@ -961,19 +1002,17 @@ pub fn align_paired_read(
9611002 }
9621003
9631004 // Half-mapped fallback: report the best-scoring single-mate transcript.
964- // STAR applies quality filter to the COMBINED read (Lread-1 = len1+len2), so use the
965- // same combined_score_threshold here. This matches STAR's behavior where a single-mate
966- // alignment that falls below the combined threshold is not output at all.
967- let thresh1 = combined_score_threshold. max ( params. out_filter_score_min ) ;
968- let thresh2 = combined_score_threshold. max ( params. out_filter_score_min ) ;
1005+ // STAR applies the quality filter to the COMBINED read (Lread-1 = len1+len2), so we
1006+ // use the same threshold for each mate here.
1007+ let single_mate_threshold = combined_score_threshold. max ( params. out_filter_score_min ) ;
9691008
9701009 let best_m1 = single_mate1_transcripts
9711010 . into_iter ( )
972- . filter ( |t| t. score >= thresh1 )
1011+ . filter ( |t| t. score >= single_mate_threshold )
9731012 . max_by_key ( |t| t. score ) ;
9741013 let best_m2 = single_mate2_transcripts
9751014 . into_iter ( )
976- . filter ( |t| t. score >= thresh2 )
1015+ . filter ( |t| t. score >= single_mate_threshold )
9771016 . max_by_key ( |t| t. score ) ;
9781017
9791018 match ( best_m1, best_m2) {
@@ -1031,8 +1070,7 @@ fn try_pair_transcripts(
10311070 len2 : usize ,
10321071 params : & Parameters ,
10331072 combined_score_threshold : i32 ,
1034- scorer : & AlignmentScorer ,
1035- combined_wt_score_override : Option < i32 > ,
1073+ combined_wt_score : i32 ,
10361074) -> Option < PairedAlignment > {
10371075 // Must be same chromosome
10381076 if t1. chr_idx != t2. chr_idx {
@@ -1067,20 +1105,6 @@ fn try_pair_transcripts(
10671105 return None ;
10681106 }
10691107
1070- // Combined score: use override if provided (combined-read path supplies the original
1071- // stitch_recurse score + combined-span penalty, which correctly includes mismatch
1072- // penalties). Fallback formula (per-mate path) undoes per-mate span penalties and
1073- // applies a single combined-span penalty.
1074- let combined_span = right. genome_end - left. genome_start ;
1075- let combined_wt_score = combined_wt_score_override. unwrap_or_else ( || {
1076- let t1_span = t1. genome_end - t1. genome_start ;
1077- let t2_span = t2. genome_end - t2. genome_start ;
1078- let p1 = scorer. genomic_length_penalty ( t1_span) ;
1079- let p2 = scorer. genomic_length_penalty ( t2_span) ;
1080- let combined_p = scorer. genomic_length_penalty ( combined_span) ;
1081- t1. score + t2. score - p1 - p2 + combined_p
1082- } ) ;
1083-
10841108 // SCORE-GATE: reject pairs where score is below the absolute floor
10851109 if combined_wt_score + params. out_filter_multimap_score_range < combined_score_threshold {
10861110 return None ;
@@ -1331,9 +1355,54 @@ mod tests {
13311355 suffix_array,
13321356 sa_index,
13331357 junction_db : crate :: junction:: SpliceJunctionDb :: empty ( ) ,
1358+ transcriptome : None ,
13341359 }
13351360 }
13361361
1362+ #[ test]
1363+ fn combined_transcript_for_projection_rewrites_mate2_ifrag ( ) {
1364+ use crate :: align:: transcript:: CigarOp ;
1365+
1366+ let make_tr = |gs : u64 , ge : u64 , rs : usize , re : usize | Transcript {
1367+ chr_idx : 0 ,
1368+ genome_start : gs,
1369+ genome_end : ge,
1370+ is_reverse : false ,
1371+ exons : vec ! [ Exon {
1372+ genome_start: gs,
1373+ genome_end: ge,
1374+ read_start: rs,
1375+ read_end: re,
1376+ i_frag: 0 ,
1377+ } ] ,
1378+ cigar : vec ! [ CigarOp :: Match ( ( ge - gs) as u32 ) ] ,
1379+ score : 100 ,
1380+ n_mismatch : 0 ,
1381+ n_gap : 0 ,
1382+ n_junction : 0 ,
1383+ junction_motifs : vec ! [ ] ,
1384+ junction_annotated : vec ! [ ] ,
1385+ read_seq : vec ! [ ] ,
1386+ } ;
1387+ let pair = PairedAlignment {
1388+ mate1_transcript : make_tr ( 1000 , 1100 , 0 , 100 ) ,
1389+ mate2_transcript : make_tr ( 1300 , 1400 , 0 , 100 ) ,
1390+ mate1_region : ( 0 , 100 ) ,
1391+ mate2_region : ( 0 , 100 ) ,
1392+ is_proper_pair : true ,
1393+ insert_size : 400 ,
1394+ combined_wt_score : 200 ,
1395+ combined_n_match : 200 ,
1396+ } ;
1397+ let combined = pair. combined_transcript_for_projection ( ) ;
1398+ assert_eq ! ( combined. exons. len( ) , 2 ) ;
1399+ assert_eq ! ( combined. exons[ 0 ] . i_frag, 0 ) ;
1400+ assert_eq ! ( combined. exons[ 1 ] . i_frag, 1 ) ;
1401+ assert_eq ! ( combined. genome_start, 1000 ) ;
1402+ assert_eq ! ( combined. genome_end, 1400 ) ;
1403+ assert_eq ! ( combined. score, 200 ) ;
1404+ }
1405+
13371406 #[ test]
13381407 fn test_align_read_no_seeds ( ) {
13391408 let index = make_test_index ( ) ;
@@ -1424,6 +1493,7 @@ mod tests {
14241493 genome_end: 1100 ,
14251494 read_start: 0 ,
14261495 read_end: 100 ,
1496+ i_frag: 0 ,
14271497 } ] ,
14281498 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
14291499 score : 100 ,
@@ -1445,6 +1515,7 @@ mod tests {
14451515 genome_end: 1300 ,
14461516 read_start: 0 ,
14471517 read_end: 100 ,
1518+ i_frag: 0 ,
14481519 } ] ,
14491520 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
14501521 score : 100 ,
@@ -1477,6 +1548,7 @@ mod tests {
14771548 genome_end: 1100 ,
14781549 read_start: 0 ,
14791550 read_end: 100 ,
1551+ i_frag: 0 ,
14801552 } ] ,
14811553 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
14821554 score : 100 ,
@@ -1498,6 +1570,7 @@ mod tests {
14981570 genome_end: 1400 ,
14991571 read_start: 0 ,
15001572 read_end: 100 ,
1573+ i_frag: 0 ,
15011574 } ] ,
15021575 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
15031576 score : 100 ,
@@ -1528,6 +1601,7 @@ mod tests {
15281601 genome_end: 1100 ,
15291602 read_start: 0 ,
15301603 read_end: 100 ,
1604+ i_frag: 0 ,
15311605 } ] ,
15321606 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
15331607 score : 100 ,
@@ -1549,6 +1623,7 @@ mod tests {
15491623 genome_end: 1300 ,
15501624 read_start: 0 ,
15511625 read_end: 100 ,
1626+ i_frag: 0 ,
15521627 } ] ,
15531628 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
15541629 score : 100 ,
@@ -1580,6 +1655,7 @@ mod tests {
15801655 genome_end: 1300 ,
15811656 read_start: 0 ,
15821657 read_end: 100 ,
1658+ i_frag: 0 ,
15831659 } ] ,
15841660 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
15851661 score : 100 ,
@@ -1602,6 +1678,7 @@ mod tests {
16021678 genome_end: 1300 ,
16031679 read_start: 0 ,
16041680 read_end: 100 ,
1681+ i_frag: 0 ,
16051682 } ] ,
16061683 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
16071684 score : 100 ,
@@ -1679,6 +1756,7 @@ mod tests {
16791756 genome_end: 1300 ,
16801757 read_start: 0 ,
16811758 read_end: 100 ,
1759+ i_frag: 0 ,
16821760 } ] ,
16831761 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
16841762 score : 100 ,
@@ -1700,6 +1778,7 @@ mod tests {
17001778 genome_end: 1100 ,
17011779 read_start: 0 ,
17021780 read_end: 100 ,
1781+ i_frag: 0 ,
17031782 } ] ,
17041783 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
17051784 score : 100 ,
@@ -1739,6 +1818,7 @@ mod tests {
17391818 genome_end: 1200 ,
17401819 read_start: 0 ,
17411820 read_end: 100 ,
1821+ i_frag: 0 ,
17421822 } ] ,
17431823 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
17441824 score : 100 ,
@@ -1823,6 +1903,7 @@ mod tests {
18231903 genome_end: 1100 ,
18241904 read_start: 0 ,
18251905 read_end: 100 ,
1906+ i_frag: 0 ,
18261907 } ] ,
18271908 cigar : vec ! [ CigarOp :: Match ( 100 ) ] ,
18281909 score : 100 ,
0 commit comments