Skip to content

Commit f6df086

Browse files
authored
chore: upstream cigar (#57)
1 parent e0a10fe commit f6df086

13 files changed

Lines changed: 580 additions & 588 deletions

File tree

src/align/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ pub use read_align::{
1010
};
1111
pub use seed::Seed;
1212
pub use stitch::{SeedCluster, WindowAlignment, stitch_seeds};
13-
pub use transcript::{CigarOp, Exon, Transcript};
13+
pub use transcript::{Exon, Transcript};

src/align/read_align.rs

Lines changed: 60 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ use crate::align::stitch::{
55
PE_SPACER_BASE, cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core,
66
stitch_seeds_with_jdb_debug,
77
};
8-
use crate::align::transcript::{Exon, Transcript};
8+
use crate::align::transcript::{Exon, KindExt as _, Transcript};
99
use crate::error::Error;
1010
use crate::index::GenomeIndex;
1111
use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters};
1212
use crate::stats::UnmappedReason;
13+
use noodles::sam::alignment::record::cigar;
1314
use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom};
15+
use std::cmp::Ordering;
1416
use std::hash::{DefaultHasher, Hash, Hasher};
1517

1618
/// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
@@ -323,7 +325,6 @@ pub fn align_read(
323325
} else {
324326
"unknown"
325327
};
326-
let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect();
327328
eprintln!(
328329
" transcript[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}",
329330
ti,
@@ -334,7 +335,7 @@ pub fn align_read(
334335
t.score,
335336
t.n_mismatch,
336337
t.n_junction,
337-
cigar_str
338+
t.cigar_string()
338339
);
339340
}
340341
}
@@ -351,20 +352,9 @@ pub fn align_read(
351352

352353
// Deduplicate transcripts with identical genomic coordinates AND CIGAR.
353354
transcripts.sort_by(|a, b| {
354-
(
355-
a.chr_idx,
356-
a.genome_start,
357-
a.genome_end,
358-
a.is_reverse,
359-
&a.cigar,
360-
)
361-
.cmp(&(
362-
b.chr_idx,
363-
b.genome_start,
364-
b.genome_end,
365-
b.is_reverse,
366-
&b.cigar,
367-
))
355+
(a.chr_idx, a.genome_start, a.genome_end, a.is_reverse)
356+
.cmp(&(b.chr_idx, b.genome_start, b.genome_end, b.is_reverse))
357+
.then_with(|| cmp_cigar(&a.cigar, &b.cigar))
368358
.then_with(|| b.score.cmp(&a.score))
369359
});
370360
transcripts.dedup_by(|a, b| {
@@ -467,13 +457,13 @@ pub fn align_read(
467457

468458
// Absolute matched bases
469459
let n_matched = t.n_matched();
470-
if n_matched < params.out_filter_match_nmin {
460+
if n_matched < params.out_filter_match_nmin as usize {
471461
*filter_reasons.entry("match_min").or_insert(0) += 1;
472462
return false;
473463
}
474464

475465
// Relative matched bases: STAR casts to uint (u32)
476-
if n_matched < (params.out_filter_match_nmin_over_lread * lread_m1) as u32 {
466+
if (n_matched as f64) < params.out_filter_match_nmin_over_lread * lread_m1 {
477467
*filter_reasons.entry("match_min_relative").or_insert(0) += 1;
478468
return false;
479469
}
@@ -519,7 +509,6 @@ pub fn align_read(
519509
match motif.implied_strand() {
520510
Some('+') => has_plus = true,
521511
Some('-') => has_minus = true,
522-
None => {}
523512
_ => {}
524513
}
525514
}
@@ -633,7 +622,6 @@ pub fn align_read(
633622
} else {
634623
"unknown"
635624
};
636-
let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect();
637625
eprintln!(
638626
" FINAL[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}",
639627
i,
@@ -644,7 +632,7 @@ pub fn align_read(
644632
t.score,
645633
t.n_mismatch,
646634
t.n_junction,
647-
cigar_str
635+
t.cigar_string()
648636
);
649637
}
650638
}
@@ -989,8 +977,8 @@ pub fn align_paired_read(
989977
}
990978
b.combined_wt_score
991979
.cmp(&a.combined_wt_score)
992-
.then_with(|| a.mate1_transcript.cigar.cmp(&b.mate1_transcript.cigar))
993-
.then_with(|| a.mate2_transcript.cigar.cmp(&b.mate2_transcript.cigar))
980+
.then_with(|| cmp_cigar(&a.mate1_transcript.cigar, &b.mate1_transcript.cigar))
981+
.then_with(|| cmp_cigar(&a.mate2_transcript.cigar, &b.mate2_transcript.cigar))
994982
});
995983
joint_pairs.dedup_by(|a, b| {
996984
a.mate1_transcript.chr_idx == b.mate1_transcript.chr_idx
@@ -1234,6 +1222,18 @@ pub fn align_paired_read(
12341222
}
12351223
}
12361224

1225+
/// Comparator for deduplicating structs containing CIGAR ops via sort→dedup
1226+
fn cmp_cigar(a: &[cigar::Op], b: &[cigar::Op]) -> Ordering {
1227+
a.len().cmp(&b.len()).then_with(|| {
1228+
(a.iter().zip(b.iter()))
1229+
.find_map(|(a, b)| {
1230+
let ord = (a.char(), a.len()).cmp(&(b.char(), b.len()));
1231+
(ord != Ordering::Equal).then_some(ord)
1232+
})
1233+
.unwrap_or(Ordering::Equal)
1234+
})
1235+
}
1236+
12371237
/// Attempt to pair two per-mate transcripts into a PairedAlignment.
12381238
///
12391239
/// Returns `None` if the mates are incompatible (same strand, different chr, too far, etc.).
@@ -1416,18 +1416,18 @@ fn extract_junctions_from_cigar(t: &Transcript) -> Vec<(u64, u64)> {
14161416
let mut junctions = Vec::new();
14171417
let mut genome_pos = t.genome_start;
14181418
for op in &t.cigar {
1419-
use crate::align::transcript::CigarOp;
1420-
match op {
1421-
CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) | CigarOp::Del(n) => {
1422-
genome_pos += *n as u64;
1419+
use noodles::sam::alignment::record::cigar::op::Kind;
1420+
match op.kind() {
1421+
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => {
1422+
genome_pos += op.len() as u64;
14231423
}
1424-
CigarOp::RefSkip(n) => {
1424+
Kind::Skip => {
14251425
let donor = genome_pos;
1426-
let acceptor = genome_pos + *n as u64;
1426+
let acceptor = genome_pos + op.len() as u64;
14271427
junctions.push((donor, acceptor));
14281428
genome_pos = acceptor;
14291429
}
1430-
CigarOp::Ins(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {}
1430+
Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {}
14311431
}
14321432
}
14331433
junctions
@@ -1479,6 +1479,7 @@ mod tests {
14791479
use crate::index::sa_index::SaIndex;
14801480
use crate::index::suffix_array::SuffixArray;
14811481
use clap::Parser;
1482+
use noodles::sam::alignment::record::cigar;
14821483

14831484
fn make_test_params() -> Parameters {
14841485
// Parse empty args to get default parameters
@@ -1537,8 +1538,7 @@ mod tests {
15371538

15381539
#[test]
15391540
fn combined_transcript_for_projection_rewrites_mate2_ifrag() {
1540-
use crate::align::transcript::CigarOp;
1541-
1541+
use cigar::op::{Kind, Op};
15421542
let make_tr = |gs: u64, ge: u64, rs: usize, re: usize| Transcript {
15431543
chr_idx: 0,
15441544
genome_start: gs,
@@ -1551,7 +1551,7 @@ mod tests {
15511551
read_end: re,
15521552
i_frag: 0,
15531553
}],
1554-
cigar: vec![CigarOp::Match((ge - gs) as u32)],
1554+
cigar: vec![Op::new(Kind::Match, (ge - gs) as usize)],
15551555
score: 100,
15561556
n_mismatch: 0,
15571557
n_gap: 0,
@@ -1654,7 +1654,8 @@ mod tests {
16541654

16551655
#[test]
16561656
fn test_check_proper_pair_distance() {
1657-
use crate::align::transcript::{CigarOp, Exon};
1657+
use crate::align::transcript::Exon;
1658+
use cigar::op::{Kind, Op};
16581659

16591660
let params = make_test_params();
16601661

@@ -1671,7 +1672,7 @@ mod tests {
16711672
read_end: 100,
16721673
i_frag: 0,
16731674
}],
1674-
cigar: vec![CigarOp::Match(100)],
1675+
cigar: vec![Op::new(Kind::Match, 100)],
16751676
score: 100,
16761677
n_mismatch: 0,
16771678
n_gap: 0,
@@ -1693,7 +1694,7 @@ mod tests {
16931694
read_end: 100,
16941695
i_frag: 0,
16951696
}],
1696-
cigar: vec![CigarOp::Match(100)],
1697+
cigar: vec![Op::new(Kind::Match, 100)],
16971698
score: 100,
16981699
n_mismatch: 0,
16991700
n_gap: 0,
@@ -1709,7 +1710,8 @@ mod tests {
17091710

17101711
#[test]
17111712
fn test_check_proper_pair_too_far() {
1712-
use crate::align::transcript::{CigarOp, Exon};
1713+
use crate::align::transcript::Exon;
1714+
use cigar::op::{Kind, Op};
17131715

17141716
let mut params = make_test_params();
17151717
params.align_mates_gap_max = 100;
@@ -1726,7 +1728,7 @@ mod tests {
17261728
read_end: 100,
17271729
i_frag: 0,
17281730
}],
1729-
cigar: vec![CigarOp::Match(100)],
1731+
cigar: vec![Op::new(Kind::Match, 100)],
17301732
score: 100,
17311733
n_mismatch: 0,
17321734
n_gap: 0,
@@ -1748,7 +1750,7 @@ mod tests {
17481750
read_end: 100,
17491751
i_frag: 0,
17501752
}],
1751-
cigar: vec![CigarOp::Match(100)],
1753+
cigar: vec![Op::new(Kind::Match, 100)],
17521754
score: 100,
17531755
n_mismatch: 0,
17541756
n_gap: 0,
@@ -1764,7 +1766,8 @@ mod tests {
17641766

17651767
#[test]
17661768
fn test_calculate_insert_size_positive() {
1767-
use crate::align::transcript::{CigarOp, Exon};
1769+
use crate::align::transcript::Exon;
1770+
use cigar::op::{Kind, Op};
17681771

17691772
// Mate1 is leftmost
17701773
let t1 = Transcript {
@@ -1779,7 +1782,7 @@ mod tests {
17791782
read_end: 100,
17801783
i_frag: 0,
17811784
}],
1782-
cigar: vec![CigarOp::Match(100)],
1785+
cigar: vec![Op::new(Kind::Match, 100)],
17831786
score: 100,
17841787
n_mismatch: 0,
17851788
n_gap: 0,
@@ -1801,7 +1804,7 @@ mod tests {
18011804
read_end: 100,
18021805
i_frag: 0,
18031806
}],
1804-
cigar: vec![CigarOp::Match(100)],
1807+
cigar: vec![Op::new(Kind::Match, 100)],
18051808
score: 100,
18061809
n_mismatch: 0,
18071810
n_gap: 0,
@@ -1817,8 +1820,9 @@ mod tests {
18171820

18181821
#[test]
18191822
fn test_strand_consistency_filter() {
1820-
use crate::align::transcript::{CigarOp, Exon, Transcript};
1823+
use crate::align::transcript::{Exon, Transcript};
18211824
use crate::params::IntronStrandFilter;
1825+
use cigar::op::{Kind, Op};
18221826

18231827
// Create a transcript with conflicting strand motifs (mixed + and - within one transcript)
18241828
let t_inconsistent = Transcript {
@@ -1833,7 +1837,7 @@ mod tests {
18331837
read_end: 100,
18341838
i_frag: 0,
18351839
}],
1836-
cigar: vec![CigarOp::Match(100)],
1840+
cigar: vec![Op::new(Kind::Match, 100)],
18371841
score: 100,
18381842
n_mismatch: 0,
18391843
n_gap: 0,
@@ -1856,7 +1860,7 @@ mod tests {
18561860
read_end: 100,
18571861
i_frag: 0,
18581862
}],
1859-
cigar: vec![CigarOp::Match(100)],
1863+
cigar: vec![Op::new(Kind::Match, 100)],
18601864
score: 100,
18611865
n_mismatch: 0,
18621866
n_gap: 0,
@@ -1917,7 +1921,8 @@ mod tests {
19171921

19181922
#[test]
19191923
fn test_calculate_insert_size_negative() {
1920-
use crate::align::transcript::{CigarOp, Exon};
1924+
use crate::align::transcript::Exon;
1925+
use cigar::op::{Kind, Op};
19211926

19221927
// RF pair: mate1 reverse (right side), mate2 forward (left side).
19231928
// STAR reverse cluster: tlen = mate1.genome_end - mate2.genome_start = 1300 - 1000 = 300.
@@ -1934,7 +1939,7 @@ mod tests {
19341939
read_end: 100,
19351940
i_frag: 0,
19361941
}],
1937-
cigar: vec![CigarOp::Match(100)],
1942+
cigar: vec![Op::new(Kind::Match, 100)],
19381943
score: 100,
19391944
n_mismatch: 0,
19401945
n_gap: 0,
@@ -1956,7 +1961,7 @@ mod tests {
19561961
read_end: 100,
19571962
i_frag: 0,
19581963
}],
1959-
cigar: vec![CigarOp::Match(100)],
1964+
cigar: vec![Op::new(Kind::Match, 100)],
19601965
score: 100,
19611966
n_mismatch: 0,
19621967
n_gap: 0,
@@ -1973,7 +1978,8 @@ mod tests {
19731978
#[test]
19741979
fn test_noncanonical_unannotated_filter() {
19751980
use crate::align::score::SpliceMotif;
1976-
use crate::align::transcript::{CigarOp, Exon, Transcript};
1981+
use crate::align::transcript::{Exon, Transcript};
1982+
use cigar::op::{Kind, Op};
19771983

19781984
// Helper: check if a transcript would be filtered by RemoveNoncanonicalUnannotated
19791985
// (mirrors the logic in the retain closure)
@@ -1996,7 +2002,7 @@ mod tests {
19962002
read_end: 100,
19972003
i_frag: 0,
19982004
}],
1999-
cigar: vec![CigarOp::Match(100)],
2005+
cigar: vec![Op::new(Kind::Match, 100)],
20002006
score: 100,
20012007
n_mismatch: 0,
20022008
n_gap: 0,
@@ -2067,7 +2073,8 @@ mod tests {
20672073

20682074
#[test]
20692075
fn test_paired_alignment_result_enum_variants() {
2070-
use crate::align::transcript::{CigarOp, Exon};
2076+
use crate::align::transcript::Exon;
2077+
use cigar::op::{Kind, Op};
20712078

20722079
let transcript = Transcript {
20732080
chr_idx: 0,
@@ -2081,7 +2088,7 @@ mod tests {
20812088
read_end: 100,
20822089
i_frag: 0,
20832090
}],
2084-
cigar: vec![CigarOp::Match(100)],
2091+
cigar: vec![Op::new(Kind::Match, 100)],
20852092
score: 100,
20862093
n_mismatch: 0,
20872094
n_gap: 0,

0 commit comments

Comments
 (0)