Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/align/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ pub use read_align::{
};
pub use seed::Seed;
pub use stitch::{SeedCluster, WindowAlignment, stitch_seeds};
pub use transcript::{CigarOp, Exon, Transcript};
pub use transcript::{Exon, Transcript};
113 changes: 60 additions & 53 deletions src/align/read_align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ use crate::align::stitch::{
PE_SPACER_BASE, cluster_seeds, finalize_transcript, split_combined_wt, stitch_seeds_core,
stitch_seeds_with_jdb_debug,
};
use crate::align::transcript::{Exon, Transcript};
use crate::align::transcript::{Exon, KindExt as _, Transcript};
use crate::error::Error;
use crate::index::GenomeIndex;
use crate::params::{IntronMotifFilter, IntronStrandFilter, Parameters};
use crate::stats::UnmappedReason;
use noodles::sam::alignment::record::cigar;
use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom};
use std::cmp::Ordering;
use std::hash::{DefaultHasher, Hash, Hasher};

/// Derive a deterministic per-read RNG seed from `run_rng_seed` + the read name.
Expand Down Expand Up @@ -323,7 +325,6 @@ pub fn align_read(
} else {
"unknown"
};
let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect();
eprintln!(
" transcript[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}",
ti,
Expand All @@ -334,7 +335,7 @@ pub fn align_read(
t.score,
t.n_mismatch,
t.n_junction,
cigar_str
t.cigar_string()
);
}
}
Expand All @@ -351,20 +352,9 @@ pub fn align_read(

// Deduplicate transcripts with identical genomic coordinates AND CIGAR.
transcripts.sort_by(|a, b| {
(
a.chr_idx,
a.genome_start,
a.genome_end,
a.is_reverse,
&a.cigar,
)
.cmp(&(
b.chr_idx,
b.genome_start,
b.genome_end,
b.is_reverse,
&b.cigar,
))
(a.chr_idx, a.genome_start, a.genome_end, a.is_reverse)
.cmp(&(b.chr_idx, b.genome_start, b.genome_end, b.is_reverse))
.then_with(|| cmp_cigar(&a.cigar, &b.cigar))
.then_with(|| b.score.cmp(&a.score))
});
transcripts.dedup_by(|a, b| {
Expand Down Expand Up @@ -467,13 +457,13 @@ pub fn align_read(

// Absolute matched bases
let n_matched = t.n_matched();
if n_matched < params.out_filter_match_nmin {
if n_matched < params.out_filter_match_nmin as usize {
*filter_reasons.entry("match_min").or_insert(0) += 1;
return false;
}

// Relative matched bases: STAR casts to uint (u32)
if n_matched < (params.out_filter_match_nmin_over_lread * lread_m1) as u32 {
if (n_matched as f64) < params.out_filter_match_nmin_over_lread * lread_m1 {
*filter_reasons.entry("match_min_relative").or_insert(0) += 1;
return false;
}
Expand Down Expand Up @@ -519,7 +509,6 @@ pub fn align_read(
match motif.implied_strand() {
Some('+') => has_plus = true,
Some('-') => has_minus = true,
None => {}
_ => {}
}
}
Expand Down Expand Up @@ -633,7 +622,6 @@ pub fn align_read(
} else {
"unknown"
};
let cigar_str: String = t.cigar.iter().map(|op| format!("{}", op)).collect();
eprintln!(
" FINAL[{}]: chr={}:{}-{} ({}) score={} mm={} junctions={} cigar={}",
i,
Expand All @@ -644,7 +632,7 @@ pub fn align_read(
t.score,
t.n_mismatch,
t.n_junction,
cigar_str
t.cigar_string()
);
}
}
Expand Down Expand Up @@ -989,8 +977,8 @@ pub fn align_paired_read(
}
b.combined_wt_score
.cmp(&a.combined_wt_score)
.then_with(|| a.mate1_transcript.cigar.cmp(&b.mate1_transcript.cigar))
.then_with(|| a.mate2_transcript.cigar.cmp(&b.mate2_transcript.cigar))
.then_with(|| cmp_cigar(&a.mate1_transcript.cigar, &b.mate1_transcript.cigar))
.then_with(|| cmp_cigar(&a.mate2_transcript.cigar, &b.mate2_transcript.cigar))
});
joint_pairs.dedup_by(|a, b| {
a.mate1_transcript.chr_idx == b.mate1_transcript.chr_idx
Expand Down Expand Up @@ -1234,6 +1222,18 @@ pub fn align_paired_read(
}
}

/// Comparator for deduplicating structs containing CIGAR ops via sort→dedup
fn cmp_cigar(a: &[cigar::Op], b: &[cigar::Op]) -> Ordering {
a.len().cmp(&b.len()).then_with(|| {
(a.iter().zip(b.iter()))
.find_map(|(a, b)| {
let ord = (a.char(), a.len()).cmp(&(b.char(), b.len()));
(ord != Ordering::Equal).then_some(ord)
})
.unwrap_or(Ordering::Equal)
})
}

/// Attempt to pair two per-mate transcripts into a PairedAlignment.
///
/// Returns `None` if the mates are incompatible (same strand, different chr, too far, etc.).
Expand Down Expand Up @@ -1416,18 +1416,18 @@ fn extract_junctions_from_cigar(t: &Transcript) -> Vec<(u64, u64)> {
let mut junctions = Vec::new();
let mut genome_pos = t.genome_start;
for op in &t.cigar {
use crate::align::transcript::CigarOp;
match op {
CigarOp::Match(n) | CigarOp::Equal(n) | CigarOp::Diff(n) | CigarOp::Del(n) => {
genome_pos += *n as u64;
use noodles::sam::alignment::record::cigar::op::Kind;
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Deletion => {
genome_pos += op.len() as u64;
}
CigarOp::RefSkip(n) => {
Kind::Skip => {
let donor = genome_pos;
let acceptor = genome_pos + *n as u64;
let acceptor = genome_pos + op.len() as u64;
junctions.push((donor, acceptor));
genome_pos = acceptor;
}
CigarOp::Ins(_) | CigarOp::SoftClip(_) | CigarOp::HardClip(_) => {}
Kind::Insertion | Kind::SoftClip | Kind::HardClip | Kind::Pad => {}
Comment thread
flying-sheep marked this conversation as resolved.
}
}
junctions
Expand Down Expand Up @@ -1479,6 +1479,7 @@ mod tests {
use crate::index::sa_index::SaIndex;
use crate::index::suffix_array::SuffixArray;
use clap::Parser;
use noodles::sam::alignment::record::cigar;

fn make_test_params() -> Parameters {
// Parse empty args to get default parameters
Expand Down Expand Up @@ -1537,8 +1538,7 @@ mod tests {

#[test]
fn combined_transcript_for_projection_rewrites_mate2_ifrag() {
use crate::align::transcript::CigarOp;

use cigar::op::{Kind, Op};
let make_tr = |gs: u64, ge: u64, rs: usize, re: usize| Transcript {
chr_idx: 0,
genome_start: gs,
Expand All @@ -1551,7 +1551,7 @@ mod tests {
read_end: re,
i_frag: 0,
}],
cigar: vec![CigarOp::Match((ge - gs) as u32)],
cigar: vec![Op::new(Kind::Match, (ge - gs) as usize)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand Down Expand Up @@ -1654,7 +1654,8 @@ mod tests {

#[test]
fn test_check_proper_pair_distance() {
use crate::align::transcript::{CigarOp, Exon};
use crate::align::transcript::Exon;
use cigar::op::{Kind, Op};

let params = make_test_params();

Expand All @@ -1671,7 +1672,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1693,7 +1694,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1709,7 +1710,8 @@ mod tests {

#[test]
fn test_check_proper_pair_too_far() {
use crate::align::transcript::{CigarOp, Exon};
use crate::align::transcript::Exon;
use cigar::op::{Kind, Op};

let mut params = make_test_params();
params.align_mates_gap_max = 100;
Expand All @@ -1726,7 +1728,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1748,7 +1750,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1764,7 +1766,8 @@ mod tests {

#[test]
fn test_calculate_insert_size_positive() {
use crate::align::transcript::{CigarOp, Exon};
use crate::align::transcript::Exon;
use cigar::op::{Kind, Op};

// Mate1 is leftmost
let t1 = Transcript {
Expand All @@ -1779,7 +1782,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1801,7 +1804,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1817,8 +1820,9 @@ mod tests {

#[test]
fn test_strand_consistency_filter() {
use crate::align::transcript::{CigarOp, Exon, Transcript};
use crate::align::transcript::{Exon, Transcript};
use crate::params::IntronStrandFilter;
use cigar::op::{Kind, Op};

// Create a transcript with conflicting strand motifs (mixed + and - within one transcript)
let t_inconsistent = Transcript {
Expand All @@ -1833,7 +1837,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1856,7 +1860,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand Down Expand Up @@ -1917,7 +1921,8 @@ mod tests {

#[test]
fn test_calculate_insert_size_negative() {
use crate::align::transcript::{CigarOp, Exon};
use crate::align::transcript::Exon;
use cigar::op::{Kind, Op};

// RF pair: mate1 reverse (right side), mate2 forward (left side).
// STAR reverse cluster: tlen = mate1.genome_end - mate2.genome_start = 1300 - 1000 = 300.
Expand All @@ -1934,7 +1939,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1956,7 +1961,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand All @@ -1973,7 +1978,8 @@ mod tests {
#[test]
fn test_noncanonical_unannotated_filter() {
use crate::align::score::SpliceMotif;
use crate::align::transcript::{CigarOp, Exon, Transcript};
use crate::align::transcript::{Exon, Transcript};
use cigar::op::{Kind, Op};

// Helper: check if a transcript would be filtered by RemoveNoncanonicalUnannotated
// (mirrors the logic in the retain closure)
Expand All @@ -1996,7 +2002,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand Down Expand Up @@ -2067,7 +2073,8 @@ mod tests {

#[test]
fn test_paired_alignment_result_enum_variants() {
use crate::align::transcript::{CigarOp, Exon};
use crate::align::transcript::Exon;
use cigar::op::{Kind, Op};

let transcript = Transcript {
chr_idx: 0,
Expand All @@ -2081,7 +2088,7 @@ mod tests {
read_end: 100,
i_frag: 0,
}],
cigar: vec![CigarOp::Match(100)],
cigar: vec![Op::new(Kind::Match, 100)],
score: 100,
n_mismatch: 0,
n_gap: 0,
Expand Down
Loading
Loading