Skip to content

Commit 7770e7b

Browse files
committed
Merge feat/transcriptome-sam: add --quantMode TranscriptomeSAM output
# Conflicts: # src/io/sam.rs # src/params.rs
2 parents 76295a7 + f420aae commit 7770e7b

8 files changed

Lines changed: 2114 additions & 26 deletions

File tree

src/align/read_align.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::hash::{DefaultHasher, Hash, Hasher};
1717
/// via rayon, so we instead fold the read name into the seed — this keeps tie
1818
/// breaks reproducible regardless of thread count while still honoring the
1919
/// user's `--runRNGseed` value.
20-
fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
20+
pub(crate) fn per_read_seed(run_rng_seed: u64, read_name: &str) -> u64 {
2121
let mut hasher = DefaultHasher::new();
2222
read_name.hash(&mut hasher);
2323
run_rng_seed.wrapping_mul(hasher.finish().wrapping_add(1))

src/io/bam.rs

Lines changed: 66 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
use crate::error::Error;
33
use crate::genome::Genome;
44
use crate::params::Parameters;
5+
use crate::quant::transcriptome::TranscriptomeIndex;
56
use noodles::bam;
67
use noodles::sam;
78
use noodles::sam::alignment::io::Write as SamWrite;
@@ -41,26 +42,41 @@ pub struct BamWriter {
4142
}
4243

4344
impl BamWriter {
44-
/// Create a new BAM writer with header from genome index
45-
///
46-
/// # Arguments
47-
/// * `output_path` - Path to output BAM file
48-
/// * `genome` - Genome index with chromosome information
49-
/// * `params` - Parameters (for @PG header)
50-
pub fn create(output_path: &Path, genome: &Genome, params: &Parameters) -> Result<Self, Error> {
51-
let file = File::create(output_path)?;
52-
let buf_writer = BufWriter::new(file);
53-
54-
// Reuse SAM header building logic
55-
let header = crate::io::sam::build_sam_header(genome, params)?;
56-
57-
// Create BAM writer (Writer::new handles BGZF compression internally)
45+
/// Create a BAM writer at `output_path` with the given prepared header.
46+
fn with_header(output_path: &Path, header: sam::Header) -> Result<Self, Error> {
47+
let buf_writer = BufWriter::new(File::create(output_path)?);
5848
let mut writer = bam::io::Writer::new(buf_writer);
5949
writer.write_header(&header)?;
60-
6150
Ok(Self { writer, header })
6251
}
6352

53+
/// Create a new BAM writer with header from genome index.
54+
pub fn create(output_path: &Path, genome: &Genome, params: &Parameters) -> Result<Self, Error> {
55+
Self::with_header(
56+
output_path,
57+
crate::io::sam::build_sam_header(genome, params)?,
58+
)
59+
}
60+
61+
/// Create a BAM writer whose @SQ header lists the transcripts (not
62+
/// chromosomes) from `tr_idx`. Used for
63+
/// `Aligned.toTranscriptome.out.bam`.
64+
pub fn create_transcriptome(
65+
output_path: &Path,
66+
tr_idx: &TranscriptomeIndex,
67+
params: &Parameters,
68+
) -> Result<Self, Error> {
69+
let refs = tr_idx
70+
.tr_ids
71+
.iter()
72+
.zip(tr_idx.tr_length.iter())
73+
.map(|(id, len)| (id.as_str(), *len as usize));
74+
Self::with_header(
75+
output_path,
76+
crate::io::sam::build_sam_header_from_refs(refs, params)?,
77+
)
78+
}
79+
6480
/// Write batch of buffered records (for parallel processing)
6581
///
6682
/// # Arguments
@@ -191,6 +207,41 @@ mod tests {
191207
assert!(result.is_ok(), "Finishing BAM file should succeed");
192208
}
193209

210+
#[test]
211+
fn test_bam_transcriptome_writer_creation() {
212+
use crate::junction::gtf::GtfRecord;
213+
use crate::quant::transcriptome::TranscriptomeIndex;
214+
use std::collections::HashMap;
215+
216+
let genome = create_test_genome();
217+
// Stretch genome / exon to fit the tiny test chromosome.
218+
let mut attrs = HashMap::new();
219+
attrs.insert("gene_id".to_string(), "G1".to_string());
220+
attrs.insert("transcript_id".to_string(), "T1".to_string());
221+
let exons = vec![GtfRecord {
222+
seqname: "chr1".to_string(),
223+
feature: "exon".to_string(),
224+
start: 1,
225+
end: 8,
226+
strand: '+',
227+
attributes: attrs,
228+
}];
229+
let tr_idx = TranscriptomeIndex::from_gtf_exons(&exons, &genome).unwrap();
230+
assert_eq!(tr_idx.n_transcripts(), 1);
231+
232+
let params = create_test_params();
233+
let temp_file = NamedTempFile::new().unwrap();
234+
let writer = BamWriter::create_transcriptome(temp_file.path(), &tr_idx, &params);
235+
assert!(
236+
writer.is_ok(),
237+
"transcriptome BAM writer creation should succeed"
238+
);
239+
240+
// Header should contain exactly 1 @SQ entry (matching n_transcripts).
241+
let writer = writer.unwrap();
242+
assert_eq!(writer.header.reference_sequences().len(), 1);
243+
}
244+
194245
#[test]
195246
fn test_bam_batch_write() {
196247
let genome = create_test_genome();

src/io/sam.rs

Lines changed: 131 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,118 @@ impl SamWriter {
547547
Ok(records)
548548
}
549549

550+
/// Build transcriptome-space SAM records for `--quantMode TranscriptomeSAM`.
551+
///
552+
/// Each projected `Transcript` is converted to a record where:
553+
/// * `chr_idx` is the transcript index (matches the transcriptome
554+
/// header's @SQ order),
555+
/// * `genome_start` is the 0-based transcript-space position (→ POS =
556+
/// t-space_pos + 1),
557+
/// * splice-aware tags (`jM`, `jI`, `XS`) are not emitted (splices
558+
/// collapse in t-space and have no meaning there),
559+
/// * standard tags (`NH`, `HI`, `AS`, `NM`/`nM`, `MD`) are emitted per
560+
/// the `--outSAMattributes` set.
561+
///
562+
/// `primary_hit_idx` (0-based) is the projected alignment selected as
563+
/// primary (randomly among ties per STAR's `rngUniformReal0to1`). All
564+
/// other records get the SECONDARY flag (0x100).
565+
#[allow(clippy::too_many_arguments)]
566+
pub fn build_transcriptome_records(
567+
read_name: &str,
568+
read_seq: &[u8],
569+
read_qual: &[u8],
570+
projected: &[Transcript],
571+
mapq: u8,
572+
params: &Parameters,
573+
primary_hit_idx: usize,
574+
) -> Result<Vec<RecordBuf>, Error> {
575+
if projected.is_empty() {
576+
return Ok(Vec::new());
577+
}
578+
let mut attrs = params.sam_attribute_set();
579+
// Splice tags are meaningless in t-space.
580+
attrs.remove("jM");
581+
attrs.remove("jI");
582+
attrs.remove("XS");
583+
// MD-tag would require the transcript's t-space reference which we do
584+
// not precompute; drop it to keep this writer simple. STAR also does
585+
// not emit MD for transcriptome SAM.
586+
attrs.remove("MD");
587+
588+
let n_alignments = projected.len();
589+
let mut records = Vec::with_capacity(n_alignments);
590+
591+
for (hit_idx, t) in projected.iter().enumerate() {
592+
let mut record = RecordBuf::default();
593+
record.name_mut().replace(read_name.into());
594+
595+
// FLAGS: SECONDARY if not the primary; REVERSE if is_reverse.
596+
let mut flags = sam::alignment::record::Flags::empty();
597+
if t.is_reverse {
598+
flags |= sam::alignment::record::Flags::REVERSE_COMPLEMENTED;
599+
}
600+
if hit_idx != primary_hit_idx {
601+
flags |= sam::alignment::record::Flags::SECONDARY;
602+
}
603+
*record.flags_mut() = flags;
604+
605+
// RNAME = transcript index (maps to transcriptome header).
606+
*record.reference_sequence_id_mut() = Some(t.chr_idx);
607+
608+
// POS = t-space position + 1 (1-based).
609+
let pos = (t.genome_start + 1) as usize;
610+
*record.alignment_start_mut() = Some(pos.try_into().map_err(|e| {
611+
Error::Alignment(format!("invalid t-space position {}: {}", pos, e))
612+
})?);
613+
614+
// MAPQ
615+
*record.mapping_quality_mut() = MappingQuality::new(mapq);
616+
617+
// CIGAR (already has N ops stripped by align_to_transcripts)
618+
*record.cigar_mut() = convert_cigar(&t.cigar)?;
619+
620+
// SEQ / QUAL — STAR writes the original-orientation sequence when
621+
// FLAG 0x10 is unset (forward alignment in t-space) and RC'd seq
622+
// when 0x10 is set. We follow SAM spec: SEQ matches the CIGAR's
623+
// read orientation, so we mirror `transcript_to_record`.
624+
if t.is_reverse {
625+
let seq_bytes: Vec<u8> = read_seq
626+
.iter()
627+
.rev()
628+
.map(|&b| decode_base(complement_base(b)))
629+
.collect();
630+
*record.sequence_mut() = Sequence::from(seq_bytes);
631+
let mut qual = read_qual.to_vec();
632+
qual.reverse();
633+
*record.quality_scores_mut() = QualityScores::from(qual);
634+
} else {
635+
let seq_bytes: Vec<u8> = read_seq.iter().map(|&b| decode_base(b)).collect();
636+
*record.sequence_mut() = Sequence::from(seq_bytes);
637+
*record.quality_scores_mut() = QualityScores::from(read_qual.to_vec());
638+
}
639+
640+
// Optional tags
641+
let data = record.data_mut();
642+
if attrs.contains("NH") {
643+
data.insert(Tag::ALIGNMENT_HIT_COUNT, Value::from(n_alignments as i32));
644+
}
645+
if attrs.contains("HI") {
646+
// HI is 1-based; primary = 1, secondaries > 1 in emission order.
647+
data.insert(Tag::HIT_INDEX, Value::from((hit_idx + 1) as i32));
648+
}
649+
if attrs.contains("AS") {
650+
data.insert(Tag::ALIGNMENT_SCORE, Value::from(t.score));
651+
}
652+
if attrs.contains("NM") || attrs.contains("nM") {
653+
data.insert(Tag::new(b'n', b'M'), Value::from(t.n_mismatch as i32));
654+
}
655+
656+
records.push(record);
657+
}
658+
659+
Ok(records)
660+
}
661+
550662
/// Build unmapped paired records (both mates unmapped)
551663
pub fn build_paired_unmapped_records(
552664
read_name: &str,
@@ -600,21 +712,33 @@ impl SamWriter {
600712

601713
/// Build paired SAM header from genome
602714
pub fn build_sam_header(genome: &Genome, params: &Parameters) -> Result<sam::Header, Error> {
715+
build_sam_header_from_refs(
716+
(0..genome.n_chr_real)
717+
.map(|i| (genome.chr_name[i].as_str(), genome.chr_length[i] as usize)),
718+
params,
719+
)
720+
}
721+
722+
/// Build a SAM header from an iterator of (name, length) reference pairs.
723+
///
724+
/// Used both for the genome header (chromosomes) and the transcriptome header
725+
/// (one @SQ per transcript, length = transcript-space length).
726+
pub fn build_sam_header_from_refs<'a, I>(refs: I, params: &Parameters) -> Result<sam::Header, Error>
727+
where
728+
I: IntoIterator<Item = (&'a str, usize)>,
729+
{
603730
let mut builder = sam::Header::builder();
604731

605732
// @HD line (default version and unsorted)
606733
builder = builder.set_header(Default::default());
607734

608-
// @SQ lines for each chromosome
609-
for i in 0..genome.n_chr_real {
610-
let name = &genome.chr_name[i];
611-
let length = genome.chr_length[i] as usize;
612-
735+
// @SQ lines for each reference
736+
for (name, length) in refs {
613737
let length_nz = NonZeroUsize::new(length)
614-
.ok_or_else(|| Error::Index(format!("chromosome {} has zero length", name)))?;
738+
.ok_or_else(|| Error::Index(format!("reference {} has zero length", name)))?;
615739

616740
builder = builder.add_reference_sequence(
617-
name.as_str(),
741+
name,
618742
Map::<sam::header::record::value::map::ReferenceSequence>::new(length_nz),
619743
);
620744
}

0 commit comments

Comments
 (0)