Skip to content

Commit b501420

Browse files
committed
fix(bam): subtract Phred+33 offset from QUAL before writing to BAM
The BAM binary QUAL field stores raw Phred values (0-93) per SAM spec §4.2.3. rustar was passing FASTQ ASCII bytes (Phred+33 encoded, e.g. 'B' = ASCII 66 = Phred 33) straight to noodles' QualityScores::from, which expects raw Phred values. Every QUAL byte in a rustar BAM was therefore +33 above what the spec mandates; samtools view rendered the SAM text with another +33 on top, producing biologically impossible quality strings like 'cccgg...'. Subtract 33 from each FASTQ ASCII byte at every BAM-write call site before constructing the QualityScores. Use saturating_sub so a malformed FASTQ byte < 33 clamps to 0 rather than underflowing. samtools stats average_quality on the nf-core/rnaseq test profile now reads 35.3 (matching STAR) rather than the previous 68.3. Fixes #34
1 parent 70be24d commit b501420

2 files changed

Lines changed: 52 additions & 14 deletions

File tree

src/io/fastq.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ pub struct EncodedRead {
5353
pub name: String,
5454
/// Base sequence encoded as 0=A, 1=C, 2=G, 3=T, 4=N
5555
pub sequence: Vec<u8>,
56-
/// Quality scores (raw FASTQ quality values)
56+
/// FASTQ ASCII quality bytes (Phred+33 encoded) - subtract 33 before
57+
/// writing to BAM binary QUAL.
5758
pub quality: Vec<u8>,
5859
}
5960

src/io/sam.rs

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,8 @@ impl SamWriter {
194194
let seq_bytes: Vec<u8> = read_seq.iter().map(|&b| decode_base(b)).collect();
195195
*record.sequence_mut() = Sequence::from(seq_bytes);
196196

197-
// Quality scores
198-
*record.quality_scores_mut() = QualityScores::from(read_qual.to_vec());
197+
// Quality scores: convert FASTQ ASCII (Phred+33) to raw Phred for BAM
198+
*record.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(read_qual));
199199

200200
maybe_insert_rg_tag(&mut record, rg_id);
201201

@@ -442,13 +442,14 @@ impl SamWriter {
442442
.map(|&b| decode_base(complement_base(b)))
443443
.collect();
444444
*mapped_rec.sequence_mut() = Sequence::from(seq_bytes);
445-
let mut qual = mapped_qual.to_vec();
445+
let mut qual = fastq_qual_to_phred(mapped_qual);
446446
qual.reverse();
447447
*mapped_rec.quality_scores_mut() = QualityScores::from(qual);
448448
} else {
449449
let seq_bytes: Vec<u8> = mapped_seq.iter().map(|&b| decode_base(b)).collect();
450450
*mapped_rec.sequence_mut() = Sequence::from(seq_bytes);
451-
*mapped_rec.quality_scores_mut() = QualityScores::from(mapped_qual.to_vec());
451+
*mapped_rec.quality_scores_mut() =
452+
QualityScores::from(fastq_qual_to_phred(mapped_qual));
452453
}
453454

454455
// Optional tags on mapped mate
@@ -532,7 +533,8 @@ impl SamWriter {
532533
// SEQ/QUAL: forward orientation (no RC for unmapped)
533534
let unmapped_seq_bytes: Vec<u8> = unmapped_seq.iter().map(|&b| decode_base(b)).collect();
534535
*unmapped_rec.sequence_mut() = Sequence::from(unmapped_seq_bytes);
535-
*unmapped_rec.quality_scores_mut() = QualityScores::from(unmapped_qual.to_vec());
536+
*unmapped_rec.quality_scores_mut() =
537+
QualityScores::from(fastq_qual_to_phred(unmapped_qual));
536538
maybe_insert_rg_tag(&mut unmapped_rec, rg_id);
537539

538540
// Order: mate1 first, mate2 second
@@ -628,13 +630,13 @@ impl SamWriter {
628630
.map(|&b| decode_base(complement_base(b)))
629631
.collect();
630632
*record.sequence_mut() = Sequence::from(seq_bytes);
631-
let mut qual = read_qual.to_vec();
633+
let mut qual = fastq_qual_to_phred(read_qual);
632634
qual.reverse();
633635
*record.quality_scores_mut() = QualityScores::from(qual);
634636
} else {
635637
let seq_bytes: Vec<u8> = read_seq.iter().map(|&b| decode_base(b)).collect();
636638
*record.sequence_mut() = Sequence::from(seq_bytes);
637-
*record.quality_scores_mut() = QualityScores::from(read_qual.to_vec());
639+
*record.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(read_qual));
638640
}
639641

640642
// Optional tags
@@ -685,7 +687,7 @@ impl SamWriter {
685687

686688
let seq1_bytes: Vec<u8> = mate1_seq.iter().map(|&b| decode_base(b)).collect();
687689
*rec1.sequence_mut() = Sequence::from(seq1_bytes);
688-
*rec1.quality_scores_mut() = QualityScores::from(mate1_qual.to_vec());
690+
*rec1.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(mate1_qual));
689691
maybe_insert_rg_tag(&mut rec1, rg_id);
690692
records.push(rec1);
691693

@@ -702,7 +704,7 @@ impl SamWriter {
702704

703705
let seq2_bytes: Vec<u8> = mate2_seq.iter().map(|&b| decode_base(b)).collect();
704706
*rec2.sequence_mut() = Sequence::from(seq2_bytes);
705-
*rec2.quality_scores_mut() = QualityScores::from(mate2_qual.to_vec());
707+
*rec2.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(mate2_qual));
706708
maybe_insert_rg_tag(&mut rec2, rg_id);
707709
records.push(rec2);
708710

@@ -860,6 +862,14 @@ fn maybe_insert_rg_tag(record: &mut RecordBuf, rg_id: Option<&str>) {
860862
}
861863
}
862864

865+
/// Convert FASTQ ASCII quality bytes (Phred+33) to raw Phred values (0-93) for
866+
/// the BAM binary QUAL field, per SAM spec §4.2.3.
867+
///
868+
/// `saturating_sub` clamps malformed bytes < 33 to 0 rather than underflowing.
869+
fn fastq_qual_to_phred(qual: &[u8]) -> Vec<u8> {
870+
qual.iter().map(|&b| b.saturating_sub(33)).collect()
871+
}
872+
863873
/// Convert Transcript to SAM record
864874
#[allow(clippy::too_many_arguments)]
865875
fn transcript_to_record(
@@ -927,13 +937,13 @@ fn transcript_to_record(
927937
*record.sequence_mut() = Sequence::from(seq_bytes);
928938

929939
// Reverse the quality scores
930-
let mut qual = read_qual.to_vec();
940+
let mut qual = fastq_qual_to_phred(read_qual);
931941
qual.reverse();
932942
*record.quality_scores_mut() = QualityScores::from(qual);
933943
} else {
934944
let seq_bytes: Vec<u8> = read_seq.iter().map(|&b| decode_base(b)).collect();
935945
*record.sequence_mut() = Sequence::from(seq_bytes);
936-
*record.quality_scores_mut() = QualityScores::from(read_qual.to_vec());
946+
*record.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(read_qual));
937947
}
938948

939949
// Optional tags: gated by --outSAMattributes
@@ -1259,13 +1269,13 @@ fn build_paired_mate_record(
12591269
.collect();
12601270
*record.sequence_mut() = Sequence::from(seq_bytes);
12611271

1262-
let mut qual = mate_qual.to_vec();
1272+
let mut qual = fastq_qual_to_phred(mate_qual);
12631273
qual.reverse();
12641274
*record.quality_scores_mut() = QualityScores::from(qual);
12651275
} else {
12661276
let seq_bytes: Vec<u8> = mate_seq.iter().map(|&b| decode_base(b)).collect();
12671277
*record.sequence_mut() = Sequence::from(seq_bytes);
1268-
*record.quality_scores_mut() = QualityScores::from(mate_qual.to_vec());
1278+
*record.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(mate_qual));
12691279
}
12701280

12711281
// Optional tags: gated by --outSAMattributes
@@ -1477,6 +1487,33 @@ mod tests {
14771487
assert!(writer.is_ok());
14781488
}
14791489

1490+
/// Regression test for issue #34: BAM binary QUAL must store raw Phred values
1491+
/// (0-93), not FASTQ ASCII bytes (Phred+33). Each FASTQ ASCII byte should be
1492+
/// reduced by 33 before being placed in the BAM QUAL field.
1493+
#[test]
1494+
fn test_qual_phred33_offset_stripped_for_bam() {
1495+
// FASTQ ASCII 'I' = 73 → Phred 40
1496+
let read_qual = b"III";
1497+
let read_seq = vec![0u8, 1, 2]; // ACG
1498+
let record = SamWriter::build_unmapped_record("read1", &read_seq, read_qual, None).unwrap();
1499+
1500+
let stored: &[u8] = record.quality_scores().as_ref();
1501+
assert_eq!(stored, &[40u8, 40, 40]);
1502+
}
1503+
1504+
/// `saturating_sub` must clamp malformed FASTQ bytes (< 33) to 0 rather
1505+
/// than underflowing.
1506+
#[test]
1507+
fn test_qual_phred33_offset_saturating_sub() {
1508+
// Byte 32 is below the Phred+33 floor; should clamp to 0.
1509+
let read_qual = &[32u8, 33, 34][..];
1510+
let read_seq = vec![0u8, 1, 2];
1511+
let record = SamWriter::build_unmapped_record("read1", &read_seq, read_qual, None).unwrap();
1512+
1513+
let stored: &[u8] = record.quality_scores().as_ref();
1514+
assert_eq!(stored, &[0u8, 0, 1]);
1515+
}
1516+
14801517
#[test]
14811518
fn test_transcript_to_record() {
14821519
let genome = make_test_genome();

0 commit comments

Comments
 (0)