Skip to content

Commit dc7db10

Browse files
committed
metrics: Split describe metrics into modules
1 parent 980321c commit dc7db10

8 files changed

+276
-141
lines changed

src/commands/describe.rs

+7-141
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use tracing::info;
55
use crate::{
66
cli::DescribeArgs,
77
fastq::{self, Record},
8+
metrics,
89
};
910

1011
pub fn describe(args: DescribeArgs) -> io::Result<()> {
@@ -13,154 +14,19 @@ pub fn describe(args: DescribeArgs) -> io::Result<()> {
1314
let mut reader = fastq::open(args.src)?;
1415
let mut record = Record::default();
1516

16-
let mut metrics = Metrics::default();
17+
let mut metrics = metrics::default();
1718

1819
while reader.read_record(&mut record)? != 0 {
19-
visit(&mut metrics, &record)?;
20-
}
21-
22-
print_metrics(&metrics);
23-
24-
info!("done");
25-
26-
Ok(())
27-
}
28-
29-
#[derive(Clone, Copy, Default)]
30-
struct ErrorProbability {
31-
sum: f64,
32-
count: u64,
33-
}
34-
35-
struct Metrics {
36-
record_count: u64,
37-
min_sequence_length: usize,
38-
max_sequence_length: usize,
39-
error_probability_sums_per_position: Vec<ErrorProbability>,
40-
}
41-
42-
impl Default for Metrics {
43-
fn default() -> Self {
44-
Self {
45-
record_count: 0,
46-
min_sequence_length: usize::MAX,
47-
max_sequence_length: usize::MIN,
48-
error_probability_sums_per_position: Vec::new(),
20+
for metric in &mut metrics {
21+
metric.visit(&record)?;
4922
}
5023
}
51-
}
52-
53-
fn visit(metrics: &mut Metrics, record: &Record) -> io::Result<()> {
54-
metrics.record_count += 1;
5524

56-
let read_length = record.sequence().len();
57-
58-
metrics.min_sequence_length = metrics.min_sequence_length.min(read_length);
59-
metrics.max_sequence_length = metrics.max_sequence_length.max(read_length);
60-
61-
if read_length > metrics.error_probability_sums_per_position.len() {
62-
metrics
63-
.error_probability_sums_per_position
64-
.resize(read_length, ErrorProbability::default());
25+
for metric in &metrics {
26+
metric.println();
6527
}
6628

67-
for (quality_score, error_probability) in record
68-
.quality_scores()
69-
.iter()
70-
.zip(&mut metrics.error_probability_sums_per_position)
71-
{
72-
let q = decode_score(*quality_score)?;
73-
let p = phred_score_to_error_probability(q);
74-
error_probability.sum += p;
75-
error_probability.count += 1;
76-
}
29+
info!("done");
7730

7831
Ok(())
7932
}
80-
81-
fn decode_score(c: u8) -> io::Result<u8> {
82-
const OFFSET: u8 = b'!';
83-
84-
c.checked_sub(OFFSET)
85-
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "invalid quality score"))
86-
}
87-
88-
// https://en.wikipedia.org/wiki/Phred_quality_score#Definition
89-
const BASE: f64 = 10.0;
90-
const FACTOR: f64 = 10.0;
91-
92-
fn phred_score_to_error_probability(n: u8) -> f64 {
93-
BASE.powf(-f64::from(n) / FACTOR)
94-
}
95-
96-
fn error_probability_to_phred_score(p: f64) -> f64 {
97-
-FACTOR * p.log10()
98-
}
99-
100-
fn print_metrics(metrics: &Metrics) {
101-
let record_count = metrics.record_count;
102-
103-
println!("record_count\t{record_count}");
104-
105-
let min_sequence_length = if record_count == 0 {
106-
0
107-
} else {
108-
metrics.min_sequence_length
109-
};
110-
111-
println!("min_sequence_length\t{min_sequence_length}");
112-
113-
let max_sequence_length = if record_count == 0 {
114-
0
115-
} else {
116-
metrics.max_sequence_length
117-
};
118-
119-
println!("max_sequence_length\t{max_sequence_length}");
120-
121-
let avg_quality_score_per_position: Vec<_> = metrics
122-
.error_probability_sums_per_position
123-
.iter()
124-
.map(|error_probability| {
125-
let n = error_probability.count as f64;
126-
let avg_error_probability = error_probability.sum / n;
127-
error_probability_to_phred_score(avg_error_probability)
128-
})
129-
.collect();
130-
131-
println!("avg_quality_score_per_position\t{avg_quality_score_per_position:?}");
132-
}
133-
134-
#[cfg(test)]
135-
mod tests {
136-
use super::*;
137-
138-
#[test]
139-
fn test_decode_score() -> io::Result<()> {
140-
assert_eq!(decode_score(b'!')?, 0);
141-
assert_eq!(decode_score(b'~')?, 93);
142-
assert!(matches!(
143-
decode_score(0x00),
144-
Err(e) if e.kind() == io::ErrorKind::InvalidData
145-
));
146-
Ok(())
147-
}
148-
149-
#[test]
150-
fn test_phred_score_to_error_probability() {
151-
assert_eq!(phred_score_to_error_probability(0), 1.0);
152-
assert_eq!(phred_score_to_error_probability(10), 0.1);
153-
assert_eq!(phred_score_to_error_probability(20), 0.01);
154-
assert_eq!(phred_score_to_error_probability(30), 0.001);
155-
assert_eq!(phred_score_to_error_probability(40), 0.0001);
156-
}
157-
158-
#[test]
159-
fn test_error_probability_to_phred_score() {
160-
assert_eq!(error_probability_to_phred_score(1.0), 0.0);
161-
assert_eq!(error_probability_to_phred_score(0.1), 10.0);
162-
assert_eq!(error_probability_to_phred_score(0.01), 20.0);
163-
assert_eq!(error_probability_to_phred_score(0.001), 30.0);
164-
assert_eq!(error_probability_to_phred_score(0.0001), 40.0);
165-
}
166-
}

src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pub mod commands;
33
pub mod distributions;
44
pub mod fastq;
55
pub mod generator;
6+
mod metrics;
67
pub mod pair_writer;
78
pub mod validators;
89

src/metrics.rs

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
mod avg_quality_score_per_position;
2+
mod max_sequence_length;
3+
mod metric;
4+
mod min_sequence_length;
5+
mod record_count;
6+
7+
pub use self::metric::Metric;
8+
use self::{
9+
avg_quality_score_per_position::AvgQualityScorePerPosition,
10+
max_sequence_length::MaxSequenceLength, min_sequence_length::MinSequenceLength,
11+
record_count::RecordCount,
12+
};
13+
14+
pub fn default() -> Vec<Box<dyn Metric>> {
15+
vec![
16+
Box::new(RecordCount::default()),
17+
Box::new(MinSequenceLength::default()),
18+
Box::new(MaxSequenceLength::default()),
19+
Box::new(AvgQualityScorePerPosition::default()),
20+
]
21+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
use std::io;
2+
3+
use super::Metric;
4+
use crate::fastq::Record;
5+
6+
const NAME: &str = "avg_quality_score_per_position";
7+
8+
#[derive(Clone, Copy, Default)]
9+
struct ErrorProbability {
10+
sum: f64,
11+
count: u64,
12+
}
13+
14+
#[derive(Default)]
15+
pub struct AvgQualityScorePerPosition {
16+
error_probability_sums_per_position: Vec<ErrorProbability>,
17+
}
18+
19+
impl Metric for AvgQualityScorePerPosition {
20+
fn visit(&mut self, record: &Record) -> io::Result<()> {
21+
let read_length = record.sequence().len();
22+
23+
if read_length > self.error_probability_sums_per_position.len() {
24+
self.error_probability_sums_per_position
25+
.resize(read_length, ErrorProbability::default());
26+
}
27+
28+
for (quality_score, error_probability) in record
29+
.quality_scores()
30+
.iter()
31+
.zip(&mut self.error_probability_sums_per_position)
32+
{
33+
let q = decode_score(*quality_score)?;
34+
let p = phred_score_to_error_probability(q);
35+
error_probability.sum += p;
36+
error_probability.count += 1;
37+
}
38+
39+
Ok(())
40+
}
41+
42+
fn println(&self) {
43+
let avg_quality_score_per_position: Vec<_> = self
44+
.error_probability_sums_per_position
45+
.iter()
46+
.map(|error_probability| {
47+
let n = error_probability.count as f64;
48+
let avg_error_probability = error_probability.sum / n;
49+
error_probability_to_phred_score(avg_error_probability)
50+
})
51+
.collect();
52+
53+
println!("{NAME}\t{avg_quality_score_per_position:?}");
54+
}
55+
}
56+
57+
fn decode_score(c: u8) -> io::Result<u8> {
58+
const OFFSET: u8 = b'!';
59+
60+
c.checked_sub(OFFSET)
61+
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "invalid quality score"))
62+
}
63+
64+
// https://en.wikipedia.org/wiki/Phred_quality_score#Definition
65+
const BASE: f64 = 10.0;
66+
const FACTOR: f64 = 10.0;
67+
68+
fn phred_score_to_error_probability(n: u8) -> f64 {
69+
BASE.powf(-f64::from(n) / FACTOR)
70+
}
71+
72+
fn error_probability_to_phred_score(p: f64) -> f64 {
73+
-FACTOR * p.log10()
74+
}
75+
76+
#[cfg(test)]
77+
mod tests {
78+
use super::*;
79+
80+
#[test]
81+
fn test_decode_score() -> io::Result<()> {
82+
assert_eq!(decode_score(b'!')?, 0);
83+
assert_eq!(decode_score(b'~')?, 93);
84+
85+
assert!(matches!(
86+
decode_score(0x00),
87+
Err(e) if e.kind() == io::ErrorKind::InvalidData
88+
));
89+
90+
Ok(())
91+
}
92+
93+
#[test]
94+
fn test_phred_score_to_error_probability() {
95+
assert_eq!(phred_score_to_error_probability(0), 1.0);
96+
assert_eq!(phred_score_to_error_probability(10), 0.1);
97+
assert_eq!(phred_score_to_error_probability(20), 0.01);
98+
assert_eq!(phred_score_to_error_probability(30), 0.001);
99+
assert_eq!(phred_score_to_error_probability(40), 0.0001);
100+
}
101+
102+
#[test]
103+
fn test_error_probability_to_phred_score() {
104+
assert_eq!(error_probability_to_phred_score(1.0), 0.0);
105+
assert_eq!(error_probability_to_phred_score(0.1), 10.0);
106+
assert_eq!(error_probability_to_phred_score(0.01), 20.0);
107+
assert_eq!(error_probability_to_phred_score(0.001), 30.0);
108+
assert_eq!(error_probability_to_phred_score(0.0001), 40.0);
109+
}
110+
}

src/metrics/max_sequence_length.rs

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
use std::io;
2+
3+
use super::Metric;
4+
use crate::fastq::Record;
5+
6+
const NAME: &str = "max_sequence_length";
7+
8+
#[derive(Default)]
9+
pub struct MaxSequenceLength(usize);
10+
11+
impl Metric for MaxSequenceLength {
12+
fn visit(&mut self, record: &Record) -> io::Result<()> {
13+
let read_length = record.sequence().len();
14+
self.0 = self.0.max(read_length);
15+
Ok(())
16+
}
17+
18+
fn println(&self) {
19+
println!("{NAME}\t{}", self.0);
20+
}
21+
}
22+
23+
#[cfg(test)]
24+
mod tests {
25+
use super::*;
26+
27+
#[test]
28+
fn test_visit() -> io::Result<()> {
29+
let mut metric = MaxSequenceLength::default();
30+
assert_eq!(metric.0, 0);
31+
32+
let record = Record::new("", "ACGT", "", "");
33+
metric.visit(&record)?;
34+
assert_eq!(metric.0, 4);
35+
36+
Ok(())
37+
}
38+
}

src/metrics/metric.rs

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
use std::io;
2+
3+
use crate::fastq::Record;
4+
5+
pub trait Metric {
6+
fn visit(&mut self, record: &Record) -> io::Result<()>;
7+
fn println(&self);
8+
}

0 commit comments

Comments
 (0)