Skip to content

Commit 10776b3

Browse files
committed
feat: various minor code optimizations
* replace ahash with rustc-hash. rustc-hash was faster than ahash and fxhash empirically. * fix the usage for threads, where it incorrectly said the minimum number of threads was three, and should be five. This also changes the README.
1 parent 715735e commit 10776b3

File tree

7 files changed

+73
-65
lines changed

7 files changed

+73
-65
lines changed

Cargo.lock

+8-14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ path = "src/bin/main.rs"
3030
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
3131

3232
[dependencies]
33-
ahash = "0.8.2"
3433
anyhow = "1.0.38"
3534
bstr = "1.0.1"
3635
clap = { version = "4.0.25", features = ["derive"] }
@@ -49,6 +48,7 @@ serde-aux = "4.1.2"
4948
seq_io = "0.3.1"
5049
thiserror = "1.0.37"
5150
proglog = {version = "0.3.0", features = ["pretty_counts"] }
51+
rustc-hash = "2.1.1"
5252

5353
[dev-dependencies]
5454
csv = "1.1.6"

README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ be concatenated using the `-` delimiter and placed in the given SAM record tag (
6262
default). Similarly, the sample barcode bases from the given read will be placed in the `BC`
6363
tag.
6464

65-
Metadata about the samples should be given as a headered metadata TSV file with at least the
65+
Metadata about the samples should be given as a headered metadata TSV file with at least the
6666
following two columns present:
6767

68-
1. `sample_id` - the id of the sample or library.
68+
1. `sample_id` - the id of the sample or library.
6969
2. `barcode` - the expected barcode sequence associated with the `sample_id`.
7070

71-
For reads containing multiple barcodes (such as dual-indexed reads), all barcodes should be
71+
For reads containing multiple barcodes (such as dual-indexed reads), all barcodes should be
7272
concatenated together in the order they are read and stored in the `barcode` field.
7373

7474
The read structures will be used to extract the observed sample barcode, template bases, and
@@ -80,6 +80,7 @@ An observed barcode matches an expected barcode if all the following are true:
8080
mismatches (see `--max-mismatches`).
8181
2. The difference between number of mismatches in the best and second best barcodes is greater
8282
than or equal to the minimum mismatch delta (`--min-mismatch-delta`).
83+
8384
The expected barcode sequence may contains Ns, which are not counted as mismatches regardless
8485
of the observed base (e.g. the expected barcode `AAN` will have zero mismatches relative to
8586
both the observed barcodes `AAA` and `AAN`).
@@ -154,7 +155,7 @@ Options:
154155
[default: 2]
155156

156157
-t, --threads <THREADS>
157-
The number of threads to use. Cannot be less than 3
158+
The number of threads to use. Cannot be less than 5
158159

159160
[default: 8]
160161

rust-toolchain.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[toolchain]
2-
channel = "1.65.0"
2+
channel = "1.85.0"
33
components = ["rustfmt", "clippy"]

src/bin/commands/demux.rs

+32-17
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ impl ReadSet {
9393
const SPACE: u8 = b' ';
9494
const COLON: u8 = b':';
9595
const PLUS: u8 = b'+';
96+
const READ_NUMBERS: &[u8] = b"12345678";
9697

9798
/// Produces an iterator over references to the template segments stored in this ``ReadSet``.
9899
fn template_segments(&self) -> SegmentIter {
@@ -213,7 +214,12 @@ impl ReadSet {
213214
None => {
214215
// If no pre-existing comment, assume the read is a passing filter, non-control
215216
// read and generate a comment for it (sample barcode is added below).
216-
write!(writer, "{}:N:0:", read_num)?;
217+
if read_num < Self::READ_NUMBERS.len() {
218+
writer.write_all(&[Self::READ_NUMBERS[read_num - 1]])?;
219+
write!(writer, ":N:0:")?;
220+
} else {
221+
write!(writer, "{}:N:0:", read_num)?;
222+
}
217223
}
218224
Some(chars) => {
219225
// Else check it's a 4-part name... fix the read number at the front and
@@ -239,7 +245,11 @@ impl ReadSet {
239245
&chars[first_colon_idx + 1..chars.len()]
240246
};
241247

242-
write!(writer, "{}:", read_num)?;
248+
if read_num < Self::READ_NUMBERS.len() {
249+
writer.write_all(&[Self::READ_NUMBERS[read_num - 1], b':'])?;
250+
} else {
251+
write!(writer, "{}:", read_num)?;
252+
}
243253
writer.write_all(remainder)?;
244254

245255
if *remainder.last().unwrap() != Self::COLON {
@@ -518,13 +528,13 @@ impl DemuxMetric {
518528
/// default). Similarly, the sample barcode bases from the given read will be placed in the `BC`
519529
/// tag.
520530
///
521-
/// Metadata about the samples should be given as a headered metadata TSV file with at least the
531+
/// Metadata about the samples should be given as a headered metadata TSV file with at least the
522532
/// following two columns present:
523-
///
524-
/// 1. `sample_id` - the id of the sample or library.
533+
///
534+
/// 1. `sample_id` - the id of the sample or library.
525535
/// 2. `barcode` - the expected barcode sequence associated with the `sample_id`.
526-
///
527-
/// For reads containing multiple barcodes (such as dual-indexed reads), all barcodes should be
536+
///
537+
/// For reads containing multiple barcodes (such as dual-indexed reads), all barcodes should be
528538
/// concatenated together in the order they are read and stored in the `barcode` field.
529539
///
530540
/// The read structures will be used to extract the observed sample barcode, template bases, and
@@ -536,6 +546,7 @@ impl DemuxMetric {
536546
/// mismatches (see `--max-mismatches`).
537547
/// 2. The difference between number of mismatches in the best and second best barcodes is greater
538548
/// than or equal to the minimum mismatch delta (`--min-mismatch-delta`).
549+
///
539550
/// The expected barcode sequence may contains Ns, which are not counted as mismatches regardless
540551
/// of the observed base (e.g. the expected barcode `AAN` will have zero mismatches relative to
541552
/// both the observed barcodes `AAA` and `AAN`).
@@ -612,7 +623,7 @@ pub(crate) struct Demux {
612623
#[clap(long, short = 'd', default_value = "2")]
613624
min_mismatch_delta: usize,
614625

615-
/// The number of threads to use. Cannot be less than 3.
626+
/// The number of threads to use. Cannot be less than 5.
616627
#[clap(long, short = 't', default_value = "8")]
617628
threads: usize,
618629

@@ -659,9 +670,12 @@ impl Demux {
659670
read_structures.iter().map(|s| s.segments_by_type(*output_type).count()).sum();
660671

661672
for idx in 1..=segment_count {
662-
output_type_writers.push(BufWriter::new(File::create(
663-
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
664-
)?));
673+
output_type_writers.push(BufWriter::with_capacity(
674+
65_536usize,
675+
File::create(
676+
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
677+
)?,
678+
));
665679
}
666680

667681
match output_type {
@@ -741,7 +755,7 @@ impl Demux {
741755
let mut new_sample_barcode_writers = None;
742756
let mut new_molecular_barcode_writers = None;
743757

744-
for (optional_ws, target) in vec![
758+
for (optional_ws, target) in [
745759
(template_writers, &mut new_template_writers),
746760
(barcode_writers, &mut new_sample_barcode_writers),
747761
(mol_writers, &mut new_molecular_barcode_writers),
@@ -894,7 +908,7 @@ impl Command for Demux {
894908
);
895909

896910
let mut fq_iterators = fq_sources
897-
.zip(self.read_structures.clone().into_iter())
911+
.zip(self.read_structures.clone())
898912
.map(|(source, read_structure)| {
899913
ReadSetIterator::new(read_structure, source, self.skip_reasons.clone())
900914
.read_ahead(1000, 1000)
@@ -1181,6 +1195,7 @@ mod tests {
11811195
skip_reasons: vec![],
11821196
};
11831197
let demux_result = demux_inputs.execute();
1198+
#[allow(clippy::permissions_set_readonly_false)]
11841199
permissions.set_readonly(false);
11851200
fs::set_permissions(tmp.path(), permissions).unwrap();
11861201
demux_result.unwrap();
@@ -1875,11 +1890,11 @@ mod tests {
18751890
let read_structures =
18761891
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("7B").unwrap()];
18771892

1878-
let records = vec![
1893+
let records = [
18791894
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
18801895
vec!["CCCCCCC", SAMPLE1_BARCODE], // barcode the correct length
18811896
vec!["", SAMPLE1_BARCODE], // template basese too short
1882-
vec!["G", SAMPLE1_BARCODE], // barcode the correct length
1897+
vec!["G", SAMPLE1_BARCODE],
18831898
];
18841899

18851900
let input_files = vec![
@@ -1911,11 +1926,11 @@ mod tests {
19111926
let read_structures =
19121927
vec![ReadStructure::from_str("+T").unwrap(), ReadStructure::from_str("7B").unwrap()];
19131928

1914-
let records = vec![
1929+
let records = [
19151930
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
19161931
vec!["CCCCCCC", SAMPLE1_BARCODE], // barcode the correct length
19171932
vec!["", SAMPLE1_BARCODE], // template basese too short
1918-
vec!["G", SAMPLE1_BARCODE], // barcode the correct length
1933+
vec!["G", SAMPLE1_BARCODE],
19191934
];
19201935

19211936
let input_files = vec![

src/lib/barcode_matching.rs

+20-22
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
use super::byte_is_nocall;
22
use super::samples::Sample;
3-
use ahash::HashMap as AHashMap;
4-
use ahash::HashMapExt;
5-
6-
const STARTING_CACHE_SIZE: usize = 1_000_000;
3+
use rustc_hash::FxHashMap;
74

85
/// The struct that contains the info related to the best and next best sample barcode match.
96
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -33,7 +30,7 @@ pub struct BarcodeMatcher {
3330
/// If true will attempt to use the cache when matching.
3431
use_cache: bool,
3532
/// Caching struct for storing results of previous matches
36-
cache: AHashMap<Vec<u8>, BarcodeMatch>,
33+
cache: FxHashMap<Vec<u8>, BarcodeMatch>,
3734
}
3835

3936
impl BarcodeMatcher {
@@ -60,7 +57,7 @@ impl BarcodeMatcher {
6057
let mut modified_samples = samples.to_vec();
6158
for sample in modified_samples.iter_mut() {
6259
sample.barcode = sample.barcode.to_ascii_uppercase();
63-
let num_ns = sample.barcode.as_bytes().iter().filter(|&b| byte_is_nocall(*b)).count();
60+
let num_ns = sample.barcode_bytes.iter().filter(|&b| byte_is_nocall(*b)).count();
6461
max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns);
6562
}
6663
Self {
@@ -69,28 +66,29 @@ impl BarcodeMatcher {
6966
max_mismatches,
7067
min_mismatch_delta,
7168
use_cache,
72-
cache: AHashMap::with_capacity(STARTING_CACHE_SIZE),
69+
cache: FxHashMap::default(),
7370
}
7471
}
7572

7673
/// Counts the number of bases that differ between two byte arrays.
7774
fn count_mismatches(observed_bases: &[u8], sample: &Sample) -> u8 {
78-
let expected_bases = sample.barcode.as_bytes();
79-
let observed_string =
80-
std::str::from_utf8(observed_bases).expect("Observed bases are not valid UTF-8");
81-
assert_eq!(
82-
observed_bases.len(),
83-
expected_bases.len(),
84-
"Read barcode ({}) length ({}) differs from expected barcode ({}) length ({}) for sample {}",
85-
observed_string,
86-
observed_bases.len(),
87-
sample.barcode,
88-
expected_bases.len(),
89-
sample.sample_id
90-
);
75+
if sample.barcode_bytes.len() != observed_bases.len() {
76+
let observed_string =
77+
std::str::from_utf8(observed_bases).expect("Observed bases are not valid UTF-8");
78+
panic!(
79+
"Read barcode ({}) length ({}) differs from expected barcode ({}) length ({}) for sample {}",
80+
observed_string,
81+
observed_bases.len(),
82+
sample.barcode,
83+
sample.barcode_bytes.len(),
84+
sample.sample_id
85+
);
86+
}
9187
let mut count: usize = 0;
92-
for (&expected_base, &observed_base) in expected_bases.iter().zip(observed_bases.iter()) {
93-
if !byte_is_nocall(expected_base) && expected_base != observed_base {
88+
for (&expected_base, &observed_base) in
89+
sample.barcode_bytes.iter().zip(observed_bases.iter())
90+
{
91+
if expected_base != observed_base && !byte_is_nocall(expected_base) {
9492
count += 1;
9593
}
9694
}

src/lib/samples.rs

+6-6
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ pub struct Sample {
1818
pub sample_id: String,
1919
/// DNA barcode associated with the sample
2020
pub barcode: String,
21+
/// DNA barcode as a byte
22+
#[serde(skip_deserializing)]
23+
pub barcode_bytes: Vec<u8>,
2124
/// index of the sample in the [`SampleGroup`] object, used for syncing indices across
2225
/// different structs
2326
#[serde(skip_deserializing)]
@@ -52,7 +55,8 @@ impl Sample {
5255
barcode.as_bytes().iter().all(|&b| is_valid_base(b)),
5356
"All sample barcode bases must be one of A, C, G, or T"
5457
);
55-
Self { sample_id: name, barcode, ordinal }
58+
let barcode_bytes = barcode.as_bytes().to_vec();
59+
Self { sample_id: name, barcode, barcode_bytes, ordinal }
5660
}
5761

5862
/// Returns the header line expected by serde when deserializing
@@ -293,11 +297,7 @@ mod tests {
293297
let barcode = "GATTACA".to_owned();
294298
let ordinal = 0;
295299
let sample = Sample::new(ordinal, name.clone(), barcode.clone());
296-
assert_eq!(
297-
Sample { sample_id: name, barcode, ordinal },
298-
sample,
299-
"Sample differed from expectation"
300-
);
300+
assert_eq!(Sample::new(ordinal, name, barcode), sample, "Sample differed from expectation");
301301
}
302302

303303
// ############################################################################################

0 commit comments

Comments
 (0)