Skip to content

Commit 4c6a22d

Browse files
committed
feat: various minor code optimizations
* replace ahash with rustc-hash. rustc-hash was faster than ahash and fxhash empirically. * optimize how the read name, specifically the read number, is written. * increase the buffer size by 4xfor the output writers to 65K. * delay creating the read bases from bytes for the display in an panic until it is needed. This occurred during every comparison of the read sample barcode with the expected sample barcode, so while not individually expenseive, expensive enough in aggregate. * create the necessary bytes for each sample barcode only once, since this was being done every time we compared barcodes. Miscellaneous changes: * fix the usage for threads, where it incorrectly said the minimum number of threads was three, and should be five. This also changes the README. * update rust toolchain to 1.85 (from 1.65.0). This requires minor code changes throughout based on clippy requirements and formatting.
1 parent c0de006 commit 4c6a22d

File tree

8 files changed

+437
-545
lines changed

8 files changed

+437
-545
lines changed

Cargo.lock

+398-514
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ path = "src/bin/main.rs"
3030
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
3131

3232
[dependencies]
33-
ahash = "0.8.11"
3433
anyhow = "1.0.38"
3534
bstr = "1.0.1"
3635
clap = { version = "4.0.25", features = ["derive"] }
@@ -49,6 +48,7 @@ serde-aux = "4.1.2"
4948
seq_io = "0.3.1"
5049
thiserror = "1.0.37"
5150
proglog = {version = "0.3.0", features = ["pretty_counts"] }
51+
rustc-hash = "2.1.1"
5252

5353
[dev-dependencies]
5454
csv = "1.1.6"

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ Options:
161161
[default: 2]
162162

163163
-t, --threads <THREADS>
164-
The number of threads to use. Cannot be less than 3
164+
The number of threads to use. Cannot be less than 5
165165

166166
[default: 8]
167167

@@ -176,10 +176,10 @@ Options:
176176
1. `too-few-bases`: there are too few bases or qualities to extract given the read structures. For example, if a read is 8bp long but the read structure is `10B`, or if a read is empty and the read structure is `+T`.
177177

178178
-h, --help
179-
Print help information (use `-h` for a summary)
179+
Print help (see a summary with '-h')
180180

181181
-V, --version
182-
Print version information
182+
Print version
183183
```
184184
<!-- end usage -->
185185

rust-toolchain.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[toolchain]
2-
channel = "1.85"
2+
channel = "1.85.0"
33
components = ["rustfmt", "clippy"]

src/bin/commands/demux.rs

+22-8
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ impl ReadSet {
9393
const SPACE: u8 = b' ';
9494
const COLON: u8 = b':';
9595
const PLUS: u8 = b'+';
96+
const READ_NUMBERS: &[u8] = b"12345678";
9697

9798
/// Produces an iterator over references to the template segments stored in this ``ReadSet``.
9899
fn template_segments(&self) -> SegmentIter {
@@ -213,7 +214,12 @@ impl ReadSet {
213214
None => {
214215
// If no pre-existing comment, assume the read is a passing filter, non-control
215216
// read and generate a comment for it (sample barcode is added below).
216-
write!(writer, "{}:N:0:", read_num)?;
217+
if read_num < Self::READ_NUMBERS.len() {
218+
writer.write_all(&[Self::READ_NUMBERS[read_num - 1]])?;
219+
write!(writer, ":N:0:")?;
220+
} else {
221+
write!(writer, "{}:N:0:", read_num)?;
222+
}
217223
}
218224
Some(chars) => {
219225
// Else check it's a 4-part name... fix the read number at the front and
@@ -239,7 +245,11 @@ impl ReadSet {
239245
&chars[first_colon_idx + 1..chars.len()]
240246
};
241247

242-
write!(writer, "{}:", read_num)?;
248+
if read_num < Self::READ_NUMBERS.len() {
249+
writer.write_all(&[Self::READ_NUMBERS[read_num - 1], b':'])?;
250+
} else {
251+
write!(writer, "{}:", read_num)?;
252+
}
243253
writer.write_all(remainder)?;
244254

245255
if *remainder.last().unwrap() != Self::COLON {
@@ -619,7 +629,7 @@ pub(crate) struct Demux {
619629
#[clap(long, short = 'd', default_value = "2")]
620630
min_mismatch_delta: usize,
621631

622-
/// The number of threads to use. Cannot be less than 3.
632+
/// The number of threads to use. Cannot be less than 5.
623633
#[clap(long, short = 't', default_value = "8")]
624634
threads: usize,
625635

@@ -666,9 +676,12 @@ impl Demux {
666676
read_structures.iter().map(|s| s.segments_by_type(*output_type).count()).sum();
667677

668678
for idx in 1..=segment_count {
669-
output_type_writers.push(BufWriter::new(File::create(
670-
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
671-
)?));
679+
output_type_writers.push(BufWriter::with_capacity(
680+
65_536usize,
681+
File::create(
682+
output_dir.join(format!("{}.{}{}.fq.gz", prefix, file_type_code, idx)),
683+
)?,
684+
));
672685
}
673686

674687
match output_type {
@@ -1189,6 +1202,7 @@ mod tests {
11891202
skip_reasons: vec![],
11901203
};
11911204
let demux_result = demux_inputs.execute();
1205+
#[allow(clippy::permissions_set_readonly_false)]
11921206
permissions.set_readonly(false);
11931207
fs::set_permissions(tmp.path(), permissions).unwrap();
11941208
demux_result.unwrap();
@@ -1963,7 +1977,7 @@ mod tests {
19631977
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
19641978
vec!["CCCCCCC", SAMPLE1_BARCODE], // barcode the correct length
19651979
vec!["", SAMPLE1_BARCODE], // template basese too short
1966-
vec!["G", SAMPLE1_BARCODE], // barcode the correct length
1980+
vec!["G", SAMPLE1_BARCODE],
19671981
];
19681982

19691983
let input_files = vec![
@@ -1999,7 +2013,7 @@ mod tests {
19992013
vec!["AAAAAAA", &SAMPLE1_BARCODE[0..7]], // barcode too short
20002014
vec!["CCCCCCC", SAMPLE1_BARCODE], // barcode the correct length
20012015
vec!["", SAMPLE1_BARCODE], // template basese too short
2002-
vec!["G", SAMPLE1_BARCODE], // barcode the correct length
2016+
vec!["G", SAMPLE1_BARCODE],
20032017
];
20042018

20052019
let input_files = vec![

src/lib/barcode_matching.rs

+5-7
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ use crate::encode;
66
use super::byte_is_nocall;
77
use super::samples::Sample;
88
use crate::bitenc::BitEnc;
9-
use ahash::HashMap as AHashMap;
10-
use ahash::HashMapExt;
9+
use rustc_hash::FxHashMap;
1110

1211
const STARTING_CACHE_SIZE: usize = 1_000_000;
1312

@@ -41,7 +40,7 @@ pub struct BarcodeMatcher {
4140
/// If true will attempt to use the cache when matching.
4241
use_cache: bool,
4342
/// Caching struct for storing results of previous matches
44-
cache: AHashMap<Vec<u8>, BarcodeMatch>,
43+
cache: FxHashMap<Vec<u8>, BarcodeMatch>,
4544
}
4645

4746
impl BarcodeMatcher {
@@ -81,7 +80,7 @@ impl BarcodeMatcher {
8180
max_mismatches,
8281
min_mismatch_delta,
8382
use_cache,
84-
cache: AHashMap::with_capacity(STARTING_CACHE_SIZE),
83+
cache: FxHashMap::with_capacity_and_hasher(STARTING_CACHE_SIZE, Default::default()),
8584
}
8685
}
8786

@@ -94,9 +93,7 @@ impl BarcodeMatcher {
9493
) -> u8 {
9594
if observed_bases.nr_symbols() != expected_bases.nr_symbols() {
9695
let observed_string = decode(observed_bases);
97-
assert_eq!(
98-
observed_bases.nr_symbols(),
99-
expected_bases.nr_symbols(),
96+
panic!(
10097
"Read barcode ({}) length ({}) differs from expected barcode ({}) length ({}) for sample {}",
10198
observed_string,
10299
observed_bases.nr_symbols(),
@@ -196,6 +193,7 @@ mod tests {
196193
fn barcode_to_sample(barcode: &str, idx: usize) -> Sample {
197194
Sample {
198195
barcode: barcode.to_string(),
196+
barcode_bytes: barcode.as_bytes().to_vec(),
199197
sample_id: format!("sample_{idx}").to_string(),
200198
ordinal: idx,
201199
}

src/lib/mod.rs

+1-5
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,7 @@ pub fn encode(bases: &[u8]) -> BitEnc {
5353
IUPAC_MASKS[b'N' as usize]
5454
} else {
5555
let value = base.to_ascii_uppercase() as usize;
56-
if value < 256 {
57-
IUPAC_MASKS[value]
58-
} else {
59-
0
60-
}
56+
if value < 256 { IUPAC_MASKS[value] } else { 0 }
6157
};
6258
vec.push(bit);
6359
}

src/lib/samples.rs

+6-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ pub struct Sample {
1919
pub sample_id: String,
2020
/// DNA barcode associated with the sample
2121
pub barcode: String,
22+
/// DNA barcode as a byte
23+
#[serde(skip_deserializing)]
24+
pub barcode_bytes: Vec<u8>,
2225
/// index of the sample in the [`SampleGroup`] object, used for syncing indices across
2326
/// different structs
2427
#[serde(skip_deserializing)]
@@ -53,7 +56,8 @@ impl Sample {
5356
barcode.as_bytes().iter().all(|&b| is_valid_iupac(b)),
5457
"All sample barcode bases must be one of A, C, G, T, U, R, Y, S, W, K, M, D, V, H, B, N"
5558
);
56-
Self { sample_id: name, barcode, ordinal }
59+
let barcode_bytes = barcode.as_bytes().to_vec();
60+
Self { sample_id: name, barcode, barcode_bytes, ordinal }
5761
}
5862

5963
/// Returns the header line expected by serde when deserializing
@@ -294,11 +298,7 @@ mod tests {
294298
let barcode = "GATTACA".to_owned();
295299
let ordinal = 0;
296300
let sample = Sample::new(ordinal, name.clone(), barcode.clone());
297-
assert_eq!(
298-
Sample { sample_id: name, barcode, ordinal },
299-
sample,
300-
"Sample differed from expectation"
301-
);
301+
assert_eq!(Sample::new(ordinal, name, barcode), sample, "Sample differed from expectation");
302302
}
303303

304304
// ############################################################################################

0 commit comments

Comments
 (0)