Skip to content

Commit dd803fc

Browse files
authored
Fix early return and counting of mismatches with no-calls in barcodes. (#48)
* fix: early return and counting of mismatches when n in barcodes.
1 parent 6ed98a3 commit dd803fc

File tree

2 files changed

+134
-2
lines changed

2 files changed

+134
-2
lines changed

src/bin/commands/demux.rs

+117
Original file line numberDiff line numberDiff line change
@@ -1373,6 +1373,123 @@ mod tests {
13731373
);
13741374
}
13751375

1376+
#[test]
1377+
fn test_demux_with_catchall_barcode() {
1378+
let tmp = TempDir::new().unwrap();
1379+
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
1380+
let s1_barcode = "NNNNNNN";
1381+
let sample_metadata = metadata_file(&tmp, &[s1_barcode]);
1382+
let input_files =
1383+
vec![fastq_file(&tmp, "ex", "ex", &[&(s1_barcode.to_owned() + &"A".repeat(100))])];
1384+
1385+
let output_dir = tmp.path().to_path_buf().join("output");
1386+
1387+
let demux_inputs = Demux {
1388+
inputs: input_files,
1389+
read_structures,
1390+
sample_metadata,
1391+
output_types: vec!['T'],
1392+
output: output_dir.clone(),
1393+
unmatched_prefix: "unmatched".to_owned(),
1394+
max_mismatches: 0,
1395+
min_mismatch_delta: 2,
1396+
threads: 5,
1397+
compression_level: 5,
1398+
skip_reasons: vec![],
1399+
};
1400+
demux_inputs.execute().unwrap();
1401+
1402+
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
1403+
let unmatched_reads = read_fastq(&unmatched_path);
1404+
assert_eq!(unmatched_reads.len(), 0);
1405+
1406+
let output_path = output_dir.join("Sample0000.R1.fq.gz");
1407+
let fq_reads = read_fastq(&output_path);
1408+
1409+
assert_eq!(fq_reads.len(), 1);
1410+
assert_equal(
1411+
&fq_reads[0],
1412+
&OwnedRecord {
1413+
head: b"ex_0 1:N:0:NNNNNNN".to_vec(),
1414+
seq: "A".repeat(100).as_bytes().to_vec(),
1415+
qual: ";".repeat(100).as_bytes().to_vec(),
1416+
},
1417+
);
1418+
}
1419+
1420+
#[test]
1421+
fn test_demux_with_ns_in_barcode() {
1422+
let tmp = TempDir::new().unwrap();
1423+
let read_structures = vec![ReadStructure::from_str("7B+T").unwrap()];
1424+
let s1_barcode = "NNAAAAA";
1425+
let s2_barcode = "NNCCCCC";
1426+
let sample_metadata = metadata_file(&tmp, &[s1_barcode, s2_barcode]);
1427+
let input_files = vec![fastq_file(
1428+
&tmp,
1429+
"ex",
1430+
"ex",
1431+
&[
1432+
&("ANAAAAA".to_owned() + &"A".repeat(5)),
1433+
&("ANCCCCC".to_owned() + &"C".repeat(5)),
1434+
&("NNNAAAA".to_owned() + &"T".repeat(5)),
1435+
],
1436+
)];
1437+
1438+
let output_dir = tmp.path().to_path_buf().join("output");
1439+
1440+
let demux_inputs = Demux {
1441+
inputs: input_files,
1442+
read_structures,
1443+
sample_metadata,
1444+
output_types: vec!['T'],
1445+
output: output_dir.clone(),
1446+
unmatched_prefix: "unmatched".to_owned(),
1447+
max_mismatches: 0,
1448+
min_mismatch_delta: 0,
1449+
threads: 5,
1450+
compression_level: 5,
1451+
skip_reasons: vec![],
1452+
};
1453+
demux_inputs.execute().unwrap();
1454+
1455+
let output_path = output_dir.join("Sample0000.R1.fq.gz");
1456+
let fq_reads = read_fastq(&output_path);
1457+
assert_eq!(fq_reads.len(), 1);
1458+
assert_equal(
1459+
&fq_reads[0],
1460+
&OwnedRecord {
1461+
head: b"ex_0 1:N:0:ANAAAAA".to_vec(),
1462+
seq: "A".repeat(5).as_bytes().to_vec(),
1463+
qual: ";".repeat(5).as_bytes().to_vec(),
1464+
},
1465+
);
1466+
1467+
let output_path = output_dir.join("Sample0001.R1.fq.gz");
1468+
let fq_reads = read_fastq(&output_path);
1469+
assert_eq!(fq_reads.len(), 1);
1470+
assert_equal(
1471+
&fq_reads[0],
1472+
&OwnedRecord {
1473+
head: b"ex_1 1:N:0:ANCCCCC".to_vec(),
1474+
seq: "C".repeat(5).as_bytes().to_vec(),
1475+
qual: ";".repeat(5).as_bytes().to_vec(),
1476+
},
1477+
);
1478+
1479+
// Should not match since it has 3 no calls, and barcodes have at maximum 1 no-call
1480+
let unmatched_path = output_dir.join("unmatched.R1.fq.gz");
1481+
let unmatched_reads = read_fastq(&unmatched_path);
1482+
assert_eq!(unmatched_reads.len(), 1);
1483+
assert_equal(
1484+
&unmatched_reads[0],
1485+
&OwnedRecord {
1486+
head: b"ex_2 1:N:0:NNNAAAA".to_vec(),
1487+
seq: "T".repeat(5).as_bytes().to_vec(),
1488+
qual: ";".repeat(5).as_bytes().to_vec(),
1489+
},
1490+
);
1491+
}
1492+
13761493
#[test]
13771494
fn test_demux_paired_reads_with_in_line_sample_barcodes() {
13781495
let tmp = TempDir::new().unwrap();

src/lib/barcode_matching.rs

+17-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ pub struct BarcodeMatcher {
2525
/// Note - this is to be replaced by a sample struct in task 3. For now we're keeping things
2626
/// very simple.
2727
sample_barcodes: Vec<BString>,
28+
/// The maxium number of Ns in any barcode in set of sample barcodes
29+
max_ns_in_barcodes: usize,
2830
/// The maximum mismatches to match a sample barcode.
2931
max_mismatches: u8,
3032
/// The minimum difference between number of mismatches in the best and second best barcodes
@@ -56,12 +58,20 @@ impl BarcodeMatcher {
5658
"Sample barcode cannot be empty string"
5759
);
5860

61+
let mut max_ns_in_barcodes = 0;
5962
let modified_sample_barcodes = sample_barcodes
6063
.iter()
61-
.map(|barcode| BString::from(barcode.to_ascii_uppercase()))
64+
.map(|barcode| {
65+
let barcode = BString::from(barcode.to_ascii_uppercase());
66+
let num_ns = barcode.iter().filter(|&b| byte_is_nocall(*b)).count();
67+
max_ns_in_barcodes = max_ns_in_barcodes.max(num_ns);
68+
barcode
69+
})
6270
.collect::<Vec<_>>();
71+
6372
Self {
6473
sample_barcodes: modified_sample_barcodes,
74+
max_ns_in_barcodes,
6575
max_mismatches,
6676
min_mismatch_delta,
6777
use_cache,
@@ -132,7 +142,7 @@ impl BarcodeMatcher {
132142
return None;
133143
}
134144
let num_no_calls = read_bases.iter().filter(|&&b| byte_is_nocall(b)).count();
135-
if num_no_calls > self.max_mismatches as usize {
145+
if num_no_calls > (self.max_mismatches as usize) + self.max_ns_in_barcodes {
136146
None
137147
} else if self.use_cache {
138148
if let Some(cached_match) = self.cache.get(read_bases) {
@@ -207,6 +217,11 @@ mod tests {
207217
assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GANNACA".as_bytes()), 0,);
208218
}
209219

220+
#[test]
221+
fn all_ns_barcode_have_no_mismatches() {
222+
assert_eq!(BarcodeMatcher::count_mismatches("GANNACA".as_bytes(), "NNNNNNN".as_bytes()), 0,);
223+
}
224+
210225
#[test]
211226
fn find_two_mismatches() {
212227
assert_eq!(BarcodeMatcher::count_mismatches("GATTACA".as_bytes(), "GACCACA".as_bytes()), 2,);

0 commit comments

Comments
 (0)