Skip to content

Commit 5eab9a8

Browse files
committed
use paraseq in cmd barcode
1 parent c4b3471 commit 5eab9a8

3 files changed

Lines changed: 116 additions & 146 deletions

File tree

src/cli/barcode.rs

Lines changed: 103 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,40 @@
1-
use bio::io::fastq;
2-
use log::{error, info, warn};
3-
1+
use crate::{
2+
cli::misc::{reverse_complement, write_record},
3+
errors::FqkitError,
4+
utils::{file_reader, file_writer_append},
5+
};
6+
use log::{error, info};
7+
use paraseq::fastq;
48
use std::{
59
collections::HashMap,
610
io::BufRead,
711
path::{Path, PathBuf},
812
};
913

10-
use crate::{
11-
errors::FqkitError,
12-
utils::{file_reader, file_writer_append},
13-
};
14-
15-
fn barcode_list(file: &String, rev_comp: bool) -> Result<HashMap<String, String>, FqkitError> {
14+
fn barcode_list(file: &String, rev_comp: bool) -> Result<HashMap<Vec<u8>, String>, FqkitError> {
1615
let mut maps = HashMap::new();
17-
let mut error_flag = "";
1816
let fp = file_reader(Some(file))?;
1917

20-
if rev_comp {
21-
for line in fp.lines().map_while(Result::ok) {
22-
let item = line.split('\t').collect::<Vec<&str>>(); // barcode => sample
23-
let bar: String = item[0]
18+
for line in fp.lines().map_while(Result::ok) {
19+
let item = line.split('\t').collect::<Vec<&str>>(); // barcode => sample
20+
let bar = if rev_comp {
21+
reverse_complement(item[0].as_bytes())
22+
} else {
23+
item[0]
2424
.chars()
2525
.rev()
26-
.map(|x| match x {
27-
'A' => 'T',
28-
'G' => 'C',
29-
'T' => 'A',
30-
'C' => 'G',
31-
'a' => 'T',
32-
'g' => 'C',
33-
't' => 'A',
34-
'c' => 'G',
35-
_ => {
36-
warn!("invalid barcode base in sample {}, skipped", item[1]);
37-
error_flag = "err";
38-
'X'
39-
}
40-
})
41-
.collect();
42-
if error_flag != "err" {
43-
maps.entry(bar).or_insert(item[1].to_string());
44-
}
45-
error_flag = "";
46-
}
47-
} else {
48-
for line in fp.lines().map_while(Result::ok) {
49-
let item = line.split('\t').collect::<Vec<&str>>();
50-
let bar: String = item[0]
51-
.chars()
52-
.map(|x| match x {
53-
'A' => 'A',
54-
'T' => 'T',
55-
'G' => 'G',
56-
'C' => 'C',
57-
'a' => 'A',
58-
't' => 'T',
59-
'g' => 'G',
60-
'c' => 'C',
61-
_ => {
62-
warn!("invalid barcode base in sample {}, skipped", item[1]);
63-
error_flag = "err";
64-
'X'
65-
}
66-
})
67-
.collect();
68-
if error_flag != "err" {
69-
maps.entry(bar).or_insert(item[1].to_string());
70-
}
71-
error_flag = "";
72-
}
26+
.collect::<String>()
27+
.as_bytes()
28+
.to_vec()
29+
};
30+
maps.entry(bar).or_insert(item[1].to_string());
7331
}
7432
Ok(maps)
7533
}
7634

7735
#[inline]
78-
fn hamming_dis(bar: &str, seq: &[u8]) -> usize {
79-
let mut dist = 0usize;
80-
let bar = bar.as_bytes();
81-
for i in 0..bar.len() {
82-
if bar[i] != seq[i] {
83-
dist += 1;
84-
}
85-
}
86-
dist
36+
fn hamming_dis(bar: &[u8], seq: &[u8]) -> usize {
37+
bar.iter().zip(seq.iter()).filter(|(a, b)| a != b).count()
8738
}
8839

8940
#[allow(clippy::too_many_arguments)]
@@ -127,95 +78,104 @@ pub fn split_fq(
12778

12879
let mut fq_hand = Vec::new();
12980
for (bar_seq, name) in maps {
130-
let fq1 = if gzip {
131-
PathBuf::from(outdir).join(format!("{}_1.fq.gz", name))
81+
let (fq1, fq2, bar) = if gzip {
82+
(
83+
PathBuf::from(outdir).join(format!("{}_1.fq.gz", name)),
84+
PathBuf::from(outdir).join(format!("{}_2.fq.gz", name)),
85+
PathBuf::from(outdir).join(format!("{}_barcode.fq.gz", name)),
86+
)
13287
} else if bzip2 {
133-
PathBuf::from(outdir).join(format!("{}_1.fq.bz2", name))
88+
(
89+
PathBuf::from(outdir).join(format!("{}_1.fq.bz2", name)),
90+
PathBuf::from(outdir).join(format!("{}_2.fq.bz2", name)),
91+
PathBuf::from(outdir).join(format!("{}_barcode.fq.bz2", name)),
92+
)
13493
} else if xz {
135-
PathBuf::from(outdir).join(format!("{}_1.fq.xz", name))
94+
(
95+
PathBuf::from(outdir).join(format!("{}_1.fq.xz", name)),
96+
PathBuf::from(outdir).join(format!("{}_2.fq.xz", name)),
97+
PathBuf::from(outdir).join(format!("{}_barcode.fq.xz", name)),
98+
)
13699
} else {
137-
PathBuf::from(outdir).join(format!("{}_1.fq", name))
138-
};
139-
let fq2 = if gzip {
140-
PathBuf::from(outdir).join(format!("{}_2.fq.gz", name))
141-
} else if bzip2 {
142-
PathBuf::from(outdir).join(format!("{}_2.fq.bz2", name))
143-
} else if xz {
144-
PathBuf::from(outdir).join(format!("{}_2.fq.xz", name))
145-
} else {
146-
PathBuf::from(outdir).join(format!("{}_2.fq", name))
147-
};
148-
let bar = if gzip {
149-
PathBuf::from(outdir).join(format!("{}_barcode.fq.gz", name))
150-
} else if bzip2 {
151-
PathBuf::from(outdir).join(format!("{}_barcode.fq.bz2", name))
152-
} else if xz {
153-
PathBuf::from(outdir).join(format!("{}_barcode.fq.xz", name))
154-
} else {
155-
PathBuf::from(outdir).join(format!("{}_barcode.fq", name))
100+
(
101+
PathBuf::from(outdir).join(format!("{}_1.fq", name)),
102+
PathBuf::from(outdir).join(format!("{}_2.fq", name)),
103+
PathBuf::from(outdir).join(format!("{}_barcode.fq", name)),
104+
)
156105
};
157106

158-
let fh1 = fastq::Writer::new(file_writer_append(&fq1, compression_level)?);
159-
let fh2 = fastq::Writer::new(file_writer_append(&fq2, compression_level)?);
160-
let fhb = fastq::Writer::new(file_writer_append(&bar, compression_level)?);
161-
let len = bar_seq.len();
162-
fq_hand.push((bar_seq, len, fh1, fh2, fhb));
107+
let fh1 = file_writer_append(&fq1, compression_level)?;
108+
let fh2 = file_writer_append(&fq2, compression_level)?;
109+
let fhb = file_writer_append(&bar, compression_level)?;
110+
fq_hand.push((bar_seq.clone(), bar_seq.len(), fh1, fh2, fhb));
163111
}
164112

165-
let bar_count = fq_hand.len();
166-
let fq1_reader = fastq::Reader::new(file_reader(Some(big_fq1))?);
167-
let fq2_reader = fastq::Reader::new(file_reader(Some(big_fq2))?);
168-
let (mut read_pair, mut get_pair) = (0u64, 0u64);
169-
170113
info!("reading from read1 file: {}", big_fq1);
114+
let mut fq1_reader = fastq::Reader::new(file_reader(Some(big_fq1))?);
171115
info!("reading from read2 file: {}", big_fq2);
116+
let mut fq2_reader = fastq::Reader::new(file_reader(Some(big_fq2))?);
117+
let mut rset1 = fastq::RecordSet::default();
118+
let mut rset2 = fastq::RecordSet::default();
119+
let bar_count = fq_hand.len();
120+
let (mut read_pair, mut get_pair) = (0u64, 0u64);
172121
info!("barcode position mode: {}", mode);
173122

174123
if mode == 2 {
175-
for (rec1, rec2) in fq1_reader
176-
.records()
177-
.flatten()
178-
.zip(fq2_reader.records().flatten())
179-
{
180-
let read_len = rec2.seq().len();
181-
read_pair += 1;
182-
for idx in 0..bar_count {
183-
if let Some((bar_seq, bar_len, fh1, fh2, fhb)) = fq_hand.get_mut(idx) {
184-
let pos_index = read_len - *bar_len;
185-
let read_part1 = &rec2.seq()[0..pos_index];
186-
let read_part2 = &rec2.seq()[pos_index..];
187-
let qual_part1 = &rec2.qual()[0..pos_index];
188-
let qual_part2 = &rec2.qual()[pos_index..];
189-
if hamming_dis(bar_seq, read_part2) <= mismatch {
190-
get_pair += 1;
191-
fh1.write(rec1.id(), rec1.desc(), rec1.seq(), rec1.qual())?;
192-
fh2.write(rec2.id(), rec2.desc(), read_part1, qual_part1)?;
193-
fhb.write(rec2.id(), rec2.desc(), read_part2, qual_part2)?;
194-
break;
124+
while rset1.fill(&mut fq1_reader)? && rset2.fill(&mut fq2_reader)? {
125+
for (rec1, rec2) in rset1
126+
.iter()
127+
.map_while(Result::ok)
128+
.zip(rset2.iter().map_while(Result::ok))
129+
{
130+
let read_len = rec2.seq().len();
131+
read_pair += 1;
132+
133+
for idx in 0..bar_count {
134+
if let Some((bar_seq, bar_len, fh1, fh2, fhb)) = fq_hand.get_mut(idx) {
135+
if rec2.seq().len() < *bar_len {
136+
continue;
137+
}
138+
let pos_index = read_len - *bar_len;
139+
let read_part1 = &rec2.seq()[0..pos_index];
140+
let read_part2 = &rec2.seq()[pos_index..];
141+
let qual_part1 = &rec2.qual()[0..pos_index];
142+
let qual_part2 = &rec2.qual()[pos_index..];
143+
if hamming_dis(bar_seq, read_part2) <= mismatch {
144+
get_pair += 1;
145+
write_record(fh1, rec1.id(), rec1.seq(), rec1.qual())?;
146+
write_record(fh2, rec2.id(), read_part1, qual_part1)?;
147+
write_record(fhb, rec2.id(), read_part2, qual_part2)?;
148+
break;
149+
}
195150
}
196151
}
197152
}
198153
}
199154
} else if mode == 1 {
200-
for (rec1, rec2) in fq1_reader
201-
.records()
202-
.flatten()
203-
.zip(fq2_reader.records().flatten())
204-
{
205-
read_pair += 1;
206-
for idx in 0..bar_count {
207-
if let Some((bar_seq, bar_len, fh1, fh2, fhb)) = fq_hand.get_mut(idx) {
208-
let pos_index = *bar_len;
209-
let read_part1 = &rec2.seq()[0..pos_index];
210-
let read_part2 = &rec2.seq()[pos_index..];
211-
let qual_part1 = &rec2.qual()[0..pos_index];
212-
let qual_part2 = &rec2.qual()[pos_index..];
213-
if hamming_dis(bar_seq, read_part1) <= mismatch {
214-
get_pair += 1;
215-
fh1.write(rec1.id(), rec1.desc(), rec1.seq(), rec1.qual())?;
216-
fh2.write(rec2.id(), rec2.desc(), read_part2, qual_part2)?;
217-
fhb.write(rec2.id(), rec2.desc(), read_part1, qual_part1)?;
218-
break;
155+
while rset1.fill(&mut fq1_reader)? && rset2.fill(&mut fq2_reader)? {
156+
for (rec1, rec2) in rset1
157+
.iter()
158+
.map_while(Result::ok)
159+
.zip(rset2.iter().map_while(Result::ok))
160+
{
161+
read_pair += 1;
162+
for idx in 0..bar_count {
163+
if let Some((bar_seq, bar_len, fh1, fh2, fhb)) = fq_hand.get_mut(idx) {
164+
if rec2.seq().len() < *bar_len {
165+
continue;
166+
}
167+
let pos_index = *bar_len;
168+
let read_part1 = &rec2.seq()[0..pos_index];
169+
let read_part2 = &rec2.seq()[pos_index..];
170+
let qual_part1 = &rec2.qual()[0..pos_index];
171+
let qual_part2 = &rec2.qual()[pos_index..];
172+
if hamming_dis(bar_seq, read_part1) <= mismatch {
173+
get_pair += 1;
174+
write_record(fh1, rec1.id(), rec1.seq(), rec1.qual())?;
175+
write_record(fh2, rec2.id(), read_part2, qual_part2)?;
176+
write_record(fhb, rec2.id(), read_part1, qual_part1)?;
177+
break;
178+
}
219179
}
220180
}
221181
}
@@ -224,6 +184,7 @@ pub fn split_fq(
224184
error!("invalid mode arg, must be 1 or 2 !");
225185
std::process::exit(1);
226186
}
187+
227188
info!(
228189
"data split rate: {:.4}%",
229190
get_pair as f64 / read_pair as f64 * 100.0

src/cli/cutadapter.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use super::misc::write_record;
22
use crate::{errors::FqkitError, utils::file_reader, utils::file_writer};
33
use log::{error, warn};
4-
use paraseq::{fastq,fasta};
4+
use paraseq::{fasta, fastq};
55
use std::collections::HashMap;
66

77
pub fn cut_adapter(
@@ -20,10 +20,14 @@ pub fn cut_adapter(
2020
while faset.fill(&mut seqfile_reader)? {
2121
for rec in faset.iter().map_while(Result::ok) {
2222
if seqs.contains_key(rec.id()) {
23-
warn!("found duplicate sequence id: {}, keep first one", std::str::from_utf8(rec.id())?);
23+
warn!(
24+
"found duplicate sequence id: {}, keep first one",
25+
std::str::from_utf8(rec.id())?
26+
);
2427
continue;
2528
} else {
26-
seqs.entry(rec.id().to_owned()).or_insert(rec.seq().to_vec());
29+
seqs.entry(rec.id().to_owned())
30+
.or_insert(rec.seq().to_vec());
2731
}
2832
}
2933
}

src/command.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,12 @@ pub enum Subcli {
255255
#[arg(short = 'l', long = "length", default_value_t = 30, value_name = "INT")]
256256
length: usize,
257257
/// maximum mismatch rate count in overlap region
258-
#[arg(short = 'm', long = "miss", default_value_t = 0.1, value_name = "FLOAT")]
258+
#[arg(
259+
short = 'm',
260+
long = "miss",
261+
default_value_t = 0.1,
262+
value_name = "FLOAT"
263+
)]
259264
miss: f64,
260265
/// output joinde long fastq file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically
261266
#[arg(short = 'o', long = "output")]

0 commit comments

Comments
 (0)