Skip to content

Commit 357552e

Browse files
committed
add subcommand filter
1 parent a4413f1 commit 357552e

8 files changed

Lines changed: 265 additions & 21 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
[package]
22
name = "fqkit"
3-
version = "0.3.4"
3+
version = "0.3.5"
44
edition = "2021"
55
authors = ["sharkLoc <mmtinfo@163.com>"]
66
rust-version = "1.65.0"
77
homepage = "https://github.com/sharkLoc/fqkit"
8-
description = "fqkit: a simple program for fastq file manipulation"
8+
description = "fqkit: a simple and cross-platform program for fastq file manipulation"
99
keywords = ["fastq", "reads","bio", "hts", "barcode"]
1010
readme = "README.md"
1111
license = "GPL-3.0"

README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414

1515

1616
## install
17-
##### setp1:install cargo first
17+
##### setp1: install cargo first
1818
```bash
1919
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
2020
```
2121

22-
##### step2:
22+
##### step2: on linux or windows
2323
```bash
2424
cargo install fqkit
2525
# or
@@ -35,7 +35,7 @@ cargo b --release
3535
```bash
3636
FqKit -- A simple and cross-platform program for fastq file manipulation
3737

38-
Version: 0.3.4
38+
Version: 0.3.5
3939

4040
Authors: sharkLoc <mmtinfo@163.com>
4141
Source code: https://github.com/sharkLoc/fqkit.git
@@ -47,6 +47,7 @@ Commands:
4747
concat concat fastq files from different lanes
4848
subfq subsample sequences from big fastq file [aliases: sample]
4949
trim trim fastq file
50+
filter a simple filter for pair end fastq sqeuence
5051
range print fastq records in a range
5152
search search reads/motifs from fastq file
5253
grep grep fastq sequence by read id or full name
@@ -78,12 +79,9 @@ Options:
7879
-V, --version Print version
7980

8081
Global Arguments:
81-
--compress-level <COMPRESSION_LEVEL>
82-
set gzip compression level 1 (compress faster) - 9 (compress better) for gzip output file, just work with option -o/--out [default: 6]
83-
-v, --verbosity <VERBOSE>
84-
control verbosity of logging, possible values: {error,warn,info,debug,trace} [default: debug]
85-
--log <LOGFILE>
86-
if file name specified, write log message to this file, or write to stderr
82+
--compress-level <int> set gzip compression level 1 (compress faster) - 9 (compress better) for gzip output file, just work with option -o/--out [default: 6]
83+
--log <str> if file name specified, write log message to this file, or write to stderr
84+
-v, --verbosity <str> control verbosity of logging, possible values: {error,warn,info,debug,trace} [default: debug]
8785

8886
Global FLAGS:
8987
-q, --quiet be quiet and do not show any extra information
@@ -140,6 +138,10 @@ Use "fqkit help [command]" for more information about a command
140138
2023.12.11
141139
- update to version 0.3.2
142140
- add subcommand slide
141+
142+
2023.12.19
143+
- update to version 0.3.5
144+
- add subcommand filter
143145
</details>
144146
145147
#### ** any bugs please report issues **💖

src/command.rs

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use clap::{Parser,value_parser};
44
#[command(
55
name = "FqKit",
66
author = "sharkLoc",
7-
version = "0.3.4",
7+
version = "0.3.5",
88
about = "A simple and cross-platform program for fastq file manipulation",
99
long_about = None,
1010
next_line_help = false,
@@ -21,16 +21,19 @@ pub struct Args {
2121
/// set gzip compression level 1 (compress faster) - 9 (compress better) for gzip output file,
2222
/// just work with option -o/--out
2323
#[arg(long = "compress-level", default_value_t = 6, global = true,
24-
value_parser = value_parser!(u32).range(1..=9),
24+
value_parser = value_parser!(u32).range(1..=9), value_name = "int",
2525
help_heading = Some("Global Arguments")
2626
)]
2727
pub compression_level: u32,
28-
/// control verbosity of logging, possible values: {error,warn,info,debug,trace}
29-
#[arg(short = 'v', long = "verbosity", global = true, default_value_t = String::from("debug"), help_heading = Some("Global Arguments"))]
30-
pub verbose: String,
3128
/// if file name specified, write log message to this file, or write to stderr
32-
#[arg(long = "log", global = true, help_heading = Some("Global Arguments"))]
29+
#[arg(long = "log", global = true, help_heading = Some("Global Arguments"), value_name = "str")]
3330
pub logfile: Option<String>,
31+
/// control verbosity of logging, possible values: {error,warn,info,debug,trace}
32+
#[arg(short = 'v', long = "verbosity", global = true, value_name = "str",
33+
default_value_t = String::from("debug"),
34+
help_heading = Some("Global Arguments")
35+
)]
36+
pub verbose: String,
3437
/// be quiet and do not show any extra information
3538
#[arg(short = 'q', long = "quiet", global= true, help_heading = Some("Global FLAGS"))]
3639
pub quiet: bool,
@@ -97,6 +100,52 @@ pub enum Subcli {
97100
#[arg(short = 'o', long = "out")]
98101
out: Option<String>,
99102
},
103+
/// a simple filter for pair end fastq sqeuence
104+
filter {
105+
/// input read1 fastq[.gz] file
106+
#[arg(short = '1', long = "read1")]
107+
read1: String,
108+
/// input read2 fastq[.gz] file
109+
#[arg(short = '2', long = "read2")]
110+
read2: String,
111+
/// if one read number of N base is more then N base limit, then this read pair is discarded.
112+
#[arg(short = 'n', long = "n-limit", default_value_t=5)]
113+
nbase: usize,
114+
/// reads shorter than length_required will be discarded
115+
#[arg(short = 'l', long = "length", default_value_t=30)]
116+
length: usize,
117+
/// the complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]),
118+
/// a 51-bp sequence, with 3 bases that is different from its next base
119+
/// seq = 'AAAATTTTTTTTTTTTTTTTTTTTTGGGGGGGGGGGGGGGGGGGGGGCCCC', and complexity = 3/(51-1) = 6%
120+
/// the threshold for low complexity filter (0~100). 30 is recommended, which means 30% complexity is required.
121+
#[arg(short = 'y', long = "complexity", default_value_t = 0,
122+
value_parser = value_parser!(u32).range(0..=100),
123+
verbatim_doc_comment
124+
)]
125+
complexity: u32,
126+
/// if one read's average quality score < average qual, then this read pair is discarded,
127+
/// eg. Q20 error 0.01, Q30 error 0.001, averaging the probability of error is 0.0055 => Q value 22.59637
128+
#[arg(short = 'q', long = "average_qual", default_value_t = 20, verbatim_doc_comment)]
129+
average_qual: u8,
130+
///phred score 33 or 64
131+
#[arg(short = 'p', long = "phred", default_value_t = 33)]
132+
phred: u8,
133+
/// the number of reads in the chunk on each thread
134+
#[arg(short, long, default_value_t = 5000)]
135+
chunk: usize,
136+
/// number of additional worker threads to use
137+
#[arg(short='@', long="thread", default_value_t = 6)]
138+
thread: usize,
139+
/// specify the file to store reads(interleaved) that cannot pass the filters, file ending in .gz will be compressed automatically
140+
#[arg(short='u', long = "failed")]
141+
failed: String,
142+
/// output pass filtered forward(read1) fastq file name, file ending in .gz will be compressed automatically
143+
#[arg(short='f', long = "out1")]
144+
out1: String,
145+
/// output pass filtered resverse(read2) fastq file name, file ending in .gz will be compressed automatically
146+
#[arg(short='r', long = "out2")]
147+
out2: String,
148+
},
100149
/// print fastq records in a range
101150
range {
102151
/// input fastq[.gz] file, or read from stdin

src/filter.rs

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
use crate::utils::*;
2+
use bio::io::fastq;
3+
use anyhow::{Result, Ok};
4+
use crossbeam::channel::unbounded;
5+
use log::*;
6+
use std::time::Instant;
7+
8+
9+
pub fn filter_fastq(
10+
read1: &str,
11+
read2: &str,
12+
nbase: usize,
13+
length: usize,
14+
complexity: u32,
15+
average_qual: u8,
16+
phred: u8,
17+
chunks: usize,
18+
ncpu: usize,
19+
failed: &str,
20+
out1: &str,
21+
out2: &str,
22+
compression_level: u32,
23+
quiet: bool,
24+
) -> Result<()> {
25+
let start = Instant::now();
26+
if !quiet {
27+
info!("read forward reads from file: {}", read1);
28+
info!("read reverse reads from file: {}", read2);
29+
if ![33u8, 64u8].contains(&phred) {
30+
error!("invalid phred value");
31+
std::process::exit(1);
32+
}
33+
if ncpu <= 1 {
34+
info!("thread num is: {}", ncpu);
35+
} else {
36+
info!("additional thread num is: {}", ncpu);
37+
}
38+
info!("output clean read1 file: {}", out1);
39+
info!("output clean read2 file: {}", out2);
40+
}
41+
42+
let fq_reader1 = file_reader(&Some(read1)).map(fastq::Reader::new)?;
43+
let fq_reader2 = file_reader(&Some(read2)).map(fastq::Reader::new)?;
44+
let mut out_writer1 = file_writer(&Some(out1), compression_level).map(fastq::Writer::new)?;
45+
let mut out_writer2 = file_writer(&Some(out2), compression_level).map(fastq::Writer::new)?;
46+
let mut failed_writer = file_writer(&Some(failed), compression_level).map(fastq::Writer::new)?;
47+
let complex = complexity as usize;
48+
let (mut pe_ok, mut pe_fail) = (0usize,0usize);
49+
if ncpu <= 1 {
50+
for (rec1,rec2) in fq_reader1.records().flatten().zip(fq_reader2.records().flatten()) {
51+
if rec1.seq().iter().filter(|v| v == &&b'N').count() > nbase || rec2.seq().iter().filter(|v| v == &&b'N').count() > nbase {
52+
failed_writer.write_record(&rec1)?;
53+
failed_writer.write_record(&rec2)?;
54+
pe_fail += 1;
55+
continue;
56+
}
57+
if rec1.seq().len() < length || rec2.seq().len() < length {
58+
failed_writer.write_record(&rec1)?;
59+
failed_writer.write_record(&rec2)?;
60+
pe_fail += 1;
61+
continue;
62+
}
63+
let complx1 = (rec1.seq().iter().skip(1)
64+
.zip(rec1.seq().iter())
65+
.filter(|(q1,q2)| {q1 != q2})
66+
.count() as f64 / rec1.seq().len() as f64 * 100.0 ) as usize;
67+
let complx2 = (rec2.seq().iter().skip(1)
68+
.zip(rec1.seq().iter())
69+
.filter(|(q1,q2)| {q1 != q2})
70+
.count() as f64 / rec2.seq().len() as f64 * 100.0 ) as usize;
71+
if complx1 < complex || complx2 < complex {
72+
failed_writer.write_record(&rec1)?;
73+
failed_writer.write_record(&rec2)?;
74+
pe_fail += 1;
75+
continue;
76+
}
77+
if phred_mean(rec1.qual(),phred) < average_qual || phred_mean(rec2.qual(), phred) < average_qual {
78+
failed_writer.write_record(&rec1)?;
79+
failed_writer.write_record(&rec2)?;
80+
pe_fail += 1;
81+
continue;
82+
}
83+
pe_ok += 1;
84+
out_writer1.write_record(&rec1)?;
85+
out_writer2.write_record(&rec2)?;
86+
}
87+
} else {
88+
let mut chunk = chunks;
89+
if chunk == 0 {
90+
warn!("pe read conut in chunk can't be: {}, changed to default value.",chunk);
91+
chunk = 5000;
92+
}
93+
94+
let (tx,rx) = unbounded();
95+
let mut fq_iter1 = fq_reader1.records();
96+
let mut fq_iter2 = fq_reader2.records();
97+
loop {
98+
let pe_vec: Vec<_> = fq_iter1.by_ref().take(chunk).flatten().zip(fq_iter2.by_ref().take(chunk).flatten()).collect();
99+
if pe_vec.is_empty() {
100+
break;
101+
}
102+
tx.send(pe_vec).unwrap();
103+
}
104+
drop(tx);
105+
106+
crossbeam::scope(|s| {
107+
let (tx2,rx2) = unbounded();
108+
let _handles: Vec<_> = (0..ncpu).map(|_| {
109+
let tx_tmp = tx2.clone();
110+
let rx_tmp = rx.clone();
111+
s.spawn(move |_| {
112+
for vec_pe in rx_tmp.iter() {
113+
let mut passed = vec![];
114+
let mut failed = vec![];
115+
for (rec1,rec2) in vec_pe {
116+
if rec1.seq().iter().filter(|v| v == &&b'N').count() > nbase || rec2.seq().iter().filter(|v| v == &&b'N').count() > nbase {
117+
failed.push((rec1,rec2));
118+
continue;
119+
}
120+
if rec1.seq().len() < length || rec2.seq().len() < length {
121+
failed.push((rec1,rec2));
122+
continue;
123+
}
124+
let complx1 = (rec1.seq().iter().skip(1)
125+
.zip(rec1.seq().iter())
126+
.filter(|(q1,q2)| {q1 != q2})
127+
.count() as f64 / rec1.seq().len() as f64 * 100.0 ) as usize;
128+
let complx2 = (rec2.seq().iter().skip(1)
129+
.zip(rec1.seq().iter())
130+
.filter(|(q1,q2)| {q1 != q2})
131+
.count() as f64 / rec2.seq().len() as f64 * 100.0 ) as usize;
132+
if complx1 < complex || complx2 < complex {
133+
failed.push((rec1,rec2));
134+
continue;
135+
}
136+
if phred_mean(rec1.qual(),phred) < average_qual || phred_mean(rec2.qual(), phred) < average_qual {
137+
failed.push((rec1,rec2));
138+
continue;
139+
}
140+
passed.push((rec1,rec2));
141+
}
142+
tx_tmp.send((passed,failed)).unwrap();
143+
}
144+
});
145+
}).collect();
146+
drop(tx2);
147+
148+
for (vec_pass,vec_failed) in rx2.iter() {
149+
for (rec1,rec2) in vec_pass {
150+
pe_ok += 1;
151+
out_writer1.write_record(&rec1).unwrap();
152+
out_writer2.write_record(&rec2).unwrap();
153+
}
154+
for (rec1,rec2) in vec_failed.iter() {
155+
pe_fail += 1;
156+
failed_writer.write_record(&rec1).unwrap();
157+
failed_writer.write_record(&rec2).unwrap();
158+
}
159+
}
160+
}).unwrap();
161+
}
162+
out_writer1.flush()?;
163+
out_writer2.flush()?;
164+
failed_writer.flush()?;
165+
166+
if !quiet {
167+
info!("total clean pe reads number: {}", pe_ok);
168+
info!("total failed pe reads number: {}", pe_fail);
169+
info!("time elapsed is: {:?}",start.elapsed());
170+
}
171+
Ok(())
172+
}
173+
174+
175+
fn phred_mean(
176+
qual: &[u8],
177+
phred: u8,
178+
) -> u8 {
179+
let ave_error = qual
180+
.iter()
181+
.map(|x| { 10.0f64.powf((x -phred) as f64 / 10.0) })
182+
.sum::<f64>() / qual.len() as f64;
183+
184+
(ave_error.log10() * -10.0f64) as u8 - phred
185+
}

src/join.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ pub fn join_fastq(
2828
} else {
2929
info!("additional thread num is: {}", ncpu);
3030
}
31-
info!("outout unmerged read1 file: {}", out_r1);
32-
info!("outout unmerged read2 file: {}", out_r2);
31+
info!("output unmerged read1 file: {}", out_r1);
32+
info!("output unmerged read2 file: {}", out_r2);
3333
info!("output merged pe reads file: {}",merge_fq);
3434
}
3535
let start = Instant::now();

src/logger.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ pub fn logger(
1818
LevelFilter::Info
1919
} else if verbose == "debug".to_string() {
2020
LevelFilter::Debug
21-
} else {
21+
} else if verbose == "trace".to_string() {
2222
LevelFilter::Trace
23+
} else {
24+
LevelFilter::Off
2325
};
2426

2527
let mut builder = Builder::from_default_env();

src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ use anyhow::{Error,Ok};
22
use clap::Parser;
33
use log::{error, warn};
44

5+
6+
mod filter;
7+
use filter::*;
58
mod join;
69
use join::*;
710
mod slide;
@@ -341,6 +344,9 @@ fn main() -> Result<(), Error> {
341344
Subcli::barcode { read1, read2, bar, mode, trans, mismatch, outdir, } => {
342345
split_fq(&read1, &read2, &bar, trans, mode, mismatch, &outdir, arg.compression_level, arg.quiet)?;
343346
}
347+
Subcli::filter { read1, read2, nbase, length, complexity, average_qual, phred, chunk, thread, failed, out1, out2 } => {
348+
filter_fastq(&read1, &read2, nbase, length, complexity, average_qual, phred, chunk, thread, &failed, &out1, &out2, arg.compression_level, arg.quiet)?;
349+
}
344350
Subcli::join { read1, read2, length, miss, chunk, thread, out1, out2, merged } => {
345351
join_fastq(&read1, &read2, length, miss, chunk, thread, &merged, &out1, &out2, arg.compression_level, arg.quiet)?;
346352
}

0 commit comments

Comments
 (0)