1- use bio:: io:: fastq;
2- use log:: { error, info, warn} ;
3-
1+ use crate :: {
2+ cli:: misc:: { reverse_complement, write_record} ,
3+ errors:: FqkitError ,
4+ utils:: { file_reader, file_writer_append} ,
5+ } ;
6+ use log:: { error, info} ;
7+ use paraseq:: fastq;
48use std:: {
59 collections:: HashMap ,
610 io:: BufRead ,
711 path:: { Path , PathBuf } ,
812} ;
913
10- use crate :: {
11- errors:: FqkitError ,
12- utils:: { file_reader, file_writer_append} ,
13- } ;
14-
15- fn barcode_list ( file : & String , rev_comp : bool ) -> Result < HashMap < String , String > , FqkitError > {
14+ fn barcode_list ( file : & String , rev_comp : bool ) -> Result < HashMap < Vec < u8 > , String > , FqkitError > {
1615 let mut maps = HashMap :: new ( ) ;
17- let mut error_flag = "" ;
1816 let fp = file_reader ( Some ( file) ) ?;
1917
20- if rev_comp {
21- for line in fp. lines ( ) . map_while ( Result :: ok) {
22- let item = line. split ( '\t' ) . collect :: < Vec < & str > > ( ) ; // barcode => sample
23- let bar: String = item[ 0 ]
18+ for line in fp. lines ( ) . map_while ( Result :: ok) {
19+ let item = line. split ( '\t' ) . collect :: < Vec < & str > > ( ) ; // barcode => sample
20+ let bar = if rev_comp {
21+ reverse_complement ( item[ 0 ] . as_bytes ( ) )
22+ } else {
23+ item[ 0 ]
2424 . chars ( )
2525 . rev ( )
26- . map ( |x| match x {
27- 'A' => 'T' ,
28- 'G' => 'C' ,
29- 'T' => 'A' ,
30- 'C' => 'G' ,
31- 'a' => 'T' ,
32- 'g' => 'C' ,
33- 't' => 'A' ,
34- 'c' => 'G' ,
35- _ => {
36- warn ! ( "invalid barcode base in sample {}, skipped" , item[ 1 ] ) ;
37- error_flag = "err" ;
38- 'X'
39- }
40- } )
41- . collect ( ) ;
42- if error_flag != "err" {
43- maps. entry ( bar) . or_insert ( item[ 1 ] . to_string ( ) ) ;
44- }
45- error_flag = "" ;
46- }
47- } else {
48- for line in fp. lines ( ) . map_while ( Result :: ok) {
49- let item = line. split ( '\t' ) . collect :: < Vec < & str > > ( ) ;
50- let bar: String = item[ 0 ]
51- . chars ( )
52- . map ( |x| match x {
53- 'A' => 'A' ,
54- 'T' => 'T' ,
55- 'G' => 'G' ,
56- 'C' => 'C' ,
57- 'a' => 'A' ,
58- 't' => 'T' ,
59- 'g' => 'G' ,
60- 'c' => 'C' ,
61- _ => {
62- warn ! ( "invalid barcode base in sample {}, skipped" , item[ 1 ] ) ;
63- error_flag = "err" ;
64- 'X'
65- }
66- } )
67- . collect ( ) ;
68- if error_flag != "err" {
69- maps. entry ( bar) . or_insert ( item[ 1 ] . to_string ( ) ) ;
70- }
71- error_flag = "" ;
72- }
26+ . collect :: < String > ( )
27+ . as_bytes ( )
28+ . to_vec ( )
29+ } ;
30+ maps. entry ( bar) . or_insert ( item[ 1 ] . to_string ( ) ) ;
7331 }
7432 Ok ( maps)
7533}
7634
7735#[ inline]
78- fn hamming_dis ( bar : & str , seq : & [ u8 ] ) -> usize {
79- let mut dist = 0usize ;
80- let bar = bar. as_bytes ( ) ;
81- for i in 0 ..bar. len ( ) {
82- if bar[ i] != seq[ i] {
83- dist += 1 ;
84- }
85- }
86- dist
36+ fn hamming_dis ( bar : & [ u8 ] , seq : & [ u8 ] ) -> usize {
37+ bar. iter ( ) . zip ( seq. iter ( ) ) . filter ( |( a, b) | a != b) . count ( )
8738}
8839
8940#[ allow( clippy:: too_many_arguments) ]
@@ -127,95 +78,104 @@ pub fn split_fq(
12778
12879 let mut fq_hand = Vec :: new ( ) ;
12980 for ( bar_seq, name) in maps {
130- let fq1 = if gzip {
131- PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq.gz" , name) )
81+ let ( fq1, fq2, bar) = if gzip {
82+ (
83+ PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq.gz" , name) ) ,
84+ PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq.gz" , name) ) ,
85+ PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq.gz" , name) ) ,
86+ )
13287 } else if bzip2 {
133- PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq.bz2" , name) )
88+ (
89+ PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq.bz2" , name) ) ,
90+ PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq.bz2" , name) ) ,
91+ PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq.bz2" , name) ) ,
92+ )
13493 } else if xz {
135- PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq.xz" , name) )
94+ (
95+ PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq.xz" , name) ) ,
96+ PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq.xz" , name) ) ,
97+ PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq.xz" , name) ) ,
98+ )
13699 } else {
137- PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq" , name) )
138- } ;
139- let fq2 = if gzip {
140- PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq.gz" , name) )
141- } else if bzip2 {
142- PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq.bz2" , name) )
143- } else if xz {
144- PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq.xz" , name) )
145- } else {
146- PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq" , name) )
147- } ;
148- let bar = if gzip {
149- PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq.gz" , name) )
150- } else if bzip2 {
151- PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq.bz2" , name) )
152- } else if xz {
153- PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq.xz" , name) )
154- } else {
155- PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq" , name) )
100+ (
101+ PathBuf :: from ( outdir) . join ( format ! ( "{}_1.fq" , name) ) ,
102+ PathBuf :: from ( outdir) . join ( format ! ( "{}_2.fq" , name) ) ,
103+ PathBuf :: from ( outdir) . join ( format ! ( "{}_barcode.fq" , name) ) ,
104+ )
156105 } ;
157106
158- let fh1 = fastq:: Writer :: new ( file_writer_append ( & fq1, compression_level) ?) ;
159- let fh2 = fastq:: Writer :: new ( file_writer_append ( & fq2, compression_level) ?) ;
160- let fhb = fastq:: Writer :: new ( file_writer_append ( & bar, compression_level) ?) ;
161- let len = bar_seq. len ( ) ;
162- fq_hand. push ( ( bar_seq, len, fh1, fh2, fhb) ) ;
107+ let fh1 = file_writer_append ( & fq1, compression_level) ?;
108+ let fh2 = file_writer_append ( & fq2, compression_level) ?;
109+ let fhb = file_writer_append ( & bar, compression_level) ?;
110+ fq_hand. push ( ( bar_seq. clone ( ) , bar_seq. len ( ) , fh1, fh2, fhb) ) ;
163111 }
164112
165- let bar_count = fq_hand. len ( ) ;
166- let fq1_reader = fastq:: Reader :: new ( file_reader ( Some ( big_fq1) ) ?) ;
167- let fq2_reader = fastq:: Reader :: new ( file_reader ( Some ( big_fq2) ) ?) ;
168- let ( mut read_pair, mut get_pair) = ( 0u64 , 0u64 ) ;
169-
170113 info ! ( "reading from read1 file: {}" , big_fq1) ;
114+ let mut fq1_reader = fastq:: Reader :: new ( file_reader ( Some ( big_fq1) ) ?) ;
171115 info ! ( "reading from read2 file: {}" , big_fq2) ;
116+ let mut fq2_reader = fastq:: Reader :: new ( file_reader ( Some ( big_fq2) ) ?) ;
117+ let mut rset1 = fastq:: RecordSet :: default ( ) ;
118+ let mut rset2 = fastq:: RecordSet :: default ( ) ;
119+ let bar_count = fq_hand. len ( ) ;
120+ let ( mut read_pair, mut get_pair) = ( 0u64 , 0u64 ) ;
172121 info ! ( "barcode position mode: {}" , mode) ;
173122
174123 if mode == 2 {
175- for ( rec1, rec2) in fq1_reader
176- . records ( )
177- . flatten ( )
178- . zip ( fq2_reader. records ( ) . flatten ( ) )
179- {
180- let read_len = rec2. seq ( ) . len ( ) ;
181- read_pair += 1 ;
182- for idx in 0 ..bar_count {
183- if let Some ( ( bar_seq, bar_len, fh1, fh2, fhb) ) = fq_hand. get_mut ( idx) {
184- let pos_index = read_len - * bar_len;
185- let read_part1 = & rec2. seq ( ) [ 0 ..pos_index] ;
186- let read_part2 = & rec2. seq ( ) [ pos_index..] ;
187- let qual_part1 = & rec2. qual ( ) [ 0 ..pos_index] ;
188- let qual_part2 = & rec2. qual ( ) [ pos_index..] ;
189- if hamming_dis ( bar_seq, read_part2) <= mismatch {
190- get_pair += 1 ;
191- fh1. write ( rec1. id ( ) , rec1. desc ( ) , rec1. seq ( ) , rec1. qual ( ) ) ?;
192- fh2. write ( rec2. id ( ) , rec2. desc ( ) , read_part1, qual_part1) ?;
193- fhb. write ( rec2. id ( ) , rec2. desc ( ) , read_part2, qual_part2) ?;
194- break ;
124+ while rset1. fill ( & mut fq1_reader) ? && rset2. fill ( & mut fq2_reader) ? {
125+ for ( rec1, rec2) in rset1
126+ . iter ( )
127+ . map_while ( Result :: ok)
128+ . zip ( rset2. iter ( ) . map_while ( Result :: ok) )
129+ {
130+ let read_len = rec2. seq ( ) . len ( ) ;
131+ read_pair += 1 ;
132+
133+ for idx in 0 ..bar_count {
134+ if let Some ( ( bar_seq, bar_len, fh1, fh2, fhb) ) = fq_hand. get_mut ( idx) {
135+ if rec2. seq ( ) . len ( ) < * bar_len {
136+ continue ;
137+ }
138+ let pos_index = read_len - * bar_len;
139+ let read_part1 = & rec2. seq ( ) [ 0 ..pos_index] ;
140+ let read_part2 = & rec2. seq ( ) [ pos_index..] ;
141+ let qual_part1 = & rec2. qual ( ) [ 0 ..pos_index] ;
142+ let qual_part2 = & rec2. qual ( ) [ pos_index..] ;
143+ if hamming_dis ( bar_seq, read_part2) <= mismatch {
144+ get_pair += 1 ;
145+ write_record ( fh1, rec1. id ( ) , rec1. seq ( ) , rec1. qual ( ) ) ?;
146+ write_record ( fh2, rec2. id ( ) , read_part1, qual_part1) ?;
147+ write_record ( fhb, rec2. id ( ) , read_part2, qual_part2) ?;
148+ break ;
149+ }
195150 }
196151 }
197152 }
198153 }
199154 } else if mode == 1 {
200- for ( rec1, rec2) in fq1_reader
201- . records ( )
202- . flatten ( )
203- . zip ( fq2_reader. records ( ) . flatten ( ) )
204- {
205- read_pair += 1 ;
206- for idx in 0 ..bar_count {
207- if let Some ( ( bar_seq, bar_len, fh1, fh2, fhb) ) = fq_hand. get_mut ( idx) {
208- let pos_index = * bar_len;
209- let read_part1 = & rec2. seq ( ) [ 0 ..pos_index] ;
210- let read_part2 = & rec2. seq ( ) [ pos_index..] ;
211- let qual_part1 = & rec2. qual ( ) [ 0 ..pos_index] ;
212- let qual_part2 = & rec2. qual ( ) [ pos_index..] ;
213- if hamming_dis ( bar_seq, read_part1) <= mismatch {
214- get_pair += 1 ;
215- fh1. write ( rec1. id ( ) , rec1. desc ( ) , rec1. seq ( ) , rec1. qual ( ) ) ?;
216- fh2. write ( rec2. id ( ) , rec2. desc ( ) , read_part2, qual_part2) ?;
217- fhb. write ( rec2. id ( ) , rec2. desc ( ) , read_part1, qual_part1) ?;
218- break ;
155+ while rset1. fill ( & mut fq1_reader) ? && rset2. fill ( & mut fq2_reader) ? {
156+ for ( rec1, rec2) in rset1
157+ . iter ( )
158+ . map_while ( Result :: ok)
159+ . zip ( rset2. iter ( ) . map_while ( Result :: ok) )
160+ {
161+ read_pair += 1 ;
162+ for idx in 0 ..bar_count {
163+ if let Some ( ( bar_seq, bar_len, fh1, fh2, fhb) ) = fq_hand. get_mut ( idx) {
164+ if rec2. seq ( ) . len ( ) < * bar_len {
165+ continue ;
166+ }
167+ let pos_index = * bar_len;
168+ let read_part1 = & rec2. seq ( ) [ 0 ..pos_index] ;
169+ let read_part2 = & rec2. seq ( ) [ pos_index..] ;
170+ let qual_part1 = & rec2. qual ( ) [ 0 ..pos_index] ;
171+ let qual_part2 = & rec2. qual ( ) [ pos_index..] ;
172+ if hamming_dis ( bar_seq, read_part1) <= mismatch {
173+ get_pair += 1 ;
174+ write_record ( fh1, rec1. id ( ) , rec1. seq ( ) , rec1. qual ( ) ) ?;
175+ write_record ( fh2, rec2. id ( ) , read_part2, qual_part2) ?;
176+ write_record ( fhb, rec2. id ( ) , read_part1, qual_part1) ?;
177+ break ;
178+ }
219179 }
220180 }
221181 }
@@ -224,6 +184,7 @@ pub fn split_fq(
224184 error ! ( "invalid mode arg, must be 1 or 2 !" ) ;
225185 std:: process:: exit ( 1 ) ;
226186 }
187+
227188 info ! (
228189 "data split rate: {:.4}%" ,
229190 get_pair as f64 / read_pair as f64 * 100.0
0 commit comments