@@ -5,139 +5,101 @@ use paraseq::{fastq, fastx::Record};
55use rand:: { Rng , prelude:: * } ;
66use rand_pcg:: Pcg64 ;
77
8- // reduce much memory but cost more time
9- fn select_fastq (
8+ pub fn sample_fastq (
109 file : Option < & String > ,
1110 n : usize ,
1211 seed : u64 ,
12+ two_pass : bool ,
1313 out : Option < & String > ,
1414 compression_level : u32 ,
1515 stdout_type : char ,
1616) -> Result < ( ) , FqkitError > {
17- let mut rng: rand_pcg:: Lcg128Xsl64 = Pcg64 :: seed_from_u64 ( seed) ;
18- let mut get: Vec < usize > = Vec :: with_capacity ( n) ;
19-
2017 if n == 0 {
2118 error ! ( "n must be greater than 0" ) ;
2219 std:: process:: exit ( 1 ) ;
2320 }
21+ info ! ( "subseq number: {}" , n) ;
2422
2523 let mut fq_reader = fastq:: Reader :: new ( file_reader ( file) ?) ;
26- info ! ( "rand seed: {}" , seed) ;
27- info ! ( "subseq number: {}" , n) ;
28- info ! ( "reduce much memory but cost more time" ) ;
2924 let mut rset = fastq:: RecordSet :: default ( ) ;
25+ info ! ( "rand seed: {}" , seed) ;
26+ let mut fq_writer = file_writer ( out, compression_level, stdout_type) ?;
27+
28+ let mut rng: rand_pcg:: Lcg128Xsl64 = Pcg64 :: seed_from_u64 ( seed) ;
3029 let mut order: usize = 0 ;
31- while rset. fill ( & mut fq_reader) ? {
32- for _ in rset. iter ( ) . map_while ( Result :: ok) {
33- if order < n {
34- get. push ( order) ;
35- } else {
36- let ret = rng. random_range ( 0 ..=order) ;
37- if ret < n {
38- get[ ret] = order;
30+ if two_pass {
31+ info ! ( "two pass mode enabled" ) ;
32+ let mut get: Vec < usize > = Vec :: with_capacity ( n) ;
33+ while rset. fill ( & mut fq_reader) ? {
34+ for _ in rset. iter ( ) . map_while ( Result :: ok) {
35+ if order < n {
36+ get. push ( order) ;
37+ } else {
38+ let ret = rng. random_range ( 0 ..=order) ;
39+ if ret < n {
40+ get[ ret] = order;
41+ }
3942 }
43+ order += 1 ;
4044 }
41- order += 1 ;
4245 }
43- }
4446
45- let mut fq_writer = file_writer ( out, compression_level, stdout_type) ?;
46- let mut fq_reader2 = fastq:: Reader :: new ( file_reader ( file) ?) ;
47- let mut rset2 = fastq:: RecordSet :: default ( ) ;
48- let mut order2: usize = 0 ;
49-
50- while rset2. fill ( & mut fq_reader2) ? {
51- for rec in rset2. iter ( ) . map_while ( Result :: ok) {
52- if get. contains ( & order2) {
53- write_record ( & mut fq_writer, rec. id ( ) , rec. seq ( ) , rec. qual ( ) ) ?;
47+ let mut fq_reader2 = fastq:: Reader :: new ( file_reader ( file) ?) ;
48+ let mut rset2 = fastq:: RecordSet :: default ( ) ;
49+ order = 0 ;
50+ get. sort_unstable ( ) ; // keep the order
51+ let mut idx = 0usize ;
52+ info ! ( "all records has been readed into memory, start write to output ..." ) ;
53+ while rset2. fill ( & mut fq_reader2) ? {
54+ for rec in rset2. iter ( ) . map_while ( Result :: ok) {
55+ if idx < get. len ( ) && order == get[ idx] {
56+ write_record ( & mut fq_writer, rec. id ( ) , rec. seq ( ) , rec. qual ( ) ) ?;
57+ idx += 1 ;
58+ }
59+ if idx >= get. len ( ) {
60+ break ;
61+ }
62+ order += 1 ;
5463 }
55- order2 += 1 ;
5664 }
57- }
58- fq_writer. flush ( ) ?;
59-
60- Ok ( ( ) )
61- }
62-
63- // fast mode but cost more memory
64- fn select_fastq2 (
65- file : Option < & String > ,
66- n : usize ,
67- seed : u64 ,
68- out : Option < & String > ,
69- compression_level : u32 ,
70- stdout_type : char ,
71- ) -> Result < ( ) , FqkitError > {
72- info ! ( "rand seed: {}" , seed) ;
73- info ! ( "subseq num: {}" , n) ;
74- info ! ( "fast mode but cost more memory" ) ;
75-
76- let mut rng = Pcg64 :: seed_from_u64 ( seed) ;
77- let mut get = Vec :: with_capacity ( n) ;
78- if n == 0 {
79- error ! ( "n must be greater than 0" ) ;
80- std:: process:: exit ( 1 ) ;
81- }
82-
83- let mut fq_reader = fastq:: Reader :: new ( file_reader ( file) ?) ;
84- let mut rset = fastq:: RecordSet :: default ( ) ;
85- let mut order: usize = 0 ;
86- while rset. fill ( & mut fq_reader) ? {
87- for rec in rset. iter ( ) . map_while ( Result :: ok) {
88- if order < n {
89- let rec_t = vec ! [
90- rec. id_str( ) . to_owned( ) ,
91- rec. seq_str( ) . to_owned( ) ,
92- rec. qual_str( ) . to_owned( ) ,
93- ] ;
94- get. push ( rec_t) ;
95- } else {
96- let ret = rng. random_range ( 0 ..=order) ;
97- if ret < n {
98- get[ ret] = vec ! [
65+ } else {
66+ let mut get = Vec :: with_capacity ( n) ;
67+ while rset. fill ( & mut fq_reader) ? {
68+ for rec in rset. iter ( ) . map_while ( Result :: ok) {
69+ if order < n {
70+ get. push ( (
71+ order,
9972 rec. id_str ( ) . to_owned ( ) ,
10073 rec. seq_str ( ) . to_owned ( ) ,
10174 rec. qual_str ( ) . to_owned ( ) ,
102- ] ;
75+ ) ) ;
76+ } else {
77+ let ret = rng. random_range ( 0 ..=order) ;
78+ if ret < n {
79+ get[ ret] = (
80+ order,
81+ rec. id_str ( ) . to_owned ( ) ,
82+ rec. seq_str ( ) . to_owned ( ) ,
83+ rec. qual_str ( ) . to_owned ( ) ,
84+ ) ;
85+ }
10386 }
87+ order += 1 ;
10488 }
105- order += 1 ;
89+ }
90+ info ! ( "all records has been readed into memory, start write to output ..." ) ;
91+ get. sort_unstable_by_key ( |x| x. 0 ) ; // sort by order to keep the raw order
92+ for ( _, id, seq, qual) in get {
93+ write_record (
94+ & mut fq_writer,
95+ id. as_bytes ( ) ,
96+ seq. as_bytes ( ) ,
97+ qual. as_bytes ( ) ,
98+ ) ?;
10699 }
107100 }
108101
109- let mut fq_writer = file_writer ( out, compression_level, stdout_type) ?;
110- for rec in get. iter ( ) {
111- write_record (
112- & mut fq_writer,
113- rec[ 0 ] . as_bytes ( ) ,
114- rec[ 1 ] . as_bytes ( ) ,
115- rec[ 2 ] . as_bytes ( ) ,
116- ) ?
117- }
118102 fq_writer. flush ( ) ?;
119103
120104 Ok ( ( ) )
121105}
122-
123- pub fn subset_fastq (
124- rdc : bool ,
125- file : Option < & String > ,
126- n : usize ,
127- seed : u64 ,
128- out : Option < & String > ,
129- compression_level : u32 ,
130- stdout_type : char ,
131- ) -> Result < ( ) , FqkitError > {
132- if rdc {
133- if file. is_none ( ) {
134- error ! ( "opt -r used, fastq data can't from stdin." ) ;
135- std:: process:: exit ( 1 ) ;
136- }
137- select_fastq ( file, n, seed, out, compression_level, stdout_type) ?;
138- } else {
139- select_fastq2 ( file, n, seed, out, compression_level, stdout_type) ?;
140- }
141-
142- Ok ( ( ) )
143- }
0 commit comments