1111from pkg_resources import resource_filename
1212from . import sequencefunctions as sf
1313from . import markov as mk
14+ from xopen import xopen
1415
1516
1617def parse_yes_no (astring ):
1718 if "yes" in astring :
18- return ( True )
19+ return True
1920 elif "no" in astring :
20- return ( False )
21+ return False
2122 else :
2223 sys .exit ("Please specify deamination (yes | no)" )
2324
@@ -27,7 +28,7 @@ def get_basename(file_name):
2728 basename = file_name .split ("/" )[- 1 ].split ("." )[0 ]
2829 else :
2930 basename = file_name .split ("." )[0 ]
30- return ( basename )
31+ return basename
3132
3233
3334def read_fasta (file_name ):
@@ -40,95 +41,155 @@ def read_fasta(file_name):
4041 """
4142 result = ""
4243 # fastadict = {}
43- with open (file_name , "r" ) as f :
44- for line in f :
45- if line [0 ] == ">" :
46- # seqname = line[1:]
47- # fastadict[seqname] = []
48- continue
49- else :
50- line = line .rstrip ()
51- # fastadict[seqname].append(line)
52- result += line
53- return ([result , len (result )])
54-
55-
56- def write_fastq_multi (fastq_list , outputfile ):
57- with open (outputfile + ".1.fastq" , "a" ) as f1 :
58- with open (outputfile + ".2.fastq" , "a" ) as f2 :
59- for read in fastq_list :
60- f1 .write (read [0 ])
61- f2 .write (read [1 ])
44+ if file_name .endswith (".gz" ):
45+ with xopen (file_name , "r" ) as f :
46+ for line in f :
47+ if line [0 ] == ">" :
48+ # seqname = line[1:]
49+ # fastadict[seqname] = []
50+ continue
51+ else :
52+ line = line .rstrip ()
53+ # fastadict[seqname].append(line)
54+ result += line
55+ else :
56+ with open (file_name , "r" ) as f :
57+ for line in f :
58+ if line [0 ] == ">" :
59+ # seqname = line[1:]
60+ # fastadict[seqname] = []
61+ continue
62+ else :
63+ line = line .rstrip ()
64+ # fastadict[seqname].append(line)
65+ result += line
66+ return [result , len (result )]
67+
68+
69+ def write_fastq_multi (fastq_list , outputfile , compressed = True ):
70+ if compressed :
71+ with xopen (outputfile + ".1.fastq.gz" , "ab" ) as f1 :
72+ with xopen (outputfile + ".2.fastq.gz" , "ab" ) as f2 :
73+ for read in fastq_list :
74+ f1 .write (read [0 ].encode ())
75+ f2 .write (read [1 ].encode ())
76+ else :
77+ with open (outputfile + ".1.fastq" , "a" ) as f1 :
78+ with open (outputfile + ".2.fastq" , "a" ) as f2 :
79+ for read in fastq_list :
80+ f1 .write (read [0 ])
81+ f2 .write (read [1 ])
6282
6383
6484def markov_wrapper_fwd (times ):
6585 a = 0
66- while (a == 0 ):
67- a = mk .mchain (starts = MARKOV_START_FWD , kmers = MARKOV_DICT_FWD ,
68- readsize = READSIZE , order = MARKOV_ORDER )
69- return (a )
86+ while a == 0 :
87+ a = mk .mchain (
88+ starts = MARKOV_START_FWD ,
89+ kmers = MARKOV_DICT_FWD ,
90+ readsize = READSIZE ,
91+ order = MARKOV_ORDER ,
92+ )
93+ return a
7094
7195
7296def markov_wrapper_rev (times ):
7397 a = 0
74- while (a == 0 ):
75- a = mk .mchain (starts = MARKOV_START_REV , kmers = MARKOV_DICT_REV ,
76- readsize = READSIZE , order = MARKOV_ORDER )
77- return (a )
98+ while a == 0 :
99+ a = mk .mchain (
100+ starts = MARKOV_START_REV ,
101+ kmers = MARKOV_DICT_REV ,
102+ readsize = READSIZE ,
103+ order = MARKOV_ORDER ,
104+ )
105+ return a
78106
79107
80108def markov_multi_fwd (process , nreads ):
81109 myIter = range (nreads )
82110 with multiprocessing .Pool (process ) as p :
83111 r = p .map (markov_wrapper_fwd , myIter )
84- return ( r )
112+ return r
85113
86114
87115def markov_multi_rev (process , nreads ):
88116 myIter = range (nreads )
89117 with multiprocessing .Pool (process ) as p :
90118 r = p .map (markov_wrapper_rev , myIter )
91- return ( r )
119+ return r
92120
93121
94122def get_fwd_qual ():
95123 try :
96- ret = pickle .load (open ("data/quality/fwd_qual.p" , 'rb' ))
97- return ( ret )
124+ ret = pickle .load (open ("data/quality/fwd_qual.p" , "rb" ))
125+ return ret
98126 except FileNotFoundError :
99- path = resource_filename (' adrsm' , ' /data/quality/fwd_qual.p' )
100- ret = pickle .load (open (path , 'rb' ))
101- return ( ret )
127+ path = resource_filename (" adrsm" , " /data/quality/fwd_qual.p" )
128+ ret = pickle .load (open (path , "rb" ))
129+ return ret
102130
103131
104132def get_rev_qual ():
105133 try :
106- ret = pickle .load (open ("data/quality/fwd_qual.p" , 'rb' ))
107- return ( ret )
134+ ret = pickle .load (open ("data/quality/fwd_qual.p" , "rb" ))
135+ return ret
108136 except FileNotFoundError :
109- path = resource_filename ('adrsm' , '/data/quality/rev_qual.p' )
110- ret = pickle .load (open (path , 'rb' ))
111- return (ret )
112-
113-
114- def multi_run (iterables , name , mutate , mutrate , damage , geom_p , themin , themax , fwd_adaptor , rev_adaptor , read_length , process ):
115- partial_run = partial (sf .generate_fq ,
116- name = name ,
117- mutate = mutate ,
118- mutrate = mutrate ,
119- damage = damage ,
120- geom_p = geom_p ,
121- themin = themin ,
122- themax = themax ,
123- fwd_adaptor = fwd_adaptor ,
124- rev_adaptor = rev_adaptor ,
125- read_length = read_length )
137+ path = resource_filename ("adrsm" , "/data/quality/rev_qual.p" )
138+ ret = pickle .load (open (path , "rb" ))
139+ return ret
140+
141+
142+ def multi_run (
143+ iterables ,
144+ name ,
145+ mutate ,
146+ mutrate ,
147+ damage ,
148+ geom_p ,
149+ themin ,
150+ themax ,
151+ fwd_adaptor ,
152+ rev_adaptor ,
153+ read_length ,
154+ process ,
155+ ):
156+ partial_run = partial (
157+ sf .generate_fq ,
158+ name = name ,
159+ mutate = mutate ,
160+ mutrate = mutrate ,
161+ damage = damage ,
162+ geom_p = geom_p ,
163+ themin = themin ,
164+ themax = themax ,
165+ fwd_adaptor = fwd_adaptor ,
166+ rev_adaptor = rev_adaptor ,
167+ read_length = read_length ,
168+ )
126169 with multiprocessing .Pool (process ) as p :
127170 r = p .map (partial_run , iterables )
128- return (r )
129-
130-
131- def run_read_simulation_multi (INFILE , COV , READLEN , INSERLEN , NBINOM , A1 , A2 , MINLENGTH , MUTATE , MUTRATE , AGE , DAMAGE , GEOM_P , THEMIN , THEMAX , PROCESS , FASTQ_OUT ):
171+ return r
172+
173+
174+ def run_read_simulation_multi (
175+ INFILE ,
176+ COV ,
177+ READLEN ,
178+ INSERLEN ,
179+ NBINOM ,
180+ A1 ,
181+ A2 ,
182+ MINLENGTH ,
183+ MUTATE ,
184+ MUTRATE ,
185+ AGE ,
186+ DAMAGE ,
187+ GEOM_P ,
188+ THEMIN ,
189+ THEMAX ,
190+ PROCESS ,
191+ FASTQ_OUT ,
192+ ):
132193 print ("===================\n ===================" )
133194 print ("Genome: " , INFILE )
134195 print ("Coverage: " , COV )
@@ -165,9 +226,11 @@ def run_read_simulation_multi(INFILE, COV, READLEN, INSERLEN, NBINOM, A1, A2, MI
165226 QUALIT_FWD = get_fwd_qual ()
166227 QUALIT_REV = get_rev_qual ()
167228 MARKOV_SEED_FWD = mk .generate_kmer (
168- qualities = QUALIT_FWD , order = MARKOV_ORDER , readsize = READLEN )
229+ qualities = QUALIT_FWD , order = MARKOV_ORDER , readsize = READLEN
230+ )
169231 MARKOV_SEED_REV = mk .generate_kmer (
170- qualities = QUALIT_REV , order = MARKOV_ORDER , readsize = READLEN )
232+ qualities = QUALIT_REV , order = MARKOV_ORDER , readsize = READLEN
233+ )
171234 MARKOV_START_FWD = MARKOV_SEED_FWD [0 ]
172235 MARKOV_START_REV = MARKOV_SEED_REV [0 ]
173236 MARKOV_DICT_FWD = MARKOV_SEED_FWD [1 ]
@@ -184,28 +247,30 @@ def run_read_simulation_multi(INFILE, COV, READLEN, INSERLEN, NBINOM, A1, A2, MI
184247 correct_mutrate = 0
185248
186249 # Prepare fragments and errors
187- all_fragments = sf .random_insert (
188- fasta , fragment_lengths , READLEN , MINLENGTH )
250+ all_fragments = sf .random_insert (fasta , fragment_lengths , READLEN , MINLENGTH )
189251 fwd_illu_err = markov_multi_fwd (process = PROCESS , nreads = len (all_fragments ))
190252 rev_illu_err = markov_multi_rev (process = PROCESS , nreads = len (all_fragments ))
191253
192- runlist = sf .prepare_run (all_frag = all_fragments ,
193- all_fwd_err = fwd_illu_err , all_rev_err = rev_illu_err )
194-
195- result = multi_run (iterables = runlist ,
196- name = basename ,
197- mutate = MUTATE ,
198- mutrate = correct_mutrate ,
199- damage = DAMAGE ,
200- geom_p = GEOM_P ,
201- themin = THEMIN ,
202- themax = THEMAX ,
203- fwd_adaptor = A1 ,
204- rev_adaptor = A2 ,
205- read_length = READLEN ,
206- process = PROCESS )
254+ runlist = sf .prepare_run (
255+ all_frag = all_fragments , all_fwd_err = fwd_illu_err , all_rev_err = rev_illu_err
256+ )
257+
258+ result = multi_run (
259+ iterables = runlist ,
260+ name = basename ,
261+ mutate = MUTATE ,
262+ mutrate = correct_mutrate ,
263+ damage = DAMAGE ,
264+ geom_p = GEOM_P ,
265+ themin = THEMIN ,
266+ themax = THEMAX ,
267+ fwd_adaptor = A1 ,
268+ rev_adaptor = A2 ,
269+ read_length = READLEN ,
270+ process = PROCESS ,
271+ )
207272 write_fastq_multi (fastq_list = result , outputfile = FASTQ_OUT )
208- return ( [nread * INSERLEN , INSERLEN , COV , DAMAGE ])
273+ return [nread * INSERLEN , INSERLEN , COV , DAMAGE ]
209274
210275
211276def specie_to_taxid (specie ):
@@ -222,7 +287,7 @@ def specie_to_taxid(specie):
222287 request = "http://taxonomy.jgi-psf.org/tax/pt_name/" + specie
223288 response = requests .get (request )
224289 answer = response .text
225- return ( answer )
290+ return answer
226291
227292
228293def write_stat (stat_dict , stat_out ):
@@ -232,8 +297,21 @@ def write_stat(stat_dict, stat_out):
232297 totbases = sum (nbases )
233298 with open (stat_out , "w" ) as fs :
234299 fs .write (
235- "Organism,taxonomy_id,percentage of metagenome,mean_insert_length,target_coverage,deamination\n " )
300+ "Organism,taxonomy_id,percentage of metagenome,mean_insert_length,target_coverage,deamination\n "
301+ )
236302 for akey in stat_dict :
237303 taxid = specie_to_taxid (akey )
238- fs .write (akey + "," + str (taxid ) + "," + str (round (stat_dict [akey ][0 ] / totbases , 2 )) + "," + str (
239- stat_dict [akey ][1 ]) + "," + str (stat_dict [akey ][2 ]) + "," + str (stat_dict [akey ][3 ]) + "\n " )
304+ fs .write (
305+ akey
306+ + ","
307+ + str (taxid )
308+ + ","
309+ + str (round (stat_dict [akey ][0 ] / totbases , 2 ))
310+ + ","
311+ + str (stat_dict [akey ][1 ])
312+ + ","
313+ + str (stat_dict [akey ][2 ])
314+ + ","
315+ + str (stat_dict [akey ][3 ])
316+ + "\n "
317+ )
0 commit comments