Skip to content

Commit f570b0a

Browse files
committed
add compressed I/O with xopen
1 parent 72c5691 commit f570b0a

File tree

7 files changed

+176
-143298
lines changed

7 files changed

+176
-143298
lines changed

adrsm/lib/adrsmlib.py

Lines changed: 162 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@
1111
from pkg_resources import resource_filename
1212
from . import sequencefunctions as sf
1313
from . import markov as mk
14+
from xopen import xopen
1415

1516

1617
def parse_yes_no(astring):
1718
if "yes" in astring:
18-
return(True)
19+
return True
1920
elif "no" in astring:
20-
return(False)
21+
return False
2122
else:
2223
sys.exit("Please specify deamination (yes | no)")
2324

@@ -27,7 +28,7 @@ def get_basename(file_name):
2728
basename = file_name.split("/")[-1].split(".")[0]
2829
else:
2930
basename = file_name.split(".")[0]
30-
return(basename)
31+
return basename
3132

3233

3334
def read_fasta(file_name):
@@ -40,95 +41,155 @@ def read_fasta(file_name):
4041
"""
4142
result = ""
4243
# fastadict = {}
43-
with open(file_name, "r") as f:
44-
for line in f:
45-
if line[0] == ">":
46-
# seqname = line[1:]
47-
# fastadict[seqname] = []
48-
continue
49-
else:
50-
line = line.rstrip()
51-
# fastadict[seqname].append(line)
52-
result += line
53-
return([result, len(result)])
54-
55-
56-
def write_fastq_multi(fastq_list, outputfile):
57-
with open(outputfile + ".1.fastq", "a") as f1:
58-
with open(outputfile + ".2.fastq", "a") as f2:
59-
for read in fastq_list:
60-
f1.write(read[0])
61-
f2.write(read[1])
44+
if file_name.endswith(".gz"):
45+
with xopen(file_name, "r") as f:
46+
for line in f:
47+
if line[0] == ">":
48+
# seqname = line[1:]
49+
# fastadict[seqname] = []
50+
continue
51+
else:
52+
line = line.rstrip()
53+
# fastadict[seqname].append(line)
54+
result += line
55+
else:
56+
with open(file_name, "r") as f:
57+
for line in f:
58+
if line[0] == ">":
59+
# seqname = line[1:]
60+
# fastadict[seqname] = []
61+
continue
62+
else:
63+
line = line.rstrip()
64+
# fastadict[seqname].append(line)
65+
result += line
66+
return [result, len(result)]
67+
68+
69+
def write_fastq_multi(fastq_list, outputfile, compressed=True):
70+
if compressed:
71+
with xopen(outputfile + ".1.fastq.gz", "ab") as f1:
72+
with xopen(outputfile + ".2.fastq.gz", "ab") as f2:
73+
for read in fastq_list:
74+
f1.write(read[0].encode())
75+
f2.write(read[1].encode())
76+
else:
77+
with open(outputfile + ".1.fastq", "a") as f1:
78+
with open(outputfile + ".2.fastq", "a") as f2:
79+
for read in fastq_list:
80+
f1.write(read[0])
81+
f2.write(read[1])
6282

6383

6484
def markov_wrapper_fwd(times):
6585
a = 0
66-
while(a == 0):
67-
a = mk.mchain(starts=MARKOV_START_FWD, kmers=MARKOV_DICT_FWD,
68-
readsize=READSIZE, order=MARKOV_ORDER)
69-
return(a)
86+
while a == 0:
87+
a = mk.mchain(
88+
starts=MARKOV_START_FWD,
89+
kmers=MARKOV_DICT_FWD,
90+
readsize=READSIZE,
91+
order=MARKOV_ORDER,
92+
)
93+
return a
7094

7195

7296
def markov_wrapper_rev(times):
7397
a = 0
74-
while(a == 0):
75-
a = mk.mchain(starts=MARKOV_START_REV, kmers=MARKOV_DICT_REV,
76-
readsize=READSIZE, order=MARKOV_ORDER)
77-
return(a)
98+
while a == 0:
99+
a = mk.mchain(
100+
starts=MARKOV_START_REV,
101+
kmers=MARKOV_DICT_REV,
102+
readsize=READSIZE,
103+
order=MARKOV_ORDER,
104+
)
105+
return a
78106

79107

80108
def markov_multi_fwd(process, nreads):
81109
myIter = range(nreads)
82110
with multiprocessing.Pool(process) as p:
83111
r = p.map(markov_wrapper_fwd, myIter)
84-
return(r)
112+
return r
85113

86114

87115
def markov_multi_rev(process, nreads):
88116
myIter = range(nreads)
89117
with multiprocessing.Pool(process) as p:
90118
r = p.map(markov_wrapper_rev, myIter)
91-
return(r)
119+
return r
92120

93121

94122
def get_fwd_qual():
95123
try:
96-
ret = pickle.load(open("data/quality/fwd_qual.p", 'rb'))
97-
return(ret)
124+
ret = pickle.load(open("data/quality/fwd_qual.p", "rb"))
125+
return ret
98126
except FileNotFoundError:
99-
path = resource_filename('adrsm', '/data/quality/fwd_qual.p')
100-
ret = pickle.load(open(path, 'rb'))
101-
return(ret)
127+
path = resource_filename("adrsm", "/data/quality/fwd_qual.p")
128+
ret = pickle.load(open(path, "rb"))
129+
return ret
102130

103131

104132
def get_rev_qual():
105133
try:
106-
ret = pickle.load(open("data/quality/fwd_qual.p", 'rb'))
107-
return(ret)
134+
ret = pickle.load(open("data/quality/fwd_qual.p", "rb"))
135+
return ret
108136
except FileNotFoundError:
109-
path = resource_filename('adrsm', '/data/quality/rev_qual.p')
110-
ret = pickle.load(open(path, 'rb'))
111-
return(ret)
112-
113-
114-
def multi_run(iterables, name, mutate, mutrate, damage, geom_p, themin, themax, fwd_adaptor, rev_adaptor, read_length, process):
115-
partial_run = partial(sf.generate_fq,
116-
name=name,
117-
mutate=mutate,
118-
mutrate=mutrate,
119-
damage=damage,
120-
geom_p=geom_p,
121-
themin=themin,
122-
themax=themax,
123-
fwd_adaptor=fwd_adaptor,
124-
rev_adaptor=rev_adaptor,
125-
read_length=read_length)
137+
path = resource_filename("adrsm", "/data/quality/rev_qual.p")
138+
ret = pickle.load(open(path, "rb"))
139+
return ret
140+
141+
142+
def multi_run(
143+
iterables,
144+
name,
145+
mutate,
146+
mutrate,
147+
damage,
148+
geom_p,
149+
themin,
150+
themax,
151+
fwd_adaptor,
152+
rev_adaptor,
153+
read_length,
154+
process,
155+
):
156+
partial_run = partial(
157+
sf.generate_fq,
158+
name=name,
159+
mutate=mutate,
160+
mutrate=mutrate,
161+
damage=damage,
162+
geom_p=geom_p,
163+
themin=themin,
164+
themax=themax,
165+
fwd_adaptor=fwd_adaptor,
166+
rev_adaptor=rev_adaptor,
167+
read_length=read_length,
168+
)
126169
with multiprocessing.Pool(process) as p:
127170
r = p.map(partial_run, iterables)
128-
return(r)
129-
130-
131-
def run_read_simulation_multi(INFILE, COV, READLEN, INSERLEN, NBINOM, A1, A2, MINLENGTH, MUTATE, MUTRATE, AGE, DAMAGE, GEOM_P, THEMIN, THEMAX, PROCESS, FASTQ_OUT):
171+
return r
172+
173+
174+
def run_read_simulation_multi(
175+
INFILE,
176+
COV,
177+
READLEN,
178+
INSERLEN,
179+
NBINOM,
180+
A1,
181+
A2,
182+
MINLENGTH,
183+
MUTATE,
184+
MUTRATE,
185+
AGE,
186+
DAMAGE,
187+
GEOM_P,
188+
THEMIN,
189+
THEMAX,
190+
PROCESS,
191+
FASTQ_OUT,
192+
):
132193
print("===================\n===================")
133194
print("Genome: ", INFILE)
134195
print("Coverage: ", COV)
@@ -165,9 +226,11 @@ def run_read_simulation_multi(INFILE, COV, READLEN, INSERLEN, NBINOM, A1, A2, MI
165226
QUALIT_FWD = get_fwd_qual()
166227
QUALIT_REV = get_rev_qual()
167228
MARKOV_SEED_FWD = mk.generate_kmer(
168-
qualities=QUALIT_FWD, order=MARKOV_ORDER, readsize=READLEN)
229+
qualities=QUALIT_FWD, order=MARKOV_ORDER, readsize=READLEN
230+
)
169231
MARKOV_SEED_REV = mk.generate_kmer(
170-
qualities=QUALIT_REV, order=MARKOV_ORDER, readsize=READLEN)
232+
qualities=QUALIT_REV, order=MARKOV_ORDER, readsize=READLEN
233+
)
171234
MARKOV_START_FWD = MARKOV_SEED_FWD[0]
172235
MARKOV_START_REV = MARKOV_SEED_REV[0]
173236
MARKOV_DICT_FWD = MARKOV_SEED_FWD[1]
@@ -184,28 +247,30 @@ def run_read_simulation_multi(INFILE, COV, READLEN, INSERLEN, NBINOM, A1, A2, MI
184247
correct_mutrate = 0
185248

186249
# Prepare fragments and errors
187-
all_fragments = sf.random_insert(
188-
fasta, fragment_lengths, READLEN, MINLENGTH)
250+
all_fragments = sf.random_insert(fasta, fragment_lengths, READLEN, MINLENGTH)
189251
fwd_illu_err = markov_multi_fwd(process=PROCESS, nreads=len(all_fragments))
190252
rev_illu_err = markov_multi_rev(process=PROCESS, nreads=len(all_fragments))
191253

192-
runlist = sf.prepare_run(all_frag=all_fragments,
193-
all_fwd_err=fwd_illu_err, all_rev_err=rev_illu_err)
194-
195-
result = multi_run(iterables=runlist,
196-
name=basename,
197-
mutate=MUTATE,
198-
mutrate=correct_mutrate,
199-
damage=DAMAGE,
200-
geom_p=GEOM_P,
201-
themin=THEMIN,
202-
themax=THEMAX,
203-
fwd_adaptor=A1,
204-
rev_adaptor=A2,
205-
read_length=READLEN,
206-
process=PROCESS)
254+
runlist = sf.prepare_run(
255+
all_frag=all_fragments, all_fwd_err=fwd_illu_err, all_rev_err=rev_illu_err
256+
)
257+
258+
result = multi_run(
259+
iterables=runlist,
260+
name=basename,
261+
mutate=MUTATE,
262+
mutrate=correct_mutrate,
263+
damage=DAMAGE,
264+
geom_p=GEOM_P,
265+
themin=THEMIN,
266+
themax=THEMAX,
267+
fwd_adaptor=A1,
268+
rev_adaptor=A2,
269+
read_length=READLEN,
270+
process=PROCESS,
271+
)
207272
write_fastq_multi(fastq_list=result, outputfile=FASTQ_OUT)
208-
return([nread * INSERLEN, INSERLEN, COV, DAMAGE])
273+
return [nread * INSERLEN, INSERLEN, COV, DAMAGE]
209274

210275

211276
def specie_to_taxid(specie):
@@ -222,7 +287,7 @@ def specie_to_taxid(specie):
222287
request = "http://taxonomy.jgi-psf.org/tax/pt_name/" + specie
223288
response = requests.get(request)
224289
answer = response.text
225-
return(answer)
290+
return answer
226291

227292

228293
def write_stat(stat_dict, stat_out):
@@ -232,8 +297,21 @@ def write_stat(stat_dict, stat_out):
232297
totbases = sum(nbases)
233298
with open(stat_out, "w") as fs:
234299
fs.write(
235-
"Organism,taxonomy_id,percentage of metagenome,mean_insert_length,target_coverage,deamination\n")
300+
"Organism,taxonomy_id,percentage of metagenome,mean_insert_length,target_coverage,deamination\n"
301+
)
236302
for akey in stat_dict:
237303
taxid = specie_to_taxid(akey)
238-
fs.write(akey + "," + str(taxid) + "," + str(round(stat_dict[akey][0] / totbases, 2)) + "," + str(
239-
stat_dict[akey][1]) + "," + str(stat_dict[akey][2]) + "," + str(stat_dict[akey][3]) + "\n")
304+
fs.write(
305+
akey
306+
+ ","
307+
+ str(taxid)
308+
+ ","
309+
+ str(round(stat_dict[akey][0] / totbases, 2))
310+
+ ","
311+
+ str(stat_dict[akey][1])
312+
+ ","
313+
+ str(stat_dict[akey][2])
314+
+ ","
315+
+ str(stat_dict[akey][3])
316+
+ "\n"
317+
)

setup.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,22 @@
22
from setuptools import setup, find_packages
33

44
setup(
5-
name='adrsm',
5+
name="adrsm",
66
version=__version__,
7-
description='Ancient DNA Read Simulator for Metagenomic ',
7+
description="Ancient DNA Read Simulator for Metagenomic ",
88
long_description=open("README.md").read(),
9-
url='https://github.com/maxibor/adrsm',
9+
url="https://github.com/maxibor/adrsm",
1010
long_description_content_type="text/markdown",
11-
license='MIT',
11+
license="MIT",
1212
python_requires=">=3.6",
1313
install_requires=[
14-
'numpy >=1.18.1',
15-
'scipy >= 1.3.1',
16-
'requests >= 2.22',
17-
'click'
14+
"numpy >=1.18.1",
15+
"scipy >= 1.3.1",
16+
"requests >= 2.22",
17+
"click",
18+
"xopen",
1819
],
19-
packages=find_packages(include=['adrsm', 'adrsm.lib']),
20-
package_data= {'adrsm': ['data/quality/fwd_qual.p','data/quality/rev_qual.p'] },
21-
entry_points={
22-
'console_scripts': [
23-
'adrsm= adrsm.adrsm:cli'
24-
]
25-
}
20+
packages=find_packages(include=["adrsm", "adrsm.lib"]),
21+
package_data={"adrsm": ["data/quality/fwd_qual.p", "data/quality/rev_qual.p"]},
22+
entry_points={"console_scripts": ["adrsm= adrsm.adrsm:cli"]},
2623
)

0 commit comments

Comments
 (0)