Skip to content

Commit ca6ae0e

Browse files
committed
adding deamination
1 parent 090f5d4 commit ca6ae0e

File tree

4 files changed

+92
-19
lines changed

4 files changed

+92
-19
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@ python:
33
- "3.6"
44
# command to install dependencies
55
install: "pip install numpy"
6+
install: "pip install scipy"
67
# command to run tests
78
script: python adrsm -d ./data/genomes ./data/short_genome_list.csv

adrsm

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,21 @@ def _get_args():
4242
default=0.01,
4343
help="Illumina sequecing error. Default = 0.01")
4444
parser.add_argument(
45+
'-p',
46+
dest="geom_p",
47+
default=0.5,
48+
help="Geometric distribution parameter for deamination")
49+
parser.add_argument(
50+
'-m',
51+
dest="min",
52+
default=0.001,
53+
help="Deamination substitution base frequency.")
54+
parser.add_argument(
55+
'-M',
56+
dest="max",
57+
default=0.3,
58+
help="Deamination substitution max frequency.")
59+
parser.add_argument(
4560
'-o',
4661
dest="output",
4762
default="metagenome",
@@ -66,11 +81,15 @@ def _get_args():
6681
a1 = args.fwdAdapt
6782
a2 = args.revAdapt
6883
err = float(args.error)
84+
geom_p = args.geom_p
85+
themin = args.min
86+
themax = args.max
6987
outfile= args.output
7088
quality = args.quality
7189
stats = args.stats
7290

73-
return(infile, gendir, readlen, lendev, a1, a2, err, outfile, quality, stats)
91+
return(infile, gendir, readlen, lendev, a1, a2, err, geom_p, themin, themax, outfile, quality, stats)
92+
7493

7594
def read_config(infile, gendir):
7695
"""
@@ -85,11 +104,14 @@ def read_config(infile, gendir):
85104
agenome = splitline[0].replace(" ","")
86105
ainsert = int(splitline[1].replace(" ",""))
87106
acov = float(splitline[2].replace(" ",""))
88-
genomes[gendir+"/"+agenome] = [ainsert, acov]
107+
deambool = str(splitline[3].replace(" ",""))
108+
deamination = ad.parse_yes_no(deambool)
109+
genomes[gendir+"/"+agenome] = [ainsert, acov, deamination]
89110
return(genomes)
90111

91112
if __name__ == "__main__":
92-
INFILE, GENDIR, READLEN, LENDEV, A1, A2, ERR, OUTFILE, QUALITY, STATS = _get_args()
113+
INFILE,GENDIR,READLEN,LENDEV,A1,A2,ERR,GEOM_P,THEMIN,THEMAX,OUTFILE,QUALITY,STATS = _get_args()
114+
93115
MINLENGTH = 20
94116

95117
genome_dict = {}
@@ -106,9 +128,16 @@ if __name__ == "__main__":
106128
A2 = A2,
107129
MINLENGTH = MINLENGTH,
108130
ERR = ERR,
131+
DAMAGE = all_genomes[agenome][2],
132+
GEOM_P = GEOM_P,
133+
THEMIN = THEMIN,
134+
THEMAX = THEMAX,
109135
fastq_dict = genome_dict,
110136
QUALITY=QUALITY)
111137
stat_dict[ad.get_basename(agenome)] = stat_and_run
112138

113139
ad.write_fastq_multi(fastq_dict=genome_dict, outputfile=OUTFILE)
114140
ad.write_stat(stat_dict=stat_dict, stat_out=STATS)
141+
print("-- ADRSM Finished --")
142+
print("-- FASTQ files written to "+OUTFILE+".1.fastq and "+OUTFILE+".2.fastq --")
143+
print("-- Statistic file written to "+STATS+ " --")

data/short_genome_list.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
genome, insert_size, coverage
2-
Agrobacterium_tumefaciens_genome.fa, 47 , 0.1
3-
Bacillus_anthracis_genome.fa, 48, 0.2
1+
genome(mandatory), insert_size(mandatory), coverage(mandatory), deamination(mandatory)
2+
Agrobacterium_tumefaciens_genome.fa, 47 , 0.1, yes
3+
Bacillus_anthracis_genome.fa, 48, 0.2, no

lib/adrsmlib.py

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
11
#!/usr/bin/env python
22

3+
import sys
4+
import numpy as np
35
from numpy import random as npr
6+
from scipy.stats import geom
47

5-
8+
def parse_yes_no(astring):
9+
if "yes" in astring:
10+
return(True)
11+
elif "no" in astring:
12+
return(False)
13+
else :
14+
sys.exit("Please specify deamination (yes | no)")
615

716
def get_basename(file_name):
817
if ("/") in file_name:
@@ -11,6 +20,9 @@ def get_basename(file_name):
1120
basename = file_name.split(".")[0]
1221
return(basename)
1322

23+
def scale(x, themin, themax):
24+
return(np.interp(x, (x.min(), x.max()), (themin, themax)))
25+
1426
def reverse_complement(dna) :
1527
dna = dna.upper()
1628
'''
@@ -75,6 +87,27 @@ def complement_read(all_inserts, adaptor, read_length):
7587
result.append("".join(read))
7688
return(result)
7789

90+
def add_damage(all_inserts, geom_p, scale_min, scale_max):
91+
for i in range(0, len(all_inserts)):
92+
insert = list(all_inserts[i])
93+
insertlen = len(insert)
94+
x = np.arange(1, insertlen+1)
95+
geom_dist=scale(geom.pmf(x, geom_p),scale_min,scale_max)
96+
97+
for j in range(0, insertlen):
98+
pos = j
99+
opp_pos = insertlen-1-j
100+
101+
## C -> T deamination
102+
if insert[pos] == "C" and geom_dist[j] >= npr.rand():
103+
insert[pos] = "T"
104+
105+
## G -> A deamination
106+
if insert[opp_pos] == "G" and geom_dist[j] >= npr.rand():
107+
insert[opp_pos] = "A"
108+
all_inserts[i] = "".join(insert)
109+
return(all_inserts)
110+
78111
def add_error(all_reads, error_rate):
79112
for i in range(0, len(all_reads)):
80113
read = list(all_reads[i])
@@ -83,7 +116,7 @@ def add_error(all_reads, error_rate):
83116
read[j] = "N"
84117
if npr.random() < error_rate:
85118
read[j] = npr.choice(["A","T","G","C"])
86-
all_reads[i] = "".join(read)
119+
all_reads[i] = "".join(read)
87120
return(all_reads)
88121

89122
def prepare_fastq(fastq_dict, fwd_reads, rev_reads, basename, read_length, quality):
@@ -111,18 +144,20 @@ def write_fastq_multi(fastq_dict, outputfile):
111144
f2.write(reads2)
112145

113146

114-
def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, fastq_dict, QUALITY):
115-
print("INFILE: ", INFILE)
147+
def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, DAMAGE, GEOM_P, THEMIN, THEMAX, fastq_dict, QUALITY):
148+
print("===================")
149+
print("Genome: ", INFILE)
116150
if COV:
117-
print("COV: ", COV)
151+
print("Coverage: ", COV)
118152
else:
119-
print("NREAD: ", NREAD)
120-
print("READLEN: ", READLEN)
121-
print("INSERLEN: ", INSERLEN)
122-
print("LENDEV: ", LENDEV)
123-
print("A1: ", A1)
124-
print("A2: ", A2)
125-
print("QUALITY", QUALITY)
153+
print("Number of reads: ", NREAD)
154+
print("Read length: ", READLEN)
155+
print("Mean Insert length: ", INSERLEN)
156+
print("Insert length standard deviation: ", LENDEV)
157+
print("Adaptor 1: ", A1)
158+
print("Adaptor 2: ", A2)
159+
print("Quality :", QUALITY)
160+
print("Deamination:", DAMAGE)
126161
nread = None
127162

128163

@@ -131,14 +166,22 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
131166

132167
if COV:
133168
nread = int(fasta[1]/INSERLEN)
134-
print("nread: ", nread)
169+
print("Number of reads: ", nread)
170+
print("===================\n")
135171

136172
insert_lengths = [int(n) for n in npr.normal(INSERLEN, LENDEV, nread)]
137173

138174

139175

140176

141177
all_inserts = random_insert(fasta, insert_lengths, READLEN, MINLENGTH)
178+
if DAMAGE == True:
179+
all_inserts = add_damage(
180+
all_inserts = all_inserts,
181+
geom_p = GEOM_P,
182+
scale_min = THEMIN,
183+
scale_max = THEMAX)
184+
142185
fwd_inserts = all_inserts
143186
rev_inserts = [reverse_complement(i) for i in all_inserts]
144187
fwd_reads = complement_read(fwd_inserts, A1, READLEN)

0 commit comments

Comments
 (0)