Skip to content

Commit 090f5d4

Browse files
committed
adding sequence checks
1 parent 0107994 commit 090f5d4

File tree

4 files changed

+38
-148
lines changed

4 files changed

+38
-148
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ adrsm -d ./data/genomes ./data/short_genome_list.csv
3131
```
3232
maxime@gph:~$ adrsm --help
3333
usage: ADRSM [-h] [-d DIRECTORY] [-r READLENGTH] [-l LENSTDEV] [-fwd FWDADAPT]
34-
[-rev REVADAPT] [-e ERROR] [-o OUTPUT] [-s STATS]
34+
[-rev REVADAPT] [-e ERROR] [-o OUTPUT] [-q QUALITY] [-s STATS]
3535
confFile
3636
3737
Ancient DNA Read Simulator for Metagenomics
@@ -50,6 +50,7 @@ optional arguments:
5050
AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
5151
-e ERROR Illumina sequecing error. Default = 0.01
5252
-o OUTPUT Output file basename. Default = ./metagenome.*
53+
-q QUALITY Base quality encoding. Default = d (PHRED+64)
5354
-s STATS Statistic file. Default = stats.csv
5455
5556
```

adrsm

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ def _get_args():
4747
default="metagenome",
4848
help="Output file basename. Default = ./metagenome.*")
4949
parser.add_argument(
50+
'-q',
51+
dest="quality",
52+
default="d",
53+
help="Base quality encoding. Default = d (PHRED+64)")
54+
parser.add_argument(
5055
'-s',
5156
dest="stats",
5257
default="stats.csv",
@@ -62,9 +67,10 @@ def _get_args():
6267
a2 = args.revAdapt
6368
err = float(args.error)
6469
outfile= args.output
70+
quality = args.quality
6571
stats = args.stats
6672

67-
return(infile, gendir, readlen, lendev, a1, a2, err, outfile, stats)
73+
return(infile, gendir, readlen, lendev, a1, a2, err, outfile, quality, stats)
6874

6975
def read_config(infile, gendir):
7076
"""
@@ -83,7 +89,7 @@ def read_config(infile, gendir):
8389
return(genomes)
8490

8591
if __name__ == "__main__":
86-
INFILE, GENDIR, READLEN, LENDEV, A1, A2, ERR, OUTFILE, STATS = _get_args()
92+
INFILE, GENDIR, READLEN, LENDEV, A1, A2, ERR, OUTFILE, QUALITY, STATS = _get_args()
8793
MINLENGTH = 20
8894

8995
genome_dict = {}
@@ -100,7 +106,8 @@ if __name__ == "__main__":
100106
A2 = A2,
101107
MINLENGTH = MINLENGTH,
102108
ERR = ERR,
103-
fastq_dict = genome_dict)
109+
fastq_dict = genome_dict,
110+
QUALITY=QUALITY)
104111
stat_dict[ad.get_basename(agenome)] = stat_and_run
105112

106113
ad.write_fastq_multi(fastq_dict=genome_dict, outputfile=OUTFILE)

adrsm_single

Lines changed: 0 additions & 80 deletions
This file was deleted.

lib/adrsmlib.py

Lines changed: 26 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from numpy import random as npr
44

5-
QUALITY = "G"
5+
66

77
def get_basename(file_name):
88
if ("/") in file_name:
@@ -18,7 +18,7 @@ def reverse_complement(dna) :
1818
'''
1919
dna = dna[::-1]
2020
revcom = []
21-
complement = {"A" : "T", "T" : "A" , "G" : "C" , "C" : "G"}
21+
complement = {"A" : "T", "T" : "A" , "G" : "C" , "C" : "G", "N": "N"}
2222
for letter in dna :
2323
for key in complement.keys() :
2424
if letter == key :
@@ -66,25 +66,37 @@ def complement_read(all_inserts, adaptor, read_length):
6666
read = insert
6767
elif inlen > read_length:
6868
read = insert[0:read_length]
69-
result.append(read)
69+
if len(read) == read_length:
70+
read = read.upper()
71+
read = list(read)
72+
for j in range(0, len(read)):
73+
if read[j] not in ["A","T","G","C","N"]:
74+
read[j] = "N"
75+
result.append("".join(read))
7076
return(result)
7177

7278
def add_error(all_reads, error_rate):
7379
for i in range(0, len(all_reads)):
7480
read = list(all_reads[i])
7581
for j in range(0, len(read)):
82+
if read[j].upper() not in ["A","T","G","C","N"]:
83+
read[j] = "N"
7684
if npr.random() < error_rate:
7785
read[j] = npr.choice(["A","T","G","C"])
7886
all_reads[i] = "".join(read)
7987
return(all_reads)
8088

81-
def prepare_fastq(fastq_dict, fwd_reads, rev_reads, basename, read_length):
89+
def prepare_fastq(fastq_dict, fwd_reads, rev_reads, basename, read_length, quality):
8290
fastq_dict[basename] = [[] for i in range(2)]
8391
cnt = 1
8492
for read1, read2 in zip(fwd_reads, rev_reads):
85-
towrite_fwd = "@"+basename+"_"+str(cnt)+"/1"+"\n"+read1+"\n+\n"+QUALITY*read_length+"\n"
93+
read1 = read1.rstrip()
94+
read2 = read2.rstrip()
95+
readlen1 = len(read1)
96+
readlen2 = len(read2)
97+
towrite_fwd = "@"+basename+"_"+str(cnt)+"/1"+"\n"+read1+"\n+\n"+quality*readlen1+"\n"
8698
fastq_dict[basename][0].append(towrite_fwd)
87-
towrite_rev = "@"+basename+"_"+str(cnt)+"/2"+"\n"+read2+"\n+\n"+QUALITY*read_length+"\n"
99+
towrite_rev = "@"+basename+"_"+str(cnt)+"/2"+"\n"+read2+"\n+\n"+quality*readlen2+"\n"
88100
fastq_dict[basename][1].append(towrite_rev)
89101
cnt += 1
90102
return(fastq_dict)
@@ -98,64 +110,8 @@ def write_fastq_multi(fastq_dict, outputfile):
98110
for reads2 in fastq_dict[akey][1]:
99111
f2.write(reads2)
100112

101-
def write_fastq(all_reads, basename, orientation, read_length, outfile):
102-
if not outfile:
103-
with open(basename+"."+str(orientation)+".fastq", "w") as fw:
104-
for read in all_reads:
105-
fw.write("@"+basename+"\n")
106-
fw.write(read+"\n")
107-
fw.write("+\n")
108-
fw.write(QUALITY*read_length+"\n")
109-
else:
110-
with open(outfile+"."+str(orientation)+".fastq", "w") as fw:
111-
for read in all_reads:
112-
fw.write("@"+basename+"\n")
113-
fw.write(read+"\n")
114-
fw.write("+\n")
115-
fw.write(QUALITY*read_length+"\n")
116-
117-
118-
119-
def run_read_simulation(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, OUTFILE, MINLENGTH, ERR):
120-
print("INFILE: ", INFILE)
121-
if COV:
122-
print("COV: ", COV)
123-
else:
124-
print("NREAD: ", NREAD)
125-
print("READLEN: ", READLEN)
126-
print("INSERLEN: ", INSERLEN)
127-
print("LENDEV: ", LENDEV)
128-
print("A1: ", A1)
129-
print("A2: ", A2)
130-
print("OUTFILE: ", OUTFILE)
131-
132-
nread = None
133-
134-
135-
basename = get_basename(INFILE)
136-
fasta = read_fasta(INFILE)
137-
138-
if COV:
139-
nread = int(fasta[1]/INSERLEN)
140-
print("nread: ", nread)
141-
142-
insert_lengths = [int(n) for n in npr.normal(INSERLEN, LENDEV, nread)]
143-
144-
145-
146-
147-
all_inserts = random_insert(fasta, insert_lengths, READLEN, MINLENGTH)
148-
fwd_inserts = all_inserts
149-
rev_inserts = [reverse_complement(i) for i in all_inserts]
150-
fwd_reads = complement_read(fwd_inserts, A1, READLEN)
151-
fwd_reads = add_error(fwd_reads, ERR)
152-
rev_reads = complement_read(rev_inserts, A2, READLEN)
153-
rev_reads = add_error(rev_reads, ERR)
154-
155-
write_fastq(fwd_reads, basename, 1, READLEN, OUTFILE)
156-
write_fastq(rev_reads, basename, 2, READLEN, OUTFILE)
157113

158-
def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, fastq_dict):
114+
def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, fastq_dict, QUALITY):
159115
print("INFILE: ", INFILE)
160116
if COV:
161117
print("COV: ", COV)
@@ -166,6 +122,7 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
166122
print("LENDEV: ", LENDEV)
167123
print("A1: ", A1)
168124
print("A2: ", A2)
125+
print("QUALITY", QUALITY)
169126
nread = None
170127

171128

@@ -189,7 +146,12 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
189146
rev_reads = complement_read(rev_inserts, A2, READLEN)
190147
rev_reads = add_error(rev_reads, ERR)
191148

192-
prepare_fastq(fastq_dict = fastq_dict, fwd_reads = fwd_reads, rev_reads = rev_reads, basename = basename, read_length = READLEN)
149+
prepare_fastq(fastq_dict = fastq_dict,
150+
fwd_reads = fwd_reads,
151+
rev_reads = rev_reads,
152+
basename = basename,
153+
read_length = READLEN,
154+
quality = QUALITY)
193155
return(nread * INSERLEN)
194156

195157
def write_stat(stat_dict, stat_out):

0 commit comments

Comments
 (0)