11#!/usr/bin/env python
22
3+ import sys
4+ import numpy as np
35from numpy import random as npr
6+ from scipy .stats import geom
47
5-
8+ def parse_yes_no (astring ):
9+ if "yes" in astring :
10+ return (True )
11+ elif "no" in astring :
12+ return (False )
13+ else :
14+ sys .exit ("Please specify deamination (yes | no)" )
615
716def get_basename (file_name ):
817 if ("/" ) in file_name :
@@ -11,6 +20,9 @@ def get_basename(file_name):
1120 basename = file_name .split ("." )[0 ]
1221 return (basename )
1322
23+ def scale (x , themin , themax ):
24+ return (np .interp (x , (x .min (), x .max ()), (themin , themax )))
25+
1426def reverse_complement (dna ) :
1527 dna = dna .upper ()
1628 '''
@@ -75,6 +87,27 @@ def complement_read(all_inserts, adaptor, read_length):
7587 result .append ("" .join (read ))
7688 return (result )
7789
90+ def add_damage (all_inserts , geom_p , scale_min , scale_max ):
91+ for i in range (0 , len (all_inserts )):
92+ insert = list (all_inserts [i ])
93+ insertlen = len (insert )
94+ x = np .arange (1 , insertlen + 1 )
95+ geom_dist = scale (geom .pmf (x , geom_p ),scale_min ,scale_max )
96+
97+ for j in range (0 , insertlen ):
98+ pos = j
99+ opp_pos = insertlen - 1 - j
100+
101+ ## C -> T deamination
102+ if insert [pos ] == "C" and geom_dist [j ] >= npr .rand ():
103+ insert [pos ] = "T"
104+
105+ ## G -> A deamination
106+ if insert [opp_pos ] == "G" and geom_dist [j ] >= npr .rand ():
107+ insert [opp_pos ] = "A"
108+ all_inserts [i ] = "" .join (insert )
109+ return (all_inserts )
110+
78111def add_error (all_reads , error_rate ):
79112 for i in range (0 , len (all_reads )):
80113 read = list (all_reads [i ])
@@ -83,7 +116,7 @@ def add_error(all_reads, error_rate):
83116 read [j ] = "N"
84117 if npr .random () < error_rate :
85118 read [j ] = npr .choice (["A" ,"T" ,"G" ,"C" ])
86- all_reads [i ] = "" .join (read )
119+ all_reads [i ] = "" .join (read )
87120 return (all_reads )
88121
89122def prepare_fastq (fastq_dict , fwd_reads , rev_reads , basename , read_length , quality ):
@@ -111,18 +144,20 @@ def write_fastq_multi(fastq_dict, outputfile):
111144 f2 .write (reads2 )
112145
113146
114- def run_read_simulation_multi (INFILE , NREAD , COV , READLEN , INSERLEN , LENDEV , A1 , A2 , MINLENGTH , ERR , fastq_dict , QUALITY ):
115- print ("INFILE: " , INFILE )
147+ def run_read_simulation_multi (INFILE , NREAD , COV , READLEN , INSERLEN , LENDEV , A1 , A2 , MINLENGTH , ERR , DAMAGE , GEOM_P , THEMIN , THEMAX , fastq_dict , QUALITY ):
148+ print ("===================" )
149+ print ("Genome: " , INFILE )
116150 if COV :
117- print ("COV : " , COV )
151+ print ("Coverage : " , COV )
118152 else :
119- print ("NREAD: " , NREAD )
120- print ("READLEN: " , READLEN )
121- print ("INSERLEN: " , INSERLEN )
122- print ("LENDEV: " , LENDEV )
123- print ("A1: " , A1 )
124- print ("A2: " , A2 )
125- print ("QUALITY" , QUALITY )
153+ print ("Number of reads: " , NREAD )
154+ print ("Read length: " , READLEN )
155+ print ("Mean Insert length: " , INSERLEN )
156+ print ("Insert length standard deviation: " , LENDEV )
157+ print ("Adaptor 1: " , A1 )
158+ print ("Adaptor 2: " , A2 )
159+ print ("Quality :" , QUALITY )
160+ print ("Deamination:" , DAMAGE )
126161 nread = None
127162
128163
@@ -131,14 +166,22 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
131166
132167 if COV :
133168 nread = int (fasta [1 ]/ INSERLEN )
134- print ("nread: " , nread )
169+ print ("Number of reads: " , nread )
170+ print ("===================\n " )
135171
136172 insert_lengths = [int (n ) for n in npr .normal (INSERLEN , LENDEV , nread )]
137173
138174
139175
140176
141177 all_inserts = random_insert (fasta , insert_lengths , READLEN , MINLENGTH )
178+ if DAMAGE == True :
179+ all_inserts = add_damage (
180+ all_inserts = all_inserts ,
181+ geom_p = GEOM_P ,
182+ scale_min = THEMIN ,
183+ scale_max = THEMAX )
184+
142185 fwd_inserts = all_inserts
143186 rev_inserts = [reverse_complement (i ) for i in all_inserts ]
144187 fwd_reads = complement_read (fwd_inserts , A1 , READLEN )
0 commit comments