22
33from  numpy  import  random  as  npr 
44
5- QUALITY   =   "G" 
5+ 
66
77def  get_basename (file_name ):
88    if  ("/" ) in  file_name :
@@ -18,7 +18,7 @@ def reverse_complement(dna) :
1818    ''' 
1919    dna  =  dna [::- 1 ]
2020    revcom  =  []
21-     complement  =  {"A"  : "T" , "T"  : "A"  , "G"  : "C"  , "C"  : "G" }
21+     complement  =  {"A"  : "T" , "T"  : "A"  , "G"  : "C"  , "C"  : "G" ,  "N" :  "N" }
2222    for  letter  in  dna  :
2323        for  key  in  complement .keys () :
2424            if  letter  ==  key  :
@@ -66,25 +66,37 @@ def complement_read(all_inserts, adaptor, read_length):
6666            read  =  insert 
6767        elif  inlen  >  read_length :
6868            read  =  insert [0 :read_length ]
69-         result .append (read )
69+         if  len (read ) ==  read_length :
70+             read  =  read .upper ()
71+             read  =  list (read )
72+             for  j  in  range (0 , len (read )):
73+                 if  read [j ] not  in "A" ,"T" ,"G" ,"C" ,"N" ]:
74+                     read [j ] =  "N" 
75+             result .append ("" .join (read ))
7076    return (result )
7177
7278def  add_error (all_reads , error_rate ):
7379    for  i  in  range (0 , len (all_reads )):
7480        read  =  list (all_reads [i ])
7581        for  j  in  range (0 , len (read )):
82+             if  read [j ].upper () not  in "A" ,"T" ,"G" ,"C" ,"N" ]:
83+                 read [j ] =  "N" 
7684            if  npr .random () <  error_rate :
7785                read [j ] =  npr .choice (["A" ,"T" ,"G" ,"C" ])
7886                all_reads [i ] =  "" .join (read )
7987    return (all_reads )
8088
81- def  prepare_fastq (fastq_dict , fwd_reads , rev_reads , basename , read_length ):
89+ def  prepare_fastq (fastq_dict , fwd_reads , rev_reads , basename , read_length ,  quality ):
8290    fastq_dict [basename ] =  [[] for  i  in  range (2 )]
8391    cnt  =  1 
8492    for  read1 , read2  in  zip (fwd_reads , rev_reads ):
85-         towrite_fwd  =  "@" + basename + "_" + str (cnt )+ "/1" + "\n " + read1 + "\n +\n " + QUALITY * read_length + "\n " 
93+         read1  =  read1 .rstrip ()
94+         read2  =  read2 .rstrip ()
95+         readlen1  =  len (read1 )
96+         readlen2  =  len (read2 )
97+         towrite_fwd  =  "@" + basename + "_" + str (cnt )+ "/1" + "\n " + read1 + "\n +\n " + quality * readlen1 + "\n " 
8698        fastq_dict [basename ][0 ].append (towrite_fwd )
87-         towrite_rev  =  "@" + basename + "_" + str (cnt )+ "/2" + "\n " + read2 + "\n +\n " + QUALITY * read_length + "\n " 
99+         towrite_rev  =  "@" + basename + "_" + str (cnt )+ "/2" + "\n " + read2 + "\n +\n " + quality * readlen2 + "\n " 
88100        fastq_dict [basename ][1 ].append (towrite_rev )
89101        cnt  +=  1 
90102    return (fastq_dict )
@@ -98,64 +110,8 @@ def write_fastq_multi(fastq_dict, outputfile):
98110                for  reads2  in  fastq_dict [akey ][1 ]:
99111                    f2 .write (reads2 )
100112
101- def  write_fastq (all_reads , basename , orientation , read_length , outfile ):
102-     if  not  outfile :
103-         with  open (basename + "." + str (orientation )+ ".fastq" , "w" ) as  fw :
104-             for  read  in  all_reads :
105-                 fw .write ("@" + basename + "\n " )
106-                 fw .write (read + "\n " )
107-                 fw .write ("+\n " )
108-                 fw .write (QUALITY * read_length + "\n " )
109-     else :
110-         with  open (outfile + "." + str (orientation )+ ".fastq" , "w" ) as  fw :
111-             for  read  in  all_reads :
112-                 fw .write ("@" + basename + "\n " )
113-                 fw .write (read + "\n " )
114-                 fw .write ("+\n " )
115-                 fw .write (QUALITY * read_length + "\n " )
116- 
117- 
118- 
119- def  run_read_simulation (INFILE , NREAD , COV , READLEN , INSERLEN , LENDEV , A1 , A2 , OUTFILE , MINLENGTH , ERR ):
120-     print ("INFILE: " , INFILE )
121-     if  COV :
122-         print ("COV: " , COV )
123-     else :
124-         print ("NREAD: " , NREAD )
125-     print ("READLEN: " , READLEN )
126-     print ("INSERLEN: " , INSERLEN )
127-     print ("LENDEV: " , LENDEV )
128-     print ("A1: " , A1 )
129-     print ("A2: " , A2 )
130-     print ("OUTFILE: " , OUTFILE )
131- 
132-     nread  =  None 
133- 
134- 
135-     basename  =  get_basename (INFILE )
136-     fasta  =  read_fasta (INFILE )
137- 
138-     if  COV :
139-         nread  =  int (fasta [1 ]/ INSERLEN )
140-         print ("nread: " , nread )
141- 
142-     insert_lengths  =  [int (n ) for  n  in  npr .normal (INSERLEN , LENDEV , nread )]
143- 
144- 
145- 
146- 
147-     all_inserts  =  random_insert (fasta , insert_lengths , READLEN , MINLENGTH )
148-     fwd_inserts  =  all_inserts 
149-     rev_inserts  =  [reverse_complement (i ) for  i  in  all_inserts ]
150-     fwd_reads  =  complement_read (fwd_inserts , A1 , READLEN )
151-     fwd_reads  =  add_error (fwd_reads , ERR )
152-     rev_reads  =  complement_read (rev_inserts , A2 , READLEN )
153-     rev_reads  =  add_error (rev_reads , ERR )
154- 
155-     write_fastq (fwd_reads , basename , 1 , READLEN , OUTFILE )
156-     write_fastq (rev_reads , basename , 2 , READLEN , OUTFILE )
157113
158- def  run_read_simulation_multi (INFILE , NREAD , COV , READLEN , INSERLEN , LENDEV , A1 , A2 , MINLENGTH , ERR , fastq_dict ):
114+ def  run_read_simulation_multi (INFILE , NREAD , COV , READLEN , INSERLEN , LENDEV , A1 , A2 , MINLENGTH , ERR , fastq_dict ,  QUALITY ):
159115    print ("INFILE: " , INFILE )
160116    if  COV :
161117        print ("COV: " , COV )
@@ -166,6 +122,7 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
166122    print ("LENDEV: " , LENDEV )
167123    print ("A1: " , A1 )
168124    print ("A2: " , A2 )
125+     print ("QUALITY" , QUALITY )
169126    nread  =  None 
170127
171128
@@ -189,7 +146,12 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
189146    rev_reads  =  complement_read (rev_inserts , A2 , READLEN )
190147    rev_reads  =  add_error (rev_reads , ERR )
191148
192-     prepare_fastq (fastq_dict  =  fastq_dict , fwd_reads  =  fwd_reads , rev_reads  =  rev_reads , basename  =  basename , read_length  =  READLEN )
149+     prepare_fastq (fastq_dict  =  fastq_dict ,
150+                   fwd_reads  =  fwd_reads ,
151+                   rev_reads  =  rev_reads ,
152+                   basename  =  basename ,
153+                   read_length  =  READLEN ,
154+                   quality  =  QUALITY )
193155    return (nread  *  INSERLEN )
194156
195157def  write_stat (stat_dict , stat_out ):
0 commit comments