adding deamination

maxibor · maxibor · commit ca6ae0e4dd7c · 2018-03-13T18:16:20.000+01:00
diff --git a/.travis.yml b/.travis.yml
@@ -3,5 +3,6 @@ python:
   - "3.6"
 # command to install dependencies
 install: "pip install numpy"
+install: "pip install scipy"
 # command to run tests
 script: python adrsm -d ./data/genomes ./data/short_genome_list.csv
diff --git a/adrsm b/adrsm
@@ -42,6 +42,21 @@ def _get_args():
     default=0.01,
     help="Illumina sequecing error. Default = 0.01")
     parser.add_argument(
+    '-p',
+    dest="geom_p",
+    default=0.5,
+    help="Geometric distribution parameter for deamination")
+    parser.add_argument(
+    '-m',
+    dest="min",
+    default=0.001,
+    help="Deamination substitution base frequency.")
+    parser.add_argument(
+    '-M',
+    dest="max",
+    default=0.3,
+    help="Deamination substitution max frequency.")
+    parser.add_argument(
     '-o',
     dest="output",
     default="metagenome",
@@ -66,11 +81,15 @@ def _get_args():
     a1 = args.fwdAdapt
     a2 = args.revAdapt
     err = float(args.error)
+    geom_p = args.geom_p
+    themin = args.min
+    themax = args.max
     outfile= args.output
     quality = args.quality
     stats = args.stats
 
-    return(infile, gendir, readlen, lendev, a1, a2, err, outfile, quality, stats)
+    return(infile, gendir, readlen, lendev, a1, a2, err, geom_p, themin, themax, outfile, quality, stats)
+
 
 def read_config(infile, gendir):
     """
@@ -85,11 +104,14 @@ def read_config(infile, gendir):
             agenome = splitline[0].replace(" ","")
             ainsert = int(splitline[1].replace(" ",""))
             acov = float(splitline[2].replace(" ",""))
-            genomes[gendir+"/"+agenome] = [ainsert, acov]
+            deambool = str(splitline[3].replace(" ",""))
+            deamination = ad.parse_yes_no(deambool)
+            genomes[gendir+"/"+agenome] = [ainsert, acov, deamination]
     return(genomes)
 
 if __name__ == "__main__":
-    INFILE, GENDIR, READLEN, LENDEV, A1, A2, ERR, OUTFILE, QUALITY, STATS = _get_args()
+    INFILE,GENDIR,READLEN,LENDEV,A1,A2,ERR,GEOM_P,THEMIN,THEMAX,OUTFILE,QUALITY,STATS = _get_args()
+
     MINLENGTH = 20
 
     genome_dict = {}
@@ -106,9 +128,16 @@ if __name__ == "__main__":
                                   A2 = A2,
                                   MINLENGTH = MINLENGTH,
                                   ERR = ERR,
+                                  DAMAGE = all_genomes[agenome][2],
+                                  GEOM_P = GEOM_P,
+                                  THEMIN = THEMIN,
+                                  THEMAX = THEMAX,
                                   fastq_dict = genome_dict,
                                   QUALITY=QUALITY)
         stat_dict[ad.get_basename(agenome)] = stat_and_run
 
     ad.write_fastq_multi(fastq_dict=genome_dict, outputfile=OUTFILE)
     ad.write_stat(stat_dict=stat_dict, stat_out=STATS)
+    print("-- ADRSM Finished --")
+    print("-- FASTQ files written to "+OUTFILE+".1.fastq and "+OUTFILE+".2.fastq --")
+    print("-- Statistic file written to "+STATS+ " --")
diff --git a/data/short_genome_list.csv b/data/short_genome_list.csv
@@ -1,3 +1,3 @@
-genome, insert_size, coverage
-Agrobacterium_tumefaciens_genome.fa, 47 , 0.1
-Bacillus_anthracis_genome.fa, 48, 0.2
+genome(mandatory), insert_size(mandatory), coverage(mandatory), deamination(mandatory)
+Agrobacterium_tumefaciens_genome.fa, 47 , 0.1, yes
+Bacillus_anthracis_genome.fa, 48, 0.2, no
diff --git a/lib/adrsmlib.py b/lib/adrsmlib.py
@@ -1,8 +1,17 @@
 #!/usr/bin/env python
 
+import sys
+import numpy as np
 from numpy import random as npr
+from scipy.stats import geom
 
-
+def parse_yes_no(astring):
+    if "yes" in astring:
+        return(True)
+    elif "no" in astring:
+        return(False)
+    else :
+        sys.exit("Please specify deamination (yes | no)")
 
 def get_basename(file_name):
     if ("/") in file_name:
@@ -11,6 +20,9 @@ def get_basename(file_name):
         basename = file_name.split(".")[0]
     return(basename)
 
+def scale(x, themin, themax):
+    return(np.interp(x, (x.min(), x.max()), (themin, themax)))
+
 def reverse_complement(dna) :
     dna = dna.upper()
     '''
@@ -75,6 +87,27 @@ def complement_read(all_inserts, adaptor, read_length):
             result.append("".join(read))
     return(result)
 
+def add_damage(all_inserts, geom_p, scale_min, scale_max):
+    for i in range(0, len(all_inserts)):
+        insert = list(all_inserts[i])
+        insertlen = len(insert)
+        x = np.arange(1, insertlen+1)
+        geom_dist=scale(geom.pmf(x, geom_p),scale_min,scale_max)
+
+        for j in range(0, insertlen):
+            pos = j
+            opp_pos = insertlen-1-j
+
+            ## C -> T deamination
+            if insert[pos] == "C" and geom_dist[j] >= npr.rand():
+                insert[pos] = "T"
+
+            ## G -> A deamination
+            if insert[opp_pos] == "G" and geom_dist[j] >= npr.rand():
+                insert[opp_pos] = "A"
+        all_inserts[i] = "".join(insert)
+    return(all_inserts)
+
 def add_error(all_reads, error_rate):
     for i in range(0, len(all_reads)):
         read = list(all_reads[i])
@@ -83,7 +116,7 @@ def add_error(all_reads, error_rate):
                 read[j] = "N"
             if npr.random() < error_rate:
                 read[j] = npr.choice(["A","T","G","C"])
-                all_reads[i] = "".join(read)
+        all_reads[i] = "".join(read)
     return(all_reads)
 
 def prepare_fastq(fastq_dict, fwd_reads, rev_reads, basename, read_length, quality):
@@ -111,18 +144,20 @@ def write_fastq_multi(fastq_dict, outputfile):
                     f2.write(reads2)
 
 
-def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, fastq_dict, QUALITY):
-    print("INFILE: ", INFILE)
+def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR,  DAMAGE, GEOM_P, THEMIN, THEMAX, fastq_dict, QUALITY):
+    print("===================")
+    print("Genome: ", INFILE)
     if COV:
-        print("COV: ", COV)
+        print("Coverage: ", COV)
     else:
-        print("NREAD: ", NREAD)
-    print("READLEN: ", READLEN)
-    print("INSERLEN: ", INSERLEN)
-    print("LENDEV: ", LENDEV)
-    print("A1: ", A1)
-    print("A2: ", A2)
-    print("QUALITY", QUALITY)
+        print("Number of reads: ", NREAD)
+    print("Read length: ", READLEN)
+    print("Mean Insert length: ", INSERLEN)
+    print("Insert length standard deviation: ", LENDEV)
+    print("Adaptor 1: ", A1)
+    print("Adaptor 2: ", A2)
+    print("Quality :", QUALITY)
+    print("Deamination:", DAMAGE)
     nread = None
 
 
@@ -131,14 +166,22 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
 
     if COV:
         nread = int(fasta[1]/INSERLEN)
-        print("nread: ", nread)
+        print("Number of reads: ", nread)
+        print("===================\n")
 
     insert_lengths = [int(n) for n in npr.normal(INSERLEN, LENDEV, nread)]
 
 
 
 
     all_inserts = random_insert(fasta, insert_lengths, READLEN, MINLENGTH)
+    if DAMAGE == True:
+        all_inserts = add_damage(
+        all_inserts = all_inserts,
+        geom_p = GEOM_P,
+        scale_min = THEMIN,
+        scale_max = THEMAX)
+
     fwd_inserts = all_inserts
     rev_inserts = [reverse_complement(i) for i in all_inserts]
     fwd_reads = complement_read(fwd_inserts, A1, READLEN)