Mutations executed on inserts instead of genome

maxibor · maxibor · commit 949de8e41e87 · 2018-11-15T15:20:36.000+01:00
diff --git a/adrsm b/adrsm
@@ -121,7 +121,7 @@ def read_config(infile):
             acov = float(splitline[2].replace(" ", ""))
             deambool = str(splitline[3].replace(" ", ""))
             deamination = ad.parse_yes_no(deambool)
-            if len(splitline) > 4:
+            if len(splitline) > 4 and float(splitline[4].replace(" ", "")) != 0.0:
                 mutate = True
                 mutrate = float(splitline[4].replace(" ", ""))
                 age = float(splitline[5].replace(" ", ""))
diff --git a/lib/adrsmlib.py b/lib/adrsmlib.py
@@ -25,12 +25,12 @@ def get_basename(file_name):
     return(basename)
 
 
-def add_mutation_multi(sequence, mutrate, process):
+def add_mutation_multi(sequences, mutrate, process):
     mutate_partial = partial(sf.mutate, mutrate=mutrate)
     print("Mutating...")
     with multiprocessing.Pool(process) as p:
-        mutseq = p.map(mutate_partial, list(sequence))
-    return("".join(mutseq))
+        mutseq = p.map(mutate_partial, sequences)
+    return(list(mutseq))
 
 
 def reverse_complement_multi(all_inserts, process):
@@ -49,10 +49,16 @@ def read_fasta(file_name):
         result(string): all of the sequences in fasta file, concatenated
     """
     result = ""
+    # fastadict = {}
     with open(file_name, "r") as f:
         for line in f:
-            if not line.startswith(">"):
+            if line[0] == ">":
+                # seqname = line[1:]
+                # fastadict[seqname] = []
+                continue
+            else:
                 line = line.rstrip()
+                # fastadict[seqname].append(line)
                 result = result + line
     return([result, len(result)])
 
@@ -159,12 +165,12 @@ def run_read_simulation_multi(INFILE, COV, READLEN, INSERLEN, NBINOM, A1, A2, MI
     prob = NBINOM / (NBINOM + INSERLEN)
     insert_lengths = npr.negative_binomial(NBINOM, prob, nread)
 
+    all_inserts = sf.random_insert(fasta, insert_lengths, READLEN, MINLENGTH)
+
     if MUTATE:
         correct_mutrate = (MUTRATE * AGE) / fasta[1]
-        fasta[0] = add_mutation_multi(
-            sequence=fasta[0], mutrate=correct_mutrate, process=PROCESS)
-
-    all_inserts = sf.random_insert(fasta, insert_lengths, READLEN, MINLENGTH)
+        all_inserts = add_mutation_multi(
+            sequences=all_inserts, mutrate=correct_mutrate, process=PROCESS)
 
     if DAMAGE:
         all_inserts = add_damage_multi(
diff --git a/lib/sequencefunctions.py b/lib/sequencefunctions.py
@@ -86,17 +86,21 @@ def add_error(read, error_rate):
     return("".join(read))
 
 
-def mutate(nucleotide, mutrate, alpha=0.4, beta=0.2):
+def mutate(sequence, mutrate, alpha=0.4, beta=0.2):
     """
     alpha: Transitions
     beta: Transversions
     https://en.wikipedia.org/wiki/Mutation_rate
     """
     a = int(10 * alpha)
     b = int(10 * beta)
+    newseq = ""
     dmut = {'A': b * ['C'] + b * ['T'] + a * ['G'], 'C': b * ['A'] + b * ['G'] + a * [
         'T'], 'G': b * ['C'] + b * ['T'] + a * ['A'], 'T': b * ['A'] + b * ['G'] + a * ['C']}
-    if npr.random() <= mutrate:
-        new_nucl = random.choice(dmut[nucleotide])
-        return(new_nucl)
-    return(nucleotide)
+    for nuc in sequence:
+        if npr.random() <= mutrate:
+            new_nucl = random.choice(dmut[nuc])
+            newseq += new_nucl
+        else:
+            newseq += nuc
+    return(newseq)