release 0.9.4

Dom Laetsch · Dom Laetsch · commit 45c1fccb764b · 2015-12-01T00:37:56.000Z
diff --git a/bam2cov.py b/bam2cov.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""usage: blobtools bam2cov         -i FASTA -b BAM [-h|--help] 
+    
+    Options:
+        -h --help                   show this
+        -i, --infile FASTA          FASTA file of assembly. Headers are split at whitespaces.  
+        -b, --bam <BAM>             BAM file (requires samtools in $PATH)
+"""
+
+from __future__ import division
+import lib.BtLog as BtLog
+from docopt import docopt
+import re
+import subprocess
+import os
+
+class Fasta():
+    def __init__(self, name, seq):
+        self.name = name 
+        self.length = len(seq)
+        self.n_count = seq.count('N')
+        self.agct_count = self.length - self.n_count
+        self.cov = 0.0
+
+def which(program):
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
+
+def runCmd(command):
+    cmd = command.split() # sanitation
+    p = subprocess.Popen(cmd,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.STDOUT)
+    return iter(p.stdout.readline, b'')
+
+def readFasta(infile):
+    with open(infile) as fh: 
+        header, seqs = '', []
+        for l in fh: 
+            if l[0] == '>':
+                if (header):
+                    yield header, ''.join(seqs)
+                header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
+            else:
+                seqs.append(l[:-1])
+        yield header, ''.join(seqs)
+
+def parseFasta(infile):
+    fasta_dict = {}
+    for name, seq in readFasta(infile):
+        fasta = Fasta(name, seq)
+        fasta_dict[fasta.name] = fasta
+    return fasta_dict
+
+def checkBam(infile):
+    print BtLog.status_d['10']
+    if not (which('samtools')):
+        BtLog.error('7')
+    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
+    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
+    reads_total, reads_mapped = 0, 0
+    output = ''
+    command = "samtools flagstat " + infile
+    for line in runCmd(command):
+        output += line
+    reads_mapped = int(reads_mapped_re.search(output).group(1))
+    reads_total = int(reads_total_re.search(output).group(1))
+    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
+    return reads_total, reads_mapped
+
+def readBam(infile, fasta_headers):
+    reads_total, reads_mapped = checkBam(infile)
+    progress_unit = int(int(reads_total)/1000)
+    base_cov_dict = {}
+    cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's
+    # execute samtools to get only mapped reads
+    command = "samtools view -F 4 " + infile
+    # only one counter since only yields mapped reads
+    parsed_reads = 0 
+    for line in runCmd(command):
+        match = line.split("\t")
+        if match >= 11:
+            seq_name = match[2]
+            base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
+            if (base_cov):
+                parsed_reads += 1
+                if seq_name not in fasta_headers:
+                    print BtLog.warn_d['2'] % (seq_name, infile)
+                else:
+                    base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov 
+        BtLog.progress(parsed_reads, progress_unit, reads_total)
+    BtLog.progress(reads_total, progress_unit, reads_total)
+
+    if not int(reads_mapped) == int(parsed_reads):
+        print warn_d['3'] % (reads_mapped, parsed_reads)
+    return base_cov_dict, reads_total, parsed_reads
+
+def parseBam(bam_f, fasta_dict):
+    base_cov_dict, reads_total, reads_mapped = readBam(bam_f, set(fasta_dict.keys()))
+    if reads_total == 0:
+        print BtLog.warn_d['4'] % bam_f
+    for name, base_cov in base_cov_dict.items():
+        fasta_dict[name].cov = base_cov / fasta_dict[name].agct_count
+    return fasta_dict
+
+def writeCov(fasta_dict, out_f):
+    with open(out_f, 'w') as fh:
+        for name, fasta_obj in fasta_dict.items():
+            fh.write("%s\t%s\n" % (name, fasta_obj.cov))
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+    
+    fasta_f = args['--infile']
+    bam_f = args['--bam']
+    out_f = os.path.basename(bam_f) + ".cov"
+
+    fasta_dict = parseFasta(fasta_f)
+    fasta_dict = parseBam(bam_f, fasta_dict)
+    writeCov(fasta_dict, out_f)
+
diff --git a/blobtools b/blobtools
@@ -9,6 +9,8 @@ commands:
   view      print BlobDB
   plot      plot BlobDB as a blobplot
 
+  bam2cov   generate cov file from bam file
+
 -h --help    show this
 
 """
@@ -22,7 +24,7 @@ from docopt import docopt
 if __name__ == '__main__':
     main_dir = join(dirname(__file__), '')
     args = docopt(__doc__,
-                  version='version 0.1',
+                  version='version 0.9.4',
                   options_first=True)
     #print(args)
 
@@ -33,5 +35,7 @@ if __name__ == '__main__':
         exit(call(['python', main_dir + 'view.py'] + argv))
     elif args['<command>'] == 'plot':
         exit(call(['python', main_dir + 'plot.py'] + argv))
+    elif args['<command>'] == 'bam2cov':
+        exit(call(['python', main_dir + 'bam2cov.py'] + argv))
     else:
         exit("%r is not a blobtools command. See 'blobtools -h'." % args['<command>'])
diff --git a/lib/BtCore.py b/lib/BtCore.py
@@ -115,7 +115,7 @@ def load(self, BlobDb_f):
         self.hitLibs = blobDict['hitLibs']
         self.taxrules = blobDict['taxrules']
 
-    def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index):
+    def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
         data_dict = {}
         read_cov_dict = {}
         max_cov = 0.0
@@ -125,7 +125,9 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index):
         for blob in self.dict_of_blobs.values():
             name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
             
-            if (c_index): # annotation with c_index instead of taxonomic group 
+            if (catcolour_dict): # annotation with categories specified in catcolour
+                group = str(catcolour_dict[name])
+            elif (c_index): # annotation with c_index instead of taxonomic group
                 group = str(blob['taxonomy'][taxrule][rank]['c_index'])
             else: # annotation with taxonomic group
                 group = str(blob['taxonomy'][taxrule][rank]['tax'])
diff --git a/lib/BtIO.py b/lib/BtIO.py
@@ -12,7 +12,6 @@
 from __future__ import division
 import re
 import subprocess
-import commands
 from os.path import basename, isfile, abspath
 import os
 import lib.BtLog as BtLog
diff --git a/lib/BtLog.py b/lib/BtLog.py
@@ -51,9 +51,10 @@ def progress(iteration, steps, max_value):
     '18' : '[ERROR:18]\t: Please provide a tax file in BLAST format.',
     '19' : '[ERROR:19]\t: Sequence %s in file %s is not part of the assembly.',
     '20' : '[ERROR:20]\t: Please add "clc_mapping_info" to you PATH variable.',
-    '21' : '[ERROR:21]\t: Refcov FILE does not seem to have the right format.',
-    '22' : '[ERROR:22]\t: Tax file %s seems to have no taxids.'
-
+    '21' : '[ERROR:21]\t: Refcov file %s does not seem to have the right format.',
+    '22' : '[ERROR:22]\t: Tax file %s seems to have no taxids.',
+    '23' : '[ERROR:23]\t: Catcolour file %s does not seem to have the right format.',
+    '24' : '[ERROR:24]\t: Catcolour file incompatible with c-index colouring.'
 }
 
 warn_d = {
diff --git a/lib/BtPlot.py b/lib/BtPlot.py
@@ -28,7 +28,7 @@
 mat.rcParams['lines.antialiased'] = True
 
 FONTSIZE = 24
-COLOURMAP = "rainbow" # "Set1", "Paired", "Set2", "Spectral"
+COLOURMAP = "Spectral" # "Set1", "Paired", "Set2", "Spectral"
 BLACK, GREY, BGGREY, WHITE, DGREY = unicode('#262626'), unicode('#d3d3d3'), unicode('#F0F0F5'), unicode('#ffffff'), unicode('#4d4d4d')
 nullfmt = NullFormatter()
 
@@ -58,9 +58,21 @@ def parseRefCov(refcov_f):
                                         'reads_mapped' : int(reads_mapped_ref)
                                        }
             except:
-                BtLog.error('21')
+                BtLog.error('21', refcov_f)
     return refcov_dict
 
+def parseCatColour(catcolour_f):
+    catcolour_dict = {}
+    with open(catcolour_f) as fh:
+        for l in fh:
+            try:
+                seq_name, category = l.rstrip("\n").split(",")
+                catcolour_dict[seq_name] = category
+            except:
+                BtLog.error('23', catcolour_f)
+    return catcolour_dict
+
+
 def getSortedGroups(data_dict, sort_order):
     """ Returns list of sorted groups based on span or count. """
     sorted_groups = []
@@ -301,7 +313,7 @@ def relabel_and_colour(self, colour_f, user_labels):
             self.plot_order.append('other')
 
     def plotReadCov(self, refcov_dict):
-        mat.rcParams.update({'font.size': 18})
+        mat.rcParams.update({'font.size': 24})
         plot_data = {}
 
         main_columns = 2
@@ -418,7 +430,7 @@ def plotBlobs(self, cov_lib, info_flag):
             group_number_of_seqs = self.stats[group]['count_visible']
             group_n50 = self.stats[group]['n50']
             blob_size_array = []
-            s, lw, alpha, colour = 15, 0.5, 1, self.colours[group]
+            s, lw, alpha, colour = 15, 0.5, 0.5, self.colours[group]
             if (self.ignore_contig_length):
                 if not group == "no-hit":
                     s = 65
@@ -448,7 +460,7 @@ def plotBlobs(self, cov_lib, info_flag):
             if (self.multiplot): 
                 axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=FONTSIZE, frameon=True)
                 plot_ref_legend(axScatter)
-                m_out_f = "%s.%s.%s.blobs.%s" % (self.out_f, i, group, self.format)
+                m_out_f = "%s.%s.%s.blobs.%s" % (self.out_f, i, group.replace("/", "_").replace(" ", "_"), self.format)
                 print BtLog.status_d['8'] % m_out_f
                 plt.savefig(m_out_f, format=self.format)
         if not (self.ignore_contig_length):
diff --git a/plot.py b/plot.py
@@ -5,7 +5,8 @@
                             [-r RANK] [-x TAXRULE] [--label GROUPS...] 
                             [-o PREFIX] [-m] [--sort ORDER] [--hist HIST] [--title]
                             [--colours FILE] [--include FILE] [--exclude FILE]
-                            [--format FORMAT] [--noblobs] [--noreads] [--refcov FILE]
+                            [--format FORMAT] [--noblobs] [--noreads] 
+                            [--refcov FILE] [--catcolour FILE]
                             [-h|--help] 
 
     Options:
@@ -41,8 +42,11 @@
         --noblobs                   Omit blobplot [default: False]
         --noreads                   Omit plot of reads mapping [default: False]
         --refcov FILE               File containing number of "total" and "mapped" reads 
-                                    per coverage file. (e.g.: bam0,900,100). If provided, info
-                                    will be used in read coverage plot(s). 
+                                     per coverage file. (e.g.: bam0,900,100). If provided, info
+                                     will be used in read coverage plot(s). 
+        --catcolour FILE            Colour plot based on categories from FILE 
+                                     (format : "seq\tcategory"). 
+                                    
 """
 
 from __future__ import division
@@ -79,6 +83,7 @@
     no_plot_blobs = args['--noblobs']
     no_plot_reads = args['--noreads']
     refcov_f = args['--refcov']
+    catcolour_f = args['--catcolour']
 
     # Does blobdb_f exist ?
     if not isfile(blobdb_f):
@@ -112,6 +117,14 @@
     if (refcov_f):
         refcov_dict = BtPlot.parseRefCov(refcov_f)
 
+    catcolour_dict = {}
+    if (catcolour_f) and (c_index):
+        BtLog.error('24')
+    elif (catcolour_f):
+        catcolour_dict = BtPlot.parseCatColour(catcolour_f)
+    else: 
+        pass
+
     # Load BlobDb
     print BtLog.status_d['9'] % blobdb_f
     blobDB = bt.BlobDb('new')
@@ -124,10 +137,8 @@
     # Is taxrule sane and was it computed?
     if taxrule not in blobDB.taxrules:
         BtLog.error('11', taxrule, blobDB.taxrules)
-
-    # Get arrays and filter_dict (filter_dict lists, span/count passing filter) for those groups passing min_length, rank, hide_nohits ...
-    # make it part of core , get data by group ... should be used by stats, generalise ...
-    data_dict, max_cov, cov_libs, cov_libs_total_reads = blobDB.getPlotData(rank, min_length, hide_nohits, taxrule, c_index)
+    
+    data_dict, max_cov, cov_libs, cov_libs_total_reads = blobDB.getPlotData(rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict)
     plotObj = BtPlot.PlotObj(data_dict, cov_libs, cov_libs_total_reads)
     plotObj.exclude_groups = exclude_groups
     plotObj.format = format
@@ -139,8 +150,7 @@
     plotObj.max_group_plot = max_group_plot
     plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order)
     plotObj.labels.update(plotObj.group_order)
-    #if len(plotObj.group_order) > plotObj.max_group_plot:
-    #    plotObj.labels.add('other')
+    
     if (user_labels):
         for group, label in user_labels.items():
             plotObj.labels.add(label)
@@ -157,6 +167,8 @@
         out_f = "%s.%s.%s.p%s" % (title, hist_type, rank, max_group_plot)
         if out_prefix:
             out_f = "%s.%s" % (out_prefix, out_f)
+        if catcolour_dict:
+            out_f = "%s.%s" % (out_f, "catcolour")
         if ignore_contig_length:
             out_f = "%s.%s" % (out_f, "noscale")
         if c_index: