morrislab · aleighbrown · Jan 6, 2019 · Jan 6, 2019 · Jan 6, 2019 · Jan 6, 2019
diff --git a/parse_pwgs_output.py b/parse_pwgs_output.py
@@ -0,0 +1,144 @@
+import json
+import pandas as pd
+import os
+from collections import Counter
+import zipfile
+import argparse
+
+def return_cnv_table(cnv_data):
+    full_df = cnv_data.drop(['a', 'd'], axis=1)
+    one = cnv_data['physical_cnvs'].str.split(",",expand = True)
+    one = one.replace({"[^0-9|.]": ''}, regex=True)
+    full_df = full_df.join(one,how = "outer")
+    full_df = full_df.rename(columns={ 0: "chr", 1: "start",2: "end", 3: "major_cn", 4 : "minor_cn", 5 : 'cell_prev','cnv':'id'})
+    full_df = full_df.drop("physical_cnvs",axis = 1)
+    return(full_df)
+
+def return_ssm_table(ssm_data):
+
+    full_df = ssm_data.drop(['mu_r', 'mu_v'], axis=1)
+    pd.set_option('mode.chained_assignment', None)
+    full_df['chr'], full_df['start'] = full_df['gene'].str.split('_', 1).str
+    full_df = full_df.drop("gene", axis = 1)
+
+    try:
+        one = full_df['a'].str.split(",",expand = True)
+        two = full_df['d'].str.split(",",expand = True)
+        one.columns = [str(col) + '_ref' for col in one.columns]
+        two.columns = [str(col) + '_total' for col in two.columns]
+        full_df = full_df.join([one, two],how = "outer")    
+    except:
+        full_df = full_df.rename(columns={'a' : 'ref', 'd' : 'total'})
+    #one.columns = [str(col) + '_ref' for col in one.columns]
+    #two.columns = [str(col) + '_total' for col in two.columns]
+    #full_df = full_df.join([one, two],how = "outer")
+    #full_df = full_df.drop(['a', 'd'], axis = 1)
+    return(full_df)
+
+def return_top_k_trees(summary_file, k = 5):
+    with open(summary_file, "r") as read_file:
+        data_summary = json.load(read_file)
+    trees = {}
+    for tr in data_summary['trees']:
+        trees[tr] = (data_summary['trees'][tr]['llh'])
+    c = Counter(trees)
+
+    return(c.most_common(k))
+
+def tree_json(mutasgn_path, tree):
+    mutzip = zipfile.ZipFile(mutasgn_path)
+
+    tree_json = mutzip.open((tree + ".json"))
+    contents = tree_json.read()
+    tree_json.close()
+    #parse the json so it's a sensible looking json
+    my_json = contents.decode('utf8').replace("'", '"')
+    data = json.loads(my_json)
+    return(data)
+
+def tree_to_df(tree_data, tree_number):
+    #go through all the populations
+    df = pd.DataFrame()
+    for p in tree_data['mut_assignments']:
+        df2 = pd.DataFrame(tree_data['mut_assignments'][p]['ssms'])
+        df2['population'] = p
+        df2['aberation'] = 'ssm'
+        df3 = pd.DataFrame(tree_data['mut_assignments'][p]['cnvs'])
+        df3['population'] = p
+        df3['aberation'] = 'cnvs'
+        df4 = pd.concat([df2,df3],sort=False)
+        df = pd.concat([df,df4],sort=False)
+    df['tree'] = tree_number
+
+    df.columns = ['id','population','aberation','tree']
+
+    return(df)
+
+def read_best_trees_json(mutasgn_path, summary_file, k0 = 5):
+    #get the tree with the max llh, as per Morris comment that tree with max llh is 'best'
+    top_k_trees = return_top_k_trees(summary_file, k = k0)
+    #use the zip file module to read in the zip file
+    #get the json of the best tree
+    best_trees = [x[0] for x in top_k_trees]
+    all_trees = []
+    for tree in best_trees:
+        tree_data = tree_json(mutasgn_path, tree)
+        tree_df = tree_to_df(tree_data, tree)
+        all_trees.append(tree_df)
+    return(all_trees)
+
+def add_labels_best_trees(all_trees, all_data_simple):
+    merge_trees = []
+    for tree in all_trees:
+        tree = pd.merge(tree, all_data_simple, on = 'id')
+        merge_trees.append(tree)
+    return(merge_trees)
+
+def write_tree_likelihoods(output_folder, summary_file, k0 = 5):
+    tree_likes = return_top_k_trees(summary_file, k = k0)
+    with open((output_folder + "/" + 'tree_likelihoods.txt'), 'w') as f:
+        f.write('\n'.join('%s %s' % x for x in tree_likes))
+
+def write_tree_csvs(output_folder, all_trees):
+    for tr in all_trees:
+        tr.to_csv(output_folder + "/" + str((tr['tree'].iloc[0])) + ".csv")
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+    description='Parse the output of PhyloWGS into useful tables for downstream analysis and exploration',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('--cnv_input', dest='cnv_input', action='append', required=True,
+    	help='The cnv.txt file which was originally fed into PhyloWGS')
+    parser.add_argument('--ssm_input', dest='ssm_input', action='append', required=True,
+    	help='The ssm.txt file which was originally fed into PhyloWGS')
+    parser.add_argument('--summary_file', dest='summary_file', action='append', required=True,
+    	help='File path to PhyloWGS output *.summ.json file, should be unziped with gunzip first')
+    parser.add_argument('--mutasgn_path', dest='mutasgn_path', action='append', required=True,
+    	help='File path to PhyloWGS output *.mutass.zip file')
+    parser.add_argument('--output_folder', dest='output_folder', action='append', required=True,
+    	help='Folder you would like the parsed output sent to')
+    parser.add_argument('--k', dest='k', type=int, default=5,
+    	help='K for how many of the top tress to output')
+
+    args = parser.parse_args()
+
+    cnv_simple = return_cnv_table(pd.read_csv(args.cnv_input[0], sep = "\t"))
+    #print("read the cnv")
+    #print(cnv_simple.head())
+
+    #print(args.ssm_input[0])
+    temp = args.ssm_input[0]
+    ssm_simple = return_ssm_table(pd.read_csv(temp, sep = "\t"))
+    all_data_simple = pd.concat([cnv_simple,ssm_simple],sort = False)
+
+    all_trees = read_best_trees_json(args.mutasgn_path[0], args.summary_file[0], k0 = args.k)
+    all_trees = add_labels_best_trees(all_trees, all_data_simple)
+
+    write_tree_csvs(args.output_folder[0], all_trees)
+    write_tree_likelihoods(args.output_folder[0], args.summary_file[0], k0 = args.k)
+
+if __name__ == '__main__':
+  main()
diff --git a/parser/parse_cnvs.py b/parser/parse_cnvs.py
@@ -45,6 +45,7 @@ def write_cnvs(self, cn_regions):
         region['copy_number'] = region['major_cn'] + region['minor_cn']
         self._write_cn_record(region)
 
+
     self._cn_output.close()
 
 class CnvParser(object):
@@ -79,6 +80,40 @@ def parse(self):
 
     return cn_regions
 
+class FacetsParser(CnvParser):
+  def __init__(self, fc_filename, cellularity):
+    self._fc_filename = fc_filename
+    self._cellularity = cellularity
+
+  def parse(self):
+    cn_regions = defaultdict(list)
+
+    with open(self._fc_filename) as facetf:
+      reader = csv.DictReader(facetf)
+      for record in reader:
+        cnv = {}
+	#cf.em is the fraction of cells in a sample with copy number, copy neutral are assigned 1 by FACETS
+        cnv['cellular_prevalence'] = float(record['cf.em'])
+
+        #FACETS does not always assign a major and minor copy number, if the number of hetroz. snps are too low, only assigns
+	#a total copy number
+	#if a minor copy is not called, skip this section
+        if (str.isdigit(record['tcn.em']) and str.isdigit(record['lcn.em'])):
+          cnv['major_cn'] = int(record['tcn.em']) - int(record['lcn.em'])
+          cnv['minor_cn'] = int(record['lcn.em'])
+          chrom = record['chrom']
+
+          cnv['start'] = int(record['start'])
+		#int float call because FACETS was occasionally writing out .csv pipeline with 2e+0.7 or something as a chrom end, maybe have been a bug in my pipeline could be removed
+	  cnv['end'] = int(float(record['end']))
+
+          cn_regions[chrom].append(cnv)
+        else:
+          next
+
+
+    return cn_regions
+
 class BattenbergParser(CnvParser):
   def __init__(self, bb_filename, cellularity):
     self._bb_filename = bb_filename
@@ -162,7 +197,7 @@ def main():
     description='Create CNV input file for parser from Battenberg or TITAN data',
     formatter_class=argparse.ArgumentDefaultsHelpFormatter
   )
-  parser.add_argument('-f', '--cnv-format', dest='input_type', required=True, choices=('battenberg', 'battenberg-smchet', 'titan'),
+  parser.add_argument('-f', '--cnv-format', dest='input_type', required=True, choices=('battenberg', 'battenberg-smchet', 'titan','facets'),
     help='Type of CNV input')
   parser.add_argument('-c', '--cellularity', dest='cellularity', type=float, required=True,
     help='Fraction of sample that is cancerous rather than somatic. Used only for estimating CNV confidence -- if no CNVs, need not specify argument.')
@@ -184,6 +219,8 @@ def main():
     parser = BattenbergSmchetParser(args.cnv_file, cellularity)
   elif args.input_type == 'titan':
     parser = TitanParser(args.cnv_file, cellularity)
+  elif args.input_type == 'facets':
+    parser = FacetsParser(args.cnv_file, cellularity)
   else:
     raise Exception('Unknown input type')
 

diff --git a/witness/index_data.py b/witness/index_data.py
@@ -7,22 +7,25 @@
 
 def main():
   datasets = defaultdict(list)
+  #use a path from the location of the python script so that it can be called from outside the same folder
+  base_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'data')
 
-  base_dir = 'data'
   for run_name in os.listdir(base_dir):
+
     for summary_path in glob.glob(os.path.join(base_dir, run_name, '*.summ.json')):
       dataset_name = summary_path.split('/')[-1].split('.')[0]
+
       run = {
-        'summary_path': summary_path,
-        'muts_path': os.path.join(base_dir, run_name, dataset_name + '.muts.json'),
+        'summary_path': os.path.join('data', run_name, dataset_name + '.summ.json'),
+        'muts_path': os.path.join('data', run_name, dataset_name + '.muts.json'),
         'name': dataset_name,
       }
 
-      clusters_path = os.path.join(base_dir, run_name, dataset_name + '.clusters.json')
+      clusters_path = os.path.join('data', run_name, dataset_name + '.clusters.json')
       if os.path.isfile(clusters_path):
         run['clusters_path'] = clusters_path
 
-      mutass_path = os.path.join(base_dir, run_name, dataset_name + '.mutass')
+      mutass_path = os.path.join('data', run_name, dataset_name + '.mutass')
       if os.path.isdir(mutass_path):
         run['mutass_path'] = mutass_path