Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions parse_pwgs_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import json
import pandas as pd
import os
from collections import Counter
import zipfile
import argparse

def return_cnv_table(cnv_data):
full_df = cnv_data.drop(['a', 'd'], axis=1)
one = cnv_data['physical_cnvs'].str.split(",",expand = True)
one = one.replace({"[^0-9|.]": ''}, regex=True)
full_df = full_df.join(one,how = "outer")
full_df = full_df.rename(columns={ 0: "chr", 1: "start",2: "end", 3: "major_cn", 4 : "minor_cn", 5 : 'cell_prev','cnv':'id'})
full_df = full_df.drop("physical_cnvs",axis = 1)
return(full_df)

def return_ssm_table(ssm_data):

full_df = ssm_data.drop(['mu_r', 'mu_v'], axis=1)
pd.set_option('mode.chained_assignment', None)
full_df['chr'], full_df['start'] = full_df['gene'].str.split('_', 1).str
full_df = full_df.drop("gene", axis = 1)

try:
one = full_df['a'].str.split(",",expand = True)
two = full_df['d'].str.split(",",expand = True)
one.columns = [str(col) + '_ref' for col in one.columns]
two.columns = [str(col) + '_total' for col in two.columns]
full_df = full_df.join([one, two],how = "outer")
except:
full_df = full_df.rename(columns={'a' : 'ref', 'd' : 'total'})
#one.columns = [str(col) + '_ref' for col in one.columns]
#two.columns = [str(col) + '_total' for col in two.columns]
#full_df = full_df.join([one, two],how = "outer")
#full_df = full_df.drop(['a', 'd'], axis = 1)
return(full_df)

def return_top_k_trees(summary_file, k = 5):
with open(summary_file, "r") as read_file:
data_summary = json.load(read_file)
trees = {}
for tr in data_summary['trees']:
trees[tr] = (data_summary['trees'][tr]['llh'])
c = Counter(trees)

return(c.most_common(k))

def tree_json(mutasgn_path, tree):
mutzip = zipfile.ZipFile(mutasgn_path)

tree_json = mutzip.open((tree + ".json"))
contents = tree_json.read()
tree_json.close()
#parse the json so it's a sensible looking json
my_json = contents.decode('utf8').replace("'", '"')
data = json.loads(my_json)
return(data)

def tree_to_df(tree_data, tree_number):
#go through all the populations
df = pd.DataFrame()
for p in tree_data['mut_assignments']:
df2 = pd.DataFrame(tree_data['mut_assignments'][p]['ssms'])
df2['population'] = p
df2['aberation'] = 'ssm'
df3 = pd.DataFrame(tree_data['mut_assignments'][p]['cnvs'])
df3['population'] = p
df3['aberation'] = 'cnvs'
df4 = pd.concat([df2,df3],sort=False)
df = pd.concat([df,df4],sort=False)
df['tree'] = tree_number

df.columns = ['id','population','aberation','tree']

return(df)

def read_best_trees_json(mutasgn_path, summary_file, k0 = 5):
#get the tree with the max llh, as per Morris comment that tree with max llh is 'best'
top_k_trees = return_top_k_trees(summary_file, k = k0)
#use the zip file module to read in the zip file
#get the json of the best tree
best_trees = [x[0] for x in top_k_trees]
all_trees = []
for tree in best_trees:
tree_data = tree_json(mutasgn_path, tree)
tree_df = tree_to_df(tree_data, tree)
all_trees.append(tree_df)
return(all_trees)

def add_labels_best_trees(all_trees, all_data_simple):
merge_trees = []
for tree in all_trees:
tree = pd.merge(tree, all_data_simple, on = 'id')
merge_trees.append(tree)
return(merge_trees)

def write_tree_likelihoods(output_folder, summary_file, k0 = 5):
tree_likes = return_top_k_trees(summary_file, k = k0)
with open((output_folder + "/" + 'tree_likelihoods.txt'), 'w') as f:
f.write('\n'.join('%s %s' % x for x in tree_likes))

def write_tree_csvs(output_folder, all_trees):
for tr in all_trees:
tr.to_csv(output_folder + "/" + str((tr['tree'].iloc[0])) + ".csv")


def main():

parser = argparse.ArgumentParser(
description='Parse the output of PhyloWGS into useful tables for downstream analysis and exploration',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('--cnv_input', dest='cnv_input', action='append', required=True,
help='The cnv.txt file which was originally fed into PhyloWGS')
parser.add_argument('--ssm_input', dest='ssm_input', action='append', required=True,
help='The ssm.txt file which was originally fed into PhyloWGS')
parser.add_argument('--summary_file', dest='summary_file', action='append', required=True,
help='File path to PhyloWGS output *.summ.json file, should be unziped with gunzip first')
parser.add_argument('--mutasgn_path', dest='mutasgn_path', action='append', required=True,
help='File path to PhyloWGS output *.mutass.zip file')
parser.add_argument('--output_folder', dest='output_folder', action='append', required=True,
help='Folder you would like the parsed output sent to')
parser.add_argument('--k', dest='k', type=int, default=5,
help='K for how many of the top tress to output')

args = parser.parse_args()

cnv_simple = return_cnv_table(pd.read_csv(args.cnv_input[0], sep = "\t"))
#print("read the cnv")
#print(cnv_simple.head())

#print(args.ssm_input[0])
temp = args.ssm_input[0]
ssm_simple = return_ssm_table(pd.read_csv(temp, sep = "\t"))
all_data_simple = pd.concat([cnv_simple,ssm_simple],sort = False)

all_trees = read_best_trees_json(args.mutasgn_path[0], args.summary_file[0], k0 = args.k)
all_trees = add_labels_best_trees(all_trees, all_data_simple)

write_tree_csvs(args.output_folder[0], all_trees)
write_tree_likelihoods(args.output_folder[0], args.summary_file[0], k0 = args.k)

if __name__ == '__main__':
main()
39 changes: 38 additions & 1 deletion parser/parse_cnvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def write_cnvs(self, cn_regions):
region['copy_number'] = region['major_cn'] + region['minor_cn']
self._write_cn_record(region)


self._cn_output.close()

class CnvParser(object):
Expand Down Expand Up @@ -79,6 +80,40 @@ def parse(self):

return cn_regions

class FacetsParser(CnvParser):
def __init__(self, fc_filename, cellularity):
self._fc_filename = fc_filename
self._cellularity = cellularity

def parse(self):
cn_regions = defaultdict(list)

with open(self._fc_filename) as facetf:
reader = csv.DictReader(facetf)
for record in reader:
cnv = {}
#cf.em is the fraction of cells in a sample with copy number, copy neutral are assigned 1 by FACETS
cnv['cellular_prevalence'] = float(record['cf.em'])

#FACETS does not always assign a major and minor copy number, if the number of hetroz. snps are too low, only assigns
#a total copy number
#if a minor copy is not called, skip this section
if (str.isdigit(record['tcn.em']) and str.isdigit(record['lcn.em'])):
cnv['major_cn'] = int(record['tcn.em']) - int(record['lcn.em'])
cnv['minor_cn'] = int(record['lcn.em'])
chrom = record['chrom']

cnv['start'] = int(record['start'])
#int float call because FACETS was occasionally writing out .csv pipeline with 2e+0.7 or something as a chrom end, maybe have been a bug in my pipeline could be removed
cnv['end'] = int(float(record['end']))

cn_regions[chrom].append(cnv)
else:
next


return cn_regions

class BattenbergParser(CnvParser):
def __init__(self, bb_filename, cellularity):
self._bb_filename = bb_filename
Expand Down Expand Up @@ -162,7 +197,7 @@ def main():
description='Create CNV input file for parser from Battenberg or TITAN data',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('-f', '--cnv-format', dest='input_type', required=True, choices=('battenberg', 'battenberg-smchet', 'titan'),
parser.add_argument('-f', '--cnv-format', dest='input_type', required=True, choices=('battenberg', 'battenberg-smchet', 'titan','facets'),
help='Type of CNV input')
parser.add_argument('-c', '--cellularity', dest='cellularity', type=float, required=True,
help='Fraction of sample that is cancerous rather than somatic. Used only for estimating CNV confidence -- if no CNVs, need not specify argument.')
Expand All @@ -184,6 +219,8 @@ def main():
parser = BattenbergSmchetParser(args.cnv_file, cellularity)
elif args.input_type == 'titan':
parser = TitanParser(args.cnv_file, cellularity)
elif args.input_type == 'facets':
parser = FacetsParser(args.cnv_file, cellularity)
else:
raise Exception('Unknown input type')

Expand Down
13 changes: 8 additions & 5 deletions witness/index_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,25 @@

def main():
datasets = defaultdict(list)
#use a path from the location of the python script so that it can be called from outside the same folder
base_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'data')

base_dir = 'data'
for run_name in os.listdir(base_dir):

for summary_path in glob.glob(os.path.join(base_dir, run_name, '*.summ.json')):
dataset_name = summary_path.split('/')[-1].split('.')[0]

run = {
'summary_path': summary_path,
'muts_path': os.path.join(base_dir, run_name, dataset_name + '.muts.json'),
'summary_path': os.path.join('data', run_name, dataset_name + '.summ.json'),
'muts_path': os.path.join('data', run_name, dataset_name + '.muts.json'),
'name': dataset_name,
}

clusters_path = os.path.join(base_dir, run_name, dataset_name + '.clusters.json')
clusters_path = os.path.join('data', run_name, dataset_name + '.clusters.json')
if os.path.isfile(clusters_path):
run['clusters_path'] = clusters_path

mutass_path = os.path.join(base_dir, run_name, dataset_name + '.mutass')
mutass_path = os.path.join('data', run_name, dataset_name + '.mutass')
if os.path.isdir(mutass_path):
run['mutass_path'] = mutass_path

Expand Down