-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPON.py
143 lines (102 loc) · 4.81 KB
/
PON.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
from __future__ import print_function
import sys
import os.path
import argparse
import re
from subprocess import call
"""
Script for parsing a VCF file
"""
epi = ('\
\n\
Create PON from genome and list of normals\n\
Your genome needs to have a .fai and .dict file available too \n\
\n\
')
# Describe what the script does
parser = argparse.ArgumentParser(description='This script creates a GATK PON from genome and normal data', epilog= epi, formatter_class=argparse.RawTextHelpFormatter)
# Get inputs
parser.add_argument('-g', '--genome', default=None, dest='genome', action='store', required=True, help="genome.fa")
parser.add_argument('-l', '--list', default=None, dest='list', action='store', required=True, help="List of normal.bam files")
parser.add_argument('-p', '--prefix', default="test", dest='prefix', action='store', required=False, help="Output prefix")
# Check for no input
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# Check if input files exist
if not os.path.isfile(args.genome)==True:
print("Cannot find input file ",args.genome)
sys.exit(1)
# Check if input files exist
if not os.path.isfile(args.list)==True:
print("Cannot find input file ",args.list)
sys.exit(1)
list = open(args.list, "r")
list2 = open(args.list, "r")
list3 = open(args.list, "r")
# Create versions of the genome, and link to here
genome=args.genome.split("/")[-1]
px=args.prefix
#print(genome,args.genome,"REF")
if ((str(genome)==str(args.genome))==False):
#call(["ln", "-s",args.genome])
print ("ln -s %s\nln -s %s.fai" % (args.genome,args.genome))
genomepx = '.'.join(genome.split(".")[0:-1])
# Create genome dict and fai if it doesnt exist
print('bsub -q bio -P Analysis -J CSD%s -o CSD%s.o -e CSD%s.e gatk -n 1 --java-options "-Xmx2g" CreateSequenceDictionary -R %s -O %s.dict ' % (px,px,px,genome,genomepx))
"""
# Create intervals from genome # FAST, easy, less than 1 minute
print ('gatk PreprocessIntervals -R %s --bin-length 1000 --padding 0 -O %s.interval_list' % (genome,genome ))
# Preprocess intervals; exomes
#print (" %s " % (args.genome))
print ('gatk PreprocessIntervals \
-L %s.interval_list \
-R %s \
--bin-length 0 \
--interval-merging-rule OVERLAPPING_ONLY \
-O %s.preprocessed.interval_list \
' % (genome, genome, genome) )
"""
# To generate consecutive bins of 1000 bases from the reference (e.g., for whole genome sequencing analyses):
print ('gatk PreprocessIntervals \
-R %s \
--bin-length 1000 \
--padding 0 \
-O %s.interval_list' % (genome,genomepx) )
# Filter the result to only include major chromosomes
print ('cat %s.interval_list | grep chr | grep -v chrEBV | grep -v chrUn | grep -v random | grep -v ChrM | grep -v ChrX | grep -v ChrY > %s.preprocessed_intervals.interval_list' %(genomepx,genomepx))
hdf5f=[]
# Make a small fake panel-of-normals side-by-side file # Takes long 7519 sec, is big 10G, preferably paralellise
for norm in list:
norm=norm.rstrip()
normpx = norm.split("/")[-1]
normpx = ''.join(normpx.split(".bam")[0:-1])
print ('bsub -P Analysis -J CRC%s -o CRC%s.o -e CRC%s.e -n 1 -R "rusage[mem=5500]" -q bio gatk --java-options "-Xmx5g" CollectReadCounts -I %s -L %s.preprocessed_intervals.interval_list --interval-merging-rule OVERLAPPING_ONLY -O %s.hdf5' % (normpx,normpx,normpx,norm,genomepx,normpx))
hdf5f.append("%s.hdf5" %(normpx))
# Create a PON intervals file from multiple samples input
print ('bsub -P Analysis -J PON%s -o PON%s.o -e PON%s.e -n 1 -R "rusage[mem=10000]" -q bio gatk --java-options "-Xmx9g" \
CreateReadCountPanelOfNormals --minimum-interval-median-percentile 5.0 -O %s.pon.hdf5 ' %(px,px,px,px), end=" " )
for file in hdf5f:
file = file.rstrip()
print('\ \n -I %s ' % (file), end=" ")
# Denoise the read counts for all normal samples, using the panel of normals
for norm in list2:
norm=norm.rstrip()
normpx = norm.split("/")[-1]
normpx = ''.join(normpx.split(".bam")[0:-1])
print('bsub -P Analysis -J DRC%s -o DRC%s.o -e DRC%s.e -n 1 -R "rusage[mem=8500]" -q bio \
gatk --java-options "-Xmx8g" DenoiseReadCounts -I %s.hdf5 \
--count-panel-of-normals %s.pon.hdf5 --standardized-copy-ratios %s.standardizedCR.tsv --denoised-copy-ratios %s.denoisedCR.tsv' % (normpx,normpx,normpx,normpx,px,normpx,normpx))
# Use customised R
print ('\nsource activate r-3.5.0\n')
print("\nmkdir Plots%s\n" %(px))
# Plot the two files showed above to highlight differences
for norm in list3:
norm=norm.rstrip()
normpx = norm.split("/")[-1]
normpx = ''.join(normpx.split(".bam")[0:-1])
print ('gatk PlotDenoisedCopyRatios --standardized-copy-ratios %s.standardizedCR.tsv --denoised-copy-ratios %s.denoisedCR.tsv \
--sequence-dictionary %s.dict --minimum-contig-length 46709983 --output Plots%s --output-prefix %s' % (normpx,normpx,genomepx,px,normpx))
quit()