-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVCF_compare_depths.py
136 lines (87 loc) · 3.01 KB
/
VCF_compare_depths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
from __future__ import division
from __future__ import print_function
"""
Script for parsing and editing a VCF file
"""
import sys
import argparse
import os.path
import cyvcf2
from cyvcf2 import VCF
import pandas as pd
epi = ('\
\n\
Compares variant calls in a merged VCF-file\n\
\n\
')
# Describe what the script does
parser = argparse.ArgumentParser(description='This script compares variant calls in a merged VCF-file', epilog= epi, formatter_class=argparse.RawTextHelpFormatter)
# Get inputs
parser.add_argument('-i', '--input', default='921.somatic.vcf.gz', dest='inf', action='store', required=True, help="VCF file")
#parser.add_argument('-f', '--filter-string', default=None, dest='fi', action='store', required=True, help="String of filters to apply")
# Check for no input
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# Check if input files exist
if not os.path.isfile(args.inf)==True:
print("Cannot find input file ",args.inf)
sys.exit(1)
vcf = cyvcf2.VCF(args.inf)
# create a new vcf Writer using the input vcf as a template.
w = Writer(f, vcf)
# Create other output
output=args.inf+".stats"
df = pd.DataFrame(columns=vcf.samples) #creates a new dataframe that's empty
v=-1
for variant in VCF(args.inf): # or VCF('some.bcf')
v=v+1
alt = [item.encode('utf-8') for item in variant.ALT]
#print(variant.REF, alt) # e.g. REF='A', ALT=['C', 'T']
# Somehow assessing the number of alternative alleles
if (len(alt)==1):
pass
# Multiple alternative alleles
elif (len(alt) >1):
#print("Long",str(variant))
pass
# No alernative variant - possibly a SN deletion
elif (len(alt) < 1):
#print("Null",str(variant))
pass
else:
print(str(variant))
# Get a numpy array of the depth per sample:
dp = variant.format('DP')
#print(str(variant))
#print(dp[0], dp[1], dp[2], dp[3], dp.size)
# parse through the format for each sample, splitting normal and tumour
for i in range(0,dp.size):
if i % 2 == 0:
#print("N", dp[i])
if (dp[i]>0):
#s = int(dp[i][0])
df.loc[v, vcf.samples[i]] = int(dp[i][0])
pass # Even
else:
#print("T", dp[i])
if (dp[i]>0):
#s= dp[i]
df.loc[v, vcf.samples[i]] = int(dp[i][0])
pass # Odd
# Split indels and SNPs
if (':'.join(variant.FORMAT)=="DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50" ):
print ("Is indel ",':'.join(variant.FORMAT), variant.REF, alt )
elif ( ':'.join(variant.FORMAT)=="DP:FDP:SDP:SUBDP:AU:CU:GU:TU"):
#print("Is SNV ", ':'.join(variant.FORMAT), variant.REF, alt)
pass
else:
#print ("Is other ", variant.format('DP'), variant.REF, alt)
pass
if variant.is_indel:
print("Indel is", ':'.join(variant.FORMAT), variant.REF, alt)
# Print output dataframe
df.to_csv(output,sep='\t',mode='w')
exit(0)