Skip to content

Commit 764a1f5

Browse files
committed
fixed symlink
1 parent 93ada90 commit 764a1f5

File tree

1 file changed

+262
-1
lines changed

1 file changed

+262
-1
lines changed

src/pcgr/vcf2tsv.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/pcgr/vcf2tsv.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
from cyvcf2 import VCF, Writer
5+
import numpy as np
6+
import re
7+
import subprocess
8+
9+
version = '0.3.1'
10+
11+
12+
def __main__():
13+
parser = argparse.ArgumentParser(description='Convert a VCF file with genomic variants to a file with tab-separated values (TSV). One entry (TSV line) per sample genotype', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
14+
parser.add_argument('query_vcf', help='Bgzipped input VCF file with query variants (SNVs/InDels)')
15+
parser.add_argument('out_tsv', help='Output TSV file with one line pr non-rejected sample genotype (Variant, genotype and annotation data as tab-separated values)')
16+
parser.add_argument("--skip_info_data",action = "store_true", help="Skip printing of data in INFO column")
17+
parser.add_argument("--skip_genotype_data", action="store_true", help="Skip printing of genotype_data (FORMAT columns)")
18+
parser.add_argument("--keep_rejected_calls", action="store_true", help="Print data for rejected calls")
19+
parser.add_argument("--print_data_type_header", action="store_true", help="Print a header line with data types of VCF annotations")
20+
parser.add_argument("--compress", action="store_true", help="Compress TSV file with gzip")
21+
args = parser.parse_args()
22+
23+
vcf2tsv(args.query_vcf, args.out_tsv, args.skip_info_data, args.skip_genotype_data, args.keep_rejected_calls, args.compress, args.print_data_type_header)
24+
25+
26+
def check_subprocess(command):
27+
try:
28+
output = subprocess.check_output(str(command), stderr=subprocess.STDOUT, shell=True)
29+
if len(output) > 0:
30+
print (str(output.decode()).rstrip())
31+
except subprocess.CalledProcessError as e:
32+
print (e.output)
33+
exit(0)
34+
35+
36+
def vcf2tsv(query_vcf, out_tsv, skip_info_data, skip_genotype_data, keep_rejected_calls, compress, print_data_type_header):
37+
38+
vcf = VCF(query_vcf, gts012 = True)
39+
out = open(out_tsv,'w')
40+
41+
fixed_columns_header = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER']
42+
fixed_columns_header_type = ['String','Integer','String','String','String','Float','String']
43+
samples = vcf.samples
44+
info_columns_header = []
45+
format_columns_header = []
46+
sample_columns_header = []
47+
column_types = {}
48+
gt_present_header = 0
49+
50+
if len(samples) > 0:
51+
sample_columns_header.append('VCF_SAMPLE_ID')
52+
53+
for e in vcf.header_iter():
54+
header_element = e.info()
55+
if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
56+
if header_element['HeaderType'] == 'INFO' or header_element['HeaderType'] == 'FORMAT':
57+
column_types[header_element['ID']] = header_element['Type']
58+
if header_element['HeaderType'] == 'INFO':
59+
if skip_info_data is False:
60+
info_columns_header.append(header_element['ID'])
61+
if header_element['HeaderType'] == 'FORMAT':
62+
if len(sample_columns_header) > 0 and skip_genotype_data is False:
63+
if header_element['ID'] != 'GT':
64+
format_columns_header.append(header_element['ID'])
65+
else:
66+
gt_present_header = 1
67+
68+
header_line = '\t'.join(fixed_columns_header)
69+
if skip_info_data is False:
70+
header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header))
71+
if len(sample_columns_header) > 0:
72+
if skip_genotype_data is False:
73+
header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header)) + '\t' + '\t'.join(sample_columns_header) + '\t' + '\t'.join(sorted(format_columns_header)) + '\tGT'
74+
else:
75+
header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header))
76+
else:
77+
if len(sample_columns_header) > 0:
78+
if skip_genotype_data is False:
79+
header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sample_columns_header) + '\t' + '\t'.join(sorted(format_columns_header)) + '\tGT'
80+
else:
81+
header_line = '\t'.join(fixed_columns_header)
82+
83+
out.write('#https://github.com/sigven/vcf2tsv version=' + str(version) + '\n')
84+
if print_data_type_header is True:
85+
header_tags = header_line.rstrip().split('\t')
86+
header_types = []
87+
for h in header_tags:
88+
if h in column_types:
89+
header_types.append(str(column_types[h]))
90+
header_line_type = '\t'.join(fixed_columns_header_type) + '\t' + '\t'.join(header_types)
91+
out.write('#' + str(header_line_type) + '\n')
92+
out.write(str(header_line) + '\n')
93+
else:
94+
out.write(str(header_line) + '\n')
95+
96+
for rec in vcf:
97+
rec_id = '.'
98+
rec_qual = '.'
99+
rec_filter = '.'
100+
alt = ",".join(str(n) for n in rec.ALT)
101+
if not rec.ID is None:
102+
rec_id = str(rec.ID)
103+
if not rec.QUAL is None:
104+
rec_qual = str("{0:.2f}".format(rec.QUAL))
105+
rec_filter = str(rec.FILTER)
106+
if rec.FILTER is None:
107+
rec_filter = 'PASS'
108+
109+
pos = int(rec.start) + 1
110+
fixed_fields_string = str(rec.CHROM) + '\t' + str(pos) + '\t' + str(rec_id) + '\t' + str(rec.REF) + '\t' + str(alt) + '\t' + str(rec_qual) + '\t' + str(rec_filter)
111+
112+
if not 'PASS' in rec_filter and not keep_rejected_calls:
113+
continue
114+
115+
variant_info = rec.INFO
116+
vcf_info_data = []
117+
if skip_info_data is False:
118+
for info_field in sorted(info_columns_header):
119+
if column_types[info_field] == 'Flag':
120+
if variant_info.get(info_field) is None:
121+
vcf_info_data.append('False')
122+
else:
123+
vcf_info_data.append('True')
124+
elif column_types[info_field] == 'Float' or column_types[info_field] == 'Integer' or column_types[info_field] == 'String':
125+
if type(variant_info.get(info_field)) is list:
126+
vcf_info_data.append(",".join(str(n) for n in variant_info.get(info_field).encode('utf-8')))
127+
else:
128+
if variant_info.get(info_field) is None:
129+
vcf_info_data.append('.')
130+
else:
131+
if column_types[info_field] == 'Float':
132+
if not isinstance(variant_info.get(info_field),float):
133+
if not ',' in str(alt):
134+
print('Warning: Multiple values in INFO tag for single ALT allele (VCF multiallelic sites not decomposed properly?):' + str(fixed_fields_string) + '\t' + str(info_field) + '=' + str(variant_info.get(info_field)))
135+
vcf_info_data.append('.')
136+
else:
137+
val = str("{0:.7f}".format(variant_info.get(info_field)))
138+
vcf_info_data.append(val)
139+
else:
140+
if column_types[info_field] == 'String':
141+
vcf_info_data.append(str(variant_info.get(info_field)))
142+
else:
143+
vcf_info_data.append(re.sub('\(|\)', '', str(variant_info.get(info_field))))
144+
145+
146+
#dictionary, with sample names as keys, values being genotype data (dictionary with format tags as keys)
147+
vcf_sample_genotype_data = {}
148+
if len(samples) > 0 and skip_genotype_data is False:
149+
gt_cyvcf = rec.gt_types
150+
i = 0
151+
while i < len(samples):
152+
vcf_sample_genotype_data[samples[i]] = {}
153+
gt = './.'
154+
if gt_present_header == 1:
155+
if gt_cyvcf[i] == 0:
156+
gt = '0/0'
157+
if gt_cyvcf[i] == 1:
158+
gt = '0/1'
159+
if gt_cyvcf[i] == 2:
160+
gt = '1/1'
161+
vcf_sample_genotype_data[samples[i]]['GT'] = gt
162+
i = i + 1
163+
164+
for format_tag in sorted(format_columns_header):
165+
if len(samples) > 0 and skip_genotype_data is False:
166+
sample_dat = rec.format(format_tag)
167+
if sample_dat is None:
168+
k = 0
169+
while k < len(samples):
170+
if samples[k] in vcf_sample_genotype_data:
171+
vcf_sample_genotype_data[samples[k]][format_tag] = '.'
172+
k = k + 1
173+
continue
174+
dim = sample_dat.shape
175+
j = 0
176+
## sample-wise
177+
while j < dim[0]:
178+
if sample_dat[j].size > 1:
179+
d = ','.join(str(e) for e in np.ndarray.tolist(sample_dat[j]))
180+
if samples[j] in vcf_sample_genotype_data:
181+
vcf_sample_genotype_data[samples[j]][format_tag] = d
182+
else:
183+
d = '.'
184+
if column_types[format_tag] == 'String':
185+
d = str(sample_dat[j])
186+
if column_types[format_tag] == 'Integer':
187+
d = str(sample_dat[j][0])
188+
if samples[j] in vcf_sample_genotype_data:
189+
vcf_sample_genotype_data[samples[j]][format_tag] = d
190+
j = j + 1
191+
192+
tsv_elements = []
193+
tsv_elements.append(fixed_fields_string)
194+
if skip_info_data is False:
195+
if skip_genotype_data is False:
196+
if len(sample_columns_header) > 0:
197+
tsv_elements.append('\t'.join(vcf_info_data))
198+
## one line per sample variant
199+
for s in sorted(vcf_sample_genotype_data.keys()):
200+
sample = s
201+
line_elements = []
202+
line_elements.extend(tsv_elements)
203+
line_elements.append(sample)
204+
gt_tag = '.'
205+
for tag in sorted(vcf_sample_genotype_data[sample].keys()):
206+
if tag != 'GT':
207+
line_elements.append(vcf_sample_genotype_data[sample][tag])
208+
else:
209+
gt_tag = vcf_sample_genotype_data[sample][tag]
210+
line_elements.append(gt_tag)
211+
if gt_tag == './.' or gt_tag == '.':
212+
if keep_rejected_calls:
213+
out.write('\t'.join(line_elements) + '\n')
214+
else:
215+
out.write('\t'.join(line_elements) + '\n')
216+
else:
217+
tsv_elements.append('\t'.join(vcf_info_data))
218+
line_elements = []
219+
line_elements.extend(tsv_elements)
220+
out.write('\t'.join(line_elements) + '\n')
221+
else:
222+
tsv_elements.append('\t'.join(vcf_info_data))
223+
line_elements = []
224+
line_elements.extend(tsv_elements)
225+
out.write('\t'.join(line_elements) + '\n')
226+
else:
227+
if skip_genotype_data is False:
228+
if len(sample_columns_header) > 0:
229+
## one line per sample variant
230+
for s in sorted(vcf_sample_genotype_data.keys()):
231+
sample = s
232+
line_elements = []
233+
line_elements.extend(tsv_elements)
234+
line_elements.append(sample)
235+
gt_tag = '.'
236+
for tag in sorted(vcf_sample_genotype_data[sample].keys()):
237+
if tag != 'GT':
238+
line_elements.append(vcf_sample_genotype_data[sample][tag])
239+
else:
240+
gt_tag = vcf_sample_genotype_data[sample][tag]
241+
line_elements.append(gt_tag)
242+
if gt_tag == './.' or gt_tag == '.':
243+
if keep_rejected_calls:
244+
out.write('\t'.join(line_elements) + '\n')
245+
else:
246+
out.write('\t'.join(line_elements) + '\n')
247+
else:
248+
line_elements = []
249+
line_elements.extend(tsv_elements)
250+
line_elements = tsv_elements
251+
out.write('\t'.join(line_elements) + '\n')
252+
253+
out.close()
254+
255+
if compress is True:
256+
command = 'gzip -f ' + str(out_tsv)
257+
check_subprocess(command)
258+
259+
if __name__=="__main__": __main__()
260+
261+
262+

0 commit comments

Comments
 (0)