|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +import argparse |
| 4 | +from cyvcf2 import VCF, Writer |
| 5 | +import numpy as np |
| 6 | +import re |
| 7 | +import subprocess |
| 8 | + |
| 9 | +version = '0.3.1' |
| 10 | + |
| 11 | + |
| 12 | +def __main__(): |
| 13 | + parser = argparse.ArgumentParser(description='Convert a VCF file with genomic variants to a file with tab-separated values (TSV). One entry (TSV line) per sample genotype', formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
| 14 | + parser.add_argument('query_vcf', help='Bgzipped input VCF file with query variants (SNVs/InDels)') |
| 15 | + parser.add_argument('out_tsv', help='Output TSV file with one line pr non-rejected sample genotype (Variant, genotype and annotation data as tab-separated values)') |
| 16 | + parser.add_argument("--skip_info_data",action = "store_true", help="Skip printing of data in INFO column") |
| 17 | + parser.add_argument("--skip_genotype_data", action="store_true", help="Skip printing of genotype_data (FORMAT columns)") |
| 18 | + parser.add_argument("--keep_rejected_calls", action="store_true", help="Print data for rejected calls") |
| 19 | + parser.add_argument("--print_data_type_header", action="store_true", help="Print a header line with data types of VCF annotations") |
| 20 | + parser.add_argument("--compress", action="store_true", help="Compress TSV file with gzip") |
| 21 | + args = parser.parse_args() |
| 22 | + |
| 23 | + vcf2tsv(args.query_vcf, args.out_tsv, args.skip_info_data, args.skip_genotype_data, args.keep_rejected_calls, args.compress, args.print_data_type_header) |
| 24 | + |
| 25 | + |
| 26 | +def check_subprocess(command): |
| 27 | + try: |
| 28 | + output = subprocess.check_output(str(command), stderr=subprocess.STDOUT, shell=True) |
| 29 | + if len(output) > 0: |
| 30 | + print (str(output.decode()).rstrip()) |
| 31 | + except subprocess.CalledProcessError as e: |
| 32 | + print (e.output) |
| 33 | + exit(0) |
| 34 | + |
| 35 | + |
| 36 | +def vcf2tsv(query_vcf, out_tsv, skip_info_data, skip_genotype_data, keep_rejected_calls, compress, print_data_type_header): |
| 37 | + |
| 38 | + vcf = VCF(query_vcf, gts012 = True) |
| 39 | + out = open(out_tsv,'w') |
| 40 | + |
| 41 | + fixed_columns_header = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER'] |
| 42 | + fixed_columns_header_type = ['String','Integer','String','String','String','Float','String'] |
| 43 | + samples = vcf.samples |
| 44 | + info_columns_header = [] |
| 45 | + format_columns_header = [] |
| 46 | + sample_columns_header = [] |
| 47 | + column_types = {} |
| 48 | + gt_present_header = 0 |
| 49 | + |
| 50 | + if len(samples) > 0: |
| 51 | + sample_columns_header.append('VCF_SAMPLE_ID') |
| 52 | + |
| 53 | + for e in vcf.header_iter(): |
| 54 | + header_element = e.info() |
| 55 | + if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys(): |
| 56 | + if header_element['HeaderType'] == 'INFO' or header_element['HeaderType'] == 'FORMAT': |
| 57 | + column_types[header_element['ID']] = header_element['Type'] |
| 58 | + if header_element['HeaderType'] == 'INFO': |
| 59 | + if skip_info_data is False: |
| 60 | + info_columns_header.append(header_element['ID']) |
| 61 | + if header_element['HeaderType'] == 'FORMAT': |
| 62 | + if len(sample_columns_header) > 0 and skip_genotype_data is False: |
| 63 | + if header_element['ID'] != 'GT': |
| 64 | + format_columns_header.append(header_element['ID']) |
| 65 | + else: |
| 66 | + gt_present_header = 1 |
| 67 | + |
| 68 | + header_line = '\t'.join(fixed_columns_header) |
| 69 | + if skip_info_data is False: |
| 70 | + header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header)) |
| 71 | + if len(sample_columns_header) > 0: |
| 72 | + if skip_genotype_data is False: |
| 73 | + header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header)) + '\t' + '\t'.join(sample_columns_header) + '\t' + '\t'.join(sorted(format_columns_header)) + '\tGT' |
| 74 | + else: |
| 75 | + header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sorted(info_columns_header)) |
| 76 | + else: |
| 77 | + if len(sample_columns_header) > 0: |
| 78 | + if skip_genotype_data is False: |
| 79 | + header_line = '\t'.join(fixed_columns_header) + '\t' + '\t'.join(sample_columns_header) + '\t' + '\t'.join(sorted(format_columns_header)) + '\tGT' |
| 80 | + else: |
| 81 | + header_line = '\t'.join(fixed_columns_header) |
| 82 | + |
| 83 | + out.write('#https://github.com/sigven/vcf2tsv version=' + str(version) + '\n') |
| 84 | + if print_data_type_header is True: |
| 85 | + header_tags = header_line.rstrip().split('\t') |
| 86 | + header_types = [] |
| 87 | + for h in header_tags: |
| 88 | + if h in column_types: |
| 89 | + header_types.append(str(column_types[h])) |
| 90 | + header_line_type = '\t'.join(fixed_columns_header_type) + '\t' + '\t'.join(header_types) |
| 91 | + out.write('#' + str(header_line_type) + '\n') |
| 92 | + out.write(str(header_line) + '\n') |
| 93 | + else: |
| 94 | + out.write(str(header_line) + '\n') |
| 95 | + |
| 96 | + for rec in vcf: |
| 97 | + rec_id = '.' |
| 98 | + rec_qual = '.' |
| 99 | + rec_filter = '.' |
| 100 | + alt = ",".join(str(n) for n in rec.ALT) |
| 101 | + if not rec.ID is None: |
| 102 | + rec_id = str(rec.ID) |
| 103 | + if not rec.QUAL is None: |
| 104 | + rec_qual = str("{0:.2f}".format(rec.QUAL)) |
| 105 | + rec_filter = str(rec.FILTER) |
| 106 | + if rec.FILTER is None: |
| 107 | + rec_filter = 'PASS' |
| 108 | + |
| 109 | + pos = int(rec.start) + 1 |
| 110 | + fixed_fields_string = str(rec.CHROM) + '\t' + str(pos) + '\t' + str(rec_id) + '\t' + str(rec.REF) + '\t' + str(alt) + '\t' + str(rec_qual) + '\t' + str(rec_filter) |
| 111 | + |
| 112 | + if not 'PASS' in rec_filter and not keep_rejected_calls: |
| 113 | + continue |
| 114 | + |
| 115 | + variant_info = rec.INFO |
| 116 | + vcf_info_data = [] |
| 117 | + if skip_info_data is False: |
| 118 | + for info_field in sorted(info_columns_header): |
| 119 | + if column_types[info_field] == 'Flag': |
| 120 | + if variant_info.get(info_field) is None: |
| 121 | + vcf_info_data.append('False') |
| 122 | + else: |
| 123 | + vcf_info_data.append('True') |
| 124 | + elif column_types[info_field] == 'Float' or column_types[info_field] == 'Integer' or column_types[info_field] == 'String': |
| 125 | + if type(variant_info.get(info_field)) is list: |
| 126 | + vcf_info_data.append(",".join(str(n) for n in variant_info.get(info_field).encode('utf-8'))) |
| 127 | + else: |
| 128 | + if variant_info.get(info_field) is None: |
| 129 | + vcf_info_data.append('.') |
| 130 | + else: |
| 131 | + if column_types[info_field] == 'Float': |
| 132 | + if not isinstance(variant_info.get(info_field),float): |
| 133 | + if not ',' in str(alt): |
| 134 | + print('Warning: Multiple values in INFO tag for single ALT allele (VCF multiallelic sites not decomposed properly?):' + str(fixed_fields_string) + '\t' + str(info_field) + '=' + str(variant_info.get(info_field))) |
| 135 | + vcf_info_data.append('.') |
| 136 | + else: |
| 137 | + val = str("{0:.7f}".format(variant_info.get(info_field))) |
| 138 | + vcf_info_data.append(val) |
| 139 | + else: |
| 140 | + if column_types[info_field] == 'String': |
| 141 | + vcf_info_data.append(str(variant_info.get(info_field))) |
| 142 | + else: |
| 143 | + vcf_info_data.append(re.sub('\(|\)', '', str(variant_info.get(info_field)))) |
| 144 | + |
| 145 | + |
| 146 | + #dictionary, with sample names as keys, values being genotype data (dictionary with format tags as keys) |
| 147 | + vcf_sample_genotype_data = {} |
| 148 | + if len(samples) > 0 and skip_genotype_data is False: |
| 149 | + gt_cyvcf = rec.gt_types |
| 150 | + i = 0 |
| 151 | + while i < len(samples): |
| 152 | + vcf_sample_genotype_data[samples[i]] = {} |
| 153 | + gt = './.' |
| 154 | + if gt_present_header == 1: |
| 155 | + if gt_cyvcf[i] == 0: |
| 156 | + gt = '0/0' |
| 157 | + if gt_cyvcf[i] == 1: |
| 158 | + gt = '0/1' |
| 159 | + if gt_cyvcf[i] == 2: |
| 160 | + gt = '1/1' |
| 161 | + vcf_sample_genotype_data[samples[i]]['GT'] = gt |
| 162 | + i = i + 1 |
| 163 | + |
| 164 | + for format_tag in sorted(format_columns_header): |
| 165 | + if len(samples) > 0 and skip_genotype_data is False: |
| 166 | + sample_dat = rec.format(format_tag) |
| 167 | + if sample_dat is None: |
| 168 | + k = 0 |
| 169 | + while k < len(samples): |
| 170 | + if samples[k] in vcf_sample_genotype_data: |
| 171 | + vcf_sample_genotype_data[samples[k]][format_tag] = '.' |
| 172 | + k = k + 1 |
| 173 | + continue |
| 174 | + dim = sample_dat.shape |
| 175 | + j = 0 |
| 176 | + ## sample-wise |
| 177 | + while j < dim[0]: |
| 178 | + if sample_dat[j].size > 1: |
| 179 | + d = ','.join(str(e) for e in np.ndarray.tolist(sample_dat[j])) |
| 180 | + if samples[j] in vcf_sample_genotype_data: |
| 181 | + vcf_sample_genotype_data[samples[j]][format_tag] = d |
| 182 | + else: |
| 183 | + d = '.' |
| 184 | + if column_types[format_tag] == 'String': |
| 185 | + d = str(sample_dat[j]) |
| 186 | + if column_types[format_tag] == 'Integer': |
| 187 | + d = str(sample_dat[j][0]) |
| 188 | + if samples[j] in vcf_sample_genotype_data: |
| 189 | + vcf_sample_genotype_data[samples[j]][format_tag] = d |
| 190 | + j = j + 1 |
| 191 | + |
| 192 | + tsv_elements = [] |
| 193 | + tsv_elements.append(fixed_fields_string) |
| 194 | + if skip_info_data is False: |
| 195 | + if skip_genotype_data is False: |
| 196 | + if len(sample_columns_header) > 0: |
| 197 | + tsv_elements.append('\t'.join(vcf_info_data)) |
| 198 | + ## one line per sample variant |
| 199 | + for s in sorted(vcf_sample_genotype_data.keys()): |
| 200 | + sample = s |
| 201 | + line_elements = [] |
| 202 | + line_elements.extend(tsv_elements) |
| 203 | + line_elements.append(sample) |
| 204 | + gt_tag = '.' |
| 205 | + for tag in sorted(vcf_sample_genotype_data[sample].keys()): |
| 206 | + if tag != 'GT': |
| 207 | + line_elements.append(vcf_sample_genotype_data[sample][tag]) |
| 208 | + else: |
| 209 | + gt_tag = vcf_sample_genotype_data[sample][tag] |
| 210 | + line_elements.append(gt_tag) |
| 211 | + if gt_tag == './.' or gt_tag == '.': |
| 212 | + if keep_rejected_calls: |
| 213 | + out.write('\t'.join(line_elements) + '\n') |
| 214 | + else: |
| 215 | + out.write('\t'.join(line_elements) + '\n') |
| 216 | + else: |
| 217 | + tsv_elements.append('\t'.join(vcf_info_data)) |
| 218 | + line_elements = [] |
| 219 | + line_elements.extend(tsv_elements) |
| 220 | + out.write('\t'.join(line_elements) + '\n') |
| 221 | + else: |
| 222 | + tsv_elements.append('\t'.join(vcf_info_data)) |
| 223 | + line_elements = [] |
| 224 | + line_elements.extend(tsv_elements) |
| 225 | + out.write('\t'.join(line_elements) + '\n') |
| 226 | + else: |
| 227 | + if skip_genotype_data is False: |
| 228 | + if len(sample_columns_header) > 0: |
| 229 | + ## one line per sample variant |
| 230 | + for s in sorted(vcf_sample_genotype_data.keys()): |
| 231 | + sample = s |
| 232 | + line_elements = [] |
| 233 | + line_elements.extend(tsv_elements) |
| 234 | + line_elements.append(sample) |
| 235 | + gt_tag = '.' |
| 236 | + for tag in sorted(vcf_sample_genotype_data[sample].keys()): |
| 237 | + if tag != 'GT': |
| 238 | + line_elements.append(vcf_sample_genotype_data[sample][tag]) |
| 239 | + else: |
| 240 | + gt_tag = vcf_sample_genotype_data[sample][tag] |
| 241 | + line_elements.append(gt_tag) |
| 242 | + if gt_tag == './.' or gt_tag == '.': |
| 243 | + if keep_rejected_calls: |
| 244 | + out.write('\t'.join(line_elements) + '\n') |
| 245 | + else: |
| 246 | + out.write('\t'.join(line_elements) + '\n') |
| 247 | + else: |
| 248 | + line_elements = [] |
| 249 | + line_elements.extend(tsv_elements) |
| 250 | + line_elements = tsv_elements |
| 251 | + out.write('\t'.join(line_elements) + '\n') |
| 252 | + |
| 253 | + out.close() |
| 254 | + |
| 255 | + if compress is True: |
| 256 | + command = 'gzip -f ' + str(out_tsv) |
| 257 | + check_subprocess(command) |
| 258 | + |
| 259 | +if __name__=="__main__": __main__() |
| 260 | + |
| 261 | + |
| 262 | + |
0 commit comments