-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_sv_annotation_tsv.py
More file actions
173 lines (135 loc) · 6.49 KB
/
generate_sv_annotation_tsv.py
File metadata and controls
173 lines (135 loc) · 6.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from pprint import pprint
from datetime import date
from os.path import exists
from sys import path
import argparse
import os
import sys
import read_resource_jsons as read_resources
def main():
# Parse the command line
args = parseCommandLine()
# Define the executable bcftools command
if args.tools_directory:
if not args.tools_directory.endswith('/'):
args.tools_directory = str(args.tools_directory) + '/'
bcftools = args.tools_directory + 'bcftools/bcftools'
else:
bcftools = 'bcftools'
# Read the mosaicJson file to get information on how to process different annotations
mosaic_info = read_resources.read_mosaic_json(args.mosaic_json, args.reference)
# Loop over the resources in the json file
annotations = {}
annotations_no_svtype = {}
for resource in mosaic_info['resources']:
#project_id = mosaic_info['resources'][resource]['project_id']
#if str(project_id) not in annotations:
# annotations[str(project_id)] = {}
# Loop over the annotations that are to be uploaded to Mosaic for this resource and get the annotation name, uid and
# type
for annotation in mosaic_info['resources'][resource]['annotations']:
uid = mosaic_info['resources'][resource]['annotations'][annotation]['uid']
tag_type = mosaic_info['resources'][resource]['annotations'][annotation]['type']
annotations[annotation] = {'uid': uid, 'type': tag_type}
if annotation != 'SVTYPE':
annotations_no_svtype[annotation] = {'uid': uid, 'type': tag_type}
#annotations[str(project_id)][annotation] = {'uid': uid, 'type': tag_type}
# We need to create a tsv for each project that annotations are uploaded to
#for project_id in annotations:
#output_file_name = args.output_tsv + '_' + str(project_id) + '.tsv'
# Open an output tsv file to write annotations to
output_file = open(args.output_tsv, 'w')
# Write the header line to the tsv file
print('CHROM\tSTART\tEND\tREF\tALT\t', '\t'.join(str(x['uid']) for x in annotations_no_svtype.values()), sep = '', file = output_file)
# Parse the vcf file, check the annotations and generate a tsv file for upload to Mosaic
process_vcf(bcftools, args.input_vcf, annotations, output_file)
# Close the output tsv file
output_file.close()
# Input options
def parseCommandLine():
global version
parser = argparse.ArgumentParser(description='Process the command line')
# Required arguments
parser.add_argument('--input_vcf', '-i', required = True, metavar = 'string', help = 'The input vcf file to annotate')
parser.add_argument('--output_tsv', '-o', required = True, metavar = 'string', help = 'The output tsv file stub')
#parser.add_argument('--resource', '-e', required = True, metavar = 'string', help = 'The name of the resource (used for the output tsv name')
parser.add_argument('--reference', '-r', required = True, metavar = 'string', help = 'The genome reference file used')
parser.add_argument('--tools_directory', '-s', required = True, metavar = 'string', help = 'The path to the directory where the tools live')
parser.add_argument('--mosaic_json', '-m', required = True, metavar = 'string', help = 'The json file describing the Mosaic parameters')
return parser.parse_args()
# The majority of annotations can be processed in a standard way. The type is checked and numerical
# annotations are checked to ensure they fall within the required bounds
def process_vcf(bcftools, vcf, annotations, output_file):
# Loop over all records in the vcf file
command = bcftools + ' query -f \'%CHROM\\t%POS\\t%END\\t%REF\\t%ALT\\t%INFO/' + '\\t%INFO/'.join(annotations.keys()) + '\\n\' ' + str(vcf)
observed_records = []
for record in os.popen(command).readlines():
# Split the record on tabs
fields = record.rstrip().split('\t')
# If all values are '.', this line can be ignored
unique_values = set(fields[5:])
if len(unique_values) == 1 and list(unique_values)[0] == '.':
continue
# Update the chromosome and position
fields[0], fields[2] = update_coords(fields[0], fields[2])
# Loop over the annotations in the order they appear in the bcftools query command
i = 5
all_zeros = True
for annotation in annotations:
# If the annotation is numeric
if str(annotations[annotation]['type']) == 'integer' or str(annotations[annotation]['type']) == 'float':
# If the are multiple values (e.g. there is a ',') in the value, ensure that all values are within
# the limits (1E-37 < x < 1E37), but output all values
values = fields[i].split(',') if ',' in fields[i] else [fields[i]]
finalValues = []
for value in values:
if value == '.':
finalValues.append('')
else:
try:
typeTest = float(value)
except:
fail('Invalid value for annotation: ' + record.rstrip())
# Check if the value is zero. If all values for this record are zero, it will be skipped
if (float(value) != 0.):
all_zeros = False
if abs(float(value)) <= 1e-37:
finalValues.append('0')
elif abs(float(value)) >= 1e37:
finalValues.append('1e37')
else:
finalValues.append(value)
# Join the values back together with commas
fields[i] = ','.join(finalValues)
# Else if the annotation is a string, the annotation should be output as is
elif str(annotations[annotation]['type']) == 'string':
pass
# If the type is unknown, fail
else:
fail('Annotation type (' + str(annotations[annotation]['type']) + ') for annotation ' + str(annotation) + ', is unknown')
# Iterate to the next annotation field
i += 1
# Build the output record from the updated fields
if not all_zeros:
# Replace the ALT with SVTYPE
fields[4] = fields[5]
del fields[5]
identifier = '_'.join(fields[0:5])
if identifier not in observed_records:
print('\t'.join(fields), file = output_file)
observed_records.append(identifier)
# Update the chromosome and position in the tsv file
def update_coords(chrom, pos):
# Check that the chromosome does not include "chr" prefix
if chrom.startswith('chr'): chrom = chrom.strip('chr')
# Add one to the end position
pos = str(int(pos) + 1)
# Return the updated values
return chrom, pos
# If the script fails, provide an error message and exit
def fail(message):
print(message, sep = "")
exit(1)
# Initialise global variables
if __name__ == "__main__":
main()