Skip to content

Commit 0119b3f

Browse files
authored
Deduplicate records and CPX variant IDs in ResolveComplexVariants (#576)
1 parent 08f3961 commit 0119b3f

File tree

4 files changed

+165
-118
lines changed

4 files changed

+165
-118
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/env python
2+
3+
import argparse
4+
import sys
5+
from typing import Optional, List, Text, Dict, Set, Callable
6+
7+
import pysam
8+
9+
MEMBERS_KEY = "MEMBERS"
10+
UNRESOLVED_KEY = "UNRESOLVED"
11+
12+
13+
def update_header(header: pysam.VariantHeader) -> None:
14+
header.add_line('##INFO=<ID=UNRESOLVED,Number=0,Type=Flag,Description="Variant is unresolved.">')
15+
header.add_line('##INFO=<ID=UNRESOLVED_TYPE,Number=1,Type=String,Description=\"Class of unresolved variant.\">')
16+
17+
18+
def is_unresolved(record: pysam.VariantRecord):
19+
return record.info.get(UNRESOLVED_KEY, None)
20+
21+
22+
def is_resolved(record: pysam.VariantRecord):
23+
return not is_unresolved(record)
24+
25+
26+
def get_members(record: pysam.VariantRecord):
27+
return list(record.info[MEMBERS_KEY]) if isinstance(record.info[MEMBERS_KEY], tuple) \
28+
else [record.info[MEMBERS_KEY]] if record.info[MEMBERS_KEY] is not None \
29+
else list()
30+
31+
32+
def get_vids_and_members_sets(vcf: pysam.VariantFile,
33+
predicate: Callable) -> Dict:
34+
unresolved_vids_set = set()
35+
unresolved_members_set = set()
36+
for r in vcf:
37+
if predicate(r):
38+
unresolved_vids_set.add(r.id)
39+
unresolved_members_set.update(get_members(r))
40+
vcf.reset()
41+
return unresolved_vids_set, unresolved_members_set
42+
43+
44+
def write_vcf(header: pysam.VariantHeader,
45+
all_vcf: pysam.VariantFile,
46+
inv_vcf: pysam.VariantFile,
47+
inv_resolved_vids_set: Set,
48+
inv_resolved_members_set: Set,
49+
all_unresolved_vids_set: Set,
50+
all_unresolved_members_set: Set) -> None:
51+
sys.stdout.write(str(header))
52+
for r in all_vcf:
53+
if r.id not in all_unresolved_vids_set or r.id not in inv_resolved_members_set:
54+
# Resolved in ALL vcf, or unresolved in both VCFs
55+
sys.stdout.write(str(r))
56+
for r in inv_vcf:
57+
if r.id in inv_resolved_vids_set:
58+
# Resolved variant in the INV vcf
59+
members = get_members(r)
60+
if all((m in all_unresolved_members_set) for m in members):
61+
# Resolved in the INV vcf and every member unresolved in the ALL vcf
62+
sys.stdout.write(str(r))
63+
64+
65+
def __parse_arguments(argv: List[Text]) -> argparse.Namespace:
66+
# noinspection PyTypeChecker
67+
parser = argparse.ArgumentParser(
68+
description="Integrates inversion-only and all-SV VCFs from the complex resolve module. "
69+
"Unsorted output is written to stdout.",
70+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
71+
)
72+
parser.add_argument("--all-vcf", type=str, required=True,
73+
help="Complex-resolved VCF containing all SVs")
74+
parser.add_argument("--inv-only-vcf", type=str, required=True,
75+
help="Complex-resolved VCF containing only inversions")
76+
if len(argv) <= 1:
77+
parser.parse_args(["--help"])
78+
sys.exit(0)
79+
parsed_arguments = parser.parse_args(argv[1:])
80+
return parsed_arguments
81+
82+
83+
def main(argv: Optional[List[Text]] = None):
84+
if argv is None:
85+
argv = sys.argv
86+
arguments = __parse_arguments(argv)
87+
with pysam.VariantFile(arguments.all_vcf) as all_vcf, \
88+
pysam.VariantFile(arguments.inv_only_vcf) as inv_vcf:
89+
header = all_vcf.header
90+
update_header(header)
91+
inv_resolved_vids_set, inv_resolved_members_set = get_vids_and_members_sets(inv_vcf, is_resolved)
92+
all_unresolved_vids_set, all_unresolved_members_set = get_vids_and_members_sets(all_vcf, is_unresolved)
93+
write_vcf(header=header,
94+
all_vcf=all_vcf,
95+
inv_vcf=inv_vcf,
96+
inv_resolved_vids_set=inv_resolved_vids_set,
97+
inv_resolved_members_set=inv_resolved_members_set,
98+
all_unresolved_vids_set=all_unresolved_vids_set,
99+
all_unresolved_members_set=all_unresolved_members_set)
100+
101+
102+
if __name__ == "__main__":
103+
main()

src/svtk/svtk/cli/resolve.py

Lines changed: 40 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import argparse
1010
import sys
1111
import subprocess
12-
import numpy as np
13-
import string
1412
from collections import deque
1513
from operator import attrgetter
1614
import itertools
@@ -36,6 +34,17 @@
3634
]
3735

3836

37+
class RecordNamer:
38+
def __init__(self, prefix='CPX_', num_digits=6):
39+
self.prefix = prefix
40+
self.num_digits = num_digits
41+
self.count = 0
42+
43+
def get_next_id(self, record):
44+
self.count += 1
45+
return f"{self.prefix}_{record.contig}_{str(self.count - 1).zfill(self.num_digits)}"
46+
47+
3948
def _merge_records(vcf, cpx_records, cpx_record_ids):
4049
"""
4150
r1, r2 : iter of pysam.VariantRecord
@@ -81,18 +90,6 @@ def _next_cpx():
8190
curr_cpx = _next_cpx()
8291

8392

84-
def remove_CPX_from_INV(resolve_CPX, resolve_INV):
85-
"""
86-
Return list of inversion calls not overlapped by list of complex calls
87-
"""
88-
cpx_interval = [(i.chrom, i.pos, i.stop) for i in resolve_CPX]
89-
out = [
90-
inv for inv in resolve_INV
91-
if not any(cpx[0] == inv.chrom and cpx[1] <= i.stop and i.pos <= cpx[2] for cpx in cpx_interval)
92-
]
93-
return out
94-
95-
9693
def multisort(xs, specs):
9794
for key, reverse in reversed(specs):
9895
xs.sort(key=attrgetter(key), reverse=reverse)
@@ -137,17 +134,8 @@ def clusters_cleanup(clusters):
137134
return deque(cluster_single_cleanup(cluster) for cluster in clusters)
138135

139136

140-
def get_random_string(random_string_len):
141-
"""
142-
Produce string of random upper-case characters and digits, of requested length
143-
"""
144-
return ''.join(np.random.choice(list(string.ascii_uppercase + string.digits))
145-
for _ in range(random_string_len))
146-
147-
148-
def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_',
149-
min_rescan_support=4, pe_blacklist=None, quiet=False,
150-
SR_only_cutoff=1000, random_resolved_id_length=10):
137+
def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, resolved_record_namer, unresolved_record_namer,
138+
min_rescan_support=4, pe_blacklist=None, quiet=False, SR_only_cutoff=1000):
151139
"""
152140
Resolve complex SV from CNV intervals and BCA breakpoints.
153141
Yields all resolved events, simple or complex, in sorted order.
@@ -157,8 +145,10 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
157145
cytobands : pysam.TabixFile
158146
disc_pairs : pysam.TabixFile
159147
mei_bed : pybedtools.BedTool
160-
variant_prefix : str
161-
Prefix to assign to resolved variants
148+
resolved_record_namer: RecordNamer
149+
RecordNamer object for resolved variants
150+
unresolved_record_namer: RecordNamer
151+
RecordNamer object for unresolved variants
162152
min_rescan_support : int
163153
Number of pairs required to count a sample as
164154
supported during PE rescan
@@ -182,14 +172,8 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
182172
'identified ' + str(len(clusters)) + ' candidate complex clusters ' +
183173
'during first pass', flush=True)
184174

185-
# resolved_idx = unresolved_idx = 1
186-
187-
if not variant_prefix.endswith('_'):
188-
variant_prefix += '_'
189-
190175
cpx_records = deque()
191176
cpx_record_ids = set()
192-
np.random.seed(1) # arbitrary fixed seed for reproducibility
193177

194178
for cluster in clusters:
195179
# Print status for each cluster
@@ -212,22 +196,15 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
212196
for record in cluster:
213197
cpx = ComplexSV([record], cytobands, mei_bed, SR_only_cutoff)
214198
cpx_record_ids = cpx_record_ids.union(cpx.record_ids)
215-
216-
# Assign random string as resolved ID to handle sharding
217-
cpx.vcf_record.id = variant_prefix + \
218-
get_random_string(random_resolved_id_length)
199+
cpx.vcf_record.id = resolved_record_namer.get_next_id(cpx.vcf_record)
219200
cpx_records.append(cpx.vcf_record)
220-
# resolved_idx += 1
221201
outcome = 'treated as separate unrelated insertions'
222202
else:
223203
cpx = ComplexSV(cluster, cytobands, mei_bed, SR_only_cutoff)
224204
cpx_record_ids = cpx_record_ids.union(cpx.record_ids)
225205
if cpx.svtype == 'UNR':
226-
# Assign random string as unresolved ID to handle sharding
227-
unresolved_vid = 'UNRESOLVED_' + \
228-
get_random_string(random_resolved_id_length)
229206
for record in cpx.records:
230-
record.info['EVENT'] = unresolved_vid
207+
record.info['EVENT'] = unresolved_record_namer.get_next_id(cpx.vcf_record)
231208
record.info['UNRESOLVED'] = True
232209
cpx_records.append(record)
233210
# unresolved_idx += 1
@@ -252,8 +229,7 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
252229
'The following records were merged into the INS record: ' + \
253230
', '.join(cnv_ids_to_append)
254231
else:
255-
cpx.vcf_record.id = variant_prefix + \
256-
get_random_string(random_resolved_id_length)
232+
cpx.vcf_record.id = resolved_record_namer.get_next_id(cpx.vcf_record)
257233
cpx_records.append(cpx.vcf_record)
258234
if 'CPX_TYPE' in cpx.vcf_record.info.keys():
259235
outcome = 'resolved as ' + \
@@ -310,15 +286,13 @@ def cluster_cleanup(clusters_v2):
310286

311287

312288
def resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs,
313-
mei_bed, variant_prefix='CPX_', min_rescan_support=4,
314-
pe_blacklist=None, quiet=False, SR_only_cutoff=1000,
315-
random_resolved_id_length=10):
289+
mei_bed, resolved_record_namer, unresolved_record_namer,
290+
min_rescan_support=4, pe_blacklist=None, quiet=False,
291+
SR_only_cutoff=1000):
316292
linked_INV = cluster_INV(resolve_INV)
317293
clusters_v2 = link_cpx_V2(linked_INV, cpx_dist=2000)
318294
clusters_v2 = cluster_cleanup(clusters_v2)
319295

320-
np.random.seed(0) # arbitrary fixed seed for reproducibility
321-
322296
# Print number of candidate clusters identified
323297
if not quiet:
324298
now = datetime.datetime.now()
@@ -349,29 +323,22 @@ def resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs,
349323
for record in cluster:
350324
cpx = ComplexSV([record], cytobands, mei_bed, SR_only_cutoff)
351325
cpx_record_ids_v2.update(cpx.record_ids)
352-
353-
# Assign random string as resolved ID to handle sharding
354-
cpx.vcf_record.id = variant_prefix + '_' + \
355-
get_random_string(random_resolved_id_length)
326+
cpx.vcf_record.id = resolved_record_namer.get_next_id(cpx.vcf_record)
356327
cpx_records_v2.append(cpx.vcf_record)
357328
# resolved_idx += 1
358329
outcome = 'treated as separate unrelated insertions'
359330
else:
360331
cpx = ComplexSV(cluster, cytobands, mei_bed, SR_only_cutoff)
361332
cpx_record_ids_v2.update(cpx.record_ids)
362333
if cpx.svtype == 'UNR':
363-
# Assign random string as unresolved ID to handle sharding
364-
unresolved_vid = 'UNRESOLVED_' + \
365-
get_random_string(random_resolved_id_length)
366334
for record in cpx.records:
367-
record.info['EVENT'] = unresolved_vid
335+
record.info['EVENT'] = unresolved_record_namer.get_next_id(cpx.vcf_record)
368336
record.info['UNRESOLVED'] = True
369337
cpx_records_v2.append(record)
370338
# unresolved_idx += 1
371339
outcome = 'is unresolved'
372340
else:
373-
cpx.vcf_record.id = variant_prefix + '_' + \
374-
get_random_string(random_resolved_id_length)
341+
cpx.vcf_record.id = resolved_record_namer.get_next_id(cpx.vcf_record)
375342
cpx_records_v2.append(cpx.vcf_record)
376343
if 'CPX_TYPE' in cpx.vcf_record.info.keys():
377344
outcome = 'resolved as ' + \
@@ -417,11 +384,6 @@ def main(argv):
417384
parser.add_argument('--cytobands', help='Cytoband file. Required to '
418385
'correctly classify interchromosomal events.',
419386
required=True)
420-
# parser.add_argument('--bincov', help='Bincov file.', required=True)
421-
# parser.add_argument('--medianfile', help='Medianfile', required=True)
422-
# parser.add_argument('--famfile', help='Fam file', required=True)
423-
# parser.add_argument('--cutoffs', help='Random forest cutoffs',
424-
# required=True)
425387
parser.add_argument('--min-rescan-pe-support', type=int, default=4,
426388
help='Minumum discordant pairs required during '
427389
'single-ender rescan.')
@@ -433,6 +395,10 @@ def main(argv):
433395
help='Unresolved complex breakpoints and CNV.')
434396
parser.add_argument('-p', '--prefix', default='CPX_',
435397
help='Variant prefix [CPX_]')
398+
parser.add_argument('-d', '--variant-id-digits', type=int, default=6,
399+
help='Number of digits in variant IDs.')
400+
parser.add_argument('-t', '--temp-dir', type=str, default=None,
401+
help='Temporary directory path for vcf sorting. [Default uses TMPDIR environment variable]')
436402
parser.add_argument('-q', '--quiet', default=False,
437403
help='Disable progress logging to stderr.')
438404

@@ -451,7 +417,10 @@ def main(argv):
451417
for line in CPX_INFO:
452418
vcf.header.add_line(line)
453419

454-
resolved_pipe = subprocess.Popen(['vcf-sort', '-c'],
420+
sort_command = ['bcftools', 'sort']
421+
if args.temp_dir:
422+
sort_command.extend(['--temp-dir', args.temp_dir])
423+
resolved_pipe = subprocess.Popen(sort_command,
455424
stdin=subprocess.PIPE,
456425
stdout=args.resolved)
457426

@@ -465,9 +434,6 @@ def main(argv):
465434
blacklist = pysam.TabixFile(args.pe_blacklist)
466435
else:
467436
blacklist = None
468-
# cutoffs = pd.read_table(args.cutoffs)
469-
# rdtest = svu.RdTest(args.bincov, args.medianfile, args.famfile,
470-
# list(vcf.header.samples), cutoffs)
471437

472438
if args.discfile is not None:
473439
disc_pairs = pysam.TabixFile(args.discfile)
@@ -481,10 +447,11 @@ def main(argv):
481447
resolved_records = []
482448
unresolved_records = []
483449
resolve_INV = []
484-
# cpx_dist = 20000
450+
resolved_record_namer = RecordNamer(prefix=args.prefix + '_CPX', num_digits=args.variant_id_digits)
451+
unresolved_record_namer = RecordNamer(prefix=args.prefix + '_UNRES', num_digits=args.variant_id_digits)
485452

486-
for record in resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, args.prefix,
487-
args.min_rescan_pe_support, blacklist, args.quiet):
453+
for record in resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, resolved_record_namer,
454+
unresolved_record_namer, args.min_rescan_pe_support, blacklist, args.quiet):
488455
# Move members to existing variant IDs unless variant is complex
489456
if record.info['SVTYPE'] != 'CPX' and args.prefix not in record.id:
490457
# Don't alter MEMBERS if the prefix of record.id is already in MEMBERS
@@ -500,7 +467,6 @@ def main(argv):
500467
else:
501468
resolved_records.append(record)
502469

503-
# out_rec = resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, args.prefix, args.min_rescan_pe_support, blacklist)
504470
# Print status
505471
if not args.quiet:
506472
now = datetime.datetime.now()
@@ -510,9 +476,8 @@ def main(argv):
510476

511477
# RLC: As of Sept 19, 2018, only considering inversion single-enders in second-pass
512478
# due to too many errors in second-pass linking and variant reporting
513-
cpx_records_v2 = resolve_complex_sv_v2(resolve_INV,
514-
cytobands, disc_pairs, mei_bed, args.prefix,
515-
args.min_rescan_pe_support, blacklist, args.quiet)
479+
cpx_records_v2 = resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs, mei_bed, resolved_record_namer,
480+
unresolved_record_namer, args.min_rescan_pe_support, blacklist, args.quiet)
516481

517482
for record in cpx_records_v2:
518483
# Move members to existing variant IDs unless variant is complex

0 commit comments

Comments
 (0)