99import argparse
1010import sys
1111import subprocess
12- import numpy as np
13- import string
1412from collections import deque
1513from operator import attrgetter
1614import itertools
3634]
3735
3836
37+ class RecordNamer :
38+ def __init__ (self , prefix = 'CPX_' , num_digits = 6 ):
39+ self .prefix = prefix
40+ self .num_digits = num_digits
41+ self .count = 0
42+
43+ def get_next_id (self , record ):
44+ self .count += 1
45+ return f"{ self .prefix } _{ record .contig } _{ str (self .count - 1 ).zfill (self .num_digits )} "
46+
47+
3948def _merge_records (vcf , cpx_records , cpx_record_ids ):
4049 """
4150 r1, r2 : iter of pysam.VariantRecord
@@ -81,18 +90,6 @@ def _next_cpx():
8190 curr_cpx = _next_cpx ()
8291
8392
84- def remove_CPX_from_INV (resolve_CPX , resolve_INV ):
85- """
86- Return list of inversion calls not overlapped by list of complex calls
87- """
88- cpx_interval = [(i .chrom , i .pos , i .stop ) for i in resolve_CPX ]
89- out = [
90- inv for inv in resolve_INV
91- if not any (cpx [0 ] == inv .chrom and cpx [1 ] <= i .stop and i .pos <= cpx [2 ] for cpx in cpx_interval )
92- ]
93- return out
94-
95-
9693def multisort (xs , specs ):
9794 for key , reverse in reversed (specs ):
9895 xs .sort (key = attrgetter (key ), reverse = reverse )
@@ -137,17 +134,8 @@ def clusters_cleanup(clusters):
137134 return deque (cluster_single_cleanup (cluster ) for cluster in clusters )
138135
139136
140- def get_random_string (random_string_len ):
141- """
142- Produce string of random upper-case characters and digits, of requested length
143- """
144- return '' .join (np .random .choice (list (string .ascii_uppercase + string .digits ))
145- for _ in range (random_string_len ))
146-
147-
148- def resolve_complex_sv (vcf , cytobands , disc_pairs , mei_bed , variant_prefix = 'CPX_' ,
149- min_rescan_support = 4 , pe_blacklist = None , quiet = False ,
150- SR_only_cutoff = 1000 , random_resolved_id_length = 10 ):
137+ def resolve_complex_sv (vcf , cytobands , disc_pairs , mei_bed , resolved_record_namer , unresolved_record_namer ,
138+ min_rescan_support = 4 , pe_blacklist = None , quiet = False , SR_only_cutoff = 1000 ):
151139 """
152140 Resolve complex SV from CNV intervals and BCA breakpoints.
153141 Yields all resolved events, simple or complex, in sorted order.
@@ -157,8 +145,10 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
157145 cytobands : pysam.TabixFile
158146 disc_pairs : pysam.TabixFile
159147 mei_bed : pybedtools.BedTool
160- variant_prefix : str
161- Prefix to assign to resolved variants
148+ resolved_record_namer: RecordNamer
149+ RecordNamer object for resolved variants
150+ unresolved_record_namer: RecordNamer
151+ RecordNamer object for unresolved variants
162152 min_rescan_support : int
163153 Number of pairs required to count a sample as
164154 supported during PE rescan
@@ -182,14 +172,8 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
182172 'identified ' + str (len (clusters )) + ' candidate complex clusters ' +
183173 'during first pass' , flush = True )
184174
185- # resolved_idx = unresolved_idx = 1
186-
187- if not variant_prefix .endswith ('_' ):
188- variant_prefix += '_'
189-
190175 cpx_records = deque ()
191176 cpx_record_ids = set ()
192- np .random .seed (1 ) # arbitrary fixed seed for reproducibility
193177
194178 for cluster in clusters :
195179 # Print status for each cluster
@@ -212,22 +196,15 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
212196 for record in cluster :
213197 cpx = ComplexSV ([record ], cytobands , mei_bed , SR_only_cutoff )
214198 cpx_record_ids = cpx_record_ids .union (cpx .record_ids )
215-
216- # Assign random string as resolved ID to handle sharding
217- cpx .vcf_record .id = variant_prefix + \
218- get_random_string (random_resolved_id_length )
199+ cpx .vcf_record .id = resolved_record_namer .get_next_id (cpx .vcf_record )
219200 cpx_records .append (cpx .vcf_record )
220- # resolved_idx += 1
221201 outcome = 'treated as separate unrelated insertions'
222202 else :
223203 cpx = ComplexSV (cluster , cytobands , mei_bed , SR_only_cutoff )
224204 cpx_record_ids = cpx_record_ids .union (cpx .record_ids )
225205 if cpx .svtype == 'UNR' :
226- # Assign random string as unresolved ID to handle sharding
227- unresolved_vid = 'UNRESOLVED_' + \
228- get_random_string (random_resolved_id_length )
229206 for record in cpx .records :
230- record .info ['EVENT' ] = unresolved_vid
207+ record .info ['EVENT' ] = unresolved_record_namer . get_next_id ( cpx . vcf_record )
231208 record .info ['UNRESOLVED' ] = True
232209 cpx_records .append (record )
233210 # unresolved_idx += 1
@@ -252,8 +229,7 @@ def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_
252229 'The following records were merged into the INS record: ' + \
253230 ', ' .join (cnv_ids_to_append )
254231 else :
255- cpx .vcf_record .id = variant_prefix + \
256- get_random_string (random_resolved_id_length )
232+ cpx .vcf_record .id = resolved_record_namer .get_next_id (cpx .vcf_record )
257233 cpx_records .append (cpx .vcf_record )
258234 if 'CPX_TYPE' in cpx .vcf_record .info .keys ():
259235 outcome = 'resolved as ' + \
@@ -310,15 +286,13 @@ def cluster_cleanup(clusters_v2):
310286
311287
312288def resolve_complex_sv_v2 (resolve_INV , cytobands , disc_pairs ,
313- mei_bed , variant_prefix = 'CPX_' , min_rescan_support = 4 ,
314- pe_blacklist = None , quiet = False , SR_only_cutoff = 1000 ,
315- random_resolved_id_length = 10 ):
289+ mei_bed , resolved_record_namer , unresolved_record_namer ,
290+ min_rescan_support = 4 , pe_blacklist = None , quiet = False ,
291+ SR_only_cutoff = 1000 ):
316292 linked_INV = cluster_INV (resolve_INV )
317293 clusters_v2 = link_cpx_V2 (linked_INV , cpx_dist = 2000 )
318294 clusters_v2 = cluster_cleanup (clusters_v2 )
319295
320- np .random .seed (0 ) # arbitrary fixed seed for reproducibility
321-
322296 # Print number of candidate clusters identified
323297 if not quiet :
324298 now = datetime .datetime .now ()
@@ -349,29 +323,22 @@ def resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs,
349323 for record in cluster :
350324 cpx = ComplexSV ([record ], cytobands , mei_bed , SR_only_cutoff )
351325 cpx_record_ids_v2 .update (cpx .record_ids )
352-
353- # Assign random string as resolved ID to handle sharding
354- cpx .vcf_record .id = variant_prefix + '_' + \
355- get_random_string (random_resolved_id_length )
326+ cpx .vcf_record .id = resolved_record_namer .get_next_id (cpx .vcf_record )
356327 cpx_records_v2 .append (cpx .vcf_record )
357328 # resolved_idx += 1
358329 outcome = 'treated as separate unrelated insertions'
359330 else :
360331 cpx = ComplexSV (cluster , cytobands , mei_bed , SR_only_cutoff )
361332 cpx_record_ids_v2 .update (cpx .record_ids )
362333 if cpx .svtype == 'UNR' :
363- # Assign random string as unresolved ID to handle sharding
364- unresolved_vid = 'UNRESOLVED_' + \
365- get_random_string (random_resolved_id_length )
366334 for record in cpx .records :
367- record .info ['EVENT' ] = unresolved_vid
335+ record .info ['EVENT' ] = unresolved_record_namer . get_next_id ( cpx . vcf_record )
368336 record .info ['UNRESOLVED' ] = True
369337 cpx_records_v2 .append (record )
370338 # unresolved_idx += 1
371339 outcome = 'is unresolved'
372340 else :
373- cpx .vcf_record .id = variant_prefix + '_' + \
374- get_random_string (random_resolved_id_length )
341+ cpx .vcf_record .id = resolved_record_namer .get_next_id (cpx .vcf_record )
375342 cpx_records_v2 .append (cpx .vcf_record )
376343 if 'CPX_TYPE' in cpx .vcf_record .info .keys ():
377344 outcome = 'resolved as ' + \
@@ -417,11 +384,6 @@ def main(argv):
417384 parser .add_argument ('--cytobands' , help = 'Cytoband file. Required to '
418385 'correctly classify interchromosomal events.' ,
419386 required = True )
420- # parser.add_argument('--bincov', help='Bincov file.', required=True)
421- # parser.add_argument('--medianfile', help='Medianfile', required=True)
422- # parser.add_argument('--famfile', help='Fam file', required=True)
423- # parser.add_argument('--cutoffs', help='Random forest cutoffs',
424- # required=True)
425387 parser .add_argument ('--min-rescan-pe-support' , type = int , default = 4 ,
426388 help = 'Minumum discordant pairs required during '
427389 'single-ender rescan.' )
@@ -433,6 +395,10 @@ def main(argv):
433395 help = 'Unresolved complex breakpoints and CNV.' )
434396 parser .add_argument ('-p' , '--prefix' , default = 'CPX_' ,
435397 help = 'Variant prefix [CPX_]' )
398+ parser .add_argument ('-d' , '--variant-id-digits' , type = int , default = 6 ,
399+ help = 'Number of digits in variant IDs.' )
400+ parser .add_argument ('-t' , '--temp-dir' , type = str , default = None ,
401+ help = 'Temporary directory path for vcf sorting. [Default uses TMPDIR environment variable]' )
436402 parser .add_argument ('-q' , '--quiet' , default = False ,
437403 help = 'Disable progress logging to stderr.' )
438404
@@ -451,7 +417,10 @@ def main(argv):
451417 for line in CPX_INFO :
452418 vcf .header .add_line (line )
453419
454- resolved_pipe = subprocess .Popen (['vcf-sort' , '-c' ],
420+ sort_command = ['bcftools' , 'sort' ]
421+ if args .temp_dir :
422+ sort_command .extend (['--temp-dir' , args .temp_dir ])
423+ resolved_pipe = subprocess .Popen (sort_command ,
455424 stdin = subprocess .PIPE ,
456425 stdout = args .resolved )
457426
@@ -465,9 +434,6 @@ def main(argv):
465434 blacklist = pysam .TabixFile (args .pe_blacklist )
466435 else :
467436 blacklist = None
468- # cutoffs = pd.read_table(args.cutoffs)
469- # rdtest = svu.RdTest(args.bincov, args.medianfile, args.famfile,
470- # list(vcf.header.samples), cutoffs)
471437
472438 if args .discfile is not None :
473439 disc_pairs = pysam .TabixFile (args .discfile )
@@ -481,10 +447,11 @@ def main(argv):
481447 resolved_records = []
482448 unresolved_records = []
483449 resolve_INV = []
484- # cpx_dist = 20000
450+ resolved_record_namer = RecordNamer (prefix = args .prefix + '_CPX' , num_digits = args .variant_id_digits )
451+ unresolved_record_namer = RecordNamer (prefix = args .prefix + '_UNRES' , num_digits = args .variant_id_digits )
485452
486- for record in resolve_complex_sv (vcf , cytobands , disc_pairs , mei_bed , args . prefix ,
487- args .min_rescan_pe_support , blacklist , args .quiet ):
453+ for record in resolve_complex_sv (vcf , cytobands , disc_pairs , mei_bed , resolved_record_namer ,
454+ unresolved_record_namer , args .min_rescan_pe_support , blacklist , args .quiet ):
488455 # Move members to existing variant IDs unless variant is complex
489456 if record .info ['SVTYPE' ] != 'CPX' and args .prefix not in record .id :
490457 # Don't alter MEMBERS if the prefix of record.id is already in MEMBERS
@@ -500,7 +467,6 @@ def main(argv):
500467 else :
501468 resolved_records .append (record )
502469
503- # out_rec = resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, args.prefix, args.min_rescan_pe_support, blacklist)
504470 # Print status
505471 if not args .quiet :
506472 now = datetime .datetime .now ()
@@ -510,9 +476,8 @@ def main(argv):
510476
511477 # RLC: As of Sept 19, 2018, only considering inversion single-enders in second-pass
512478 # due to too many errors in second-pass linking and variant reporting
513- cpx_records_v2 = resolve_complex_sv_v2 (resolve_INV ,
514- cytobands , disc_pairs , mei_bed , args .prefix ,
515- args .min_rescan_pe_support , blacklist , args .quiet )
479+ cpx_records_v2 = resolve_complex_sv_v2 (resolve_INV , cytobands , disc_pairs , mei_bed , resolved_record_namer ,
480+ unresolved_record_namer , args .min_rescan_pe_support , blacklist , args .quiet )
516481
517482 for record in cpx_records_v2 :
518483 # Move members to existing variant IDs unless variant is complex
0 commit comments