Skip to content

Commit f4630d2

Browse files
authored
Version 3.1.2 (#31)
Add --write-nocalls-in-vcf option to write no-call sites in the VCF
1 parent 75facb3 commit f4630d2

3 files changed

Lines changed: 109 additions & 35 deletions

File tree

paraphase/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.1.1"
1+
__version__ = "3.1.2"

paraphase/paraphase.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ def process_gene(
171171
sample_id,
172172
outdir,
173173
phaser_call,
174+
args,
174175
)
175176
vcf_generater.set_parameter(
176177
config, tmpdir=tmpdir, prog_cmd=prog_cmd
@@ -181,6 +182,7 @@ def process_gene(
181182
sample_id,
182183
outdir,
183184
phaser_call,
185+
args,
184186
)
185187
vcf_generater.set_parameter(
186188
config, tmpdir=tmpdir, prog_cmd=prog_cmd
@@ -646,6 +648,12 @@ def load_parameters(self):
646648
required=False,
647649
action="store_true",
648650
)
651+
parser.add_argument(
652+
"--write-nocalls-in-vcf",
653+
help="Optional. If specified, Paraphase will write no-call sites in the VCFs, marked with LowQual filter.",
654+
required=False,
655+
action="store_true",
656+
)
649657
parser.add_argument(
650658
"--samtools",
651659
help="Optional path to samtools",

paraphase/prepare_bam_and_vcf.py

Lines changed: 100 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -344,10 +344,13 @@ class VcfGenerater:
344344
search_range = 200
345345
min_base_quality_for_variant_calling = 25
346346

347-
def __init__(self, sample_id, outdir, call_sum):
347+
def __init__(self, sample_id, outdir, call_sum, args=None):
348348
self.sample_id = sample_id
349349
self.outdir = outdir
350350
self.call_sum = call_sum
351+
self.lowqual = False
352+
if args is not None:
353+
self.lowqual = args.write_nocalls_in_vcf
351354
self.match = {}
352355

353356
def set_parameter(self, config, tmpdir=None, prog_cmd=None):
@@ -395,7 +398,7 @@ def write_header(self, fout):
395398
"""Write VCF header"""
396399
fout.write("##fileformat=VCFv4.2\n")
397400
fout.write('##FILTER=<ID=PASS,Description="All filters passed">\n')
398-
# fout.write('##FILTER=<ID=LowQual,Description="Nonpassing variant">\n')
401+
fout.write('##FILTER=<ID=LowQual,Description="Nonpassing variant">\n')
399402
fout.write(
400403
'##INFO=<ID=HPBOUND,Number=.,Type=String,Description="Boundary coordinates of the phased haplotype">\n'
401404
)
@@ -495,16 +498,55 @@ def merge_vcf(self, vars_list):
495498
call_info = variants_info[pos]
496499
# unique variants at this site
497500
variant_observed = set([a[0] for a in call_info if a is not None])
501+
ref_only = False
502+
if len(variant_observed) == 1:
503+
_, ref, alt = list(variant_observed)[0].split("_")
504+
if ref == alt:
505+
ref_only = True
506+
if len(variant_observed) == 2:
507+
has_ref = False
508+
has_del = False
509+
for var in variant_observed:
510+
_, a, b = var.split("_")
511+
if a == b:
512+
has_ref = True
513+
elif b == "*":
514+
has_del = True
515+
if has_ref and has_del:
516+
ref_only = True
517+
498518
for variant in variant_observed:
499519
_, ref, alt = variant.split("_")
520+
valid_gts = []
500521
merge_gt = []
501522
merge_ad = []
502523
merge_dp = []
503-
for each_call in call_info:
524+
for each_call_index, each_call in enumerate(call_info):
504525
if each_call is None:
505526
merge_gt.append(".")
506527
merge_ad.append(".")
507528
merge_dp.append(".")
529+
hap_info = haps_info[each_call_index]
530+
(
531+
_,
532+
hap_info_bound1,
533+
hap_info_bound2,
534+
hap_info_truncated,
535+
) = hap_info
536+
if (
537+
hap_info_truncated is None
538+
or hap_info_truncated is False
539+
):
540+
valid_gts.append(".")
541+
elif hap_info_truncated == ["5p"]:
542+
if pos > hap_info_bound1:
543+
valid_gts.append(".")
544+
elif hap_info_truncated == ["3p"]:
545+
if pos < hap_info_bound2:
546+
valid_gts.append(".")
547+
elif hap_info_truncated == ["5p", "3p"]:
548+
if hap_info_bound1 < pos < hap_info_bound2:
549+
valid_gts.append(".")
508550
else:
509551
var_name, dp, ad, var_filter, gt, counter = each_call
510552
if counter is None:
@@ -527,39 +569,51 @@ def merge_vcf(self, vars_list):
527569
]
528570
)
529571
else:
530-
this_ad = ",".join([str(a) for a in [ad[0], 0]])
572+
this_ad = ",".join(
573+
[str(a) for a in [ad[0], dp - ad[0]]]
574+
)
531575
if var_filter != []:
532576
gt = "."
533577
merge_dp.append(str(dp))
534578
if gt == "0":
535579
merge_gt.append(gt)
580+
valid_gts.append(gt)
536581
merge_ad.append(this_ad)
537582
elif var_name == variant:
538583
merge_gt.append(gt)
584+
valid_gts.append(gt)
539585
merge_ad.append(this_ad)
540586
else:
541587
merge_gt.append(".")
588+
valid_gts.append(".")
542589
merge_ad.append(this_ad)
543-
if list_counter == 0 and haps_ids != haps_ids1:
544-
for _ in range(len(haps_ids2)):
545-
merge_gt.append(".")
546-
merge_ad.append(".")
547-
merge_dp.append(".")
548-
elif list_counter > 0:
549-
for _ in range(len(haps_ids1)):
550-
merge_gt.insert(0, ".")
551-
merge_ad.insert(0, ".")
552-
merge_dp.insert(0, ".")
590+
write_variant = False
591+
if self.lowqual is True:
592+
if (
593+
(alt != ref or ref_only)
594+
and alt not in [".", "*"]
595+
and ("1" in merge_gt or "." in valid_gts)
596+
):
597+
write_variant = True
598+
elif alt != ref and alt not in [".", "*"] and "1" in merge_gt:
599+
write_variant = True
553600
final_qual = "."
554-
if (
555-
alt != ref
556-
and alt not in [".", "*"]
557-
and "1" in merge_gt # or "." in merge_gt
558-
):
601+
if write_variant:
602+
if list_counter == 0 and haps_ids != haps_ids1:
603+
for _ in range(len(haps_ids2)):
604+
merge_gt.append(".")
605+
merge_ad.append(".")
606+
merge_dp.append(".")
607+
elif list_counter > 0:
608+
for _ in range(len(haps_ids1)):
609+
merge_gt.insert(0, ".")
610+
merge_ad.insert(0, ".")
611+
merge_dp.insert(0, ".")
612+
559613
if "1" in merge_gt:
560614
variant_filter = "PASS"
561-
# else:
562-
# variant_filter = "LowQual"
615+
else:
616+
variant_filter = "LowQual"
563617
info_field = "HPBOUND=" + ",".join(haps_bounds)
564618
alleles = self.call_sum.get("alleles_final")
565619
if alleles is not None and alleles != []:
@@ -778,7 +832,7 @@ def pileup_to_variant(
778832
var_filter = []
779833
if dp < min_depth:
780834
var_filter.append("LowDP")
781-
if ad[1] < dp * 0.7:
835+
if (gt == "1" and ad[1] < dp * 0.7) or (gt == "0" and ad[0] < dp * 0.7):
782836
var_filter.append("LowQual")
783837
if var_filter != []:
784838
gt = "."
@@ -843,7 +897,7 @@ def run_without_realign(
843897
two_cp_haplotypes = self.call_sum.get("two_copy_haplotypes")
844898
nhap = len(final_haps)
845899
if two_cp_haplotypes is not None:
846-
nhap += len(two_cp_haplotypes)
900+
nhap += len([a for a in two_cp_haplotypes if a in final_haps.values()])
847901
hap_info = []
848902

849903
# gene1only, or two-gene mode but gene1 side
@@ -862,6 +916,11 @@ def run_without_realign(
862916
hap_name = f"{self.gene}_homozygous_hap1"
863917
pileups_raw = {}
864918
read_names = {}
919+
920+
for pos in range(self.left_boundary, self.right_boundary):
921+
pileups_raw.setdefault(pos, [])
922+
read_names.setdefault(pos, [])
923+
865924
for pileupcolumn in bamh.pileup(
866925
nchr,
867926
truncate=True,
@@ -871,8 +930,8 @@ def run_without_realign(
871930
this_pos_bases = [
872931
a.upper() for a in pileupcolumn.get_query_sequences(add_indels=True)
873932
]
874-
pileups_raw.setdefault(pos, this_pos_bases)
875-
read_names.setdefault(pos, pileupcolumn.get_query_names())
933+
pileups_raw[pos] = this_pos_bases
934+
read_names[pos] = pileupcolumn.get_query_names()
876935
variants_called = self.pileup_to_variant(
877936
pileups_raw,
878937
read_names,
@@ -887,13 +946,14 @@ def run_without_realign(
887946
)
888947

889948
for pos, var_name, dp, ad, var_filter, gt, counter in variants_called:
890-
variants_info.setdefault(
891-
pos,
892-
[
893-
[var_name, dp, ad, var_filter, gt, counter],
894-
[var_name, dp, ad, var_filter, gt, counter],
895-
],
896-
)
949+
if pos > self.left_boundary and pos < self.right_boundary:
950+
variants_info.setdefault(
951+
pos,
952+
[
953+
[var_name, dp, ad, var_filter, gt, counter],
954+
[var_name, dp, ad, var_filter, gt, counter],
955+
],
956+
)
897957

898958
i = 0
899959
for hap_name in final_haps.values():
@@ -943,6 +1003,12 @@ def run_without_realign(
9431003
# by HP tag
9441004
pileups_raw = {}
9451005
read_names = {}
1006+
1007+
if hap_bound != []:
1008+
for pos in range(hap_bound[0], hap_bound[1]):
1009+
pileups_raw.setdefault(pos, [])
1010+
read_names.setdefault(pos, [])
1011+
9461012
for pileupcolumn in bamh.pileup(
9471013
nchr,
9481014
truncate=True,
@@ -1024,8 +1090,8 @@ class TwoGeneVcfGenerater(VcfGenerater):
10241090
Make vcf for two-gene scenario
10251091
"""
10261092

1027-
def __init__(self, sample_id, outdir, call_sum):
1028-
VcfGenerater.__init__(self, sample_id, outdir, call_sum)
1093+
def __init__(self, sample_id, outdir, call_sum, args):
1094+
VcfGenerater.__init__(self, sample_id, outdir, call_sum, args)
10291095

10301096
def set_parameter(self, config, tmpdir=None, prog_cmd=None):
10311097
super().set_parameter(config, tmpdir, prog_cmd)

0 commit comments

Comments
 (0)