@@ -344,10 +344,13 @@ class VcfGenerater:
344344 search_range = 200
345345 min_base_quality_for_variant_calling = 25
346346
347- def __init__ (self , sample_id , outdir , call_sum ):
347+ def __init__ (self , sample_id , outdir , call_sum , args = None ):
348348 self .sample_id = sample_id
349349 self .outdir = outdir
350350 self .call_sum = call_sum
351+ self .lowqual = False
352+ if args is not None :
353+ self .lowqual = args .write_nocalls_in_vcf
351354 self .match = {}
352355
353356 def set_parameter (self , config , tmpdir = None , prog_cmd = None ):
@@ -395,7 +398,7 @@ def write_header(self, fout):
395398 """Write VCF header"""
396399 fout .write ("##fileformat=VCFv4.2\n " )
397400 fout .write ('##FILTER=<ID=PASS,Description="All filters passed">\n ' )
398- # fout.write('##FILTER=<ID=LowQual,Description="Nonpassing variant">\n')
401+ fout .write ('##FILTER=<ID=LowQual,Description="Nonpassing variant">\n ' )
399402 fout .write (
400403 '##INFO=<ID=HPBOUND,Number=.,Type=String,Description="Boundary coordinates of the phased haplotype">\n '
401404 )
@@ -495,16 +498,55 @@ def merge_vcf(self, vars_list):
495498 call_info = variants_info [pos ]
496499 # unique variants at this site
497500 variant_observed = set ([a [0 ] for a in call_info if a is not None ])
501+ ref_only = False
502+ if len (variant_observed ) == 1 :
503+ _ , ref , alt = list (variant_observed )[0 ].split ("_" )
504+ if ref == alt :
505+ ref_only = True
506+ if len (variant_observed ) == 2 :
507+ has_ref = False
508+ has_del = False
509+ for var in variant_observed :
510+ _ , a , b = var .split ("_" )
511+ if a == b :
512+ has_ref = True
513+ elif b == "*" :
514+ has_del = True
515+ if has_ref and has_del :
516+ ref_only = True
517+
498518 for variant in variant_observed :
499519 _ , ref , alt = variant .split ("_" )
520+ valid_gts = []
500521 merge_gt = []
501522 merge_ad = []
502523 merge_dp = []
503- for each_call in call_info :
524+ for each_call_index , each_call in enumerate ( call_info ) :
504525 if each_call is None :
505526 merge_gt .append ("." )
506527 merge_ad .append ("." )
507528 merge_dp .append ("." )
529+ hap_info = haps_info [each_call_index ]
530+ (
531+ _ ,
532+ hap_info_bound1 ,
533+ hap_info_bound2 ,
534+ hap_info_truncated ,
535+ ) = hap_info
536+ if (
537+ hap_info_truncated is None
538+ or hap_info_truncated is False
539+ ):
540+ valid_gts .append ("." )
541+ elif hap_info_truncated == ["5p" ]:
542+ if pos > hap_info_bound1 :
543+ valid_gts .append ("." )
544+ elif hap_info_truncated == ["3p" ]:
545+ if pos < hap_info_bound2 :
546+ valid_gts .append ("." )
547+ elif hap_info_truncated == ["5p" , "3p" ]:
548+ if hap_info_bound1 < pos < hap_info_bound2 :
549+ valid_gts .append ("." )
508550 else :
509551 var_name , dp , ad , var_filter , gt , counter = each_call
510552 if counter is None :
@@ -527,39 +569,51 @@ def merge_vcf(self, vars_list):
527569 ]
528570 )
529571 else :
530- this_ad = "," .join ([str (a ) for a in [ad [0 ], 0 ]])
572+ this_ad = "," .join (
573+ [str (a ) for a in [ad [0 ], dp - ad [0 ]]]
574+ )
531575 if var_filter != []:
532576 gt = "."
533577 merge_dp .append (str (dp ))
534578 if gt == "0" :
535579 merge_gt .append (gt )
580+ valid_gts .append (gt )
536581 merge_ad .append (this_ad )
537582 elif var_name == variant :
538583 merge_gt .append (gt )
584+ valid_gts .append (gt )
539585 merge_ad .append (this_ad )
540586 else :
541587 merge_gt .append ("." )
588+ valid_gts .append ("." )
542589 merge_ad .append (this_ad )
543- if list_counter == 0 and haps_ids != haps_ids1 :
544- for _ in range ( len ( haps_ids2 )) :
545- merge_gt . append ( "." )
546- merge_ad . append ( "." )
547- merge_dp . append ( "." )
548- elif list_counter > 0 :
549- for _ in range ( len ( haps_ids1 ) ):
550- merge_gt . insert ( 0 , "." )
551- merge_ad . insert ( 0 , "." )
552- merge_dp . insert ( 0 , "." )
590+ write_variant = False
591+ if self . lowqual is True :
592+ if (
593+ ( alt != ref or ref_only )
594+ and alt not in [ "." , "*" ]
595+ and ( "1" in merge_gt or "." in valid_gts )
596+ ):
597+ write_variant = True
598+ elif alt != ref and alt not in [ "." , "*" ] and "1" in merge_gt :
599+ write_variant = True
553600 final_qual = "."
554- if (
555- alt != ref
556- and alt not in ["." , "*" ]
557- and "1" in merge_gt # or "." in merge_gt
558- ):
601+ if write_variant :
602+ if list_counter == 0 and haps_ids != haps_ids1 :
603+ for _ in range (len (haps_ids2 )):
604+ merge_gt .append ("." )
605+ merge_ad .append ("." )
606+ merge_dp .append ("." )
607+ elif list_counter > 0 :
608+ for _ in range (len (haps_ids1 )):
609+ merge_gt .insert (0 , "." )
610+ merge_ad .insert (0 , "." )
611+ merge_dp .insert (0 , "." )
612+
559613 if "1" in merge_gt :
560614 variant_filter = "PASS"
561- # else:
562- # variant_filter = "LowQual"
615+ else :
616+ variant_filter = "LowQual"
563617 info_field = "HPBOUND=" + "," .join (haps_bounds )
564618 alleles = self .call_sum .get ("alleles_final" )
565619 if alleles is not None and alleles != []:
@@ -778,7 +832,7 @@ def pileup_to_variant(
778832 var_filter = []
779833 if dp < min_depth :
780834 var_filter .append ("LowDP" )
781- if ad [1 ] < dp * 0.7 :
835+ if ( gt == "1" and ad [1 ] < dp * 0.7 ) or ( gt == "0" and ad [ 0 ] < dp * 0.7 ) :
782836 var_filter .append ("LowQual" )
783837 if var_filter != []:
784838 gt = "."
@@ -843,7 +897,7 @@ def run_without_realign(
843897 two_cp_haplotypes = self .call_sum .get ("two_copy_haplotypes" )
844898 nhap = len (final_haps )
845899 if two_cp_haplotypes is not None :
846- nhap += len (two_cp_haplotypes )
900+ nhap += len ([ a for a in two_cp_haplotypes if a in final_haps . values ()] )
847901 hap_info = []
848902
849903 # gene1only, or two-gene mode but gene1 side
@@ -862,6 +916,11 @@ def run_without_realign(
862916 hap_name = f"{ self .gene } _homozygous_hap1"
863917 pileups_raw = {}
864918 read_names = {}
919+
920+ for pos in range (self .left_boundary , self .right_boundary ):
921+ pileups_raw .setdefault (pos , [])
922+ read_names .setdefault (pos , [])
923+
865924 for pileupcolumn in bamh .pileup (
866925 nchr ,
867926 truncate = True ,
@@ -871,8 +930,8 @@ def run_without_realign(
871930 this_pos_bases = [
872931 a .upper () for a in pileupcolumn .get_query_sequences (add_indels = True )
873932 ]
874- pileups_raw . setdefault ( pos , this_pos_bases )
875- read_names . setdefault ( pos , pileupcolumn .get_query_names () )
933+ pileups_raw [ pos ] = this_pos_bases
934+ read_names [ pos ] = pileupcolumn .get_query_names ()
876935 variants_called = self .pileup_to_variant (
877936 pileups_raw ,
878937 read_names ,
@@ -887,13 +946,14 @@ def run_without_realign(
887946 )
888947
889948 for pos , var_name , dp , ad , var_filter , gt , counter in variants_called :
890- variants_info .setdefault (
891- pos ,
892- [
893- [var_name , dp , ad , var_filter , gt , counter ],
894- [var_name , dp , ad , var_filter , gt , counter ],
895- ],
896- )
949+ if pos > self .left_boundary and pos < self .right_boundary :
950+ variants_info .setdefault (
951+ pos ,
952+ [
953+ [var_name , dp , ad , var_filter , gt , counter ],
954+ [var_name , dp , ad , var_filter , gt , counter ],
955+ ],
956+ )
897957
898958 i = 0
899959 for hap_name in final_haps .values ():
@@ -943,6 +1003,12 @@ def run_without_realign(
9431003 # by HP tag
9441004 pileups_raw = {}
9451005 read_names = {}
1006+
1007+ if hap_bound != []:
1008+ for pos in range (hap_bound [0 ], hap_bound [1 ]):
1009+ pileups_raw .setdefault (pos , [])
1010+ read_names .setdefault (pos , [])
1011+
9461012 for pileupcolumn in bamh .pileup (
9471013 nchr ,
9481014 truncate = True ,
@@ -1024,8 +1090,8 @@ class TwoGeneVcfGenerater(VcfGenerater):
10241090 Make vcf for two-gene scenario
10251091 """
10261092
1027- def __init__ (self , sample_id , outdir , call_sum ):
1028- VcfGenerater .__init__ (self , sample_id , outdir , call_sum )
1093+ def __init__ (self , sample_id , outdir , call_sum , args ):
1094+ VcfGenerater .__init__ (self , sample_id , outdir , call_sum , args )
10291095
10301096 def set_parameter (self , config , tmpdir = None , prog_cmd = None ):
10311097 super ().set_parameter (config , tmpdir , prog_cmd )
0 commit comments