Skip to content

Commit a72afbc

Browse files
authored
Version 3.4 (#59)
- Fix bug where MM/ML tags are off in supplementary alignments - Fix bug where the last base of a read is sometimes used in phasing - Fix bug in variant calling where in some reads a wrong base is chosen between primary and supplementary alignments - Fix bug where some alleles may include redundant haplotype names - Fix bug where it's not possible to lower the minimum variant frequency for variant calling for individual target regions. This change affects opn1lw and ikbkg. - Improve large deletion calling in reads. This change may affect all targets, particularly ikbkg and pms2. - Rename haplotypes to label gene1 and gene2 for regions with fusion calling(GBA, CYP2D6, CYP11B1 and CFH/CFHR3) - For smn1, consider more scenarios for adjusting SMN2 copy number - For ncf1, adjust copy number for the scenario where three haplotypes are found and all are present at two copies. - Update two copy haplotypes for CFHclust
1 parent 08fc26d commit a72afbc

9 files changed

Lines changed: 375 additions & 135 deletions

File tree

paraphase/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.3.4"
1+
__version__ = "3.4.0"

paraphase/genes/cfhclust.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def call(self):
5252
len(self.cfh["final_haplotypes"]),
5353
len(self.cfhr3["final_haplotypes"]),
5454
)
55+
if total_cn < self.cfh["total_cn"] or total_cn < self.cfhr3["total_cn"]:
56+
two_cp_haps = []
5557

5658
return self.GeneCall(
5759
total_cn,

paraphase/genes/ncf1_phaser.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,23 @@ def call(self):
119119
haplotypes = tmp
120120

121121
two_cp_haps = []
122-
if counter_gene == 1:
122+
# scenario where only three haplotypes are found, possibly each at CN2
123+
if total_cn == 3:
124+
if self.mdepth is not None:
125+
probs = self.depth_prob(self.region_avg_depth.median, self.mdepth)
126+
if probs is not None and probs[2] + probs[3] > 0.95:
127+
two_cp_haps = list(ass_haps.values())
128+
for hap in two_cp_haps:
129+
total_cn += 1
130+
if "pseudo" in hap:
131+
counter_pseudo += 1
132+
else:
133+
counter_gene += 1
134+
if counter_gene == 1 and counter_pseudo == 2:
135+
counter_gene = None
136+
total_cn = None
137+
138+
elif counter_gene == 1:
123139
two_cp_hap_candidate = self.compare_depth(haplotypes, ass_haps)
124140
if two_cp_hap_candidate == []:
125141
# check if one haplotype has more reads than others
@@ -131,18 +147,15 @@ def call(self):
131147
counter_gene += 1
132148
total_cn += 1
133149

134-
if self.mdepth is not None:
150+
if self.mdepth is not None and counter_gene is not None:
135151
prob = self.depth_prob(gene_reads, self.mdepth / 2)
136-
if prob[0] < 0.9 and counter_gene == 1:
137-
counter_gene = None
138-
total_cn = None
139-
if prob[0] > 0.95 and counter_gene > 1 and two_cp_haps != []:
140-
counter_gene = None
141-
total_cn = None
142-
# scenario where only three haplotypes are found, possibly each at CN2
143-
if counter_gene == 1 and counter_pseudo == 2 and total_cn == 3:
144-
counter_gene = None
145-
total_cn = None
152+
if prob is not None:
153+
if prob[0] < 0.9 and counter_gene == 1:
154+
counter_gene = None
155+
total_cn = None
156+
if prob[0] > 0.95 and counter_gene > 1 and two_cp_haps != []:
157+
counter_gene = None
158+
total_cn = None
146159

147160
# homozygous case
148161
if total_cn == 0:

paraphase/genes/pms2_phaser.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ def call(self):
2828
phase_region=f"{self.genome_build}:{self.nchr}:{self.left_boundary}-{self.right_boundary}",
2929
)
3030
self.get_homopolymer()
31-
self.find_big_deletion(min_size=2900)
31+
self.find_big_deletion(min_size=1000)
32+
pms2cl_clip_site = self.clip_3p_positions[0]
33+
self.find_clip_site()
34+
self.clip_3p_positions = [pms2cl_clip_site]
3235

3336
if self.deletion1_size is not None:
3437
self.del1_reads, self.del1_reads_partial = self.get_long_del_reads(
@@ -52,6 +55,7 @@ def call(self):
5255
# for distinguishing pms2 from pms2cl
5356
raw_read_haps = self.get_haplotypes_from_reads(
5457
clip_buffer=50,
58+
min_clip_len=100,
5559
check_clip=True,
5660
kept_sites=homo_sites_to_add,
5761
add_sites=self.add_sites,
@@ -93,13 +97,22 @@ def call(self):
9397
for hap in ass_haps:
9498
clip_position = self.get_3pclip_from_hap(hap)
9599
if clip_position is None:
96-
counter_unknown += 1
97-
hap_name = f"{self.gene}_unknownhap{counter_unknown}"
98-
elif clip_position in self.clip_3p_positions:
100+
num_pms2_bases = 0
101+
for i, base in enumerate(hap):
102+
pos = int(self.het_sites[i].split("_")[0])
103+
if pos > pms2cl_clip_site + 50 and base not in ["0", "x"]:
104+
num_pms2_bases += 1
105+
if num_pms2_bases >= 3:
106+
counter_gene += 1
107+
hap_name = f"{self.gene}_pms2hap{counter_gene}"
108+
else:
109+
counter_unknown += 1
110+
hap_name = f"{self.gene}_unknownhap{counter_unknown}"
111+
elif clip_position == pms2cl_clip_site:
99112
counter_pseudo += 1
100113
hap_name = f"{self.gene}_pms2clhap{counter_pseudo}"
101114
else:
102-
assert clip_position == 0
115+
# assert clip_position == 0
103116
counter_gene += 1
104117
hap_name = f"{self.gene}_pms2hap{counter_gene}"
105118
tmp.setdefault(hap, hap_name)

paraphase/genes/smn1_phaser.py

Lines changed: 72 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -311,21 +311,22 @@ def adjust_smn1_cn(self, smn1_cn, smn2_cn, hcn, ass_haps, read_counts, smn1_haps
311311
haploid_depth = genome_depth / 2
312312
depth1 = self.smn1_reads_splice
313313
copy_number_probs = self.depth_prob(depth1, haploid_depth)
314-
if smn1_cn == 1:
315-
# when there is only one haplotype, check if depth is
316-
# consistent with haploid depth
317-
copy_one_prob = copy_number_probs[0]
318-
if copy_one_prob < 0.05:
319-
two_cp_haps = list(smn1_haps.values())
320-
return 2, two_cp_haps
321-
if copy_one_prob < 0.25:
322-
new_smn1_cn = None
323-
elif smn1_cn == 2:
324-
# scenario where two two-copy alleles are identical
325-
copy_four_prob = copy_number_probs[3]
326-
if copy_four_prob > 0.75:
327-
two_cp_haps = list(smn1_haps.values())
328-
return 4, two_cp_haps
314+
if copy_number_probs is not None:
315+
if smn1_cn == 1:
316+
# when there is only one haplotype, check if depth is
317+
# consistent with haploid depth
318+
copy_one_prob = copy_number_probs[0]
319+
if copy_one_prob < 0.05:
320+
two_cp_haps = list(smn1_haps.values())
321+
return 2, two_cp_haps
322+
if copy_one_prob < 0.25:
323+
new_smn1_cn = None
324+
elif smn1_cn == 2:
325+
# scenario where two two-copy alleles are identical
326+
copy_four_prob = copy_number_probs[3]
327+
if copy_four_prob > 0.75:
328+
two_cp_haps = list(smn1_haps.values())
329+
return 4, two_cp_haps
329330
if smn1_cn in [2, 3] and read_counts is not None and self.targeted is False:
330331
# check if one smn1 haplotype has more reads than others
331332
haps = list(read_counts.keys())
@@ -335,7 +336,7 @@ def adjust_smn1_cn(self, smn1_cn, smn2_cn, hcn, ass_haps, read_counts, smn1_haps
335336
if cp2_hap in smn1_haps:
336337
others_max = sorted(counts, reverse=True)[1]
337338
probs = self.depth_prob(max_count, others_max)
338-
if probs[0] < 0.0005 and others_max >= 10:
339+
if probs is not None and probs[0] < 0.0005 and others_max >= 10:
339340
two_cp_haps.append(smn1_haps[cp2_hap])
340341
return smn1_cn + 1, two_cp_haps
341342

@@ -346,12 +347,13 @@ def adjust_smn1_cn(self, smn1_cn, smn2_cn, hcn, ass_haps, read_counts, smn1_haps
346347
copy_number_probs = self.depth_prob(depth1, haploid_depth)
347348
# when there is only one haplotype, check if depth is
348349
# consistent with haploid depth
349-
copy_one_prob = copy_number_probs[0]
350-
if copy_one_prob < 0.25:
351-
new_smn1_cn = None
352-
if smn2_cn > 1 and copy_one_prob < 0.05:
353-
two_cp_haps = list(smn1_haps.values())
354-
return 2, two_cp_haps
350+
if copy_number_probs is not None:
351+
copy_one_prob = copy_number_probs[0]
352+
if copy_one_prob < 0.25:
353+
new_smn1_cn = None
354+
if smn2_cn > 1 and copy_one_prob < 0.05:
355+
two_cp_haps = list(smn1_haps.values())
356+
return 2, two_cp_haps
355357
# if we see more haplotypes in other regions of the gene
356358
if smn1_cn == 1 and hcn > len(ass_haps):
357359
if self.has_smn2 is False:
@@ -361,7 +363,7 @@ def adjust_smn1_cn(self, smn1_cn, smn2_cn, hcn, ass_haps, read_counts, smn1_haps
361363
new_smn1_cn = None
362364
return new_smn1_cn, two_cp_haps
363365

364-
def adjust_smn2_cn(self, smn1_cn, smn2_cn, smn2_haps):
366+
def adjust_smn2_cn(self, smn1_cn, smn2_cn, read_counts, smn2_haps):
365367
"""
366368
Adjust SMN2 CN if there is only one haplotype found.
367369
"""
@@ -377,19 +379,43 @@ def adjust_smn2_cn(self, smn1_cn, smn2_cn, smn2_haps):
377379
depth1 = self.smn2_reads_splice
378380
copy_number_probs = self.depth_prob(depth1, haploid_depth)
379381
# if smn1 cn is 3, then no need to adjust smn2 cn
380-
if (
381-
smn2_cn == 1
382-
and len(self.smn2_del_reads_partial) <= 1
383-
and (smn1_cn is None or smn1_cn <= 2)
384-
):
385-
# when there is only one haplotype, check if depth is
386-
# consistent with haploid depth
387-
copy_one_prob = copy_number_probs[0]
388-
if copy_one_prob < 0.25:
389-
new_smn2_cn = None
390-
if copy_one_prob < 0.05:
382+
if copy_number_probs is not None:
383+
if (
384+
smn2_cn == 1
385+
and len(self.smn2_del_reads_partial) <= 1
386+
and (smn1_cn is None or smn1_cn <= 2)
387+
):
388+
# when there is only one haplotype, check if depth is
389+
# consistent with haploid depth
390+
copy_one_prob = copy_number_probs[0]
391+
if copy_one_prob < 0.25:
392+
new_smn2_cn = None
393+
if copy_one_prob < 0.05:
394+
two_cp_haps = list(smn2_haps.values())
395+
return 2, two_cp_haps
396+
if smn1_cn == 0 and smn2_cn == 2:
397+
# scenario where two two-copy alleles are identical
398+
copy_four_prob = copy_number_probs[3]
399+
if copy_four_prob > 0.75:
391400
two_cp_haps = list(smn2_haps.values())
392-
return 2, two_cp_haps
401+
return 4, two_cp_haps
402+
if (
403+
smn1_cn in [0, 1]
404+
and smn2_cn in [2, 3]
405+
and read_counts is not None
406+
and self.targeted is False
407+
):
408+
# check if one smn2 haplotype has more reads than others
409+
haps = list(read_counts.keys())
410+
counts = list(read_counts.values())
411+
max_count = max(counts)
412+
cp2_hap = haps[counts.index(max_count)]
413+
if cp2_hap in smn2_haps:
414+
others_max = sorted(counts, reverse=True)[1]
415+
probs = self.depth_prob(max_count, others_max)
416+
if probs is not None and probs[0] < 0.0005 and others_max >= 10:
417+
two_cp_haps.append(smn2_haps[cp2_hap])
418+
return smn2_cn + 1, two_cp_haps
393419
if smn1_cn is not None:
394420
if smn2_cn == 1 and smn1_cn == 2 and len(self.smn2_del_reads_partial) <= 1:
395421
haploid_depth = self.smn1_reads_splice / smn1_cn
@@ -398,12 +424,13 @@ def adjust_smn2_cn(self, smn1_cn, smn2_cn, smn2_haps):
398424
copy_number_probs = self.depth_prob(depth1, haploid_depth)
399425
# when there is only one haplotype, check if depth is
400426
# consistent with haploid depth
401-
copy_one_prob = copy_number_probs[0]
402-
if copy_one_prob < 0.25:
403-
new_smn2_cn = None
404-
if copy_one_prob < 0.05:
405-
two_cp_haps = list(smn2_haps.values())
406-
return 2, two_cp_haps
427+
if copy_number_probs is not None:
428+
copy_one_prob = copy_number_probs[0]
429+
if copy_one_prob < 0.25:
430+
new_smn2_cn = None
431+
if copy_one_prob < 0.05:
432+
two_cp_haps = list(smn2_haps.values())
433+
return 2, two_cp_haps
407434
return new_smn2_cn, two_cp_haps
408435

409436
def get_best_match(
@@ -629,7 +656,7 @@ def call(self):
629656
cp2_hap = haps[counts.index(max_count)]
630657
others_max = sorted(counts, reverse=True)[1]
631658
probs = self.depth_prob(max_count, others_max)
632-
if probs[0] < 0.05 and others_max >= 10:
659+
if probs is not None and probs[0] < 0.05 and others_max >= 10:
633660
two_cp_haps.append(haps_to_compare[cp2_hap])
634661
for hap in two_cp_haps:
635662
if "smn1hap" in hap:
@@ -645,7 +672,9 @@ def call(self):
645672
for hap in two_cp_haps_smn1:
646673
if hap not in two_cp_haps:
647674
two_cp_haps.append(hap)
648-
smn2_cn, two_cp_haps_smn2 = self.adjust_smn2_cn(smn1_cn_old, smn2_cn, smn2_haps)
675+
smn2_cn, two_cp_haps_smn2 = self.adjust_smn2_cn(
676+
smn1_cn_old, smn2_cn, read_counts, smn2_haps
677+
)
649678
for hap in two_cp_haps_smn2:
650679
if hap not in two_cp_haps:
651680
two_cp_haps.append(hap)

0 commit comments

Comments
 (0)