Skip to content

Commit ce232a9

Browse files
authored
Version 3.2.1 (#34)
- Fix problem working with CRAM Other minor changes: - Add RN tag to output bam. RN stands for region name, indicating reads used to analyze one region (paralog group) - Sort some output fields in the JSON, making it consistent from run to run - Clean up reported alleles, filtering out cases where all haplotypes are linked into one allele, or there are missing haplotypes when two alleles are reported - Fix getting sample ID from bam header when there are blank spaces - Fix logic for writing homozygous haplotypes in VCF (no haplotypes phased -> no haplotypes phased and no heterozygous variant sites) - Update two_cp_haplotypes when adjusting total_cn from 2 to 4 based on depth - Use all reads instead of unique reads for variant calling at edges of clipped haplotypes
1 parent 933f7d1 commit ce232a9

19 files changed

Lines changed: 194 additions & 120 deletions

paraphase/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.2.0"
1+
__version__ = "3.2.1"

paraphase/genes/cfc1_phaser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,5 +102,5 @@ def call(self):
102102
self.mdepth,
103103
self.region_avg_depth._asdict(),
104104
self.sample_sex,
105-
None,
105+
self.init_het_sites,
106106
)

paraphase/genes/cfhclust.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,7 @@ def call(self):
7272
None,
7373
None,
7474
None,
75+
None,
76+
None,
7577
fusions,
7678
)

paraphase/genes/f8_phaser.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,12 @@
55
from collections import namedtuple
66
import pysam
77
from ..phaser import Phaser
8+
from paraphase.prepare_bam_and_vcf import pysam_handle
89

910

1011
class F8Phaser(Phaser):
1112
new_fields = copy.deepcopy(Phaser.fields)
1213
new_fields.remove("gene_cn")
13-
new_fields.remove("alleles_final")
14-
new_fields.remove("hap_links")
1514
new_fields.insert(3, "sv_called")
1615
new_fields.insert(3, "flanking_summary")
1716
new_fields.insert(3, "exon1_to_exon22_depth")
@@ -33,6 +32,9 @@ def __init__(
3332
Phaser.__init__(
3433
self, sample_id, outdir, args, genome_depth, genome_bam, sample_sex
3534
)
35+
self.reference_fasta = None
36+
if args is not None:
37+
self.reference_fasta = args.reference
3638

3739
def set_parameter(self, config):
3840
super().set_parameter(config)
@@ -41,11 +43,11 @@ def set_parameter(self, config):
4143
"extract_regions"
4244
].split()
4345

44-
def get_read_positions(self, min_extension=5000):
46+
def get_read_positions(self, genome_bamh, min_extension=5000):
4547
"""Get mapped region of the part of reads not overlapping repeat"""
4648
dpos5 = {}
4749
dpos3 = {}
48-
genome_bamh = pysam.AlignmentFile(self.genome_bam, "rb")
50+
4951
for i, extract_region in enumerate(
5052
[self.extract_region1, self.extract_region2, self.extract_region3]
5153
):
@@ -79,18 +81,17 @@ def get_read_positions(self, min_extension=5000):
7981
dpos3.setdefault(read_name, []).append(pos_name)
8082
else:
8183
dpos5.setdefault(read_name, []).append(pos_name)
82-
genome_bamh.close()
8384
return dpos5, dpos3
8485

8586
def call(self):
8687
if self.check_coverage_before_analysis() is False:
8788
return self.GeneCall()
8889

89-
genome_bamh = pysam.AlignmentFile(self.genome_bam, "rb")
90+
genome_bamh = pysam_handle(self.genome_bam, self.reference_fasta)
9091
e1_e22_depth = self.get_regional_depth(genome_bamh, self.depth_region)[0].median
91-
genome_bamh.close()
9292

93-
dpos5, dpos3 = self.get_read_positions()
93+
dpos5, dpos3 = self.get_read_positions(genome_bamh)
94+
genome_bamh.close()
9495
self.get_homopolymer()
9596
self.get_candidate_pos()
9697

@@ -114,6 +115,7 @@ def call(self):
114115

115116
self.het_sites = sorted(list(self.candidate_pos))
116117
self.remove_noisy_sites()
118+
self.init_het_sites = [a for a in self.het_sites]
117119

118120
raw_read_haps = self.get_haplotypes_from_reads(
119121
check_clip=True,
@@ -215,6 +217,8 @@ def call(self):
215217
e1_e22_depth,
216218
flanking_sum,
217219
sv_hap,
220+
None,
221+
None,
218222
hcn,
219223
original_haps,
220224
self.het_sites,
@@ -227,4 +231,5 @@ def call(self):
227231
self.mdepth,
228232
self.region_avg_depth._asdict(),
229233
self.sample_sex,
234+
self.init_het_sites,
230235
)

paraphase/genes/hba_phaser.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections import namedtuple
66
import pysam
77
from ..phaser import Phaser
8+
from paraphase.prepare_bam_and_vcf import pysam_handle
89

910

1011
class HbaPhaser(Phaser):
@@ -31,6 +32,9 @@ def __init__(
3132
Phaser.__init__(
3233
self, sample_id, outdir, args, genome_depth, genome_bam, sample_sex
3334
)
35+
self.reference_fasta = None
36+
if args is not None:
37+
self.reference_fasta = args.reference
3438

3539
def set_parameter(self, config):
3640
super().set_parameter(config)
@@ -39,7 +43,7 @@ def set_parameter(self, config):
3943
def call(self):
4044
if self.check_coverage_before_analysis() is False:
4145
return self.GeneCall()
42-
genome_bamh = pysam.AlignmentFile(self.genome_bam, "rb")
46+
genome_bamh = pysam_handle(self.genome_bam, self.reference_fasta)
4347
surrounding_region_depth = self.get_regional_depth(
4448
genome_bamh, self.depth_region
4549
)[0].median
@@ -227,13 +231,15 @@ def call(self):
227231

228232
# phase
229233
alleles = []
234+
linked_haps = []
230235
hap_links = {}
231236
if self.to_phase is True:
232237
(
233238
alleles,
234239
hap_links,
235240
_,
236241
_,
242+
linked_haps,
237243
) = self.phase_alleles(
238244
uniquely_supporting_reads,
239245
nonuniquely_supporting_reads,
@@ -245,6 +251,7 @@ def call(self):
245251
if len(alleles) == 1:
246252
if len(alleles[0]) == len(ass_haps):
247253
alleles = []
254+
linked_haps = []
248255
self.close_handle()
249256

250257
return self.GeneCall(
@@ -268,4 +275,6 @@ def call(self):
268275
self.mdepth,
269276
self.region_avg_depth._asdict(),
270277
self.sample_sex,
278+
self.init_het_sites,
279+
linked_haps,
271280
)

paraphase/genes/ikbkg_phaser.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def call(self):
7171
kept_sites = [self.add_sites[1]]
7272
self.het_sites = sorted(self.het_sites)
7373
self.remove_noisy_sites()
74+
self.init_het_sites = [a for a in self.het_sites]
7475
homo_sites_to_add = self.add_homo_sites(min_no_var_region_size=6000)
7576
# print(homo_sites_to_add)
7677
kept_sites += homo_sites_to_add
@@ -189,13 +190,23 @@ def call(self):
189190
hap_links,
190191
_,
191192
_,
193+
linked_haps,
192194
) = self.phase_alleles(
193195
uniquely_supporting_reads,
194196
nonuniquely_supporting_reads,
195197
raw_read_haps,
196198
ass_haps,
197199
reverse=self.is_reverse,
198200
)
201+
# all haplotypes are phased into one allele
202+
if (
203+
len(alleles) == 1
204+
and sorted(alleles[0]) == sorted(ass_haps.values())
205+
and self.sample_sex is not None
206+
and self.sample_sex == "female"
207+
):
208+
alleles = []
209+
linked_haps = []
199210

200211
if gene_counter == 0 or pseudo_counter == 0:
201212
total_cn = None
@@ -228,4 +239,6 @@ def call(self):
228239
self.mdepth,
229240
self.region_avg_depth._asdict(),
230241
self.sample_sex,
242+
self.init_het_sites,
243+
linked_haps,
231244
)

paraphase/genes/ncf1_phaser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def call(self):
5252
self.get_candidate_pos()
5353
self.het_sites = sorted(list(self.candidate_pos))
5454
self.remove_noisy_sites()
55+
self.init_het_sites = [a for a in self.het_sites]
5556

5657
raw_read_haps = self.get_haplotypes_from_reads(add_sites=self.add_sites)
5758

@@ -155,4 +156,5 @@ def call(self):
155156
self.mdepth,
156157
self.region_avg_depth._asdict(),
157158
self.sample_sex,
159+
self.init_het_sites,
158160
)

paraphase/genes/neb_phaser.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def call(self):
4040
self.get_candidate_pos()
4141
self.het_sites = sorted(list(self.candidate_pos))
4242
self.remove_noisy_sites()
43+
self.init_het_sites = [a for a in self.het_sites]
4344
homo_sites_to_add = self.add_homo_sites()
4445
raw_read_haps = self.get_haplotypes_from_reads(
4546
kept_sites=homo_sites_to_add,
@@ -120,13 +121,15 @@ def call(self):
120121
tri3.append(tri3[0])
121122

122123
alleles = []
124+
linked_haps = []
123125
hap_links = {}
124126
if two_cp_haps == []:
125127
(
126128
alleles,
127129
hap_links,
128130
_,
129131
_,
132+
linked_haps,
130133
) = self.phase_alleles(
131134
uniquely_supporting_reads,
132135
nonuniquely_supporting_reads,
@@ -139,6 +142,7 @@ def call(self):
139142
# incorrect phasing suggests haplotypes with cn > 1
140143
if len(alleles) == 1 and sorted(alleles[0]) == sorted(ass_haps.values()):
141144
alleles = []
145+
linked_haps = []
142146
total_cn = None
143147
elif len(tri1) > 2 or len(tri3) > 2:
144148
total_cn = None
@@ -155,6 +159,7 @@ def call(self):
155159
total_cn = 6
156160
two_cp_haps = [tri2[0]]
157161
alleles = []
162+
linked_haps = []
158163

159164
# homozygous case
160165
if total_cn == 0:
@@ -181,4 +186,6 @@ def call(self):
181186
self.mdepth,
182187
self.region_avg_depth._asdict(),
183188
self.sample_sex,
189+
self.init_het_sites,
190+
linked_haps,
184191
)

paraphase/genes/opn1lw_phaser.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# paraphase
22
# Author: Xiao Chen <xchen@pacificbiosciences.com>
33

4-
4+
import copy
55
from collections import namedtuple
66
from ..phaser import Phaser
77

88

99
class Opn1lwPhaser(Phaser):
10-
fields = (
10+
new_fields = (
1111
"total_cn",
1212
"opn1lw_cn",
1313
"opn1mw_cn",
@@ -25,23 +25,13 @@ class Opn1lwPhaser(Phaser):
2525
"directional_links",
2626
"links_loose",
2727
"alleles_raw",
28-
"highest_total_cn",
29-
"assembled_haplotypes",
30-
"sites_for_phasing",
31-
"unique_supporting_reads",
32-
"het_sites_not_used_in_phasing",
33-
"homozygous_sites",
34-
"haplotype_details",
35-
"nonunique_supporting_reads",
36-
"read_details",
37-
"genome_depth",
38-
"region_depth",
39-
"sample_sex",
4028
)
29+
# new_fields = copy.deepcopy(Phaser.fields)
30+
new_fields += tuple(Phaser.fields[6:])
4131
GeneCall = namedtuple(
4232
"GeneCall",
43-
fields,
44-
defaults=(None,) * len(fields),
33+
new_fields,
34+
defaults=(None,) * len(new_fields),
4535
)
4636
pathogenic_haps = [
4737
"LIAVA",
@@ -189,6 +179,7 @@ def call(self):
189179
self.get_candidate_pos(min_vaf=0.08)
190180
self.het_sites = sorted(list(self.candidate_pos))
191181
self.remove_noisy_sites()
182+
self.init_het_sites = [a for a in self.het_sites]
192183
homo_sites_to_add = self.add_homo_sites()
193184
raw_read_haps = self.get_haplotypes_from_reads(
194185
check_clip=True,
@@ -284,13 +275,15 @@ def call(self):
284275
hap_links,
285276
directional_links,
286277
directional_links_loose,
278+
linked_haps,
287279
) = self.phase_alleles(
288280
uniquely_supporting_reads,
289281
nonuniquely_supporting_reads,
290282
raw_read_haps,
291283
ass_haps,
292284
reverse=self.is_reverse,
293285
)
286+
alleles = linked_haps
294287

295288
# incorrect phasing suggests haplotypes with cn > 1
296289
if (
@@ -455,4 +448,6 @@ def call(self):
455448
self.mdepth,
456449
self.region_avg_depth._asdict(),
457450
self.sample_sex,
451+
self.init_het_sites,
452+
alleles_1st_2nd,
458453
)

paraphase/genes/pms2_phaser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def call(self):
4242
self.get_candidate_pos()
4343
self.het_sites = sorted(list(self.candidate_pos))
4444
self.remove_noisy_sites()
45+
self.init_het_sites = [a for a in self.het_sites]
4546
homo_sites_to_add = self.add_homo_sites(min_no_var_region_size=6000)
4647
# for distinguishing pms2 from pms2cl
4748
raw_read_haps = self.get_haplotypes_from_reads(
@@ -149,5 +150,5 @@ def call(self):
149150
self.mdepth,
150151
self.region_avg_depth._asdict(),
151152
self.sample_sex,
152-
None,
153+
self.init_het_sites,
153154
)

0 commit comments

Comments
 (0)