Skip to content

Commit e5a8c47

Browse files
committed
*v0.3.2 (Sep 24, 2024)*:
1. add "INFO/DNP" score for all child's "0/1" output; 2. add "INFO/HDN" tag for "high quality de novo variant", default configured as DNP > 0.8; 3. add reference call in gVCF output; 4. add "INFO/RPL" tag for saving reference PL score for gvcf output.
1 parent 80127b0 commit e5a8c47

File tree

7 files changed

+95
-19
lines changed

7 files changed

+95
-19
lines changed

README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,17 @@ Clair3-Nova is the 2nd generation of [Clair3-Trio](https://github.com/HKU-BAL/Cl
3333
----
3434

3535
## Latest Updates
36-
*v0.3.1 (July 9, 2024)*: fix bug in multiple alternative sites
37-
*v0.3 (June 23, 2024)*: add r10.4.1 hac model and add base_err feature
36+
*v0.3.2 (Sep 24, 2024)*:
37+
38+
1. add "INFO/DNP" score for all child's "0/1" output;
39+
2. add "INFO/HDN" tag for "high quality de novo variant", default configured as DNP > 0.8;
40+
3. add reference call in gVCF output;
41+
4. add "INFO/RPL" tag for saving reference PL score for gvcf output.
42+
43+
*v0.3.1 (July 9, 2024)*: fix bug in multiple alternative sites.
44+
45+
*v0.3 (June 23, 2024)*: add r10.4.1 hac model and add base_err feature.
46+
3847
1. add r10 HAC model trained at HG002 trio
3948
2. add `--base_err` [flag](https://github.com/HKU-BAL/Clair3/issues/220) for reducing "./." in gvcf output
4049
3. add `--keep_iupac_bases` [flag](https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry) to showing iupac char.

clair3/CallVariants.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ def output_header():
272272
##FILTER=<ID=RefCall,Description="Reference call">
273273
##INFO=<ID=P,Number=0,Type=Flag,Description="Result from pileup calling">
274274
##INFO=<ID=F,Number=0,Type=Flag,Description="Result from full-alignment calling">
275+
##INFO=<ID=RPL,Number=.,Type=String,Description="For reference call's Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
275276
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
276277
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
277278
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ<20 or selected by 'samtools view -F 2316' are filtered)">
@@ -1331,9 +1332,14 @@ def decode_alt_info(alt_info_dict):
13311332
','.join(["%.4f" % (min(1.0, 1.0 * item / read_depth)) for item in alt_list_count])
13321333

13331334
if output_config.gvcf:
1334-
PLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
1335-
alternate_base)
1335+
PLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base)
13361336
PLs = ','.join([str(x) for x in PLs])
1337+
if alternate_base == ".":
1338+
RPLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
1339+
alternate_base, is_output_ref_pl = True)
1340+
RPLs = [str(x) for x in RPLs]
1341+
information_string += ";RPL=%s" % (",".join(RPLs))
1342+
13371343

13381344
output_utilities.output("%s\t%d\t.\t%s\t%s\t%.2f\t%s\t%s\tGT:GQ:DP:AD:AF:PL\t%s:%d:%d:%s:%s:%s" % (
13391345
chromosome,
@@ -1368,7 +1374,7 @@ def decode_alt_info(alt_info_dict):
13681374

13691375

13701376

1371-
def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base):
1377+
def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base, is_output_ref_pl=False):
13721378
'''
13731379
PL computation
13741380
for bi-allelic: AA(00), AB(01), BB(11)
@@ -1393,7 +1399,10 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
13931399
except:
13941400
#skip N positions
13951401
if alternate_base == ".":
1396-
return [990]
1402+
if is_output_ref_pl:
1403+
break
1404+
else:
1405+
return [990]
13971406
else:
13981407
return [990] * len(genotypes[alt_num])
13991408
genotype_prob_21 = gt21_probabilities[gt21_prob_index]
@@ -1406,7 +1415,16 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
14061415
_p = genotype_prob_21 * genotype_prob_zygosity
14071416
# _p = genotype_prob_21
14081417
likelihoods.append(_p)
1409-
pass
1418+
1419+
# for reference call, compute pl as [p_ref, p_alt, p_alt^2]
1420+
if is_output_ref_pl and alternate_base == ".":
1421+
#likelihoods.append(1-likelihoods[-1])
1422+
try:
1423+
_rp = math.sqrt(1 - likelihoods[-1] + 0.25) - 0.5
1424+
likelihoods += [_rp, _rp**2]
1425+
except:
1426+
return [990] * 3
1427+
14101428

14111429
# genotype likelihood normalization
14121430
# p/sum(p)

preprocess/utils.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def readCalls(self, callPath, callType='variant', ctgName=None, ctgStart=None, c
115115
cur_variant_start = int(line.strip('\n').split('\t')[1])
116116
cur_variant_end = cur_variant_start - 1 + len(ref)
117117
is_reference_call = (alt == '.') or (ref == alt)
118+
#import pdb; pdb.set_trace()
118119
if not is_reference_call:
119120
# assuming AD is at the columns [-3], add 0 to AD for gVCF
120121
ori_info = tmp[-1].split(':')
@@ -125,14 +126,36 @@ def readCalls(self, callPath, callType='variant', ctgName=None, ctgStart=None, c
125126
# add <NON_REF> to variant calls
126127
tmp[4] = tmp[4] + ',<NON_REF>'
127128
if (n_alt == 1):
128-
129129
tmp[-1] = tmp[-1] + ',990,990,990'
130-
131130
elif (n_alt == 2):
132131
tmp[-1] = tmp[-1] + ',990,990,990,990'
133132
else:
134-
# skip reference calls
135-
continue
133+
# reference calls
134+
# original order GT:GQ:DP:AD:AF:PL
135+
# assuming AD is at the columns [-3]
136+
ori_info = tmp[-1].split(':')
137+
138+
# update DP to MIN_DP
139+
t_n = tmp[-2].split(":")
140+
t_n[2] = "MIN_DP"
141+
tmp[-2] = ":".join(t_n)
142+
143+
# set AD field
144+
_dp = int(ori_info[2])
145+
_ad = int(ori_info[3])
146+
new_ad = "%s,%s" % (_ad, _dp-_ad)
147+
ori_info[3] = new_ad
148+
149+
# get pl and update INFO tage
150+
_rpl = tmp[7].split(';')[1].split("=")[1]
151+
ori_info[-1] = _rpl
152+
tmp[7] = tmp[7].split(';')[0] + ";END=%s" % (tmp[1])
153+
tmp[-1] = ':'.join(ori_info)
154+
155+
# add <NON_REF> to variant calls
156+
tmp[4] = '<NON_REF>'
157+
#import pdb; pdb.set_trace()
158+
136159
new_line = '\t'.join(tmp)
137160

138161
cur_variant_chr = tmp[0]

trio/CallVariants_Denovo.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,18 +1458,25 @@ def decode_alt_info(alt_info_dict):
14581458
allele_frequency_s = "%.4f" % allele_frequency if len(alt_list_count) <= 1 else \
14591459
','.join(["%.4f" % (min(1.0, 1.0 * item / read_depth)) for item in alt_list_count])
14601460

1461-
14621461
#import pdb; pdb.set_trace()
14631462
DNP_info=""
1464-
if output_config.is_denovo:
1465-
DNP_info = ";DNP=%.3f" % denovo_probabilities[1] if denovo_probabilities[1] > denovo_probabilities[0] else ""
1463+
if output_config.is_denovo and output_config.trio_n_id == 0:
1464+
#DNP_info = ";DNP=%.3f" % denovo_probabilities[1] if denovo_probabilities[1] > denovo_probabilities[0] else ""
1465+
DNP_info = ";DNP=%.3f" % denovo_probabilities[1] if ((denovo_probabilities[1] >= param.output_DNP_p) or (genotype_string == "0/1")) else ""
1466+
if denovo_probabilities[1] >= param.high_DNP_p:
1467+
DNP_info += ";HDN"
1468+
14661469

14671470
##INFO=<ID=DNP,Number=.,Type=Float,Description="de novo variant probability">
1468-
14691471
if output_config.gvcf:
14701472
PLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
14711473
alternate_base)
14721474
PLs = ','.join([str(x) for x in PLs])
1475+
if alternate_base == ".":
1476+
RPLs = compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base,
1477+
alternate_base, is_output_ref_pl = True)
1478+
RPLs = [str(x) for x in RPLs]
1479+
information_string += ";RPL=%s" % (",".join(RPLs))
14731480

14741481
output_utilities.output("%s\t%d\t.\t%s\t%s\t%.2f\t%s\t%s%s\tGT:GQ:DP:AD:AF:PL\t%s:%d:%d:%s:%s:%s" % (
14751482
chromosome,
@@ -1506,7 +1513,7 @@ def decode_alt_info(alt_info_dict):
15061513

15071514

15081515

1509-
def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base):
1516+
def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, reference_base, alternate_base, is_output_ref_pl=False):
15101517
'''
15111518
PL computation
15121519
for bi-allelic: AA(00), AB(01), BB(11)
@@ -1531,7 +1538,11 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
15311538
except:
15321539
#skip N positions
15331540
if alternate_base == ".":
1534-
return [990]
1541+
# for ref call, set PL as p(ref), p(non_ref)
1542+
if is_output_ref_pl:
1543+
break
1544+
else:
1545+
return [990]
15351546
else:
15361547
return [990] * len(genotypes[alt_num])
15371548
genotype_prob_21 = gt21_probabilities[gt21_prob_index]
@@ -1544,7 +1555,15 @@ def compute_PL(genotype_string, genotype_probabilities, gt21_probabilities, refe
15441555
_p = genotype_prob_21 * genotype_prob_zygosity
15451556
# _p = genotype_prob_21
15461557
likelihoods.append(_p)
1547-
pass
1558+
1559+
# for reference call, compute pl as [p_ref, p_alt, p_alt^2]
1560+
if is_output_ref_pl and alternate_base == ".":
1561+
#likelihoods.append(1-likelihoods[-1])
1562+
try:
1563+
_rp = math.sqrt(1 - likelihoods[-1] + 0.25) - 0.5
1564+
likelihoods += [_rp, _rp**2]
1565+
except:
1566+
return [990] * 3
15481567

15491568
# genotype likelihood normalization
15501569
# p/sum(p)

trio/CheckEnvs_Trio.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,9 @@ def output_header(output_fn, reference_file_path, sample_name=None, cmdline=None
178178
##INFO=<ID=P,Number=0,Type=Flag,Description="Result from pileup calling">
179179
##INFO=<ID=F,Number=0,Type=Flag,Description="Result from full-alignment calling">
180180
##INFO=<ID=T,Number=0,Type=Flag,Description="Result from trio calling">
181+
##INFO=<ID=RPL,Number=.,Type=String,Description="For reference call's Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
181182
##INFO=<ID=DNP,Number=.,Type=Float,Description="de novo variant probability">
183+
##INFO=<ID=HDN,Number=0,Type=Flag,Description="is high quality de novo variant">
182184
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
183185
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
184186
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ<20 or selected by 'samtools view -F 2316' are filtered)">

trio/param_t.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
TOOL_NAME = "clair3_nova"
2-
VERSION='v0.3.1'
2+
VERSION='v0.3.2'
33

44
from itertools import accumulate
55

@@ -75,3 +75,6 @@
7575
padding_value_p1 = "60"
7676
padding_value_p2 = "90"
7777

78+
79+
high_DNP_p = 0.8
80+
output_DNP_p = 0.5

trio/print_header.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ def get_header(tmp_path=None, sample_name="TMP"):
2222
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ<20 or selected by 'samtools view -F 2316' are filtered)">
2323
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
2424
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
25+
##INFO=<ID=RPL,Number=.,Type=String,Description="For reference call's Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
2526
##INFO=<ID=DNP,Number=.,Type=Float,Description="de novo variant probability">
27+
##INFO=<ID=HDN,Number=0,Type=Flag,Description="is high quality de novo variant">
2628
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Observed allele frequency in reads, for each ALT allele, in the same order as listed, or the REF allele for a RefCall">\n"""
2729
)
2830

0 commit comments

Comments
 (0)