Skip to content

Commit fe33c9f

Browse files
committed
Make --indel-bias more sensitive to indels
by switching off an existing heuristics which reduces indelQ and, in the extreme case, has the power to discard an indel entirely when the reference alignment has low score. This is a problem for long reads, so newly `--indel-bias 1.01` is added to 'ont' and 'pacbio-ccs' presets. Tentative fix to #1459
1 parent 42b6e91 commit fe33c9f

File tree

4 files changed

+24
-17
lines changed

4 files changed

+24
-17
lines changed

bam2bcf.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
/* bam2bcf.h -- variant calling.
2+
mplp.indel_bias = 1.01;
23
34
Copyright (C) 2010-2012 Broad Institute.
4-
Copyright (C) 2012-2021 Genome Research Ltd.
5+
Copyright (C) 2012-2022 Genome Research Ltd.
56
67
Author: Heng Li <[email protected]>
78
@@ -99,7 +100,7 @@ typedef struct __bcf_callaux_t {
99100
uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
100101
errmod_t *e;
101102
void *rghash;
102-
float indel_bias; // adjusts indel score threshold; lower => call more.
103+
float indel_bias_inverted; // adjusts indel score threshold, 1/--indel-bias, so lower => call more.
103104
} bcf_callaux_t;
104105

105106
// per-sample values

bam2bcf_indel.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* bam2bcf_indel.c -- indel caller.
22
33
Copyright (C) 2010, 2011 Broad Institute.
4-
Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd.
4+
Copyright (C) 2012-2014,2016-2017,2021-2022 Genome Research Ltd.
55
66
Author: Heng Li <[email protected]>
77
@@ -540,7 +540,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
540540
}
541541

542542
// used for adjusting indelQ below
543-
l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias;
543+
l = (int)((100. * sc / (qend - qbeg) + .499) * bca->indel_bias_inverted);
544544
*score = sc<<8 | MIN(255, l);
545545

546546
rep_ele *reps, *elt, *tmp;
@@ -623,8 +623,14 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
623623
seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
624624
}
625625
tmp = sc[0]>>6 & 0xff;
626+
627+
// Don't know how this indelQ reduction threshold of 111 was derived,
628+
// but it does not function well for longer reads that span multiple
629+
// events.
630+
//
626631
// reduce indelQ
627-
indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
632+
if ( bca->indel_bias_inverted >= 1 )
633+
indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
628634

629635
// Doesn't really help accuracy, but permits -h to take
630636
// affect still.
@@ -633,7 +639,7 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
633639
if (seqQ > 255) seqQ = 255;
634640
p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
635641
sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
636-
// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
642+
// fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
637643
}
638644
}
639645
// determine bca->indel_types[] and bca->inscns
@@ -922,7 +928,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
922928
fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s "
923929
"qbeg=%d tbeg=%d score=%d\n",
924930
pos, types[t], s, i, bam_get_qname(p->b),
925-
qbeg, tbeg, sc);
931+
qbeg, tbeg, score[K*n_types + t]);
926932
#endif
927933
}
928934
}

doc/bcftools.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2150,8 +2150,8 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for
21502150

21512151
1.12 -Q13 -h100 -m1
21522152
illumina [ default values ]
2153-
ont -B -Q5 --max-BQ 30 -I
2154-
pacbio-ccs -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 -M99999
2153+
ont -B -Q5 --max-BQ 30 --indel-bias 1.01 -I
2154+
pacbio-ccs -D -Q5 --max-BQ 50 --indel-bias 1.01 -F0.1 -o25 -e1 -M99999
21552155

21562156
*--ar, --ambig-reads* 'drop'|'incAD'|'incAD0'::
21572157
What to do with ambiguous indel reads that do not span an entire

mpileup.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@ static int mpileup(mplp_conf_t *conf)
842842
conf->delta_baseQ);
843843
conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
844844
conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
845-
conf->bca->indel_bias = conf->indel_bias;
845+
conf->bca->indel_bias_inverted = 1 / conf->indel_bias;
846846
conf->bca->min_frac = conf->min_frac;
847847
conf->bca->min_support = conf->min_support;
848848
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
@@ -1180,8 +1180,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
11801180
"Configuration profiles activated with -X, --config:\n"
11811181
" 1.12: -Q13 -h100 -m1 -F0.002\n"
11821182
" illumina: [ default values ]\n"
1183-
" ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n"
1184-
" pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
1183+
" ont: -B -Q5 --max-BQ 30 --indel-bias 1.01 -I [also try eg |bcftools call -P0.01]\n"
1184+
" pacbio-ccs: -D -Q5 --max-BQ 50 --indel-bias 1.01 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n"
11851185
"\n"
11861186
"Notes: Assuming diploid individuals.\n"
11871187
"\n"
@@ -1400,11 +1400,9 @@ int main_mpileup(int argc, char *argv[])
14001400
break;
14011401
case 'e': mplp.extQ = atoi(optarg); break;
14021402
case 'h': mplp.tandemQ = atoi(optarg); break;
1403-
case 10: // --indel-bias (inverted so higher => more indels called)
1404-
if (atof(optarg) < 1e-2)
1405-
mplp.indel_bias = 1/1e2;
1406-
else
1407-
mplp.indel_bias = 1/atof(optarg);
1403+
case 10: // --indel-bias
1404+
mplp.indel_bias = atof(optarg);
1405+
if ( mplp.indel_bias < 1e-2 ) mplp.indel_bias = 1e-2;
14081406
break;
14091407
case 15: {
14101408
char *tmp;
@@ -1441,13 +1439,15 @@ int main_mpileup(int argc, char *argv[])
14411439
mplp.extQ = 1;
14421440
mplp.flag |= MPLP_REALN_PARTIAL;
14431441
mplp.max_read_len = 99999;
1442+
mplp.indel_bias = 1.01;
14441443
} else if (strcasecmp(optarg, "ont") == 0) {
14451444
fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with "
14461445
"a higher -P, eg -P0.01 or -P 0.1\n");
14471446
mplp.min_baseQ = 5;
14481447
mplp.max_baseQ = 30;
14491448
mplp.flag &= ~MPLP_REALN;
14501449
mplp.flag |= MPLP_NO_INDEL;
1450+
mplp.indel_bias = 1.01;
14511451
} else if (strcasecmp(optarg, "1.12") == 0) {
14521452
// 1.12 and earlier
14531453
mplp.min_frac = 0.002;

0 commit comments

Comments
 (0)