Skip to content

Commit 50bc903

Browse files
committed
Add a new indel caller.
This is a combination of PR #1679 from 2022 coupled with more recent changes to replace BAQ with edlib for evaluating the alignment of reads against candidate alleles. Due to the complexity of rebasing many commits with many conflicts, these have been squashed together for simplicity. Please consult the original commit log in the PR for more detailed history. PR 1679 is a major restructuring of the indel caller that produces proper consensus sequences. TODO: paste collected commit messages here. The newer edlib changes are designed to dramtically increase the speed of indel calling on long read data by use of a modern algorithm. Note edlib's alignments are very crude, with just +1 for all differences including substitutions and each indel base. However if we've computed our candidate allele consensus sequences correctly then the aligning against the real allele should be a minimal score regardless of scoring regime, so the cost of an HMM or even affine weights do not help much. The lack of quality values is compensated for by detection of minimum quality within STRs.
1 parent 7a4f801 commit 50bc903

File tree

11 files changed

+3874
-32
lines changed

11 files changed

+3874
-32
lines changed

LICENSE

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,3 +772,28 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
772772
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
773773
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
774774
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
775+
776+
-----------------------------------------------------------------------------
777+
778+
License for edlib.[ch]
779+
780+
The MIT License (MIT)
781+
782+
Copyright (c) 2014 Martin Šošić
783+
784+
Permission is hereby granted, free of charge, to any person obtaining a copy of
785+
this software and associated documentation files (the "Software"), to deal in
786+
the Software without restriction, including without limitation the rights to
787+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
788+
the Software, and to permit persons to whom the Software is furnished to do so,
789+
subject to the following conditions:
790+
791+
The above copyright notice and this permission notice shall be included in all
792+
copies or substantial portions of the Software.
793+
794+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
795+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
796+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
797+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
798+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
799+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ OBJS = main.o vcfindex.o tabix.o \
4040
vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \
4141
vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \
4242
regidx.o smpl_ilist.o csq.o vcfbuf.o \
43-
mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \
43+
mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o bam2bcf_edlib.o \
44+
read_consensus.o bam_sample.o \
4445
vcfsort.o cols.o extsort.o dist.o abuf.o \
45-
ccall.o em.o prob1.o kmin.o str_finder.o gff.o
46+
ccall.o em.o prob1.o kmin.o str_finder.o gff.o edlib.o
4647
PLUGIN_OBJS = vcfplugin.o
4748

4849
prefix = /usr/local
@@ -234,6 +235,7 @@ vcfbuf_h = vcfbuf.h $(htslib_vcf_h)
234235
abuf_h = abuf.h $(htslib_vcf_h)
235236
dbuf_h = dbuf.h $(htslib_vcf_h)
236237
bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h)
238+
edlib.h = edlib.h
237239
bam_sample_h = bam_sample.h $(htslib_sam_h)
238240
cigar_state_h = cigar_state.h $(htslib_hts_h) $(htslib_sam_h)
239241
read_consensus_h = read_consensus.h $(htslib_hts_h) $(htslib_sam_h)
@@ -285,6 +287,7 @@ mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(hts
285287
bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h
286288
bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) $(str_finder_h)
287289
bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h)
290+
bam2bcf_edlib.o: bam2bcf_edlib.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bcftools_h) $(bam2bcf_h) $(htslib_ksort_h) $(read_consensus_h) $(cigar_state_h) $(edlib.h)
288291
read_consensus.o: read_consensus.c $(read_consensus_h) $(cigar_state_h) $(bcftools_h) kheap.h
289292
bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h)
290293
version.o: version.h version.c

bam2bcf.c

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
268268
bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
269269
}
270270

271+
// Detect if indel occurs anywhere in this sample
272+
int indel_in_sample = 0;
273+
if (bca->edlib) {
274+
for (i = n = 0; i < _n; ++i) {
275+
const bam_pileup1_t *p = pl + i;
276+
if (p->indel) indel_in_sample = 1;
277+
}
278+
}
279+
271280
// fill the bases array
272281
double nqual_over_60 = bca->nqual / 60.0;
273282
int ADR_ref_missed[4] = {0};
@@ -298,7 +307,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
298307
b = p->aux>>16&0x3f; // indel type
299308
seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
300309

301-
if ( !bca->indels_v20 )
310+
if (bca->edlib) {
311+
if (indel_in_sample) {
312+
seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
313+
} else {
314+
// An indel in another sample, but not this. So just use
315+
// basic sequence confidences.
316+
q = bam_get_qual(p->b)[p->qpos];
317+
if (q > bca->max_baseQ) q = bca->max_baseQ;
318+
seqQ = 99;
319+
}
320+
}
321+
322+
if ( !bca->indels_v20 && !bca->edlib )
302323
{
303324
/*
304325
This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain
@@ -341,6 +362,25 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
341362
}
342363
continue;
343364
}
365+
366+
// FIXME: CHECK if this is still needed with edlib mode
367+
// It's a slight variant on the one above guarded by --indels-2.0
368+
if (bca->edlib) {
369+
if (indel_in_sample && p->indel == 0 && (q < _n/2 || _n > 20)) {
370+
// high quality indel calls without p->indel set aren't
371+
// particularly indicative of being a good REF match either,
372+
// at least not in low coverage. So require solid coverage
373+
// before we start utilising such quals.
374+
if (b != 0)
375+
b = 5;
376+
q = (int)bam_get_qual(p->b)[p->qpos];
377+
seqQ = (3*seqQ + 2*q)/8;
378+
}
379+
if (_n > 20 && seqQ > 40) seqQ = 40;
380+
}
381+
382+
// Note baseQ changes some output fields such as I16, but has no
383+
// significant affect on "call".
344384
baseQ = p->aux>>8&0xff;
345385
}
346386
else
@@ -478,9 +518,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
478518
for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp);
479519
}
480520

521+
// Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed
522+
// ratios. This is detrimental on Illumina, but beneficial on PacBio CCS.
523+
// It's possibly related to the homopolyer error likelihoods or overall
524+
// Indel accuracy. Maybe tie this in to the -h option?
525+
481526
r->ori_depth = ori_depth;
482527
// glfgen
483528
errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
529+
530+
// TODO: account for the number of unassigned reads. If depth is 50,
531+
// but AD is 5,7 then it may look like a variant but it probably
532+
// should be low quality.
533+
484534
return n;
485535
}
486536

bam2bcf.h

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,15 @@ typedef struct __bcf_callaux_t {
122122
// for internal uses
123123
int max_bases;
124124
int indel_types[4]; // indel lengths
125-
int indel_win_size, indels_v20;
125+
int indel_win_size, indels_v20, edlib;
126126
int maxins, indelreg;
127127
int read_len;
128128
char *inscns;
129129
uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
130130
errmod_t *e;
131131
void *rghash;
132132
float indel_bias; // adjusts indel score threshold; lower => call more.
133+
float del_bias; // (-.9 < x < .9) error profile; >0 => more del, <0 => more ins
133134
int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm}
134135
unsigned int nnm[2]; // number of nm observations
135136
float nm[2]; // cumulative count of mismatches in ref and alt reads
@@ -193,11 +194,35 @@ extern "C" {
193194
const bcf_callaux_t *bca, const char *ref);
194195
int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
195196
int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
197+
int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
198+
bcf_callaux_t *bca, const char *ref, int ref_len);
199+
196200
void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call);
197201

198202
int bcf_cgp_l_run(const char *ref, int pos);
199203
int est_indelreg(int pos, const char *ref, int l, char *ins4);
200204

205+
/* ----------------------------------------------------------------------
206+
* Shared between bam2bcf_indel.c and bam2bcf_edlib.c
207+
*/
208+
209+
// Take a reference position tpos and convert to a query position (returned).
210+
// This uses the CIGAR string plus alignment c->pos to do the mapping.
211+
//
212+
// *_tpos is returned as tpos if query overlaps tpos, but for deletions
213+
// it'll be either the start (is_left) or end (!is_left) ref position.
214+
int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos);
215+
216+
// Identify spft-clip length, position in seq, and clipped seq len
217+
void get_pos(const bcf_callaux_t *bca, bam_pileup1_t *p,
218+
int *sc_len_r, int *slen_r, int *epos_r, int *end);
219+
220+
// Compute the consensus for this sample 's', minus indels which
221+
// get added later.
222+
char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp,
223+
int pos, int *types, int n_types,
224+
int max_ins, int s);
225+
201226
#ifdef __cplusplus
202227
}
203228
#endif

0 commit comments

Comments
 (0)