diff --git a/NEWS b/NEWS index 402d203cb..d7532302a 100644 --- a/NEWS +++ b/NEWS @@ -55,6 +55,9 @@ Changes affecting specific commands: --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set + - New option --no-indelQ-tweaks to increase sensitivity for indels, especially + in long reads. + * bcftools query - Make the `--samples` and `--samples-file` options work also in the `--list-samples` @@ -78,7 +81,7 @@ Changes affecting specific commands: Changes affecting the whole of bcftools, or multiple commands: * New `--regions-overlap` and `--targets-overlap` options which address - a long-standing design problem with subsetting VCF files by region. + a long-standing design problem with subsetting VCF files by region. BCFtools recognize two sets of options, one for streaming (`-t/-T`) and one for index-gumping (`-r/-R`). They behave differently, the first includes only records with POS coordinate within the regions, the other @@ -106,11 +109,11 @@ Changes affecting specific commands: by using `-c INFO/END`. - add a new '.' modifier to control wheter missing values should be carried - over from a tab-delimited file or not. For example: + over from a tab-delimited file or not. For example: -c TAG .. adds TAG if the source value is not missing. If TAG exists in the target file, it will be overwritten - + -c .TAG .. adds TAG even if the source value is missing. This can overwrite non-missing values with a missing value and can create empty VCF fields (`TAG=.`) @@ -239,7 +242,7 @@ Changes affecting specific commands: * bcftools +fill-tags: - Generalization and better support for custom functions that allow - adding new INFO tags based on arbitrary `-i, --include` type of + adding new INFO tags based on arbitrary `-i, --include` type of expressions. For example, to calculate a missing INFO/DP annotation from FORMAT/AD, it is possible to use: @@ -303,7 +306,7 @@ Changes affecting specific commands: - Atomization of AD and QS tags now correctly updates occurrences of duplicate alleles within different haplotypes - + - Fix a bug in atomization of Number=A,R tags * bcftools reheader: @@ -315,7 +318,7 @@ Changes affecting specific commands: - A wider range of genotypes can be set by the plugin by allowing specifying custom genotypes. For example, to force a heterozygous genotype it is now possible to use expressions like: - + c:'m|M' c:0/1 c:0 @@ -327,7 +330,7 @@ Changes affecting specific commands: - Better handling of ambiguous keys such as INFO/AF and CSQ/AD. The `-p, --annot-prefix` option is now applied before doing anything else which allows its use with `-f, --format` and `-c, --columns` options. - + - Some consequence field names may not constitute a valid tag name, such as "pos(1-based)". Newly field names are trimmed to exclude brackets. @@ -457,7 +460,7 @@ Changes affecting specific commands: * bcftools csq: - - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too + - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too many per-sample consequences - Fix a bug which incorrectly handled the --ncsq parameter and could clash diff --git a/bam2bcf.h b/bam2bcf.h index e778b8952..782c80770 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -1,7 +1,8 @@ /* bam2bcf.h -- variant calling. + mplp.indel_bias = 1.01; Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Heng Li @@ -99,7 +100,8 @@ typedef struct __bcf_callaux_t { uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types) errmod_t *e; void *rghash; - float indel_bias; // adjusts indel score threshold; lower => call more. + float indel_bias_inverted; // adjusts indel score threshold, 1/--indel-bias, so lower => call more. + int no_indelQ_tweaks; } bcf_callaux_t; // per-sample values diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 108d50557..b98f515e8 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,7 +1,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd. + Copyright (C) 2012-2014,2016-2017,2021-2022 Genome Research Ltd. Author: Heng Li @@ -540,7 +540,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, } // used for adjusting indelQ below - l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; + l = (int)((100. * sc / (qend - qbeg) + .499) * bca->indel_bias_inverted); *score = sc<<8 | MIN(255, l); rep_ele *reps, *elt, *tmp; @@ -623,8 +623,14 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); } tmp = sc[0]>>6 & 0xff; + + // Don't know how this indelQ reduction threshold of 111 was derived, + // but it does not function well for longer reads that span multiple + // events. + // // reduce indelQ - indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); + if ( !bca->no_indelQ_tweaks ) + indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); // Doesn't really help accuracy, but permits -h to take // affect still. @@ -633,7 +639,7 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; - // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + // fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -922,7 +928,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " "qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), - qbeg, tbeg, sc); + qbeg, tbeg, score[K*n_types + t]); #endif } } diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 00ac621ba..2ecb7cc7b 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -213,7 +213,7 @@ specific commands to see if they apply. *--regions-overlap* '0'|'1'|'2':: This option controls how overlapping records are determined: set to *0* if the VCF record has to have POS inside a region - (this corresponds to the default behavior of *-t/-T*); + (this corresponds to the default behavior of *-t/-T*); set to *1* if also overlapping records with POS outside a region should be included (this is the default behavior of *-r/-R*); or set to *2* to include only true overlapping variation (compare @@ -278,7 +278,7 @@ The program ignores the first column and the last indicates sex (1=male, 2=femal *-T, --targets-file* \[^]'FILE':: Same *-t, --targets*, but reads regions from a file. Note that *-T* - cannot be used in combination with *-t*. + cannot be used in combination with *-t*. + With the *call -C* 'alleles' command, third column of the targets file must be comma-separated list of alleles, starting with the reference allele. @@ -478,7 +478,7 @@ Add or remove annotations. *--single-overlaps*:: use this option to keep memory requirements low with very large annotation files. Note, however, that this comes at a cost, only single overlapping intervals - are considered in this mode. This was the default mode until the commit + are considered in this mode. This was the default mode until the commit af6f0c9 (Feb 24 2019). *--threads* 'INT':: @@ -633,7 +633,7 @@ demand. The original calling model can be invoked with the *-c* option. text file with sample names in the first column and group names in the second column. If '-' is given instead, no HWE assumption is made at all and single-sample calling is performed. (Note that in low coverage data this inflates the rate of false positives.) The *-G* option requires the presence of - per-sample FORMAT/QS or FORMAT/AD tag generated with *bcftools mpileup -a QS* (or *-a AD*). + per-sample FORMAT/QS or FORMAT/AD tag generated with *bcftools mpileup -a QS* (or *-a AD*). *-g, --gvcf* 'INT':: output also gVCF blocks of homozygous REF calls. The parameter 'INT' is the @@ -892,7 +892,7 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-H, --haplotype* '1'|'2'|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'1pIu'|'2pIu':: choose which allele from the FORMAT/GT field to use (the codes are case-insensitive): - + '1';; the first allele, regardless of phasing @@ -1018,8 +1018,8 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the ==== GEN/SAMPLE conversion: *-G, --gensample2vcf* 'prefix' or 'gen-file','sample-file':: convert IMPUTE2 output to VCF. One of the ID columns ("SNP ID" or "rsID" in - https://www.cog-genomics.org/plink/2.0/formats#gen) must be of the form - "CHROM:POS_REF_ALT" to detect possible strand swaps. + https://www.cog-genomics.org/plink/2.0/formats#gen) must be of the form + "CHROM:POS_REF_ALT" to detect possible strand swaps. {nbsp} + When the *--vcf-ids* option is given, the other column (autodetected) is used to fill the ID column of the VCF. @@ -1279,7 +1279,7 @@ output VCF and are ignored for the prediction analysis. # # Attributes required for # gene lines: - # - ID=gene: + # - ID=gene: # - biotype= # - Name= [optional] # @@ -1553,7 +1553,7 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i that average score is used to determine the top matches, not absolute values. *--no-HWE-prob*:: - Disable calculation of HWE probability to reduce memory requirements with + Disable calculation of HWE probability to reduce memory requirements with comparisons between very large number of sample pairs. *-p, --pairs* 'LIST':: @@ -1622,11 +1622,11 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i // present, a constant value '99' is used for the unseen genotypes. With // *-G*, the value '1' can be used instead; the discordance value then // gives exactly the number of differing genotypes. -// +// // ERR, error rate;; // Pairwise error rate calculated as number of differences divided // by the total number of comparisons. -// +// // CLUSTER, TH, DOT;; // In presence of multiple samples, related samples and outliers can be // identified by clustering samples by error rate. A simple hierarchical @@ -1861,7 +1861,7 @@ For "vertical" merge take a look at *<>* or *<>* option is supplied. See also *--atom-overlaps* and *--old-rec-tag*. *--atom-overlaps* '.'|'*':: - Alleles missing because of an overlapping variant can be set either + Alleles missing because of an overlapping variant can be set either to missing (.) or to the star alele (*), as recommended by the VCF specification. IMPORTANT: Note that asterisk is expaneded by shell and must be put in quotes or escaped by a backslash: @@ -2286,7 +2293,7 @@ the *<>* option is supplied. can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields. Also note, and this cannot be stressed enough, that 's' will NOT fix strand issues in - your VCF, do NOT use it for that purpose!!! (Instead see + your VCF, do NOT use it for that purpose!!! (Instead see and .) @@ -2330,7 +2337,7 @@ the *<>* option is supplied. *--old-rec-tag* 'STR':: Add INFO/STR annotation with the original record. The format of the - annotation is CHROM|POS|REF|ALT|USED_ALT_IDX. + annotation is CHROM|POS|REF|ALT|USED_ALT_IDX. *-o, --output* 'FILE':: see *<>* @@ -2949,11 +2956,11 @@ Transition probabilities: *-M, --rec-rate* 'FLOAT':: constant recombination rate per bp. In combination with *--genetic-map*, - the *--rec-rate* parameter is interpreted differently, as 'FLOAT'-fold increase of + the *--rec-rate* parameter is interpreted differently, as 'FLOAT'-fold increase of transition probabilities, which allows the model to become more sensitive yet still account for recombination hotspots. Note that also the range of the values is therefore different in both cases: normally the - parameter will be in the range (1e-3,1e-9) but with *--genetic-map* + parameter will be in the range (1e-3,1e-9) but with *--genetic-map* it will be in the range (10,1000). *-o, --output* 'FILE':: @@ -3192,7 +3199,7 @@ Convert between VCF and BCF. Former *bcftools subset*. Note that filter options below dealing with counting the number of alleles will, for speed, first check for the values of AC and AN in the INFO column to avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means -that a filter like '--min-af 0.1' will be calculated from INFO/AC and INFO/AN +that a filter like '--min-af 0.1' will be calculated from INFO/AC and INFO/AN when available or FORMAT/GT otherwise. However, it will not attempt to use any other existing field, like INFO/AF for example. For that, use '--exclude AF<0.1' instead. @@ -3411,7 +3418,7 @@ to require that all alleles are of the given type. Compare * array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and an element of the vector, as shown in the examples below - + INFO/AF[0] > 0.3 .. first AF value bigger than 0.3 FORMAT/AD[0:0] > 30 .. first AD value of the first sample bigger than 30 FORMAT/AD[0:1] .. first sample, second AD value @@ -3524,7 +3531,7 @@ used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluat TYPE="snp" && QUAL>=10 && (DP4[2]+DP4[3] > 2) - COUNT(GT="hom")=0 .. no homozygous genotypes at the site + COUNT(GT="hom")=0 .. no homozygous genotypes at the site AVG(GQ)>50 .. average (arithmetic mean) of genotype qualities bigger than 50 diff --git a/mpileup.c b/mpileup.c index fd5aa510e..a768b5934 100644 --- a/mpileup.c +++ b/mpileup.c @@ -73,6 +73,7 @@ typedef struct { int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; + int no_indelQ_tweaks; char *reg_fname, *pl_list, *fai_fname, *output_fname; int reg_is_file, record_cmd_line, n_threads, clevel; faidx_t *fai; @@ -842,7 +843,8 @@ static int mpileup(mplp_conf_t *conf) conf->delta_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; - conf->bca->indel_bias = conf->indel_bias; + conf->bca->indel_bias_inverted = 1 / conf->indel_bias; + conf->bca->no_indelQ_tweaks = conf->no_indelQ_tweaks; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; @@ -1175,13 +1177,15 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); fprintf(fp, " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); + fprintf(fp, + " --no-indelQ-tweaks Increase sensitivity by disabling indel quality adjustments\n"); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" " 1.12: -Q13 -h100 -m1 -F0.002\n" " illumina: [ default values ]\n" - " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" - " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" + " ont: -B -Q5 --max-BQ 30 --no-indelQ-tweaks -I [also try eg |bcftools call -P0.01]\n" + " pacbio-ccs: -D -Q5 --max-BQ 50 --no-indelQ-tweaks -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" "\n" "Notes: Assuming diploid individuals.\n" "\n" @@ -1283,6 +1287,7 @@ int main_mpileup(int argc, char *argv[]) {"gap-frac", required_argument, NULL, 'F'}, {"indel-bias", required_argument, NULL, 10}, {"indel-size", required_argument, NULL, 15}, + {"no-indelQ-tweaks", no_argument, NULL, 20}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1400,11 +1405,9 @@ int main_mpileup(int argc, char *argv[]) break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; - case 10: // --indel-bias (inverted so higher => more indels called) - if (atof(optarg) < 1e-2) - mplp.indel_bias = 1/1e2; - else - mplp.indel_bias = 1/atof(optarg); + case 10: // --indel-bias + mplp.indel_bias = atof(optarg); + if ( mplp.indel_bias < 1e-2 ) mplp.indel_bias = 1e-2; break; case 15: { char *tmp; @@ -1417,6 +1420,7 @@ int main_mpileup(int argc, char *argv[]) } } break; + case 20: mplp.no_indelQ_tweaks = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1441,6 +1445,7 @@ int main_mpileup(int argc, char *argv[]) mplp.extQ = 1; mplp.flag |= MPLP_REALN_PARTIAL; mplp.max_read_len = 99999; + mplp.no_indelQ_tweaks = 1; } else if (strcasecmp(optarg, "ont") == 0) { fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " "a higher -P, eg -P0.01 or -P 0.1\n"); @@ -1448,6 +1453,7 @@ int main_mpileup(int argc, char *argv[]) mplp.max_baseQ = 30; mplp.flag &= ~MPLP_REALN; mplp.flag |= MPLP_NO_INDEL; + mplp.no_indelQ_tweaks = 1; } else if (strcasecmp(optarg, "1.12") == 0) { // 1.12 and earlier mplp.min_frac = 0.002;