Skip to content

Commit f0ad6aa

Browse files
committed
Work around a bug in the LOFTEE VEP plugin used to annotate gnomAD VCFs
The LoF_info subfield contains commas which, in general, makes it impossible to parse the VEP subfields in automated way. The +split-vep plugin can now work with such files, replacing the offending commas with slash (/) characters. Note that this makes two assumptions: 1) the number of subfields delimited by the pipe characters (|) are consistent with the header definition 2) the first subfield never contains a comma, otherwise it woud be impossible to distinguish between A|A,A,B,B|B and A|A,A,A,B|B See also Ensembl/ensembl-vep#1351
1 parent f4fac3c commit f0ad6aa

File tree

5 files changed

+81
-12
lines changed

5 files changed

+81
-12
lines changed

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,11 @@ Changes affecting specific commands:
128128

129129
- New `-H, --print-header` option to print the header with `-f`
130130

131+
- Work around a bug in the LOFTEE VEP plugin used to annotate gnomAD VCFs. There the
132+
LoF_info subfield contains commas which, in general, makes it impossible to parse the
133+
VEP subfields. The +split-vep plugin can now work with such files, replacing the offending
134+
commas with slash (/) characters. See also https://github.com/Ensembl/ensembl-vep/issues/1351
135+
131136
* bcftools stats
132137

133138
- The per-sample stats (PSC) would not be computed when `-i/-e` filtering options and

plugins/split-vep.c

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* The MIT License
22
3-
Copyright (c) 2019-2022 Genome Research Ltd.
3+
Copyright (c) 2019-2023 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -1095,6 +1095,55 @@ static void restrict_csqs_to_genes(args_t *args)
10951095
}
10961096
args->ncols_csq = nhit;
10971097
}
1098+
1099+
// Split the VEP annotation by transcript and by field, then check if the number of subfields looks alright.
1100+
// Unfortunately, we cannot enforce the number of subfields to match the header definition because that can
1101+
// be variable: `bcftools csq` outputs different number of fields for different consequence types.
1102+
// So we need to distinguish between this reasonable case and incorrectly formated consequences such
1103+
// as those reported for LoF_info subfield here https://github.com/Ensembl/ensembl-vep/issues/1351.
1104+
static void split_csq_fields(args_t *args, bcf1_t *rec, int csq_str_len)
1105+
{
1106+
int i;
1107+
args->cols_tr = cols_split(args->csq_str, args->cols_tr, ',');
1108+
if ( args->cols_tr->n > args->mcols_csq )
1109+
{
1110+
args->cols_csq = (cols_t**)realloc(args->cols_csq,args->cols_tr->n*sizeof(*args->cols_csq));
1111+
for (i=args->mcols_csq; i<args->cols_tr->n; i++) args->cols_csq[i] = NULL;
1112+
args->mcols_csq = args->cols_tr->n;
1113+
}
1114+
args->ncols_csq = args->cols_tr->n;
1115+
int nfield_diff = 0, need_fix = 0;
1116+
for (i=0; i<args->cols_tr->n; i++)
1117+
{
1118+
args->cols_csq[i] = cols_split(args->cols_tr->off[i], args->cols_csq[i], '|');
1119+
if ( args->csq_idx >= args->cols_csq[i]->n ) need_fix = 1;
1120+
if ( nfield_diff < abs(args->cols_csq[i]->n - args->nfield) ) nfield_diff = args->cols_csq[i]->n - args->nfield;
1121+
}
1122+
if ( !csq_str_len ) return; // called 2nd time, don't attempt to fix
1123+
if ( !need_fix ) return;
1124+
1125+
static int warned = 0;
1126+
if ( !warned )
1127+
fprintf(stderr,"Warning: The number of INFO/%s subfields at %s:%"PRIhts_pos" does not match the header definition,\n"
1128+
" expected %d subfields, found as %s as %d. (This warning is printed only once.)\n",
1129+
args->vep_tag,bcf_seqname(args->hdr,rec),rec->pos+1,args->nfield,nfield_diff>0?"much":"few",args->nfield+nfield_diff);
1130+
warned = 1;
1131+
1132+
// One known failure mode is LoF_info subfield which can contain commas. Work around this by relying on
1133+
// the number of pipe delimiters matching the header definition, replacing offending commas with slash
1134+
// characters. This assumes that there is never a comma in the first subfield, otherwise it would be
1135+
// impossible to distinguish between A|A,A,B,B|B and A|A,A,A,B|B
1136+
int npipe = 0;
1137+
while ( csq_str_len > 0 )
1138+
{
1139+
csq_str_len--;
1140+
if ( args->csq_str[csq_str_len]=='|' ) { npipe++; continue; }
1141+
if ( args->csq_str[csq_str_len]!=',' ) continue;
1142+
if ( npipe && !(npipe % (args->nfield-1)) ) { npipe = 0; continue; } // wholesome number of pipes encountered, this is a valid comma
1143+
args->csq_str[csq_str_len] = '/';
1144+
}
1145+
split_csq_fields(args,rec,0); // try once more
1146+
}
10981147
static void process_record(args_t *args, bcf1_t *rec)
10991148
{
11001149
int i,len = bcf_get_info_string(args->hdr,rec,args->vep_tag,&args->csq_str,&args->ncsq_str);
@@ -1108,17 +1157,7 @@ static void process_record(args_t *args, bcf1_t *rec)
11081157
return;
11091158
}
11101159

1111-
// split by transcript and by field
1112-
args->cols_tr = cols_split(args->csq_str, args->cols_tr, ',');
1113-
if ( args->cols_tr->n > args->mcols_csq )
1114-
{
1115-
args->cols_csq = (cols_t**)realloc(args->cols_csq,args->cols_tr->n*sizeof(*args->cols_csq));
1116-
for (i=args->mcols_csq; i<args->cols_tr->n; i++) args->cols_csq[i] = NULL;
1117-
args->mcols_csq = args->cols_tr->n;
1118-
}
1119-
args->ncols_csq = args->cols_tr->n;
1120-
for (i=0; i<args->cols_tr->n; i++)
1121-
args->cols_csq[i] = cols_split(args->cols_tr->off[i], args->cols_csq[i], '|');
1160+
split_csq_fields(args,rec,len);
11221161

11231162
// restrict to -g genes
11241163
if ( args->genes ) restrict_csqs_to_genes(args);

test/split-vep.broken-LoF.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
chr21:5032064 missense_variant .
2+
chr21:5032064 missense_variant .
3+
chr21:5032064 missense_variant .
4+
chr21:5032064 3_prime_UTR_variant&NMD_transcript_variant .
5+
chr21:5032064 missense_variant .
6+
chr21:5032064 missense_variant .
7+
chr21:5032064 missense_variant .
8+
chr21:5032064 missense_variant .
9+
chr21:5032064 missense_variant .
10+
chr21:5032064 frameshift_variant PERCENTILE:0.773118279569892/GERP_DIST:-366.377766615897/BP_DIST:218/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
11+
chr21:5032064 frameshift_variant PERCENTILE:0.635578583765112/GERP_DIST:-366.377766615897/BP_DIST:218/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
12+
chr21:5032064 frameshift_variant PERCENTILE:0.659498207885305/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
13+
chr21:5032064 3_prime_UTR_variant&NMD_transcript_variant .
14+
chr21:5032064 frameshift_variant PERCENTILE:0.790979097909791/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/ANN_ORF:-698.745/MAX_ORF:-698.745
15+
chr21:5032064 frameshift_variant PERCENTILE:0.790979097909791/GERP_DIST:-372.525567065179/BP_DIST:197/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
16+
chr21:5032064 frameshift_variant PERCENTILE:0.463571889103804/GERP_DIST:-1141.14512844086/BP_DIST:840/DIST_FROM_LAST_EXON:152/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
17+
chr21:5032064 frameshift_variant PERCENTILE:0.662062615101289/GERP_DIST:-354.294564935565/BP_DIST:374/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT
18+
chr21:5032064 frameshift_variant PERCENTILE:0.785792349726776/GERP_DIST:-391.862466733158/BP_DIST:203/DIST_FROM_LAST_EXON:187/50_BP_RULE:PASS/PHYLOCSF_TOO_SHORT

test/split-vep.broken-LoF.vcf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
##fileformat=VCFv4.2
2+
##INFO=<ID=vep,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info">
3+
##contig=<ID=chr21,length=46709983,assembly=gnomAD_GRCh38>
4+
#CHROM POS ID REF ALT QUAL FILTER INFO
5+
chr21 5032064 . G A . . vep=A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000612610|protein_coding|5/7||ENST00000612610.4:c.709G>A|ENSP00000483732.1:p.Gly237Arg|896|709|237|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||1|A2||ENSP00000483732|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000620481|protein_coding|4/6||ENST00000620481.4:c.358G>A|ENSP00000484302.1:p.Gly120Arg|545|358|120|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||5|||ENSP00000484302|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000623795|protein_coding|4/6||ENST00000623795.1:c.358G>A|ENSP00000485649.1:p.Gly120Arg|505|358|120|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene|||2|||ENSP00000485649|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|3_prime_UTR_variant&NMD_transcript_variant|MODIFIER|FP565260.3|ENSG00000277117|Transcript|ENST00000623903|nonsense_mediated_decay|5/7||ENST00000623903.3:c.*323G>A||706|||||1||1|SNV||Clone_based_ensembl_gene|||2|||ENSP00000485557||||||||||||||||,A|missense_variant|MODERATE|FP565260.3|ENSG00000277117|Transcript|ENST00000623960|protein_coding|5/7||ENST00000623960.4:c.709G>A|ENSP00000485129.1:p.Gly237Arg|858|709|237|G/R|Gga/Aga|1||1|SNV||Clone_based_ensembl_gene||YES|1|P2|CCDS86973.1|ENSP00000485129|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|NM_001363770.2|protein_coding|5/7||NM_001363770.2:c.709G>A|NP_001350699.1:p.Gly237Arg|858|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||YES||||NP_001350699.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_006723899.2|protein_coding|5/6||XM_006723899.2:c.709G>A|XP_006723962.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_006723962.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_011546078.2|protein_coding|5/7||XM_011546078.2:c.709G>A|XP_011544380.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_011544380.1||||||||||||||||,A|missense_variant|MODERATE|LOC102723996|102723996|Transcript|XM_011546079.1|protein_coding|5/7||XM_011546079.1:c.709G>A|XP_011544381.1:p.Gly237Arg|1345|709|237|G/R|Gga/Aga|1||1|SNV||EntrezGene||||||XP_011544381.1||||||||||||||||
6+
chr21 5032064 . G GGA . . vep=GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000612610|protein_coding|5/7||ENST00000612610.4:c.718_719dup|ENSP00000483732.1:p.Asp240GlufsTer35|896-897|709-710|237|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||1|A2||ENSP00000483732|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.773118279569892,GERP_DIST:-366.377766615897,BP_DIST:218,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000620481|protein_coding|4/6||ENST00000620481.4:c.367_368dup|ENSP00000484302.1:p.Asp123GlufsTer35|545-546|358-359|120|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||5|||ENSP00000484302|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.635578583765112,GERP_DIST:-366.377766615897,BP_DIST:218,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000623795|protein_coding|4/6||ENST00000623795.1:c.367_368dup|ENSP00000485649.1:p.Asp123GlufsTer35|505-506|358-359|120|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene|||2|||ENSP00000485649|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.659498207885305,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|3_prime_UTR_variant&NMD_transcript_variant|MODIFIER|FP565260.3|ENSG00000277117|Transcript|ENST00000623903|nonsense_mediated_decay|5/7||ENST00000623903.3:c.*332_*333dup||706-707|||||1||1|insertion||Clone_based_ensembl_gene|||2|||ENSP00000485557||||||||||||||||,GA|frameshift_variant|HIGH|FP565260.3|ENSG00000277117|Transcript|ENST00000623960|protein_coding|5/7||ENST00000623960.4:c.718_719dup|ENSP00000485129.1:p.Asp240GlufsTer35|858-859|709-710|237|G/GX|gga/gGAga|1||1|insertion||Clone_based_ensembl_gene||YES|1|P2|CCDS86973.1|ENSP00000485129|||||||PANTHER:PTHR24100&PANTHER:PTHR24100|10|||||HC||PHYLOCSF_WEAK|PERCENTILE:0.790979097909791,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,ANN_ORF:-698.745,MAX_ORF:-698.745,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|NM_001363770.2|protein_coding|5/7||NM_001363770.2:c.718_719dup|NP_001350699.1:p.Asp240GlufsTer35|858-859|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||YES||||NP_001350699.1||||||||10|||||HC|||PERCENTILE:0.790979097909791,GERP_DIST:-372.525567065179,BP_DIST:197,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_006723899.2|protein_coding|5/6||XM_006723899.2:c.718_719dup|XP_006723962.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_006723962.1||||||||10|||||HC|||PERCENTILE:0.463571889103804,GERP_DIST:-1141.14512844086,BP_DIST:840,DIST_FROM_LAST_EXON:152,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_011546078.2|protein_coding|5/7||XM_011546078.2:c.718_719dup|XP_011544380.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_011544380.1||||||||10|||||HC|||PERCENTILE:0.662062615101289,GERP_DIST:-354.294564935565,BP_DIST:374,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT,GA|frameshift_variant|HIGH|LOC102723996|102723996|Transcript|XM_011546079.1|protein_coding|5/7||XM_011546079.1:c.718_719dup|XP_011544381.1:p.Asp240GlufsTer35|1345-1346|709-710|237|G/GX|gga/gGAga|1||1|insertion||EntrezGene||||||XP_011544381.1||||||||10|||||HC|||PERCENTILE:0.785792349726776,GERP_DIST:-391.862466733158,BP_DIST:203,DIST_FROM_LAST_EXON:187,50_BP_RULE:PASS,PHYLOCSF_TOO_SHORT

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,7 @@
659659
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.1.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n']);
660660
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.2.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g {PATH}/split-vep.gene-list.txt]);
661661
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.gene-list',out=>'split-vep.gene-list.3.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Gene %Consequence\\n' -g +{PATH}/split-vep.gene-list.txt]);
662+
run_test(\&test_vcf_plugin,$opts,in=>'split-vep.broken-LoF',out=>'split-vep.broken-LoF.out',cmd=>'+split-vep',args=>qq[-d -f '%CHROM:%POS %Consequence %LoF_info\\n' -a vep]);
662663
run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]);
663664
run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]);
664665
run_test(\&test_vcf_plugin,$opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]);

0 commit comments

Comments
 (0)