Skip to content

Commit ea3032d

Browse files
authored
Adding hard filter to M2 for polymorphic NuMTs and low VAF sites (#5842)
1 parent 21da1c0 commit ea3032d

File tree

11 files changed

+115
-111
lines changed

11 files changed

+115
-111
lines changed

src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/PolymorphicNuMT.java

Lines changed: 0 additions & 85 deletions
This file was deleted.

src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
4444
public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
4545
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
4646
public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts";
47-
public static final String MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME = "median-autosomal-coverage";
4847
public static final String MITOCHONDRIA_MODE_LONG_NAME = "mitochondria-mode";
4948
public static final String CALLABLE_DEPTH_LONG_NAME = "callable-depth";
5049
public static final String PCR_SNV_QUAL_LONG_NAME = "pcr-snv-qual";
@@ -230,13 +229,6 @@ public double getInitialLod() {
230229
@Argument(fullName= IGNORE_ITR_ARTIFACTS_LONG_NAME, doc="Turn off read transformer that clips artifacts associated with end repair insertions near inverted tandem repeats.", optional = true)
231230
public boolean dontClipITRArtifacts = false;
232231

233-
/**
234-
* Used to model autosomal coverage when calling mitochondria. The median tends to be a more robust center statistic.
235-
*/
236-
@Advanced
237-
@Argument(fullName = MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME, doc="For mitochondrial calling only; Annotate possible polymorphic NuMT based on Poisson distribution given median autosomal coverage", optional = true)
238-
public double autosomalCoverage;
239-
240232
/**
241233
* When Mutect2 is run in reference confidence mode with banding compression enabled (-ERC GVCF), homozygous-reference
242234
* sites are compressed into bands of similar tumor LOD (TLOD) that are emitted as a single VCF record. See

src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,9 +306,6 @@ public void onTraversalStart() {
306306
public Collection<Annotation> makeVariantAnnotations(){
307307
final Collection<Annotation> annotations = super.makeVariantAnnotations();
308308

309-
if (MTAC.autosomalCoverage > 0) {
310-
annotations.add(new PolymorphicNuMT(MTAC.autosomalCoverage));
311-
}
312309
if (MTAC.mitochondria) {
313310
annotations.add(new OriginalAlignment());
314311
}

src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/M2FiltersArgumentCollection.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public class M2FiltersArgumentCollection {
3636
public double initialPosteriorThreshold = DEFAULT_INITIAL_POSTERIOR_THRESHOLD;
3737

3838
/**
39-
* Mitochondria mode includes the filter{@link ChimericOriginalAlignmentFilter}
39+
* Mitochondria mode includes the filter{@link ChimericOriginalAlignmentFilter} and {@link PolymorphicNuMTFilter},
4040
* and excludes the filters {@link ClusteredEventsFilter}, {@link MultiallelicFilter}, {@link PolymeraseSlippageFilter},
4141
* {@link FilteredHaplotypeFilter}, and {@link GermlineFilter}
4242
*/
@@ -55,9 +55,10 @@ public class M2FiltersArgumentCollection {
5555
public static final String MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_LONG_NAME = "max-median-fragment-length-difference";
5656
public static final String MIN_MEDIAN_READ_POSITION_LONG_NAME = "min-median-read-position";
5757
public static final String MAX_N_RATIO_LONG_NAME = "max-n-ratio";
58-
public static final String MIN_LOG_10_ODDS_DIVIDED_BY_DEPTH = "lod-divided-by-depth";
5958
public static final String MIN_READS_ON_EACH_STRAND_LONG_NAME = "min-reads-per-strand";
6059
public static final String MAX_NUMT_FRACTION_LONG_NAME = "max-numt-fraction";
60+
public static final String MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME = "autosomal-coverage";
61+
public static final String MIN_AF_LONG_NAME = "min-allele-fraction";
6162

6263
private static final int DEFAULT_MAX_EVENTS_IN_REGION = 2;
6364
private static final int DEFAULT_MAX_ALT_ALLELES = 1;
@@ -69,6 +70,8 @@ public class M2FiltersArgumentCollection {
6970
private static final double DEFAULT_MAX_N_RATIO = Double.POSITIVE_INFINITY;
7071
private static final int DEFAULT_MIN_READS_ON_EACH_STRAND = 0;
7172
private static final double DEFAULT_MAX_NUMT_FRACTION = 0.85;
73+
private static final double DEFAULT_MEDIAN_AUTOSOMAL_COVERAGE = 0;
74+
private static final double DEFAULT_MIN_AF = 0;
7275

7376
@Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum events in a single assembly region. Filter all variants if exceeded.")
7477
public int maxEventsInRegion = DEFAULT_MAX_EVENTS_IN_REGION;
@@ -97,9 +100,15 @@ public class M2FiltersArgumentCollection {
97100
@Argument(fullName = MIN_READS_ON_EACH_STRAND_LONG_NAME, optional = true, doc = "Minimum alt reads required on both forward and reverse strands")
98101
public int minReadsOnEachStrand = DEFAULT_MIN_READS_ON_EACH_STRAND;
99102

103+
@Argument(fullName = MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME, optional = true, doc = "Median autosomal coverage for filtering potential polymporphic NuMTs when calling on mitochondria.")
104+
public double medianAutosomalCoverage = DEFAULT_MEDIAN_AUTOSOMAL_COVERAGE;
105+
100106
@Argument(fullName = MAX_NUMT_FRACTION_LONG_NAME, doc="Maximum fraction of alt reads that originally aligned outside the mitochondria. These are due to NuMTs.", optional = true)
101107
public double maxNuMTFraction = DEFAULT_MAX_NUMT_FRACTION;
102108

109+
@Argument(fullName = MIN_AF_LONG_NAME, doc="Minimum allele fraction required", optional = true)
110+
public double minAf = DEFAULT_MIN_AF;
111+
103112

104113
/**
105114
* Input files and values to use if inputs are missing
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package org.broadinstitute.hellbender.tools.walkers.mutect.filtering;
2+
3+
import htsjdk.variant.variantcontext.VariantContext;
4+
import org.apache.commons.lang.mutable.MutableBoolean;
5+
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
6+
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
7+
8+
import java.util.*;
9+
import java.util.stream.IntStream;
10+
11+
public class MinAlleleFractionFilter extends HardFilter {
12+
private final double minAf;
13+
14+
public MinAlleleFractionFilter(final double minAf) {
15+
this.minAf = minAf;
16+
}
17+
18+
@Override
19+
public ErrorType errorType() { return ErrorType.ARTIFACT; }
20+
21+
@Override
22+
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
23+
return vc.getGenotypes().stream().filter(filteringEngine::isTumor)
24+
.filter(g -> g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))
25+
.anyMatch(g -> {
26+
final double[] alleleFractions = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 1.0);
27+
final int numRealAlleles = vc.hasSymbolicAlleles() ? alleleFractions.length - 1 : alleleFractions.length;
28+
final OptionalDouble max = IntStream.range(0, numRealAlleles).mapToDouble(a -> alleleFractions[a]).max();
29+
return max.getAsDouble() < minAf;
30+
});
31+
}
32+
33+
@Override
34+
public String filterName() {
35+
return GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME;
36+
}
37+
38+
@Override
39+
protected List<String> requiredAnnotations() { return Collections.emptyList(); }
40+
}

src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/Mutect2FilteringEngine.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ private void buildFiltersList(final M2FiltersArgumentCollection MTFAC) {
206206
filters.add(new NRatioFilter(MTFAC.nRatio));
207207
filters.add(new StrictStrandBiasFilter(MTFAC.minReadsOnEachStrand));
208208
filters.add(new ReadPositionFilter(MTFAC.minMedianReadPosition));
209+
filters.add(new MinAlleleFractionFilter(MTFAC.minAf));
209210

210211
if (!MTFAC.readOrientationPriorTarGzs.isEmpty()) {
211212
final List<File> artifactTables = MTFAC.readOrientationPriorTarGzs.stream().flatMap(tarGz -> {
@@ -219,6 +220,7 @@ private void buildFiltersList(final M2FiltersArgumentCollection MTFAC) {
219220

220221
if (MTFAC.mitochondria) {
221222
filters.add(new ChimericOriginalAlignmentFilter(MTFAC.maxNuMTFraction));
223+
filters.add(new PolymorphicNuMTFilter(MTFAC.medianAutosomalCoverage));
222224
} else {
223225
filters.add(new ClusteredEventsFilter(MTFAC.maxEventsInRegion));
224226
filters.add(new MultiallelicFilter(MTFAC.numAltAllelesThreshold));
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package org.broadinstitute.hellbender.tools.walkers.mutect.filtering;
2+
3+
import htsjdk.variant.variantcontext.Genotype;
4+
import htsjdk.variant.variantcontext.VariantContext;
5+
import org.apache.commons.lang.mutable.MutableBoolean;
6+
import org.apache.commons.math3.distribution.PoissonDistribution;
7+
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
8+
9+
import java.util.Collections;
10+
import java.util.List;
11+
import java.util.OptionalInt;
12+
import java.util.stream.IntStream;
13+
14+
public class PolymorphicNuMTFilter extends HardFilter {
15+
private static final double LOWER_BOUND_PROB = .01;
16+
private static final double MULTIPLE_COPIES_MULTIPLIER = 1.5;
17+
private final int maxAltDepthCutoff;
18+
19+
public PolymorphicNuMTFilter(final double medianAutosomalCoverage){
20+
if (medianAutosomalCoverage != 0) {
21+
final PoissonDistribution autosomalCoverage = new PoissonDistribution(medianAutosomalCoverage * MULTIPLE_COPIES_MULTIPLIER);
22+
maxAltDepthCutoff = autosomalCoverage.inverseCumulativeProbability(1 - LOWER_BOUND_PROB);
23+
} else {
24+
maxAltDepthCutoff = 0;
25+
}
26+
}
27+
28+
@Override
29+
public ErrorType errorType() { return ErrorType.NON_SOMATIC; }
30+
31+
@Override
32+
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
33+
return vc.getGenotypes().stream().filter(filteringEngine::isTumor)
34+
.filter(Genotype::hasAD)
35+
.anyMatch(g -> {
36+
final int[] alleleDepths = g.getAD();
37+
final int numRealAlleles = vc.hasSymbolicAlleles() ? alleleDepths.length - 1 : alleleDepths.length;
38+
//Start at first alternate allele depth (the ref allele is first)
39+
final OptionalInt max = IntStream.range(1, numRealAlleles).map(a -> alleleDepths[a]).max();
40+
return max.getAsInt() < maxAltDepthCutoff;
41+
});
42+
}
43+
44+
@Override
45+
public String filterName() {
46+
return GATKVCFConstants.POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME;
47+
}
48+
49+
@Override
50+
protected List<String> requiredAnnotations() { return Collections.emptyList(); }
51+
}

src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFConstants.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,6 @@ public final class GATKVCFConstants {
144144

145145
// M2-specific FORMAT keys
146146
public static final String ALLELE_FRACTION_KEY = "AF";
147-
public static final String POTENTIAL_POLYMORPHIC_NUMT_KEY = "NUMT";
148147

149148
//FILTERS
150149
/* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic,
@@ -171,6 +170,8 @@ their names (or descriptions) depend on some threshold. Those filters are not i
171170
public final static String STRICT_STRAND_BIAS_FILTER_NAME = "strict_strand";
172171
public final static String N_RATIO_FILTER_NAME = "n_ratio";
173172
public final static String CHIMERIC_ORIGINAL_ALIGNMENT_FILTER_NAME = "numt_chimera"; //mitochondria
173+
public final static String ALLELE_FRACTION_FILTER_NAME = "low_allele_frac";
174+
public static final String POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME = "numt_novel";
174175

175176
public static final List<String> MUTECT_FILTER_NAMES = Arrays.asList(POLYMERASE_SLIPPAGE,
176177
PON_FILTER_NAME, CLUSTERED_EVENTS_FILTER_NAME, TUMOR_EVIDENCE_FILTER_NAME, GERMLINE_RISK_FILTER_NAME,
@@ -179,7 +180,7 @@ their names (or descriptions) depend on some threshold. Those filters are not i
179180
MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_FILTER_NAME,
180181
READ_POSITION_FILTER_NAME, CONTAMINATION_FILTER_NAME, DUPLICATED_EVIDENCE_FILTER_NAME,
181182
READ_ORIENTATION_ARTIFACT_FILTER_NAME, BAD_HAPLOTYPE_FILTER_NAME, CHIMERIC_ORIGINAL_ALIGNMENT_FILTER_NAME,
182-
STRICT_STRAND_BIAS_FILTER_NAME, N_RATIO_FILTER_NAME);
183+
STRICT_STRAND_BIAS_FILTER_NAME, N_RATIO_FILTER_NAME, ALLELE_FRACTION_FILTER_NAME, POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME);
183184

184185
// Symbolic alleles
185186
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";

src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFHeaderLines.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,11 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
9292
addFilterLine(new VCFFilterHeaderLine(BAD_HAPLOTYPE_FILTER_NAME, "Variant near filtered variant on same haplotype."));
9393
addFilterLine(new VCFFilterHeaderLine(STRICT_STRAND_BIAS_FILTER_NAME, "Evidence for alt allele is not represented in both directions"));
9494
addFilterLine(new VCFFilterHeaderLine(N_RATIO_FILTER_NAME, "Ratio of N to alt exceeds specified ratio"));
95+
addFilterLine(new VCFFilterHeaderLine(ALLELE_FRACTION_FILTER_NAME, "Allele fraction is below specified threshold"));
9596

9697
//Mitochondrial M2-related filters
9798
addFilterLine(new VCFFilterHeaderLine(CHIMERIC_ORIGINAL_ALIGNMENT_FILTER_NAME, "NuMT variant with too many ALT reads originally from autosome"));
99+
addFilterLine(new VCFFilterHeaderLine(POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME, "Alt depth is below expected coverage of NuMT in autosome"));
98100

99101
addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype"));
100102
addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample"));
@@ -122,7 +124,6 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
122124
addFormatLine(new VCFFormatHeaderLine(ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele fractions of alternate alleles in the tumor"));
123125
addFormatLine(new VCFFormatHeaderLine(F1R2_KEY, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting each allele"));
124126
addFormatLine(new VCFFormatHeaderLine(F2R1_KEY, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting each allele"));
125-
addFormatLine(new VCFFormatHeaderLine(POTENTIAL_POLYMORPHIC_NUMT_KEY, 1, VCFHeaderLineType.String, "Potentially a polymorphic NuMT false positive rather than a real mitochondrial variant."));
126127

127128
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
128129
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));

0 commit comments

Comments
 (0)