Skip to content

Commit a21447f

Browse files
LeeTL1220lbergelson
authored andcommitted
Fixing hash collision issue when adding filter value to the variant context (#3305)
* Fixing hash collision issue when adding filter value to the variant context. * adding a new class UniqueIDWrapper which wraps an object and associates it with a (per jvm invocation) unique id.
1 parent 349ac2d commit a21447f

File tree

5 files changed

+130
-27
lines changed

5 files changed

+130
-27
lines changed

src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,16 @@
66
import org.apache.logging.log4j.LogManager;
77
import org.apache.logging.log4j.Logger;
88
import org.broadinstitute.hellbender.engine.ProgressMeter;
9-
import org.broadinstitute.hellbender.exceptions.UserException;
9+
import org.broadinstitute.hellbender.exceptions.GATKException;
1010
import org.broadinstitute.hellbender.tools.picard.analysis.artifacts.Transition;
1111
import org.broadinstitute.hellbender.utils.SimpleInterval;
12+
import org.broadinstitute.hellbender.utils.UniqueIDWrapper;
1213
import org.broadinstitute.hellbender.utils.Utils;
1314
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
1415
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
1516
import org.broadinstitute.hellbender.utils.param.ParamUtils;
1617
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
17-
18+
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
1819
import java.util.*;
1920
import java.util.stream.Collectors;
2021
import java.util.stream.IntStream;
@@ -58,7 +59,8 @@ public static VariantContext annotateVariantContextWithPreprocessingValues(final
5859

5960
final List<Allele> alleles = genotype.getAlleles();
6061
if (genotype.getPloidy() != 2) {
61-
logger.warn("No action required: This tool will skip non-diploid sites. Saw GT: " + genotype.getGenotypeString() + " at " + vc.toStringWithoutGenotypes());
62+
logger.debug("No action required: This tool will skip non-diploid sites. Saw GT: {} at {}",
63+
genotype::getGenotypeString, vc::toStringWithoutGenotypes);
6264
}
6365

6466
// Get the reference allele as a String and make sure that there is only one ref allele and that it is length
@@ -131,7 +133,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
131133
}
132134

133135
final List<String> sampleNames = preAdapterQAnnotatedVariants.get(0).getSampleNamesOrderedByName();
134-
final Map<String, SortedMap<Genotype, VariantContext>> sampleNameToVariants = createSampleToGenotypeVariantContextSortedMap(sampleNames, preAdapterQAnnotatedVariants);
136+
final Map<String, SortedMap<UniqueIDWrapper<Genotype>, VariantContext>> sampleNameToVariants = createSampleToGenotypeVariantContextSortedMap(sampleNames, preAdapterQAnnotatedVariants);
135137

136138
// This map will hold all updated genotypes (across samples)
137139
final Map<VariantContext, List<Genotype>> newGenotypes = new HashMap<>();
@@ -143,7 +145,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
143145
// I.e. only remove filtered or ref/ref genotypes and count the rest.
144146
final long unfilteredGenotypeCount = OrientationBiasUtils.calculateUnfilteredNonRefGenotypeCount(preAdapterQAnnotatedVariants, sampleName);
145147

146-
final SortedMap<Genotype, VariantContext> genotypesToConsiderForFiltering = sampleNameToVariants.get(sampleName);
148+
final SortedMap<UniqueIDWrapper<Genotype>, VariantContext> genotypesToConsiderForFiltering = sampleNameToVariants.get(sampleName);
147149

148150
// Save some time, especially for the normal sample
149151
if (genotypesToConsiderForFiltering.keySet().size() == 0) {
@@ -171,7 +173,8 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
171173

172174
final Map<Transition, Long> transitionCutSoFar = new HashMap<>();
173175
relevantTransitions.stream().forEach(transition -> transitionCutSoFar.put(transition, 0L));
174-
for (final Genotype genotype : genotypesToConsiderForFiltering.keySet()) {
176+
for (final UniqueIDWrapper<Genotype> genotypeWrapped : genotypesToConsiderForFiltering.keySet()) {
177+
final Genotype genotype = genotypeWrapped.getWrapped();
175178
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype);
176179
// Since we only have the ALT_F2R1 and ALT_F1R2 counts for the first alt allele, we will not consider a site where transition is not artifact mode in the first alt allele.
177180
final Transition transition = Transition.transitionOf(genotype.getAllele(0).getBaseString().charAt(0), genotype.getAllele(1).getBaseString().charAt(0));
@@ -195,7 +198,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
195198
}
196199
}
197200

198-
newGenotypes.computeIfAbsent(genotypesToConsiderForFiltering.get(genotype), v -> new ArrayList<>()).add(genotypeBuilder.make());
201+
newGenotypes.computeIfAbsent(genotypesToConsiderForFiltering.get(genotypeWrapped), v -> new ArrayList<>()).add(genotypeBuilder.make());
199202
}
200203
}
201204

@@ -210,8 +213,11 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
210213
final List<Genotype> newGenotypesForThisVariantContext = newGenotypes.get(vc);
211214
newGenotypesForThisVariantContext.forEach(gcc::replace);
212215
final VariantContextBuilder variantContextBuilder = new VariantContextBuilder(vc).genotypes(gcc);
216+
217+
// Add the orientation bias filter to the variant context.
213218
if (newGenotypesForThisVariantContext.stream().anyMatch(g -> (g != null) && (g.getFilters() != null) && (g.getFilters().contains(OrientationBiasFilterConstants.IS_ORIENTATION_BIAS_CUT)))) {
214-
variantContextBuilder.filter(OrientationBiasFilterConstants.IS_ORIENTATION_BIAS_CUT);
219+
final List<String> result = GATKVariantContextUtils.createFilterListWithAppend(vc, OrientationBiasFilterConstants.IS_ORIENTATION_BIAS_CUT);
220+
variantContextBuilder.filters(result.toArray(new String[result.size()]));
215221
}
216222
final VariantContext updatedVariantContext = variantContextBuilder.make();
217223
finalVariants.add(updatedVariantContext);
@@ -225,7 +231,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
225231
}
226232

227233

228-
private static Map<Transition, Long> createTransitionToNumCutPrePreAdapterQ(double fdrThresh, String sampleName, long unfilteredGenotypeCount, final SortedMap<Genotype, VariantContext> genotypesToConsiderForFiltering, final Map<Transition, Long> transitionCount) {
234+
private static Map<Transition, Long> createTransitionToNumCutPrePreAdapterQ(double fdrThresh, String sampleName, long unfilteredGenotypeCount, final SortedMap<UniqueIDWrapper<Genotype>, VariantContext> genotypesToConsiderForFiltering, final Map<Transition, Long> transitionCount) {
229235
final long allTransitionCount = transitionCount.values().stream().mapToLong(Long::longValue).sum();
230236
final int totalNumToCut = calculateTotalNumToCut(fdrThresh, unfilteredGenotypeCount, genotypesToConsiderForFiltering);
231237

@@ -241,9 +247,9 @@ private static Map<Transition, Long> createTransitionToNumCutPrePreAdapterQ(doub
241247
return transitionNumToCut;
242248
}
243249

244-
private static int calculateTotalNumToCut(final double fdrThresh, final long unfilteredGenotypeCount, final SortedMap<Genotype, VariantContext> genotypesToConsiderForFiltering) {
250+
private static int calculateTotalNumToCut(final double fdrThresh, final long unfilteredGenotypeCount, final SortedMap<UniqueIDWrapper<Genotype>, VariantContext> genotypesToConsiderForFiltering) {
245251
final List<Double> pArtifactScores = genotypesToConsiderForFiltering.keySet().stream()
246-
.map(g -> OrientationBiasUtils.getGenotypeDouble(g, OrientationBiasFilterConstants.P_ARTIFACT_FIELD_NAME, 0.0))
252+
.map(g -> OrientationBiasUtils.getGenotypeDouble(g.getWrapped(), OrientationBiasFilterConstants.P_ARTIFACT_FIELD_NAME, 0.0))
247253
.collect(Collectors.toList());
248254

249255
// When doing the Benjamini-Hochberg procedure, we need to include the non-artifact mode SNVs to guarantee the
@@ -274,12 +280,12 @@ static int calculateTotalNumToCut(final double fdrThreshold, final long unfilter
274280
.findFirst().orElse(pArtifactScoresIncludingNonArtifact.size() - 1);
275281
}
276282

277-
private static Map<Transition, Long> createTransitionCountMap(SortedSet<Transition> relevantTransitions, SortedMap<Genotype, VariantContext> genotypesToConsiderForFiltering) {
283+
private static Map<Transition, Long> createTransitionCountMap(SortedSet<Transition> relevantTransitions, SortedMap<UniqueIDWrapper<Genotype>, VariantContext> genotypesToConsiderForFiltering) {
278284
final Map<Transition, Long> transitionCount = new HashMap<>();
279285
relevantTransitions.stream().forEach(transition -> transitionCount.put(transition, 0L));
280-
for (final Genotype g : genotypesToConsiderForFiltering.keySet()) {
286+
for (final UniqueIDWrapper<Genotype> g : genotypesToConsiderForFiltering.keySet()) {
281287
relevantTransitions.stream()
282-
.filter(transition -> OrientationBiasUtils.isGenotypeInTransition(g, transition))
288+
.filter(transition -> OrientationBiasUtils.isGenotypeInTransition(g.getWrapped(), transition))
283289
.forEach(transition -> transitionCount.put(transition, transitionCount.get(transition) + 1));
284290
}
285291
return transitionCount;
@@ -291,28 +297,29 @@ private static Map<Transition, Long> createTransitionCountMap(SortedSet<Transiti
291297
* @param variants The associated VariantContexts. The given sample names should be included.
292298
* @return a mapping from the sampleNames to the a sorted (by p_artifact score) map that associates genotypes to their enclosing variant context.
293299
*/
294-
public static Map<String, SortedMap<Genotype, VariantContext>> createSampleToGenotypeVariantContextSortedMap(final List<String> sampleNames, final Collection<VariantContext> variants) {
300+
public static Map<String, SortedMap<UniqueIDWrapper<Genotype>, VariantContext>> createSampleToGenotypeVariantContextSortedMap(final List<String> sampleNames, final Collection<VariantContext> variants) {
295301

296302
// Sorts in reverse order (highest p_artifact goes first and will not allow anything to be equal
297-
// unless they share the same reference)
303+
// unless they share the same reference). Unfortunately, we cannot check the actual reference, so we rely on
304+
// hashCode and the genotype rendered as a string to be universally unique.
298305
// Note the negative sign is to sort in reverse error.
299-
final Comparator<Genotype> genotypePArtifactComparator = Comparator
300-
.comparingDouble((Genotype g) -> -OrientationBiasUtils.getGenotypeDouble(g, OrientationBiasFilterConstants.P_ARTIFACT_FIELD_NAME, 0.0))
301-
.thenComparingInt(g -> g.hashCode());
306+
final Comparator<UniqueIDWrapper<Genotype>> genotypePArtifactComparator = Comparator
307+
.comparingDouble((UniqueIDWrapper<Genotype> g) -> -OrientationBiasUtils.getGenotypeDouble(g.getWrapped(), OrientationBiasFilterConstants.P_ARTIFACT_FIELD_NAME, 0.0))
308+
.thenComparingLong(UniqueIDWrapper::getId);
302309

303-
final Map<String, SortedMap<Genotype, VariantContext>> sampleNameToVariants = new HashMap<>();
310+
final Map<String, SortedMap<UniqueIDWrapper<Genotype>, VariantContext>> sampleNameToVariants = new HashMap<>();
304311

305312
final ProgressMeter customProgressMeter = new ProgressMeter(0.1);
306313
customProgressMeter.start();
307314

308315
// Populate a mapping of genotypes that we might want to filter to their variant context.
309316
// Make sure that the keys are sorted by cumulative probability of being an artifact.
310317
for (final String sampleName : sampleNames) {
311-
final SortedMap<Genotype, VariantContext> genotypesToConsiderForFiltering = new TreeMap<>(genotypePArtifactComparator);
318+
final SortedMap<UniqueIDWrapper<Genotype>, VariantContext> genotypesToConsiderForFiltering = new TreeMap<>(genotypePArtifactComparator);
312319
for (final VariantContext vc : variants) {
313320
vc.getGenotypes(sampleName).stream()
314321
.filter(g -> isFilteringCandidate(g, vc))
315-
.forEach(genotype -> genotypesToConsiderForFiltering.put(genotype, vc));
322+
.forEach(genotype -> putDisallowingKeyOverwrite(genotypesToConsiderForFiltering, genotype, vc));
316323
customProgressMeter.update(new SimpleInterval(vc.getContig(), vc.getStart(), vc.getEnd()));
317324
}
318325
sampleNameToVariants.put(sampleName, genotypesToConsiderForFiltering);
@@ -321,6 +328,21 @@ public static Map<String, SortedMap<Genotype, VariantContext>> createSampleToGen
321328
return sampleNameToVariants;
322329
}
323330

331+
/**
332+
* Add the genotype:vc entry into the given map, but fail if a key already exists.
333+
*
334+
* @param genotypesToConsiderForFiltering SortedMap, changed in place
335+
* @param genotype key to insert into the sorted map
336+
* @param vc value to insert into the sorted map
337+
*/
338+
private static void putDisallowingKeyOverwrite(SortedMap<UniqueIDWrapper<Genotype>, VariantContext> genotypesToConsiderForFiltering, Genotype genotype, VariantContext vc) {
339+
340+
final VariantContext vcTemp = genotypesToConsiderForFiltering.putIfAbsent(new UniqueIDWrapper<>(genotype), vc);
341+
if (vcTemp != null) {
342+
throw new GATKException.ShouldNeverReachHereException("This error may disappear on a subsequent run of this tool, so please try running this tool again. Otherwise, this can only be fixed by a GATK developer. Attempting to overwrite a key in the genotypes for filtering map. This means that genotypePArtifactComparator is (still) not generating unique keys. See https://github.com/broadinstitute/gatk/issues/3291");
343+
}
344+
}
345+
324346
/**
325347
* Determine whether this genotype can be filtered by the orientation bias filter.
326348
*
@@ -362,4 +384,5 @@ public static VCFHeader createVCFHeader(final VCFHeader inputVCFHeader, final St
362384
final Set<String> sampleNameSet = samples.asSetOfSamples();
363385
return new VCFHeader(headerLines, sampleNameSet);
364386
}
387+
365388
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package org.broadinstitute.hellbender.utils;
2+
3+
import java.util.concurrent.atomic.AtomicLong;
4+
5+
/**
6+
* Create a unique ID for an arbitrary object and wrap it.
7+
*
8+
* Warning: This will not work as desired on a Spark cluster (local spark should be fine)
9+
*/
10+
public class UniqueIDWrapper<A> {
11+
private static final AtomicLong counter = new AtomicLong();
12+
13+
private final long id;
14+
private final A wrapped;
15+
16+
public UniqueIDWrapper(A toWrap) {
17+
id = counter.getAndIncrement();
18+
wrapped = toWrap;
19+
}
20+
21+
public long getId() {
22+
return id;
23+
}
24+
25+
public A getWrapped() {
26+
return wrapped;
27+
}
28+
}

src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
1010
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
1111
import htsjdk.variant.vcf.VCFConstants;
12-
import org.apache.commons.io.FilenameUtils;
1312
import htsjdk.variant.vcf.VCFHeaderLine;
1413
import htsjdk.variant.vcf.VCFStandardHeaderLines;
14+
import org.apache.commons.io.FilenameUtils;
1515
import org.apache.commons.lang3.ArrayUtils;
1616
import org.apache.commons.lang3.tuple.MutablePair;
1717
import org.apache.commons.lang3.tuple.Pair;
@@ -1469,4 +1469,21 @@ public static int calculateGQFromPLs(final int[] plValues) {
14691469
}
14701470
return second - first;
14711471
}
1472+
1473+
/**
1474+
* Create a string with existing filters plus the one to append
1475+
* @param vc VariantContext to base initial filter values. Not {@code null}
1476+
* @param filterToAppend the filter value to append to the strings Not {@code null}
1477+
* @return String of filter values. Sorted alphabetically.
1478+
*/
1479+
public static List<String> createFilterListWithAppend(VariantContext vc, String filterToAppend) {
1480+
Utils.nonNull(vc);
1481+
Utils.nonNull(filterToAppend);
1482+
1483+
final List<String> filtersAsList = new ArrayList<>(vc.getFilters());
1484+
filtersAsList.add(filterToAppend);
1485+
filtersAsList.sort(String::compareToIgnoreCase);
1486+
return filtersAsList;
1487+
}
14721488
}
1489+

src/test/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFiltererUnitTest.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import org.apache.commons.lang3.tuple.Pair;
77
import org.broadinstitute.hellbender.engine.FeatureDataSource;
88
import org.broadinstitute.hellbender.tools.picard.analysis.artifacts.Transition;
9+
import org.broadinstitute.hellbender.utils.UniqueIDWrapper;
910
import org.broadinstitute.hellbender.utils.test.BaseTest;
1011
import org.testng.Assert;
1112
import org.testng.annotations.DataProvider;
@@ -155,7 +156,7 @@ public void testAnnotateVariantContextWithFilterValuesMultiArtifact() {
155156
final List<String> sampleNames = updatedVariants.get(0).getSampleNamesOrderedByName();
156157

157158
// Create a mapping from sample name to a genotype->variant context map
158-
final Map<String, SortedMap<Genotype, VariantContext>> sampleNameToVariants = OrientationBiasFilterer.createSampleToGenotypeVariantContextSortedMap(sampleNames, updatedVariants);
159+
final Map<String, SortedMap<UniqueIDWrapper<Genotype>, VariantContext>> sampleNameToVariants = OrientationBiasFilterer.createSampleToGenotypeVariantContextSortedMap(sampleNames, updatedVariants);
159160
}
160161

161162

@@ -194,7 +195,7 @@ public void testCreateSampleToGenotypeVCMap() {
194195

195196
// Do the test
196197
// Create a mapping from sample name to a genotype->variant context map with the second map sorted by p_artifact (i.e. OrientationBiasFilterConstants.P_ARTIFACT_FIELD_NAME)
197-
final Map<String, SortedMap<Genotype, VariantContext>> sampleNameToVariants = OrientationBiasFilterer.createSampleToGenotypeVariantContextSortedMap(sampleNames, updatedVariants);
198+
final Map<String, SortedMap<UniqueIDWrapper<Genotype>, VariantContext>> sampleNameToVariants = OrientationBiasFilterer.createSampleToGenotypeVariantContextSortedMap(sampleNames, updatedVariants);
198199
Assert.assertEquals(sampleNameToVariants.keySet().size(), 2);
199200
Assert.assertTrue(sampleNameToVariants.keySet().contains("TUMOR"));
200201
Assert.assertTrue(sampleNameToVariants.keySet().contains("NORMAL"));
@@ -206,7 +207,8 @@ public void testCreateSampleToGenotypeVCMap() {
206207

207208
// Check that the sorted map is getting smaller (or same) values of p_artifact and not staying put.
208209
double previousPArtifact = Double.POSITIVE_INFINITY;
209-
for (final Genotype genotypeTumor : sampleNameToVariants.get("TUMOR").keySet()) {
210+
for (final UniqueIDWrapper<Genotype> genotypeTumorWrapped : sampleNameToVariants.get("TUMOR").keySet()) {
211+
final Genotype genotypeTumor = genotypeTumorWrapped.getWrapped();
210212
final Double pArtifact = OrientationBiasUtils.getGenotypeDouble(genotypeTumor, OrientationBiasFilterConstants.P_ARTIFACT_FIELD_NAME, Double.POSITIVE_INFINITY);
211213
Assert.assertNotNull(pArtifact);
212214
Assert.assertTrue(pArtifact <= previousPArtifact);

0 commit comments

Comments
 (0)