66import org .apache .logging .log4j .LogManager ;
77import org .apache .logging .log4j .Logger ;
88import org .broadinstitute .hellbender .engine .ProgressMeter ;
9- import org .broadinstitute .hellbender .exceptions .UserException ;
9+ import org .broadinstitute .hellbender .exceptions .GATKException ;
1010import org .broadinstitute .hellbender .tools .picard .analysis .artifacts .Transition ;
1111import org .broadinstitute .hellbender .utils .SimpleInterval ;
12+ import org .broadinstitute .hellbender .utils .UniqueIDWrapper ;
1213import org .broadinstitute .hellbender .utils .Utils ;
1314import org .broadinstitute .hellbender .utils .genotyper .IndexedSampleList ;
1415import org .broadinstitute .hellbender .utils .genotyper .SampleList ;
1516import org .broadinstitute .hellbender .utils .param .ParamUtils ;
1617import org .broadinstitute .hellbender .utils .variant .GATKVCFConstants ;
17-
18+ import org . broadinstitute . hellbender . utils . variant . GATKVariantContextUtils ;
1819import java .util .*;
1920import java .util .stream .Collectors ;
2021import java .util .stream .IntStream ;
@@ -58,7 +59,8 @@ public static VariantContext annotateVariantContextWithPreprocessingValues(final
5859
5960 final List <Allele > alleles = genotype .getAlleles ();
6061 if (genotype .getPloidy () != 2 ) {
61- logger .warn ("No action required: This tool will skip non-diploid sites. Saw GT: " + genotype .getGenotypeString () + " at " + vc .toStringWithoutGenotypes ());
62+ logger .debug ("No action required: This tool will skip non-diploid sites. Saw GT: {} at {}" ,
63+ genotype ::getGenotypeString , vc ::toStringWithoutGenotypes );
6264 }
6365
6466 // Get the reference allele as a String and make sure that there is only one ref allele and that it is length
@@ -131,7 +133,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
131133 }
132134
133135 final List <String > sampleNames = preAdapterQAnnotatedVariants .get (0 ).getSampleNamesOrderedByName ();
134- final Map <String , SortedMap <Genotype , VariantContext >> sampleNameToVariants = createSampleToGenotypeVariantContextSortedMap (sampleNames , preAdapterQAnnotatedVariants );
136+ final Map <String , SortedMap <UniqueIDWrapper < Genotype > , VariantContext >> sampleNameToVariants = createSampleToGenotypeVariantContextSortedMap (sampleNames , preAdapterQAnnotatedVariants );
135137
136138 // This map will hold all updated genotypes (across samples)
137139 final Map <VariantContext , List <Genotype >> newGenotypes = new HashMap <>();
@@ -143,7 +145,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
143145 // I.e. only remove filtered or ref/ref genotypes and count the rest.
144146 final long unfilteredGenotypeCount = OrientationBiasUtils .calculateUnfilteredNonRefGenotypeCount (preAdapterQAnnotatedVariants , sampleName );
145147
146- final SortedMap <Genotype , VariantContext > genotypesToConsiderForFiltering = sampleNameToVariants .get (sampleName );
148+ final SortedMap <UniqueIDWrapper < Genotype > , VariantContext > genotypesToConsiderForFiltering = sampleNameToVariants .get (sampleName );
147149
148150 // Save some time, especially for the normal sample
149151 if (genotypesToConsiderForFiltering .keySet ().size () == 0 ) {
@@ -171,7 +173,8 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
171173
172174 final Map <Transition , Long > transitionCutSoFar = new HashMap <>();
173175 relevantTransitions .stream ().forEach (transition -> transitionCutSoFar .put (transition , 0L ));
174- for (final Genotype genotype : genotypesToConsiderForFiltering .keySet ()) {
176+ for (final UniqueIDWrapper <Genotype > genotypeWrapped : genotypesToConsiderForFiltering .keySet ()) {
177+ final Genotype genotype = genotypeWrapped .getWrapped ();
175178 final GenotypeBuilder genotypeBuilder = new GenotypeBuilder (genotype );
176179 // Since we only have the ALT_F2R1 and ALT_F1R2 counts for the first alt allele, we will not consider a site where transition is not artifact mode in the first alt allele.
177180 final Transition transition = Transition .transitionOf (genotype .getAllele (0 ).getBaseString ().charAt (0 ), genotype .getAllele (1 ).getBaseString ().charAt (0 ));
@@ -195,7 +198,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
195198 }
196199 }
197200
198- newGenotypes .computeIfAbsent (genotypesToConsiderForFiltering .get (genotype ), v -> new ArrayList <>()).add (genotypeBuilder .make ());
201+ newGenotypes .computeIfAbsent (genotypesToConsiderForFiltering .get (genotypeWrapped ), v -> new ArrayList <>()).add (genotypeBuilder .make ());
199202 }
200203 }
201204
@@ -210,8 +213,11 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
210213 final List <Genotype > newGenotypesForThisVariantContext = newGenotypes .get (vc );
211214 newGenotypesForThisVariantContext .forEach (gcc ::replace );
212215 final VariantContextBuilder variantContextBuilder = new VariantContextBuilder (vc ).genotypes (gcc );
216+
217+ // Add the orientation bias filter to the variant context.
213218 if (newGenotypesForThisVariantContext .stream ().anyMatch (g -> (g != null ) && (g .getFilters () != null ) && (g .getFilters ().contains (OrientationBiasFilterConstants .IS_ORIENTATION_BIAS_CUT )))) {
214- variantContextBuilder .filter (OrientationBiasFilterConstants .IS_ORIENTATION_BIAS_CUT );
219+ final List <String > result = GATKVariantContextUtils .createFilterListWithAppend (vc , OrientationBiasFilterConstants .IS_ORIENTATION_BIAS_CUT );
220+ variantContextBuilder .filters (result .toArray (new String [result .size ()]));
215221 }
216222 final VariantContext updatedVariantContext = variantContextBuilder .make ();
217223 finalVariants .add (updatedVariantContext );
@@ -225,7 +231,7 @@ public static List<VariantContext> annotateVariantContextsWithFilterResults(fina
225231 }
226232
227233
228- private static Map <Transition , Long > createTransitionToNumCutPrePreAdapterQ (double fdrThresh , String sampleName , long unfilteredGenotypeCount , final SortedMap <Genotype , VariantContext > genotypesToConsiderForFiltering , final Map <Transition , Long > transitionCount ) {
234+ private static Map <Transition , Long > createTransitionToNumCutPrePreAdapterQ (double fdrThresh , String sampleName , long unfilteredGenotypeCount , final SortedMap <UniqueIDWrapper < Genotype > , VariantContext > genotypesToConsiderForFiltering , final Map <Transition , Long > transitionCount ) {
229235 final long allTransitionCount = transitionCount .values ().stream ().mapToLong (Long ::longValue ).sum ();
230236 final int totalNumToCut = calculateTotalNumToCut (fdrThresh , unfilteredGenotypeCount , genotypesToConsiderForFiltering );
231237
@@ -241,9 +247,9 @@ private static Map<Transition, Long> createTransitionToNumCutPrePreAdapterQ(doub
241247 return transitionNumToCut ;
242248 }
243249
244- private static int calculateTotalNumToCut (final double fdrThresh , final long unfilteredGenotypeCount , final SortedMap <Genotype , VariantContext > genotypesToConsiderForFiltering ) {
250+ private static int calculateTotalNumToCut (final double fdrThresh , final long unfilteredGenotypeCount , final SortedMap <UniqueIDWrapper < Genotype > , VariantContext > genotypesToConsiderForFiltering ) {
245251 final List <Double > pArtifactScores = genotypesToConsiderForFiltering .keySet ().stream ()
246- .map (g -> OrientationBiasUtils .getGenotypeDouble (g , OrientationBiasFilterConstants .P_ARTIFACT_FIELD_NAME , 0.0 ))
252+ .map (g -> OrientationBiasUtils .getGenotypeDouble (g . getWrapped () , OrientationBiasFilterConstants .P_ARTIFACT_FIELD_NAME , 0.0 ))
247253 .collect (Collectors .toList ());
248254
249255 // When doing the Benjamini-Hochberg procedure, we need to include the non-artifact mode SNVs to guarantee the
@@ -274,12 +280,12 @@ static int calculateTotalNumToCut(final double fdrThreshold, final long unfilter
274280 .findFirst ().orElse (pArtifactScoresIncludingNonArtifact .size () - 1 );
275281 }
276282
277- private static Map <Transition , Long > createTransitionCountMap (SortedSet <Transition > relevantTransitions , SortedMap <Genotype , VariantContext > genotypesToConsiderForFiltering ) {
283+ private static Map <Transition , Long > createTransitionCountMap (SortedSet <Transition > relevantTransitions , SortedMap <UniqueIDWrapper < Genotype > , VariantContext > genotypesToConsiderForFiltering ) {
278284 final Map <Transition , Long > transitionCount = new HashMap <>();
279285 relevantTransitions .stream ().forEach (transition -> transitionCount .put (transition , 0L ));
280- for (final Genotype g : genotypesToConsiderForFiltering .keySet ()) {
286+ for (final UniqueIDWrapper < Genotype > g : genotypesToConsiderForFiltering .keySet ()) {
281287 relevantTransitions .stream ()
282- .filter (transition -> OrientationBiasUtils .isGenotypeInTransition (g , transition ))
288+ .filter (transition -> OrientationBiasUtils .isGenotypeInTransition (g . getWrapped () , transition ))
283289 .forEach (transition -> transitionCount .put (transition , transitionCount .get (transition ) + 1 ));
284290 }
285291 return transitionCount ;
@@ -291,28 +297,29 @@ private static Map<Transition, Long> createTransitionCountMap(SortedSet<Transiti
291297 * @param variants The associated VariantContexts. The given sample names should be included.
292298 * @return a mapping from the sampleNames to the a sorted (by p_artifact score) map that associates genotypes to their enclosing variant context.
293299 */
294- public static Map <String , SortedMap <Genotype , VariantContext >> createSampleToGenotypeVariantContextSortedMap (final List <String > sampleNames , final Collection <VariantContext > variants ) {
300+ public static Map <String , SortedMap <UniqueIDWrapper < Genotype > , VariantContext >> createSampleToGenotypeVariantContextSortedMap (final List <String > sampleNames , final Collection <VariantContext > variants ) {
295301
296302 // Sorts in reverse order (highest p_artifact goes first and will not allow anything to be equal
297- // unless they share the same reference)
303+ // unless they share the same reference). Unfortunately, we cannot check the actual reference, so we rely on
304+ // hashCode and the genotype rendered as a string to be universally unique.
298305 // Note the negative sign is to sort in reverse error.
299- final Comparator <Genotype > genotypePArtifactComparator = Comparator
300- .comparingDouble ((Genotype g ) -> -OrientationBiasUtils .getGenotypeDouble (g , OrientationBiasFilterConstants .P_ARTIFACT_FIELD_NAME , 0.0 ))
301- .thenComparingInt ( g -> g . hashCode () );
306+ final Comparator <UniqueIDWrapper < Genotype > > genotypePArtifactComparator = Comparator
307+ .comparingDouble ((UniqueIDWrapper < Genotype > g ) -> -OrientationBiasUtils .getGenotypeDouble (g . getWrapped () , OrientationBiasFilterConstants .P_ARTIFACT_FIELD_NAME , 0.0 ))
308+ .thenComparingLong ( UniqueIDWrapper :: getId );
302309
303- final Map <String , SortedMap <Genotype , VariantContext >> sampleNameToVariants = new HashMap <>();
310+ final Map <String , SortedMap <UniqueIDWrapper < Genotype > , VariantContext >> sampleNameToVariants = new HashMap <>();
304311
305312 final ProgressMeter customProgressMeter = new ProgressMeter (0.1 );
306313 customProgressMeter .start ();
307314
308315 // Populate a mapping of genotypes that we might want to filter to their variant context.
309316 // Make sure that the keys are sorted by cumulative probability of being an artifact.
310317 for (final String sampleName : sampleNames ) {
311- final SortedMap <Genotype , VariantContext > genotypesToConsiderForFiltering = new TreeMap <>(genotypePArtifactComparator );
318+ final SortedMap <UniqueIDWrapper < Genotype > , VariantContext > genotypesToConsiderForFiltering = new TreeMap <>(genotypePArtifactComparator );
312319 for (final VariantContext vc : variants ) {
313320 vc .getGenotypes (sampleName ).stream ()
314321 .filter (g -> isFilteringCandidate (g , vc ))
315- .forEach (genotype -> genotypesToConsiderForFiltering . put ( genotype , vc ));
322+ .forEach (genotype -> putDisallowingKeyOverwrite ( genotypesToConsiderForFiltering , genotype , vc ));
316323 customProgressMeter .update (new SimpleInterval (vc .getContig (), vc .getStart (), vc .getEnd ()));
317324 }
318325 sampleNameToVariants .put (sampleName , genotypesToConsiderForFiltering );
@@ -321,6 +328,21 @@ public static Map<String, SortedMap<Genotype, VariantContext>> createSampleToGen
321328 return sampleNameToVariants ;
322329 }
323330
331+ /**
332+ * Add the genotype:vc entry into the given map, but fail if a key already exists.
333+ *
334+ * @param genotypesToConsiderForFiltering SortedMap, changed in place
335+ * @param genotype key to insert into the sorted map
336+ * @param vc value to insert into the sorted map
337+ */
338+ private static void putDisallowingKeyOverwrite (SortedMap <UniqueIDWrapper <Genotype >, VariantContext > genotypesToConsiderForFiltering , Genotype genotype , VariantContext vc ) {
339+
340+ final VariantContext vcTemp = genotypesToConsiderForFiltering .putIfAbsent (new UniqueIDWrapper <>(genotype ), vc );
341+ if (vcTemp != null ) {
342+ throw new GATKException .ShouldNeverReachHereException ("This error may disappear on a subsequent run of this tool, so please try running this tool again. Otherwise, this can only be fixed by a GATK developer. Attempting to overwrite a key in the genotypes for filtering map. This means that genotypePArtifactComparator is (still) not generating unique keys. See https://github.com/broadinstitute/gatk/issues/3291" );
343+ }
344+ }
345+
324346 /**
325347 * Determine whether this genotype can be filtered by the orientation bias filter.
326348 *
@@ -362,4 +384,5 @@ public static VCFHeader createVCFHeader(final VCFHeader inputVCFHeader, final St
362384 final Set <String > sampleNameSet = samples .asSetOfSamples ();
363385 return new VCFHeader (headerLines , sampleNameSet );
364386 }
387+
365388}
0 commit comments