Skip to content

Commit 657b4ee

Browse files
authored
Add variant stratification and overlap metrics in SVConcordance (#9125)
1 parent 76edc75 commit 657b4ee

26 files changed

+2251
-1060
lines changed

src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java

+5
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,11 @@ public enum ComplexVariantSubtype {
189189
public static final String TRUTH_ALLELE_NUMBER_INFO = "TRUTH_AN";
190190
public static final String TRUTH_ALLELE_FREQUENCY_INFO = "TRUTH_AF";
191191

192+
public static final String TRUTH_RECIPROCAL_OVERLAP_INFO = "TRUTH_RECIPROCAL_OVERLAP";
193+
public static final String TRUTH_SIZE_SIMILARITY_INFO = "TRUTH_SIZE_SIMILARITY";
194+
public static final String TRUTH_DISTANCE_START_INFO = "TRUTH_DISTANCE_START";
195+
public static final String TRUTH_DISTANCE_END_INFO = "TRUTH_DISTANCE_END";
196+
192197
// stratification
193198
public static final String STRATUM_INFO_KEY = "STRAT";
194199

src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVLinkage.java

+15-10
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
* {@link org.broadinstitute.hellbender.tools.copynumber.GermlineCNVCaller GermlineCNVCaller}. Each variant is first padded by a fraction
2020
* of its length and then merged with any other overlapping variants that meet the minimum sample overlap. Additionally,
2121
* singleton variants (with only one carrier sample) will only be linked with variants of the same copy number.
22+
*
23+
* This class does not return any metadata via {@link org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVLinkage.CanonicalLinkageResult}.
2224
*/
2325
public class CNVLinkage extends SVClusterLinkage<SVCallRecord> {
2426

@@ -38,15 +40,15 @@ public CNVLinkage(final SAMSequenceDictionary dictionary, final double paddingFr
3840
}
3941

4042
@Override
41-
public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
43+
public LinkageResult areClusterable(final SVCallRecord a, final SVCallRecord b) {
4244
// Only do clustering on depth-only CNVs
43-
if (!a.isDepthOnly() || !b.isDepthOnly()) return false;
44-
if (!a.isSimpleCNV() || !b.isSimpleCNV()) return false;
45+
if (!a.isDepthOnly() || !b.isDepthOnly()) return new LinkageResult(false);
46+
if (!a.isSimpleCNV() || !b.isSimpleCNV()) return new LinkageResult(false);
4547
Utils.validate(a.getContigA().equals(a.getContigB()), "Variant A is a CNV but interchromosomal");
4648
Utils.validate(b.getContigA().equals(b.getContigB()), "Variant B is a CNV but interchromosomal");
4749

4850
// Types match
49-
if (a.getType() != b.getType()) return false;
51+
if (a.getType() != b.getType()) return new LinkageResult(false);
5052

5153
// Interval overlap
5254
// Positions should be validated already by the SVCallRecord class - these checks are for thoroughness
@@ -56,11 +58,14 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
5658
final SimpleInterval intervalB = getPaddedRecordInterval(b.getContigA(), b.getPositionA(), b.getPositionB());
5759
Utils.nonNull(intervalB, "Invalid interval " + new SimpleInterval(b.getContigA(), b.getPositionA(),
5860
b.getPositionB()) + " for record " + b.getId());
59-
if (!intervalA.overlaps(intervalB)) return false;
61+
if (!intervalA.overlaps(intervalB)) return new LinkageResult(false);
6062

6163
// Sample overlap
62-
if (!hasSampleOverlap(a, b, minSampleOverlap)) {
63-
return false;
64+
if (minSampleOverlap > 0) {
65+
final double sampleOverlap = computeSampleOverlap(a, b);
66+
if (!testSampleOverlap(sampleOverlap, minSampleOverlap)) {
67+
return new LinkageResult(false);
68+
}
6469
}
6570

6671
// In the single-sample case, match copy number strictly if we're looking at the same sample
@@ -76,17 +81,17 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
7681
final int copyNumberDeltaA = genotypeA.getPloidy() - copyNumberA;
7782
final int copyNumberDeltaB = genotypeB.getPloidy() - copyNumberB;
7883
if (copyNumberDeltaA != copyNumberDeltaB) {
79-
return false;
84+
return new LinkageResult(false);
8085
}
8186
} else {
8287
final List<Allele> sortedAllelesA = SVCallRecordUtils.sortAlleles(genotypeA.getAlleles());
8388
final List<Allele> sortedAllelesB = SVCallRecordUtils.sortAlleles(genotypeB.getAlleles());
8489
if (!sortedAllelesA.equals(sortedAllelesB)) {
85-
return false;
90+
return new LinkageResult(false);
8691
}
8792
}
8893
}
89-
return true;
94+
return new LinkageResult(true);
9095
}
9196

9297
/**

src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java

+148-88
Large diffs are not rendered by default.

src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ private final List<Integer> cluster(final Integer itemId) {
138138
final Set<Integer> linkedItems = idToClusterMap.values().stream().map(Cluster::getItemIds)
139139
.flatMap(List::stream)
140140
.distinct()
141-
.filter(other -> !other.equals(itemId) && linkage.areClusterable(item, getItem(other)))
141+
.filter(other -> !other.equals(itemId) && linkage.areClusterable(item, getItem(other)).getResult())
142142
.collect(Collectors.toCollection(LinkedHashSet::new));
143143

144144
// Find clusters to which this item belongs, and which active clusters we're definitely done with

src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java

+46-43
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public abstract class SVClusterLinkage<T extends SVLocatable> {
2121
* @param a first item
2222
* @param b second item
2323
*/
24-
public abstract boolean areClusterable(final T a, final T b);
24+
public abstract LinkageResult areClusterable(final T a, final T b);
2525

2626
/**
2727
* Returns the maximum feasible starting position of any other item with the given item. That is, given item A and
@@ -46,62 +46,56 @@ public int getMaxClusterableStartingPosition(final Collection<T> items) {
4646
}
4747

4848
/**
49-
* Checks for minimum fractional sample overlap of the two sets. Defaults to true if both sets are empty.
49+
* Returns number of overlapping items
5050
*/
51-
protected static boolean hasSampleSetOverlap(final Set<String> samplesA, final Set<String> samplesB, final double minSampleOverlap) {
52-
final int denom = Math.max(samplesA.size(), samplesB.size());
51+
protected static double getSampleSetOverlap(final Collection<String> a, final Set<String> b) {
52+
final double denom = Math.max(a.size(), b.size());
5353
if (denom == 0) {
54-
return true;
54+
return 1;
5555
}
56-
final double sampleOverlap = getSampleSetOverlap(samplesA, samplesB) / (double) denom;
57-
return sampleOverlap >= minSampleOverlap;
56+
return a.stream().filter(b::contains).count() / denom;
5857
}
5958

6059
/**
61-
* Returns number of overlapping items
60+
* Returns true if the overlap is null or exceeds threshold.
6261
*/
63-
protected static int getSampleSetOverlap(final Collection<String> a, final Set<String> b) {
64-
return (int) a.stream().filter(b::contains).count();
62+
protected static boolean testSampleOverlap(final Double sampleOverlap, final double threshold) {
63+
return sampleOverlap == null || sampleOverlap >= threshold;
6564
}
6665

66+
6767
/**
68-
* Returns true if there is sufficient fractional carrier sample overlap in the two records. For CNVs, returns true
69-
* if sufficient fraction of copy number states match.
68+
* Returns fractional carrier sample overlap in the two records. For CNVs, returns fraction of copy number states that match.
7069
*/
71-
protected static boolean hasSampleOverlap(final SVCallRecord a, final SVCallRecord b, final double minSampleOverlap) {
72-
if (minSampleOverlap > 0) {
73-
if (a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV || b.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
74-
// CNV sample overlap
75-
final GenotypesContext genotypesA = a.getGenotypes();
76-
final GenotypesContext genotypesB = b.getGenotypes();
77-
final Set<String> samples = new HashSet<>(SVUtils.hashMapCapacity(genotypesA.size() + genotypesB.size()));
78-
samples.addAll(genotypesA.getSampleNames());
79-
samples.addAll(genotypesB.getSampleNames());
80-
if (samples.isEmpty()) {
81-
// Empty case considered perfect overlap
82-
return true;
83-
}
84-
int numMatches = 0;
85-
for (final String sample : samples) {
86-
final Genotype genotypeA = genotypesA.get(sample);
87-
final Genotype genotypeB = genotypesB.get(sample);
88-
// If one sample doesn't exist in the other set, assume reference copy state
89-
final int cnA = getCopyState(genotypeA, genotypeB);
90-
final int cnB = getCopyState(genotypeB, genotypeA);
91-
if (cnA == cnB) {
92-
numMatches++;
93-
}
70+
protected static Double computeSampleOverlap(final SVCallRecord a, final SVCallRecord b) {
71+
if (a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV || b.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
72+
// CNV sample overlap
73+
final GenotypesContext genotypesA = a.getGenotypes();
74+
final GenotypesContext genotypesB = b.getGenotypes();
75+
final Set<String> samples = new HashSet<>(SVUtils.hashMapCapacity(genotypesA.size() + genotypesB.size()));
76+
samples.addAll(genotypesA.getSampleNames());
77+
samples.addAll(genotypesB.getSampleNames());
78+
if (samples.isEmpty()) {
79+
return null;
80+
}
81+
int numMatches = 0;
82+
for (final String sample : samples) {
83+
final Genotype genotypeA = genotypesA.get(sample);
84+
final Genotype genotypeB = genotypesB.get(sample);
85+
// If one sample doesn't exist in the other set, assume reference copy state
86+
final int cnA = getCopyState(genotypeA, genotypeB);
87+
final int cnB = getCopyState(genotypeB, genotypeA);
88+
if (cnA == cnB) {
89+
numMatches++;
9490
}
95-
final int numSamples = samples.size();
96-
return (numMatches / (double) numSamples) >= minSampleOverlap;
97-
} else {
98-
// Non-CNV
99-
final Set<String> samplesA = a.getCarrierSampleSet();
100-
final Set<String> samplesB = b.getCarrierSampleSet();
101-
return hasSampleSetOverlap(samplesA, samplesB, minSampleOverlap);
10291
}
92+
final int numSamples = samples.size();
93+
return (numMatches / (double) numSamples);
10394
} else {
104-
return true;
95+
// Non-CNV
96+
final Set<String> samplesA = a.getCarrierSampleSet();
97+
final Set<String> samplesB = b.getCarrierSampleSet();
98+
return getSampleSetOverlap(samplesA, samplesB);
10599
}
106100
}
107101

@@ -121,4 +115,13 @@ private static int getCopyState(final Genotype genotype, final Genotype matchedS
121115
VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, -1));
122116
}
123117
}
118+
119+
/**
120+
* Used for storing the result of a clustering check between two records and any additional metadata
121+
*/
122+
public static class LinkageResult {
123+
private final boolean result;
124+
public LinkageResult(final boolean result) { this.result = result; }
125+
public boolean getResult() { return result; }
126+
}
124127
}

0 commit comments

Comments
 (0)