@@ -21,7 +21,7 @@ public abstract class SVClusterLinkage<T extends SVLocatable> {
21
21
* @param a first item
22
22
* @param b second item
23
23
*/
24
- public abstract boolean areClusterable (final T a , final T b );
24
+ public abstract LinkageResult areClusterable (final T a , final T b );
25
25
26
26
/**
27
27
* Returns the maximum feasible starting position of any other item with the given item. That is, given item A and
@@ -46,62 +46,56 @@ public int getMaxClusterableStartingPosition(final Collection<T> items) {
46
46
}
47
47
48
48
/**
49
- * Checks for minimum fractional sample overlap of the two sets. Defaults to true if both sets are empty.
49
+ * Returns number of overlapping items
50
50
*/
51
- protected static boolean hasSampleSetOverlap (final Set <String > samplesA , final Set <String > samplesB , final double minSampleOverlap ) {
52
- final int denom = Math .max (samplesA .size (), samplesB .size ());
51
+ protected static double getSampleSetOverlap (final Collection <String > a , final Set <String > b ) {
52
+ final double denom = Math .max (a .size (), b .size ());
53
53
if (denom == 0 ) {
54
- return true ;
54
+ return 1 ;
55
55
}
56
- final double sampleOverlap = getSampleSetOverlap (samplesA , samplesB ) / (double ) denom ;
57
- return sampleOverlap >= minSampleOverlap ;
56
+ return a .stream ().filter (b ::contains ).count () / denom ;
58
57
}
59
58
60
59
/**
61
- * Returns number of overlapping items
60
+ * Returns true if the overlap is null or exceeds threshold.
62
61
*/
63
- protected static int getSampleSetOverlap (final Collection < String > a , final Set < String > b ) {
64
- return ( int ) a . stream (). filter ( b :: contains ). count () ;
62
+ protected static boolean testSampleOverlap (final Double sampleOverlap , final double threshold ) {
63
+ return sampleOverlap == null || sampleOverlap >= threshold ;
65
64
}
66
65
66
+
67
67
/**
68
- * Returns true if there is sufficient fractional carrier sample overlap in the two records. For CNVs, returns true
69
- * if sufficient fraction of copy number states match.
68
+ * Returns fractional carrier sample overlap in the two records. For CNVs, returns fraction of copy number states that match.
70
69
*/
71
- protected static boolean hasSampleOverlap (final SVCallRecord a , final SVCallRecord b , final double minSampleOverlap ) {
72
- if (minSampleOverlap > 0 ) {
73
- if (a .getType () == GATKSVVCFConstants .StructuralVariantAnnotationType .CNV || b .getType () == GATKSVVCFConstants .StructuralVariantAnnotationType .CNV ) {
74
- // CNV sample overlap
75
- final GenotypesContext genotypesA = a .getGenotypes ();
76
- final GenotypesContext genotypesB = b .getGenotypes ();
77
- final Set <String > samples = new HashSet <>(SVUtils .hashMapCapacity (genotypesA .size () + genotypesB .size ()));
78
- samples .addAll (genotypesA .getSampleNames ());
79
- samples .addAll (genotypesB .getSampleNames ());
80
- if (samples .isEmpty ()) {
81
- // Empty case considered perfect overlap
82
- return true ;
83
- }
84
- int numMatches = 0 ;
85
- for (final String sample : samples ) {
86
- final Genotype genotypeA = genotypesA .get (sample );
87
- final Genotype genotypeB = genotypesB .get (sample );
88
- // If one sample doesn't exist in the other set, assume reference copy state
89
- final int cnA = getCopyState (genotypeA , genotypeB );
90
- final int cnB = getCopyState (genotypeB , genotypeA );
91
- if (cnA == cnB ) {
92
- numMatches ++;
93
- }
70
+ protected static Double computeSampleOverlap (final SVCallRecord a , final SVCallRecord b ) {
71
+ if (a .getType () == GATKSVVCFConstants .StructuralVariantAnnotationType .CNV || b .getType () == GATKSVVCFConstants .StructuralVariantAnnotationType .CNV ) {
72
+ // CNV sample overlap
73
+ final GenotypesContext genotypesA = a .getGenotypes ();
74
+ final GenotypesContext genotypesB = b .getGenotypes ();
75
+ final Set <String > samples = new HashSet <>(SVUtils .hashMapCapacity (genotypesA .size () + genotypesB .size ()));
76
+ samples .addAll (genotypesA .getSampleNames ());
77
+ samples .addAll (genotypesB .getSampleNames ());
78
+ if (samples .isEmpty ()) {
79
+ return null ;
80
+ }
81
+ int numMatches = 0 ;
82
+ for (final String sample : samples ) {
83
+ final Genotype genotypeA = genotypesA .get (sample );
84
+ final Genotype genotypeB = genotypesB .get (sample );
85
+ // If one sample doesn't exist in the other set, assume reference copy state
86
+ final int cnA = getCopyState (genotypeA , genotypeB );
87
+ final int cnB = getCopyState (genotypeB , genotypeA );
88
+ if (cnA == cnB ) {
89
+ numMatches ++;
94
90
}
95
- final int numSamples = samples .size ();
96
- return (numMatches / (double ) numSamples ) >= minSampleOverlap ;
97
- } else {
98
- // Non-CNV
99
- final Set <String > samplesA = a .getCarrierSampleSet ();
100
- final Set <String > samplesB = b .getCarrierSampleSet ();
101
- return hasSampleSetOverlap (samplesA , samplesB , minSampleOverlap );
102
91
}
92
+ final int numSamples = samples .size ();
93
+ return (numMatches / (double ) numSamples );
103
94
} else {
104
- return true ;
95
+ // Non-CNV
96
+ final Set <String > samplesA = a .getCarrierSampleSet ();
97
+ final Set <String > samplesB = b .getCarrierSampleSet ();
98
+ return getSampleSetOverlap (samplesA , samplesB );
105
99
}
106
100
}
107
101
@@ -121,4 +115,13 @@ private static int getCopyState(final Genotype genotype, final Genotype matchedS
121
115
VariantContextGetters .getAttributeAsInt (genotype , GATKSVVCFConstants .DEPTH_GENOTYPE_COPY_NUMBER_FORMAT , -1 ));
122
116
}
123
117
}
118
+
119
+ /**
120
+ * Used for storing the result of a clustering check between two records and any additional metadata
121
+ */
122
+ public static class LinkageResult {
123
+ private final boolean result ;
124
+ public LinkageResult (final boolean result ) { this .result = result ; }
125
+ public boolean getResult () { return result ; }
126
+ }
124
127
}
0 commit comments