Skip to content

Commit e22e016

Browse files
author
lrhodes
committed
Fixed bug in JaccardSimilarity.
Made ReservoirSize package private. It will eventually be removed entirely.
1 parent 6a1dfa8 commit e22e016

3 files changed

Lines changed: 36 additions & 10 deletions

File tree

src/main/java/com/yahoo/sketches/sampling/ReservoirSize.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
*
2626
* @author Jon Malkin
2727
*/
28-
public final class ReservoirSize {
28+
final class ReservoirSize {
2929
/**
3030
* Number of bins per power of two.
3131
*/
@@ -57,7 +57,7 @@ private ReservoirSize() {}
5757
* @return reservoir size as 16-bit encoded value
5858
*/
5959
public static short computeSize(final int k) {
60-
if (k < 1 || k > MAX_ABS_VALUE) {
60+
if ((k < 1) || (k > MAX_ABS_VALUE)) {
6161
throw new SketchesArgumentException("Can only encode strictly positive sketch sizes "
6262
+ "less than " + MAX_ABS_VALUE + ", found: " + k);
6363
}
@@ -70,11 +70,11 @@ public static short computeSize(final int k) {
7070
}
7171

7272
// mantissa is scalar in range [1,2); can reconstruct k as m * 2^p
73-
final double m = Math.pow(2.0, Math.log(k) * INV_LN_2 - p);
73+
final double m = Math.pow(2.0, (Math.log(k) * INV_LN_2) - p);
7474

7575
// Convert to index offset: ceil(m * BPO) - BPO
7676
// Typically in range range 0-(BINS_PER_OCTAVE-1) (but see note below)
77-
final int i = (int) Math.floor(m * BINS_PER_OCTAVE) - BINS_PER_OCTAVE + 1;
77+
final int i = ((int) Math.floor(m * BINS_PER_OCTAVE) - BINS_PER_OCTAVE) + 1;
7878

7979
// Due to ceiling, possible to overflow BINS_PER_OCTAVE
8080
// E.g., if BPO = 2048 then for k=32767 we have p=14. Except that 32767 > decodeValue
@@ -83,7 +83,7 @@ public static short computeSize(final int k) {
8383
return (short) ((((p + 1) & EXPONENT_MASK) << EXPONENT_SHIFT) & OUTPUT_MASK);
8484
}
8585

86-
return (short) (((p & EXPONENT_MASK) << EXPONENT_SHIFT) | (i & INDEX_MASK) & OUTPUT_MASK);
86+
return (short) (((p & EXPONENT_MASK) << EXPONENT_SHIFT) | ((i & INDEX_MASK) & OUTPUT_MASK));
8787
}
8888

8989
/**
@@ -103,6 +103,6 @@ public static int decodeValue(final short encodedSize) {
103103
final int p = (value >> EXPONENT_SHIFT) & EXPONENT_MASK;
104104
final int i = value & INDEX_MASK;
105105

106-
return (int) ((1 << p) * (i * INV_BINS_PER_OCTAVE + 1.0));
106+
return (int) ((1 << p) * ((i * INV_BINS_PER_OCTAVE) + 1.0));
107107
}
108108
}

src/main/java/com/yahoo/sketches/theta/JaccardSimilarity.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
import static com.yahoo.sketches.BoundsOnRatiosInThetaSketchedSets.getEstimateOfBoverA;
99
import static com.yahoo.sketches.BoundsOnRatiosInThetaSketchedSets.getLowerBoundForBoverA;
1010
import static com.yahoo.sketches.BoundsOnRatiosInThetaSketchedSets.getUpperBoundForBoverA;
11+
import static com.yahoo.sketches.Util.MAX_LG_NOM_LONGS;
12+
import static com.yahoo.sketches.Util.MIN_LG_NOM_LONGS;
1113
import static com.yahoo.sketches.Util.ceilingPowerOf2;
14+
import static java.lang.Math.max;
15+
import static java.lang.Math.min;
1216

1317
/**
1418
* Jaccard similarity of two Theta Sketches.
@@ -26,6 +30,9 @@ public final class JaccardSimilarity {
2630
* distinct from each other. A Jaccard of .95 means the overlap between the two
2731
* populations is 95% of the union of the two populations.
2832
*
33+
* <p>Note: For very large pairs of sketches, where the configured nominal entries of the sketches
34+
* are 2^25 or 2^26, this method may produce unpredictable results.
35+
*
2936
* @param sketchA given sketch A
3037
* @param sketchB given sketch B
3138
* @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard ratio.
@@ -42,8 +49,11 @@ public static double[] jaccard(final Sketch sketchA, final Sketch sketchB) {
4249
final int countB = sketchB.getRetainedEntries();
4350

4451
//Create the Union
52+
final int minK = 1 << MIN_LG_NOM_LONGS;
53+
final int maxK = 1 << MAX_LG_NOM_LONGS;
54+
final int newK = max(min(ceilingPowerOf2(countA + countB), maxK), minK);
4555
final Union union =
46-
SetOperation.builder().setNominalEntries(ceilingPowerOf2(countA + countB)).buildUnion();
56+
SetOperation.builder().setNominalEntries(newK).buildUnion();
4757
union.update(sketchA);
4858
union.update(sketchB);
4959
final Sketch unionAB = union.getResult();

src/test/java/com/yahoo/sketches/theta/JaccardSimilarityTest.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public void checkExactMode() {
6969
UpdateSketch measured = UpdateSketch.builder().setNominalEntries(k).build();
7070
UpdateSketch expected = UpdateSketch.builder().setNominalEntries(k).build();
7171

72-
for (int i = 0; i < u-1; i++) { //one short
72+
for (int i = 0; i < (u-1); i++) { //one short
7373
measured.update(i);
7474
expected.update(i);
7575
}
@@ -118,7 +118,7 @@ public void checkEstMode() {
118118
state = exactlyEqual(measured, expected);
119119
assertTrue(state);
120120

121-
for (int i = u; i < u + 50; i++) { //empirically determined
121+
for (int i = u; i < (u + 50); i++) { //empirically determined
122122
measured.update(i);
123123
}
124124

@@ -200,7 +200,23 @@ private static String jaccardString(double[] jResults) {
200200
double lb = jResults[0];
201201
double est = jResults[1];
202202
double ub = jResults[2];
203-
return lb + "\t" + est + "\t" + ub + "\t" + (lb/est - 1.0) + "\t" + (ub/est - 1.0);
203+
return lb + "\t" + est + "\t" + ub + "\t" + ((lb/est) - 1.0) + "\t" + ((ub/est) - 1.0);
204+
}
205+
206+
@Test
207+
public void checkMinK() {
208+
UpdateSketch skA = UpdateSketch.builder().build(); //4096
209+
UpdateSketch skB = UpdateSketch.builder().build(); //4096
210+
skA.update(1);
211+
skB.update(1);
212+
double[] result = JaccardSimilarity.jaccard(skA, skB);
213+
println(result[0] + ", " + result[1] + ", " + result[2]);
214+
for (int i = 1; i < 4096; i++) {
215+
skA.update(i);
216+
skB.update(i);
217+
}
218+
result = JaccardSimilarity.jaccard(skA, skB);
219+
println(result[0] + ", " + result[1] + ", " + result[2]);
204220
}
205221

206222
@Test

0 commit comments

Comments
 (0)