Skip to content

Commit 5d91029

Browse files
authored
Merge pull request #441 from apache/improve_hll_docs
Restructured the Hll Package-info, HllSketch, Union class level
2 parents 1537759 + c603bdc commit 5d91029

31 files changed

+261
-147
lines changed

src/main/java/org/apache/datasketches/common/package-info.java

+3
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,7 @@
1717
* under the License.
1818
*/
1919

20+
/**
21+
* This package is for common classes that may be used across all the sketch families.
22+
*/
2023
package org.apache.datasketches.common;

src/main/java/org/apache/datasketches/cpc/package-info.java

+1-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@
1818
*/
1919

2020
/**
21-
* Compressed Probabilistic Counting
22-
*
23-
* @author Lee Rhodes
24-
* @author Kevin Lang
21+
* Compressed Probabilistic Counting sketch family
2522
*/
2623

2724
package org.apache.datasketches.cpc;

src/main/java/org/apache/datasketches/fdt/package-info.java

+3
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,7 @@
1717
* under the License.
1818
*/
1919

20+
/**
21+
* Frequent Distinct Tuples Sketch
22+
*/
2023
package org.apache.datasketches.fdt;

src/main/java/org/apache/datasketches/frequencies/package-info.java

+8-11
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,13 @@
1818
*/
1919

2020
/**
21-
* <p>This package is dedicated to streaming algorithms that enable estimation of the
22-
* frequency of occurence of items in a weighted multiset stream of items.
23-
* If the frequency distribution of items is sufficiently skewed, these algorithms are very
24-
* useful in identifying the "Heavy Hitters" that occured most frequently in the stream.
25-
* The accuracy of the estimation of the frequency of an item has well understood error
26-
* bounds that can be returned by the sketch.</p>
27-
*
28-
* <p>These sketches are mergable and can be serialized and deserialized to/from a compact
29-
* form.</p>
30-
*
31-
* @author Lee Rhodes
21+
* This package is dedicated to streaming algorithms that enable estimation of the
22+
* frequency of occurrence of items in a weighted multiset stream of items.
23+
* If the frequency distribution of items is sufficiently skewed, these algorithms are very
24+
* useful in identifying the "Heavy Hitters" that occurred most frequently in the stream.
25+
* The accuracy of the estimation of the frequency of an item has well understood error
26+
* bounds that can be returned by the sketch.
27+
*
28+
* <p>These algorithms are sometimes referred to as "TopN" algorithms.</p>
3229
*/
3330
package org.apache.datasketches.frequencies;

src/main/java/org/apache/datasketches/hash/package-info.java

+4-6
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,12 @@
1818
*/
1919

2020
/**
21-
* <p>The hash package contains a high-performing and extended Java implementation
22-
* of Austin Appleby's 128-bit MurmurHash3 hash function originally coded in C.
23-
* This core MurmurHash3.java class is used throughout all the sketch classes for consistentancy
21+
* <p>The hash package contains a high-performing and extended Java implementations
22+
* of Austin Appleby's 128-bit MurmurHash3 hash function originally coded in C.
23+
* This core MurmurHash3.java class is used throughout many of the sketch classes for consistency
2424
* and as long as the user specifies the same seed will result in coordinated hash operations.
25-
* This package also contains an adaptor class that extends the basic class with more functions
25+
* This package also contains an adaptor class that extends the basic class with more functions
2626
* commonly associated with hashing.
2727
* </p>
28-
*
29-
* @author Lee Rhodes
3028
*/
3129
package org.apache.datasketches.hash;

src/main/java/org/apache/datasketches/hll/BaseHllSketch.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -113,22 +113,22 @@ public static final int getSerializationVersion(final Memory mem) {
113113
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
114114
* parameters. This is used primarily for testing.
115115
* @param upperBound return the RE for the Upper Bound, otherwise for the Lower Bound.
116-
* @param unioned set true if the sketch is the result of a union operation.
116+
* @param oooFlag set true if the sketch is the result of a non qualifying union operation.
117117
* @param lgConfigK the configured value for the sketch.
118118
* @param numStdDev the given number of Standard Deviations. This must be an integer between
119119
* 1 and 3, inclusive.
120120
* <a href="{@docRoot}/resources/dictionary.html#numStdDev">Number of Standard Deviations</a>
121121
* @return the current (approximate) RelativeError
122122
*/
123-
public static double getRelErr(final boolean upperBound, final boolean unioned,
123+
public static double getRelErr(final boolean upperBound, final boolean oooFlag,
124124
final int lgConfigK, final int numStdDev) {
125125
HllUtil.checkLgK(lgConfigK);
126126
if (lgConfigK > 12) {
127-
final double rseFactor = unioned ? HLL_NON_HIP_RSE_FACTOR : HLL_HIP_RSE_FACTOR;
127+
final double rseFactor = oooFlag ? HLL_NON_HIP_RSE_FACTOR : HLL_HIP_RSE_FACTOR;
128128
final int configK = 1 << lgConfigK;
129129
return (numStdDev * rseFactor) / Math.sqrt(configK);
130130
}
131-
return Math.abs(RelativeErrorTables.getRelErr(upperBound, unioned, lgConfigK, numStdDev));
131+
return Math.abs(RelativeErrorTables.getRelErr(upperBound, oooFlag, lgConfigK, numStdDev));
132132
}
133133

134134
/**

src/main/java/org/apache/datasketches/hll/HllSketch.java

+61-23
Original file line numberDiff line numberDiff line change
@@ -36,33 +36,71 @@
3636
import org.apache.datasketches.memory.WritableMemory;
3737

3838
/**
39-
* This is a high performance implementation of Phillipe Flajolet&#8217;s HLL sketch but with
40-
* significantly improved error behavior. If the ONLY use case for sketching is counting
41-
* uniques and merging, the HLL sketch the HLL sketch is a reasonable choice, although the highest
42-
* performing in terms of accuracy for storage space consumed is CPC (Compressed Probabilistic Counting).
43-
* For large enough counts, this HLL version (with HLL_4) can be 2 to 16 times smaller than the
44-
* Theta sketch family for the same accuracy.
39+
* The HllSketch is actually a collection of compact implementations of Phillipe Flajolet’s HyperLogLog (HLL)
40+
* sketch but with significantly improved error behavior and excellent speed performance.
4541
*
46-
* <p>This implementation offers three different types of HLL sketch, each with different
47-
* trade-offs with accuracy, space and performance. These types are specified with the
48-
* {@link TgtHllType} parameter.
42+
* <p>If the use case for sketching is primarily counting uniques and merging, the HLL sketch is the 2nd highest
43+
* performing in terms of accuracy for storage space consumed in the DataSketches library
44+
* (the new CPC sketch developed by Kevin J. Lang now beats HLL in terms of accuracy / space).
45+
* For large counts, HLL sketches can be 2 to 8 times smaller for the same accuracy than the DataSketches Theta
46+
* Sketches when serialized, but the Theta sketches can do set intersections and differences while HLL and CPC cannot.
47+
* The CPC sketch and HLL share similar use cases, but the CPC sketch is about 30 to 40% smaller than the HLL sketch
48+
* when serialized and larger than the HLL when active in memory. Choose your weapons!</p>
4949
*
50-
* <p>In terms of accuracy, all three types, for the same <i>lgConfigK</i>, have the same error
51-
* distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
52-
* The configuration parameter <i>lgConfigK</i> is the log-base-2 of <i>K</i>,
53-
* where <i>K</i> is the number of buckets or slots for the sketch.
50+
* <p>A new HLL sketch is created with a simple constructor:</p>
51+
* <pre>{@code
52+
* int lgK = 12; //This is log-base2 of k, so k = 4096. lgK can be from 4 to 21
53+
* HllSketch sketch = new HllSketch(lgK); //TgtHllType.HLL_4 is the default
54+
* //OR
55+
* HllSketch sketch = new HllSketch(lgK, TgtHllType.HLL_6);
56+
* //OR
57+
* HllSketch sketch = new HllSketch(lgK, TgtHllType.HLL_8);
58+
* }</pre>
5459
*
55-
* <p>During warmup, when the sketch has only received a small number of unique items
56-
* (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
57-
* algorithms with significantly better accuracy.
60+
* <p>All three different sketch types are targets in that the sketches start out in a warm-up mode that is small in
61+
* size and gradually grows as needed until the full HLL array is allocated. The HLL_4, HLL_6 and HLL_8 represent
62+
* different levels of compression of the final HLL array where the 4, 6 and 8 refer to the number of bits each
63+
* bucket of the HLL array is compressed down to.
64+
* The HLL_4 is the most compressed but generally slower than the other two, especially during union operations.</p>
5865
*
59-
* <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
60-
* created by the user, the sketch will perform all of its updates and internal phase transitions
61-
* in that object, which can actually reside either on-heap or off-heap based on how it is
62-
* configured. In large systems that must update and merge many millions of sketches, having the
63-
* sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
64-
* to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
65-
* delays.
66+
* <p>All three types share the same API. Updating the HllSketch is very simple:</p>
67+
*
68+
* <pre>{@code
69+
* long n = 1000000;
70+
* for (int i = 0; i < n; i++) {
71+
* sketch.update(i);
72+
* }
73+
* }</pre>
74+
*
75+
* <p>Each of the presented integers above are first hashed into 128-bit hash values that are used by the sketch
76+
* HLL algorithm, so the above loop is essentially equivalent to using a random number generator initialized with a
77+
* seed so that the sequence is deterministic and random.</p>
78+
*
79+
* <p>Obtaining the cardinality results from the sketch is also simple:</p>
80+
*
81+
* <pre>{@code
82+
* double estimate = sketch.getEstimate();
83+
* double estUB = sketch.getUpperBound(1.0); //the upper bound at 1 standard deviation.
84+
* double estLB = sketch.getLowerBound(1.0); //the lower bound at 1 standard deviation.
85+
* //OR
86+
* System.out.println(sketch.toString()); //will output a summary of the sketch.
87+
* }</pre>
88+
*
89+
* <p>Which produces a console output something like this:</p>
90+
*
91+
* <pre>{@code
92+
* ### HLL SKETCH SUMMARY:
93+
* Log Config K : 12
94+
* Hll Target : HLL_4
95+
* Current Mode : HLL
96+
* LB : 977348.7024560181
97+
* Estimate : 990116.6007366662
98+
* UB : 1003222.5095308956
99+
* OutOfOrder Flag: false
100+
* CurMin : 5
101+
* NumAtCurMin : 1
102+
* HipAccum : 990116.6007366662
103+
* }</pre>
66104
*
67105
* @author Lee Rhodes
68106
* @author Kevin Lang

src/main/java/org/apache/datasketches/hll/TgtHllType.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,14 @@
3434
* sketches where lgConfigK is &gt; 8.</p>
3535
*
3636
* <ul>
37-
* <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the
37+
* <li><b>HLL 8</b> This uses an 8-bit byte per HLL bucket. It is generally the
3838
* fastest in terms of update time, but has the largest storage footprint of about
3939
* <i>K</i> bytes.</li>
4040
*
41-
* <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
41+
* <li><b>HLL 6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
4242
* in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>
4343
*
44-
* <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require
44+
* <li><b>HLL 4</b> This uses a 4-bit field per HLL bucket and for large counts may require
4545
* the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
4646
* For the values of <i>lgConfigK &gt; 13</i> (<i>K</i> = 8192),
4747
* this additional array adds about 3% to the overall storage. It is generally the slowest in

src/main/java/org/apache/datasketches/hll/Union.java

+12
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,18 @@
5151
* <p>Second, the user cannot specify the {@link TgtHllType} as an input parameter to the union.
5252
* Instead, it is specified for the sketch returned with {@link #getResult(TgtHllType)}.
5353
*
54+
* <p>The following graph illustrates the HLL Merge speed.</p>
55+
*
56+
* <p><img src="doc-files/HLL_UnionTime4_6_8_Java_CPP.png" width="500" alt="HLL LgK12 Union Speed"></p>
57+
* This graph illustrates the relative merging speed of the HLL 4,6,8 Java HLL sketches compared to
58+
* the DataSketches C++ implementations of the same sketches. With this particular test (merging 32 relative large
59+
* sketches together), the Java HLL 8 is the fastest and the Java HLL 4 the slowest, with a mixed cluster in the middle.
60+
* Union / Merging speed is somewhat difficult to measure as the performance is very dependent on the mix of sketch
61+
* sizes (and types) you are merging. So your mileage will vary!
62+
*
63+
* <p>For a complete example of using the Union operator
64+
* see <a href="https://datasketches.apache.org/docs/HLL/HllJavaExample.html">Union Example</a>.</p>
65+
*
5466
* @author Lee Rhodes
5567
* @author Kevin Lang
5668
*/
Loading
Loading

0 commit comments

Comments
 (0)