apache
diff --git a/‎src/main/java/org/apache/datasketches/common/package-info.java
Lines changed: 3 additions & 0 deletions b/‎src/main/java/org/apache/datasketches/common/package-info.java
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/main/java/org/apache/datasketches/cpc/package-info.java
Lines changed: 1 addition & 4 deletions b/‎src/main/java/org/apache/datasketches/cpc/package-info.java
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/main/java/org/apache/datasketches/fdt/package-info.java
Lines changed: 3 additions & 0 deletions b/‎src/main/java/org/apache/datasketches/fdt/package-info.java
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/main/java/org/apache/datasketches/frequencies/package-info.java
Lines changed: 8 additions & 11 deletions b/‎src/main/java/org/apache/datasketches/frequencies/package-info.java
Lines changed: 8 additions & 11 deletions
diff --git a/‎src/main/java/org/apache/datasketches/hash/package-info.java
Lines changed: 4 additions & 6 deletions b/‎src/main/java/org/apache/datasketches/hash/package-info.java
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
Lines changed: 4 additions & 4 deletions b/‎src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/main/java/org/apache/datasketches/hll/HllSketch.java
Lines changed: 61 additions & 23 deletions b/‎src/main/java/org/apache/datasketches/hll/HllSketch.java
Lines changed: 61 additions & 23 deletions
diff --git a/‎src/main/java/org/apache/datasketches/hll/TgtHllType.java
Lines changed: 3 additions & 3 deletions b/‎src/main/java/org/apache/datasketches/hll/TgtHllType.java
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/main/java/org/apache/datasketches/hll/Union.java
Lines changed: 12 additions & 0 deletions b/‎src/main/java/org/apache/datasketches/hll/Union.java
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/main/java/org/apache/datasketches/hll/doc-files/HLL_HIP_K12T20U20.png
152 KB b/‎src/main/java/org/apache/datasketches/hll/doc-files/HLL_HIP_K12T20U20.png
152 KB
diff --git a/‎src/main/java/org/apache/datasketches/hll/doc-files/HLL_UnionTime4_6_8_Java_CPP.png
314 KB b/‎src/main/java/org/apache/datasketches/hll/doc-files/HLL_UnionTime4_6_8_Java_CPP.png
314 KB
@@ -17,4 +17,7 @@
  * under the License.
  */
 
+/**
+ * This package is for common classes that may be used across all the sketch families.
+ */
 package org.apache.datasketches.common;
@@ -18,10 +18,7 @@
  */
 
 /**
- * Compressed Probabilistic Counting
- *
- * @author Lee Rhodes
- * @author Kevin Lang
+ * Compressed Probabilistic Counting sketch family
  */
 
 package org.apache.datasketches.cpc;
@@ -17,4 +17,7 @@
  * under the License.
  */
 
+/**
+ * Frequent Distinct Tuples Sketch
+ */
 package org.apache.datasketches.fdt;
@@ -18,16 +18,13 @@
  */
 
 /**
- * <p>This package is dedicated to streaming algorithms that enable estimation of the 
- * frequency of occurence of items in a weighted multiset stream of items.  
- * If the frequency distribution of items is sufficiently skewed, these algorithms are very 
- * useful in identifying the "Heavy Hitters" that occured most frequently in the stream.  
- * The accuracy of the estimation of the frequency of an item has well understood error 
- * bounds that can be returned by the sketch.</p>
- * 
- * <p>These sketches are mergable and can be serialized and deserialized to/from a compact 
- * form.</p>
- * 
- * @author Lee Rhodes
+ * This package is dedicated to streaming algorithms that enable estimation of the
+ * frequency of occurrence of items in a weighted multiset stream of items.
+ * If the frequency distribution of items is sufficiently skewed, these algorithms are very
+ * useful in identifying the "Heavy Hitters" that occurred most frequently in the stream.
+ * The accuracy of the estimation of the frequency of an item has well understood error
+ * bounds that can be returned by the sketch.
+ *
+ * <p>These algorithms are sometimes referred to as "TopN" algorithms.</p>
  */
 package org.apache.datasketches.frequencies;
@@ -18,14 +18,12 @@
  */
 
 /**
- * <p>The hash package contains a high-performing and extended Java implementation 
- * of Austin Appleby's 128-bit MurmurHash3 hash function originally coded in C. 
- * This core MurmurHash3.java class is used throughout all the sketch classes for consistentancy 
+ * <p>The hash package contains a high-performing and extended Java implementations
+ * of Austin Appleby's 128-bit MurmurHash3 hash function originally coded in C.
+ * This core MurmurHash3.java class is used throughout many of the sketch classes for consistency
  * and as long as the user specifies the same seed will result in coordinated hash operations.
- * This package also contains an adaptor class that extends the basic class with more functions 
+ * This package also contains an adaptor class that extends the basic class with more functions
  * commonly associated with hashing.
  * </p>
- * 
- * @author Lee Rhodes
  */
 package org.apache.datasketches.hash;
@@ -113,22 +113,22 @@ public static final int getSerializationVersion(final Memory mem) {
    * Gets the current (approximate) Relative Error (RE) asymptotic values given several
    * parameters. This is used primarily for testing.
    * @param upperBound return the RE for the Upper Bound, otherwise for the Lower Bound.
-   * @param unioned set true if the sketch is the result of a union operation.
+   * @param oooFlag set true if the sketch is the result of a non qualifying union operation.
    * @param lgConfigK the configured value for the sketch.
    * @param numStdDev the given number of Standard Deviations. This must be an integer between
    * 1 and 3, inclusive.
    * <a href="{@docRoot}/resources/dictionary.html#numStdDev">Number of Standard Deviations</a>
    * @return the current (approximate) RelativeError
    */
-  public static double getRelErr(final boolean upperBound, final boolean unioned,
+  public static double getRelErr(final boolean upperBound, final boolean oooFlag,
       final int lgConfigK, final int numStdDev) {
     HllUtil.checkLgK(lgConfigK);
     if (lgConfigK > 12) {
-      final double rseFactor = unioned ? HLL_NON_HIP_RSE_FACTOR : HLL_HIP_RSE_FACTOR;
+      final double rseFactor = oooFlag ? HLL_NON_HIP_RSE_FACTOR : HLL_HIP_RSE_FACTOR;
       final int configK = 1 << lgConfigK;
       return (numStdDev * rseFactor) / Math.sqrt(configK);
     }
-    return Math.abs(RelativeErrorTables.getRelErr(upperBound, unioned, lgConfigK, numStdDev));
+    return Math.abs(RelativeErrorTables.getRelErr(upperBound, oooFlag, lgConfigK, numStdDev));
   }
 
   /**
 
@@ -36,33 +36,71 @@
 import org.apache.datasketches.memory.WritableMemory;
 
 /**
- * This is a high performance implementation of Phillipe Flajolet&#8217;s HLL sketch but with
- * significantly improved error behavior.  If the ONLY use case for sketching is counting
- * uniques and merging, the HLL sketch the HLL sketch is a reasonable choice, although the highest
- * performing in terms of accuracy for storage space consumed is CPC (Compressed Probabilistic Counting).
- * For large enough counts, this HLL version (with HLL_4) can be 2 to 16 times smaller than the
- * Theta sketch family for the same accuracy.
+ * The HllSketch is actually a collection of compact implementations of Phillipe Flajolet’s HyperLogLog (HLL)
+ * sketch but with significantly improved error behavior and excellent speed performance.
  *
- * <p>This implementation offers three different types of HLL sketch, each with different
- * trade-offs with accuracy, space and performance. These types are specified with the
- * {@link TgtHllType} parameter.
+ * <p>If the use case for sketching is primarily counting uniques and merging, the HLL sketch is the 2nd highest
+ * performing in terms of accuracy for storage space consumed in the DataSketches library
+ * (the new CPC sketch developed by Kevin J. Lang now beats HLL in terms of accuracy / space).
+ * For large counts, HLL sketches can be 2 to 8 times smaller for the same accuracy than the DataSketches Theta
+ * Sketches when serialized, but the Theta sketches can do set intersections and differences while HLL and CPC cannot.
+ * The CPC sketch and HLL share similar use cases, but the CPC sketch is about 30 to 40% smaller than the HLL sketch
+ * when serialized and larger than the HLL when active in memory.  Choose your weapons!</p>
  *
- * <p>In terms of accuracy, all three types, for the same <i>lgConfigK</i>, have the same error
- * distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
- * The configuration parameter <i>lgConfigK</i> is the log-base-2 of <i>K</i>,
- * where <i>K</i> is the number of buckets or slots for the sketch.
+ * <p>A new HLL sketch is created with a simple constructor:</p>
+ * <pre>{@code
+ * int lgK = 12; //This is log-base2 of k, so k = 4096. lgK can be from 4 to 21
+ * HllSketch sketch = new HllSketch(lgK); //TgtHllType.HLL_4 is the default
+ * //OR
+ * HllSketch sketch = new HllSketch(lgK, TgtHllType.HLL_6);
+ * //OR
+ * HllSketch sketch = new HllSketch(lgK, TgtHllType.HLL_8);
+ * }</pre>
  *
- * <p>During warmup, when the sketch has only received a small number of unique items
- * (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
- * algorithms with significantly better accuracy.
+ * <p>All three different sketch types are targets in that the sketches start out in a warm-up mode that is small in
+ * size and gradually grows as needed until the full HLL array is allocated. The HLL_4, HLL_6 and HLL_8 represent
+ * different levels of compression of the final HLL array where the 4, 6 and 8 refer to the number of bits each
+ * bucket of the HLL array is compressed down to.
+ * The HLL_4 is the most compressed but generally slower than the other two, especially during union operations.</p>
  *
- * <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
- * created by the user, the sketch will perform all of its updates and internal phase transitions
- * in that object, which can actually reside either on-heap or off-heap based on how it is
- * configured. In large systems that must update and merge many millions of sketches, having the
- * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
- * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
- * delays.
+ * <p>All three types share the same API. Updating the HllSketch is very simple:</p>
+ *
+ * <pre>{@code
+ * long n = 1000000;
+ * for (int i = 0; i < n; i++) {
+ *   sketch.update(i);
+ * }
+ * }</pre>
+ *
+ * <p>Each of the presented integers above are first hashed into 128-bit hash values that are used by the sketch
+ * HLL algorithm, so the above loop is essentially equivalent to using a random number generator initialized with a
+ * seed so that the sequence is deterministic and random.</p>
+ *
+ * <p>Obtaining the cardinality results from the sketch is also simple:</p>
+ *
+ * <pre>{@code
+ * double estimate = sketch.getEstimate();
+ * double estUB = sketch.getUpperBound(1.0); //the upper bound at 1 standard deviation.
+ * double estLB = sketch.getLowerBound(1.0); //the lower bound at 1 standard deviation.
+ * //OR
+ * System.out.println(sketch.toString()); //will output a summary of the sketch.
+ * }</pre>
+ *
+ * <p>Which produces a console output something like this:</p>
+ *
+ * <pre>{@code
+ * ### HLL SKETCH SUMMARY:
+ *   Log Config K   : 12
+ *   Hll Target     : HLL_4
+ *   Current Mode   : HLL
+ *   LB             : 977348.7024560181
+ *   Estimate       : 990116.6007366662
+ *   UB             : 1003222.5095308956
+ *   OutOfOrder Flag: false
+ *   CurMin         : 5
+ *   NumAtCurMin    : 1
+ *   HipAccum       : 990116.6007366662
+ * }</pre>
  *
  * @author Lee Rhodes
  * @author Kevin Lang
 
@@ -34,14 +34,14 @@
  * sketches where lgConfigK is &gt; 8.</p>
  *
  * <ul>
- * <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the
+ * <li><b>HLL 8</b> This uses an 8-bit byte per HLL bucket. It is generally the
  * fastest in terms of update time, but has the largest storage footprint of about
  * <i>K</i> bytes.</li>
  *
- * <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
+ * <li><b>HLL 6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
  * in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>
  *
- * <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require
+ * <li><b>HLL 4</b> This uses a 4-bit field per HLL bucket and for large counts may require
  * the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
  * For the values of <i>lgConfigK &gt; 13</i> (<i>K</i> = 8192),
  * this additional array adds about 3% to the overall storage. It is generally the slowest in
 
@@ -51,6 +51,18 @@
  * <p>Second, the user cannot specify the {@link TgtHllType} as an input parameter to the union.
  * Instead, it is specified for the sketch returned with {@link #getResult(TgtHllType)}.
  *
+ * <p>The following graph illustrates the HLL Merge speed.</p>
+ *
+ * <p><img src="doc-files/HLL_UnionTime4_6_8_Java_CPP.png" width="500" alt="HLL LgK12 Union Speed"></p>
+ * This graph illustrates the relative merging speed of the HLL 4,6,8 Java HLL sketches compared to
+ * the DataSketches C++ implementations of the same sketches. With this particular test (merging 32 relative large
+ * sketches together), the Java HLL 8 is the fastest and the Java HLL 4 the slowest, with a mixed cluster in the middle.
+ * Union / Merging speed is somewhat difficult to measure as the performance is very dependent on the mix of sketch
+ * sizes (and types) you are merging. So your mileage will vary!
+ *
+ * <p>For a complete example of using the Union operator
+ * see <a href="https://datasketches.apache.org/docs/HLL/HllJavaExample.html">Union Example</a>.</p>
+ *
  * @author Lee Rhodes
  * @author Kevin Lang
  */
Original file line number	Diff line number	Diff line change
`@@ -34,14 +34,14 @@`
`34`	`34`	`* sketches where lgConfigK is > 8.</p>`
`35`	`35`	`*`
`36`	`36`	`* <ul>`
`37`		`- * <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the`
	`37`	`+ * <li><b>HLL 8</b> This uses an 8-bit byte per HLL bucket. It is generally the`
`38`	`38`	`* fastest in terms of update time, but has the largest storage footprint of about`
`39`	`39`	`* <i>K</i> bytes.</li>`
`40`	`40`	`*`
`41`		`- * <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest`
	`41`	`+ * <li><b>HLL 6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest`
`42`	`42`	`* in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>`
`43`	`43`	`*`
`44`		`- * <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require`
	`44`	`+ * <li><b>HLL 4</b> This uses a 4-bit field per HLL bucket and for large counts may require`
`45`	`45`	`* the use of a small internal auxiliary array for storing statistical exceptions, which are rare.`
`46`	`46`	`* For the values of <i>lgConfigK > 13</i> (<i>K</i> = 8192),`
`47`	`47`	`* this additional array adds about 3% to the overall storage. It is generally the slowest in`