Skip to content

Commit 0dea9cd

Browse files
authored
Merge pull request #348 from apache/Prep_for_2.0.0-RC3
Made parallel APIs for SetOperations in Theta, and Tuple (generics).
2 parents ac00afa + 0cad491 commit 0dea9cd

22 files changed

Lines changed: 1232 additions & 492 deletions

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
=================
2727

28-
# DataSketches Core Java Library Component
28+
# Apache<sup>&reg;</sup> DataSketches&trade; Core Java Library Component
2929
This is the core Java component of the DataSketches library. It contains all of the sketching algorithms and can be accessed directly from user applications.
3030

3131
This component is also a dependency of other components of the library that create adaptors for target systems, such as Hadoop Pig and Hadoop Hive.

src/main/java/org/apache/datasketches/BinarySearch.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public final class BinarySearch {
3030
/**
3131
* Binary Search for the index of the exact float value in the given search range.
3232
* If -1 is returned there are no values in the search range that equals the given value.
33-
* @param arr The given array to search.
33+
* @param arr The given ordered array to search.
3434
* @param low the index of the lowest value of the search range
3535
* @param high the index of the highest value of the search range
3636
* @param v the value to search for
@@ -53,7 +53,7 @@ public static int find(final float[] arr, final int low, final int high, final f
5353
/**
5454
* Binary Search for the index of the exact double value in the given search range.
5555
* If -1 is returned there are no values in the search range that equals the given value.
56-
* @param arr The given array to search.
56+
* @param arr The given ordered array to search.
5757
* @param low the index of the lowest value of the search range
5858
* @param high the index of the highest value of the search range
5959
* @param v the value to search for
@@ -76,7 +76,7 @@ public static int find(final double[] arr, final int low, final int high, final
7676
/**
7777
* Binary Search for the index of the exact long value in the given search range.
7878
* If -1 is returned there are no values in the search range that equals the given value.
79-
* @param arr The given array to search.
79+
* @param arr The given ordered array to search.
8080
* @param low the index of the lowest value of the search range
8181
* @param high the index of the highest value of the search range
8282
* @param v the value to search for

src/main/java/org/apache/datasketches/GenericInequalitySearch.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,6 @@ public enum Inequality {
9494
GT
9595
}
9696

97-
/**
98-
* Constructs this class
99-
*/
100-
public GenericInequalitySearch() { }
101-
10297
/**
10398
* Binary Search for the index of the generic value in the given search range that satisfies
10499
* the given inequality.

src/main/java/org/apache/datasketches/theta/Intersection.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ public CompactSketch getResult() {
8686
public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem);
8787

8888
/**
89-
* Returns true if there is an intersection result available
90-
* @return true if there is an intersection result available
89+
* Returns true if there is a valid intersection result available
90+
* @return true if there is a valid intersection result available
9191
*/
9292
public abstract boolean hasResult();
9393

src/main/java/org/apache/datasketches/theta/Union.java

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
import org.apache.datasketches.memory.WritableMemory;
2525

2626
/**
27-
* The API for Union operations
27+
* Compute the union of two or more theta sketches.
28+
* A new instance represents an empty set.
2829
*
2930
* @author Lee Rhodes
3031
*/
@@ -59,7 +60,7 @@ public Family getFamily() {
5960
public abstract CompactSketch getResult(boolean dstOrdered, WritableMemory dstMem);
6061

6162
/**
62-
* Resets this Union. The seed remains intact, otherwise reverts back to its virgin state.
63+
* Resets this Union. The seed remains intact, everything else reverts back to its virgin state.
6364
*/
6465
public abstract void reset();
6566

@@ -71,7 +72,7 @@ public Family getFamily() {
7172

7273
/**
7374
* This implements a stateless, pair-wise union operation. The returned sketch will be cutback to
74-
* k if required, similar to the regular Union operation.
75+
* the smaller of the two k values if required.
7576
*
7677
* <p>Nulls and empty sketches are ignored.</p>
7778
*
@@ -104,7 +105,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
104105
* This method is not valid for the older SetSketch, which was prior to Open Source (August, 2015).
105106
*
106107
* <p>This method can be repeatedly called.
107-
* If the given sketch is null it is interpreted as an empty sketch.</p>
108+
*
109+
* <p>Nulls and empty sketches are ignored.</p>
108110
*
109111
* @param sketchIn The incoming sketch.
110112
*/
@@ -116,7 +118,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
116118
* This method is not valid for the older SetSketch, which was prior to Open Source (August, 2015).
117119
*
118120
* <p>This method can be repeatedly called.
119-
* If the given sketch is null it is interpreted as an empty sketch.</p>
121+
*
122+
* <p>Nulls and empty sketches are ignored.</p>
120123
*
121124
* @param sketchIn The incoming sketch.
122125
* @deprecated 2.0.0. Use {@link #union(Sketch)} instead.
@@ -130,7 +133,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
130133
* called the SetSketch (circa 2012), which was prior to Open Source and are compact and ordered.
131134
*
132135
* <p>This method can be repeatedly called.
133-
* If the given sketch is null it is interpreted as an empty sketch.</p>
136+
*
137+
* <p>Nulls and empty sketches are ignored.</p>
134138
*
135139
* @param mem Memory image of sketch to be merged
136140
*/
@@ -142,7 +146,8 @@ public abstract CompactSketch union(Sketch sketchA, Sketch sketchB, boolean dstO
142146
* called the SetSketch (circa 2012), which was prior to Open Source and are compact and ordered.
143147
*
144148
* <p>This method can be repeatedly called.
145-
* If the given sketch is null it is interpreted as an empty sketch.</p>
149+
*
150+
* <p>Nulls and empty sketches are ignored.</p>
146151
*
147152
* @param mem Memory image of sketch to be merged
148153
* @deprecated 2.0.0. Use {@link #union(Memory)} instead.

src/main/java/org/apache/datasketches/theta/UnionImpl.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,9 +272,12 @@ public byte[] toByteArray() {
272272
@Override
273273
public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boolean dstOrdered,
274274
final WritableMemory dstMem) {
275-
update(sketchA);
276-
update(sketchB);
277-
return getResult(dstOrdered, dstMem);
275+
reset();
276+
union(sketchA);
277+
union(sketchB);
278+
final CompactSketch csk = getResult(dstOrdered, dstMem);
279+
reset();
280+
return csk;
278281
}
279282

280283
@Deprecated
@@ -369,13 +372,13 @@ public void union(final Memory skMem) {
369372
if (serVer == 2) { //older Sketch, which is compact and ordered
370373
Util.checkSeedHashes(seedHash_, (short)extractSeedHash(skMem));
371374
final CompactSketch csk = ForwardCompatibility.heapify2to3(skMem, DEFAULT_UPDATE_SEED);
372-
update(csk);
375+
union(csk);
373376
return;
374377
}
375378

376379
if (serVer == 1) { //much older Sketch, which is compact and ordered
377380
final CompactSketch csk = ForwardCompatibility.heapify1to3(skMem, DEFAULT_UPDATE_SEED);
378-
update(csk);
381+
union(csk);
379382
return;
380383
}
381384

src/main/java/org/apache/datasketches/tuple/Intersection.java

Lines changed: 95 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434

3535

3636
/**
37-
* Computes an intersection of two or more generic tuple sketches.
37+
* Computes an intersection of two or more generic tuple sketches or generic tuple sketches
38+
* combined with theta sketches.
3839
* A new instance represents the Universal Set. Because the Universal Set
3940
* cannot be realized a <i>getResult()</i> on a new instance will produce an error.
4041
* Every update() computes an intersection with the internal state, which will never
@@ -64,28 +65,74 @@ public Intersection(final SummarySetOperations<S> summarySetOps) {
6465
}
6566

6667
/**
67-
* Updates the internal state by intersecting it with the given sketch.
68-
* @param sketchIn input sketch to intersect with the internal state. It may not be null.
68+
* Perform a stateless intersect set operation on the two given tuple sketches and returns the
69+
* result as an unordered CompactSketch on the heap.
70+
* @param tupleSketchA The first sketch argument. It must not be null.
71+
* @param tupleSketchB The second sketch argument. It must not be null.
72+
* @return an unordered CompactSketch on the heap
6973
*/
70-
public void update(final Sketch<S> sketchIn) {
71-
if (sketchIn == null) { throw new SketchesArgumentException("Sketch may not be null"); }
74+
public CompactSketch<S> intersect(final Sketch<S> tupleSketchA, final Sketch<S> tupleSketchB) {
75+
reset();
76+
intersect(tupleSketchA);
77+
intersect(tupleSketchB);
78+
final CompactSketch<S> csk = getResult();
79+
reset();
80+
return csk;
81+
}
82+
83+
/**
84+
* Perform a stateless intersect set operation on a tuple sketch and a theta sketch and returns the
85+
* result as an unordered CompactSketch on the heap.
86+
* @param tupleSketch The first sketch argument. It must not be null.
87+
* @param thetaSketch The second sketch argument. It must not be null.
88+
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
89+
* This must not be null.
90+
* @return an unordered CompactSketch on the heap
91+
*/
92+
public CompactSketch<S> intersect(final Sketch<S> tupleSketch,
93+
final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
94+
reset();
95+
intersect(tupleSketch);
96+
intersect(thetaSketch, summary);
97+
final CompactSketch<S> csk = getResult();
98+
reset();
99+
return csk;
100+
}
101+
102+
/**
103+
* Performs a stateful intersection of the internal set with the given tupleSketch.
104+
* @param tupleSketch input sketch to intersect with the internal state. It must not be null.
105+
* @deprecated 2.0.0. Please use {@link #intersect(Sketch)}.
106+
*/
107+
@Deprecated
108+
public void update(final Sketch<S> tupleSketch) {
109+
intersect(tupleSketch);
110+
}
111+
112+
/**
113+
* Performs a stateful intersection of the internal set with the given tupleSketch.
114+
* @param tupleSketch input sketch to intersect with the internal state. It must not be null.
115+
*/
116+
public void intersect(final Sketch<S> tupleSketch) {
117+
if (tupleSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
72118
final boolean firstCall = firstCall_;
73119
firstCall_ = false;
74120

75121
// input sketch could be first or next call
76-
final long thetaLongIn = sketchIn.getThetaLong();
77-
final int countIn = sketchIn.getRetainedEntries();
122+
final long thetaLongIn = tupleSketch.getThetaLong();
123+
final int countIn = tupleSketch.getRetainedEntries();
78124
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
79125
// Empty rule extended in case incoming sketch does not have empty bit properly set
80-
empty_ |= countIn == 0 && thetaLongIn == Long.MAX_VALUE;
126+
final boolean emptyIn = countIn == 0 && thetaLongIn == Long.MAX_VALUE;
127+
empty_ |= emptyIn; //empty rule
81128
if (countIn == 0) {
82129
hashTables_.clear();
83130
return;
84131
}
85132
// input sketch will have valid entries > 0
86133

87134
if (firstCall) {
88-
final Sketch<S> firstSketch = sketchIn;
135+
final Sketch<S> firstSketch = tupleSketch;
89136
//Copy firstSketch data into local instance hashTables_
90137
hashTables_.fromSketch(firstSketch);
91138
}
@@ -95,7 +142,7 @@ public void update(final Sketch<S> sketchIn) {
95142
if (hashTables_.count_ == 0) {
96143
return;
97144
}
98-
final Sketch<S> nextSketch = sketchIn;
145+
final Sketch<S> nextSketch = tupleSketch;
99146
//Match nextSketch data with local instance data, filtering by theta
100147
final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());
101148

@@ -126,29 +173,47 @@ public void update(final Sketch<S> sketchIn) {
126173
}
127174

128175
/**
129-
* Updates the internal set by intersecting it with the given Theta sketch.
130-
* @param sketchIn input Theta Sketch to intersect with the internal state. It may not be null.
131-
* @param summary the given proxy summary for the Theta Sketch, which doesn't have one.
132-
* It will be copied for each matching index. It may not be null.
176+
* Performs a stateful intersection of the internal set with the given thetaSketch by combining entries
177+
* using the hashes from the theta sketch and summary values from the given summary and rules
178+
* from the summarySetOps defined by the Intersection constructor.
179+
* @param thetaSketch input theta sketch to intersect with the internal state. It must not be null.
180+
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
181+
* It will be copied for each matching index. It must not be null.
182+
* @deprecated 2.0.0. Please use intersect(org.apache.datasketches.theta.Sketch, S).
183+
*/
184+
@Deprecated //note the {at_link} does not work in the above
185+
public void update(final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
186+
intersect(thetaSketch, summary);
187+
}
188+
189+
/**
190+
* Performs a stateful intersection of the internal set with the given thetaSketch by combining entries
191+
* using the hashes from the theta sketch and summary values from the given summary and rules
192+
* from the summarySetOps defined by the Intersection constructor.
193+
* @param thetaSketch input theta sketch to intersect with the internal state. It must not be null.
194+
* @param summary the given proxy summary for the theta sketch, which doesn't have one.
195+
* It will be copied for each matching index. It must not be null.
133196
*/
134-
public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) {
135-
if (sketchIn == null) { throw new SketchesArgumentException("Sketch may not be null"); }
197+
public void intersect(final org.apache.datasketches.theta.Sketch thetaSketch, final S summary) {
198+
if (thetaSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
136199
if (summary == null) { throw new SketchesArgumentException("Summary cannot be null."); }
137200
final boolean firstCall = firstCall_;
138201
firstCall_ = false;
202+
139203
// input sketch is not null, could be first or next call
140-
final long thetaLongIn = sketchIn.getThetaLong();
141-
final int countIn = sketchIn.getRetainedEntries(true);
204+
final long thetaLongIn = thetaSketch.getThetaLong();
205+
final int countIn = thetaSketch.getRetainedEntries(true);
142206
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
143207
// Empty rule extended in case incoming sketch does not have empty bit properly set
144-
empty_ |= countIn == 0 && thetaLongIn == Long.MAX_VALUE;
208+
final boolean emptyIn = countIn == 0 && thetaLongIn == Long.MAX_VALUE;
209+
empty_ |= emptyIn; //empty rule
145210
if (countIn == 0) {
146211
hashTables_.clear();
147212
return;
148213
}
149214
// input sketch will have valid entries > 0
150215
if (firstCall) {
151-
final org.apache.datasketches.theta.Sketch firstSketch = sketchIn;
216+
final org.apache.datasketches.theta.Sketch firstSketch = thetaSketch;
152217
//Copy firstSketch data into local instance hashTables_
153218
hashTables_.fromSketch(firstSketch, summary);
154219
}
@@ -158,15 +223,15 @@ public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S
158223
if (hashTables_.count_ == 0) {
159224
return;
160225
}
161-
final org.apache.datasketches.theta.Sketch nextSketch = sketchIn;
226+
final org.apache.datasketches.theta.Sketch nextSketch = thetaSketch;
162227
//Match nextSketch data with local instance data, filtering by theta
163228
final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries(true));
164229

165230
final long[] matchHashArr = new long[maxMatchSize];
166231
S[] matchSummaries = null;
167232
int matchCount = 0;
168233

169-
final org.apache.datasketches.theta.HashIterator it = sketchIn.iterator();
234+
final org.apache.datasketches.theta.HashIterator it = thetaSketch.iterator();
170235
final Class<S> summaryType = (Class<S>) hashTables_.summaryTable_.getClass().getComponentType();
171236
while (it.next()) {
172237
final long hash = it.get();
@@ -221,6 +286,14 @@ public CompactSketch<S> getResult() {
221286
return new CompactSketch<>(hashArr, summaries, thetaLong_, empty_);
222287
}
223288

289+
/**
290+
* Returns true if there is a valid intersection result available
291+
* @return true if there is a valid intersection result available
292+
*/
293+
public boolean hasResult() {
294+
return !firstCall_;
295+
}
296+
224297
/**
225298
* Resets the internal set to the initial state, which represents the Universal Set
226299
*/

0 commit comments

Comments
 (0)