Skip to content

Commit 51aef3c

Browse files
committed
Updated class javadoc to include the following comment:
The code included here does work fine for moderate sized partitioning tasks. As an example, using the test code in the test branch with the partitioning task of splitting a data set of 1 billion items into 324 partitions of size 3M items completed in under 3 minutes, which was performed on a single CPU. For much larger partitioning tasks, it is recommended that this code be leveraged into a parallelized systems environment. I made some minor tweaks to the test code examples.
1 parent 327d621 commit 51aef3c

4 files changed

Lines changed: 47 additions & 20 deletions

File tree

src/main/java/org/apache/datasketches/partitions/Partitioner.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@
4141
/**
4242
* A partitioning process that can partition very large data sets into thousands
4343
* of partitions of approximately the same size.
44+
*
45+
* <p>The code included here does work fine for moderate sized partitioning tasks.
46+
* As an example, using the test code in the test branch with the partitioning task of splitting
47+
* a data set of 1 billion items into 324 partitions of size 3M items completed in under 3 minutes, which was
48+
* performed on a single CPU. For much larger partitioning tasks, it is recommended that this code be leveraged into a
49+
* parallelized systems environment.</p>
4450
* @param <T> the data type
4551
* @param <S> the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature.
4652
*/

src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@
2020
package org.apache.datasketches.partitions;
2121

2222
import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH;
23+
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE;
2324
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
2425

2526
import java.util.List;
2627

2728
import org.apache.datasketches.common.SketchesArgumentException;
2829
import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow;
2930
import org.apache.datasketches.quantiles.ItemsSketch;
31+
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
3032
import org.testng.annotations.Test;
3133

3234
/**
@@ -44,42 +46,48 @@ public class ClassicPartitionsTest {
4446
* Launch the partitioner as an application with the following arguments as strings:
4547
* <ul>
4648
* <li>arg[0]: int k, the size of the sketch</li>
47-
* <li>arg[1]: long totalN, the total size, in elements, of the data set to parse.</li>
48-
* <li>arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
49-
* <li>arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
49+
* <li>arg[1]: String INCLUSIVE or EXCLUSIVE, the search criteria.</li>
50+
* <li>arg[2]: long totalN, the total size, in elements, of the data set to parse.</li>
51+
* <li>arg[3]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
52+
* <li>arg[4]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
5053
* </ul>
5154
* @param args input arguments as defined above.
5255
*/
5356
public void main(String[] args) {
5457
final int k, maxPartsPerSk;
5558
final long totalN, tgtPartitionSize;
59+
final QuantileSearchCriteria searchCrit;
5660
try {
5761
k = Integer.parseInt(args[0].trim());
58-
totalN = Long.parseLong(args[1].trim());
59-
tgtPartitionSize = Long.parseLong(args[2].trim());
60-
maxPartsPerSk = Integer.parseInt(args[3].trim());
62+
searchCrit = args[1].trim().equalsIgnoreCase("INCLUSIVE") ? INCLUSIVE : EXCLUSIVE;
63+
totalN = Long.parseLong(args[2].trim());
64+
tgtPartitionSize = Long.parseLong(args[3].trim());
65+
maxPartsPerSk = Integer.parseInt(args[4].trim());
6166
} catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); }
62-
classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
67+
classicPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
6368
}
6469

6570
//@Test //launch from TestNG
6671
public void checkClassicPartitioner() {
6772
final int k = 1 << 15;
68-
final long totalN = 1000_000_000L; //artificially set low so it will execute fast
73+
final QuantileSearchCriteria searchCrit = INCLUSIVE;
74+
final long totalN = 30_000_000L; //artificially set low so it will execute fast as a simple test
6975
final long tgtPartitionSize = 3_000_000L;
7076
final int maxPartsPerSk = 100;
71-
classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
77+
classicPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
7278
}
7379

7480
/**
7581
* Programmatic call to classic Partitioner
7682
* @param k the size of the sketch.
83+
* @param searchCrit the QuantileSearchCriteria: either INCLUSIVE or EXCLUSIVE.
7784
* @param totalN the total size, in elements, of the data set to parse.
7885
* @param tgtPartitionSize the target number of elements per resulting partition.
7986
* @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch.
8087
*/
8188
public void classicPartitioner(
8289
final int k,
90+
final QuantileSearchCriteria searchCrit,
8391
final long totalN,
8492
final long tgtPartitionSize,
8593
final int maxPartsPerSk) {
@@ -92,7 +100,7 @@ public void classicPartitioner(
92100
tgtPartitionSize,
93101
maxPartsPerSk,
94102
fillReq,
95-
INCLUSIVE);
103+
searchCrit);
96104
final List<PartitionBoundsRow<String>> list = partitioner.partition(sk);
97105
final long endTime_mS = System.currentTimeMillis();
98106
final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS;
@@ -102,6 +110,7 @@ public void classicPartitioner(
102110
"Classic",
103111
list,
104112
k,
113+
searchCrit,
105114
totalN,
106115
tgtPartitionSize,
107116
maxPartsPerSk,

src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@
2020
package org.apache.datasketches.partitions;
2121

2222
import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH;
23+
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE;
2324
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
2425

2526
import java.util.List;
2627

2728
import org.apache.datasketches.common.SketchesArgumentException;
2829
import org.apache.datasketches.kll.KllItemsSketch;
2930
import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow;
31+
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
3032
import org.testng.annotations.Test;
3133

3234
/**
@@ -44,42 +46,48 @@ public class KllPartitionsTest {
4446
* Launch the partitioner as an application with the following arguments as strings:
4547
* <ul>
4648
* <li>arg[0]: int k, the size of the sketch</li>
47-
* <li>arg[1]: long totalN, the total size, in elements, of the data set to parse.</li>
48-
* <li>arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
49-
* <li>arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
49+
* <li>arg[1]: String INCLUSIVE or EXCLUSIVE, the search criteria.</li>
50+
* <li>arg[2]: long totalN, the total size, in elements, of the data set to parse.</li>
51+
* <li>arg[3]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
52+
* <li>arg[4]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
5053
* </ul>
5154
* @param args input arguments as defined above.
5255
*/
5356
public void main(String[] args) {
5457
final int k, maxPartsPerSk;
5558
final long totalN, tgtPartitionSize;
59+
final QuantileSearchCriteria searchCrit;
5660
try {
5761
k = Integer.parseInt(args[0].trim());
58-
totalN = Long.parseLong(args[1].trim());
59-
tgtPartitionSize = Long.parseLong(args[2].trim());
60-
maxPartsPerSk = Integer.parseInt(args[3].trim());
62+
searchCrit = args[1].trim().equalsIgnoreCase("INCLUSIVE") ? INCLUSIVE : EXCLUSIVE;
63+
totalN = Long.parseLong(args[2].trim());
64+
tgtPartitionSize = Long.parseLong(args[3].trim());
65+
maxPartsPerSk = Integer.parseInt(args[4].trim());
6166
} catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); }
62-
kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
67+
kllPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
6368
}
6469

6570
//@Test //launch from TestNG
6671
public void checkKllPartitioner() {
6772
final int k = 1 << 15;
68-
final long totalN = 30_000_000L; //artificially set low so it will execute fast
73+
final QuantileSearchCriteria searchCrit = INCLUSIVE;
74+
final long totalN = 30_000_000L; //artificially set low so it will execute fast as a simple test
6975
final long tgtPartitionSize = 3_000_000L;
7076
final int maxPartsPerSk = 100;
71-
kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
77+
kllPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
7278
}
7379

7480
/**
7581
* Programmatic call to KLL Partitioner
7682
* @param k the size of the sketch.
83+
* @param searchCrit the QuantileSearchCriteria: either INCLUSIVE or EXCLUSIVE.
7784
* @param totalN the total size, in elements, of the data set to parse.
7885
* @param tgtPartitionSize the target number of elements per resulting partition.
7986
* @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch.
8087
*/
8188
public void kllPartitioner(
8289
final int k,
90+
final QuantileSearchCriteria searchCrit,
8391
final long totalN,
8492
final long tgtPartitionSize,
8593
final int maxPartsPerSk) {
@@ -92,7 +100,7 @@ public void kllPartitioner(
92100
tgtPartitionSize,
93101
maxPartsPerSk,
94102
fillReq,
95-
INCLUSIVE);
103+
searchCrit);
96104
final List<PartitionBoundsRow<String>> list = partitioner.partition(sk);
97105
final long endTime_mS = System.currentTimeMillis();
98106
final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS;
@@ -102,6 +110,7 @@ public void kllPartitioner(
102110
"KLL",
103111
list,
104112
k,
113+
searchCrit,
105114
totalN,
106115
tgtPartitionSize,
107116
maxPartsPerSk,

src/test/java/org/apache/datasketches/partitions/PartitionResults.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.util.List;
2828

2929
import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow;
30+
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
3031

3132
/**
3233
* Output partitioning results to console.
@@ -42,6 +43,7 @@ public static void output(
4243
final String sketchType,
4344
final List<PartitionBoundsRow<String>> list,
4445
final int k,
46+
final QuantileSearchCriteria searchCrit,
4547
final long totalN,
4648
final long tgtPartitionSize,
4749
final int maxPartsPerSk,
@@ -75,6 +77,7 @@ public static void output(
7577
println(LS + sketchType +" ItemsSketch Partitions Test");
7678
println(LS + "INPUT:");
7779
printf("Sketch K :%,20d\n", k);
80+
printf("Search Criteria :%20s\n", searchCrit.name());
7881
printf("Total N :%,20d\n", totalN);
7982
printf("Tgt Partition Size :%,20d\n", tgtPartitionSize);
8083
printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk);

0 commit comments

Comments
 (0)