Skip to content

Commit 7eab79b

Browse files
committed
[core] Refactor topN and minmax pushdown and should not work for pk table
1 parent e2d02d2 commit 7eab79b

File tree

14 files changed

+261
-269
lines changed

14 files changed

+261
-269
lines changed

paimon-core/src/main/java/org/apache/paimon/operation/AbstractFileStoreScan.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,12 @@ public FileStoreScan dropStats() {
224224
return this;
225225
}
226226

227+
@Override
228+
public FileStoreScan keepStats() {
229+
this.dropStats = false;
230+
return this;
231+
}
232+
227233
@Nullable
228234
@Override
229235
public Integer parallelism() {

paimon-core/src/main/java/org/apache/paimon/operation/FileStoreScan.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ public interface FileStoreScan {
8585

8686
FileStoreScan dropStats();
8787

88+
FileStoreScan keepStats();
89+
8890
@Nullable
8991
Integer parallelism();
9092

paimon-core/src/main/java/org/apache/paimon/table/source/DataSplit.java

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,13 @@
4646
import java.io.ObjectInputStream;
4747
import java.io.ObjectOutputStream;
4848
import java.util.ArrayList;
49-
import java.util.HashSet;
5049
import java.util.List;
5150
import java.util.Objects;
5251
import java.util.Optional;
5352
import java.util.OptionalLong;
54-
import java.util.Set;
5553
import java.util.stream.Collectors;
5654

5755
import static org.apache.paimon.io.DataFilePathFactory.INDEX_PATH_SUFFIX;
58-
import static org.apache.paimon.utils.ListUtils.isNullOrEmpty;
5956
import static org.apache.paimon.utils.Preconditions.checkArgument;
6057
import static org.apache.paimon.utils.Preconditions.checkState;
6158

@@ -158,21 +155,6 @@ public long mergedRowCount() {
158155
return partialMergedRowCount();
159156
}
160157

161-
public boolean statsAvailable(Set<String> columns) {
162-
if (isNullOrEmpty(columns)) {
163-
return false;
164-
}
165-
166-
return dataFiles.stream()
167-
.map(DataFileMeta::valueStatsCols)
168-
.allMatch(
169-
valueStatsCols ->
170-
// It means there are all column statistics when valueStatsCols ==
171-
// null
172-
valueStatsCols == null
173-
|| new HashSet<>(valueStatsCols).containsAll(columns));
174-
}
175-
176158
public Object minValue(int fieldIndex, DataField dataField, SimpleStatsEvolutions evolutions) {
177159
Object minValue = null;
178160
for (DataFileMeta dataFile : dataFiles) {

paimon-core/src/main/java/org/apache/paimon/table/source/DataTableBatchScan.java

Lines changed: 66 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,21 @@
2121
import org.apache.paimon.CoreOptions;
2222
import org.apache.paimon.manifest.PartitionEntry;
2323
import org.apache.paimon.predicate.Predicate;
24+
import org.apache.paimon.predicate.SortValue;
2425
import org.apache.paimon.predicate.TopN;
2526
import org.apache.paimon.schema.SchemaManager;
2627
import org.apache.paimon.schema.TableSchema;
2728
import org.apache.paimon.table.BucketMode;
2829
import org.apache.paimon.table.source.snapshot.SnapshotReader;
2930
import org.apache.paimon.table.source.snapshot.StartingScanner;
3031
import org.apache.paimon.table.source.snapshot.StartingScanner.ScannedResult;
32+
import org.apache.paimon.types.DataType;
3133

3234
import java.util.ArrayList;
3335
import java.util.List;
36+
import java.util.Optional;
37+
38+
import static org.apache.paimon.table.source.PushDownUtils.minmaxAvailable;
3439

3540
/** {@link TableScan} implementation for batch planning. */
3641
public class DataTableBatchScan extends AbstractDataTableScan {
@@ -93,10 +98,15 @@ public TableScan.Plan plan() {
9398

9499
if (hasNext) {
95100
hasNext = false;
96-
StartingScanner.Result result = startingScanner.scan(snapshotReader);
97-
result = applyPushDownLimit(result);
98-
result = applyPushDownTopN(result);
99-
return DataFilePlan.fromResult(result);
101+
Optional<StartingScanner.Result> pushed = applyPushDownLimit();
102+
if (pushed.isPresent()) {
103+
return DataFilePlan.fromResult(pushed.get());
104+
}
105+
pushed = applyPushDownTopN();
106+
if (pushed.isPresent()) {
107+
return DataFilePlan.fromResult(pushed.get());
108+
}
109+
return DataFilePlan.fromResult(startingScanner.scan(snapshotReader));
100110
} else {
101111
throw new EndOfScanException();
102112
}
@@ -110,51 +120,77 @@ public List<PartitionEntry> listPartitionEntries() {
110120
return startingScanner.scanPartitions(snapshotReader);
111121
}
112122

113-
private StartingScanner.Result applyPushDownLimit(StartingScanner.Result result) {
114-
if (pushDownLimit != null && result instanceof ScannedResult) {
115-
long scannedRowCount = 0;
116-
SnapshotReader.Plan plan = ((ScannedResult) result).plan();
117-
List<DataSplit> splits = plan.dataSplits();
118-
if (splits.isEmpty()) {
119-
return result;
120-
}
123+
private Optional<StartingScanner.Result> applyPushDownLimit() {
124+
if (pushDownLimit == null) {
125+
return Optional.empty();
126+
}
127+
128+
StartingScanner.Result result = startingScanner.scan(snapshotReader);
129+
if (!(result instanceof ScannedResult)) {
130+
return Optional.of(result);
131+
}
132+
133+
long scannedRowCount = 0;
134+
SnapshotReader.Plan plan = ((ScannedResult) result).plan();
135+
List<DataSplit> splits = plan.dataSplits();
136+
if (splits.isEmpty()) {
137+
return Optional.of(result);
138+
}
121139

122-
List<Split> limitedSplits = new ArrayList<>();
123-
for (DataSplit dataSplit : splits) {
124-
if (dataSplit.rawConvertible()) {
125-
long partialMergedRowCount = dataSplit.partialMergedRowCount();
126-
limitedSplits.add(dataSplit);
127-
scannedRowCount += partialMergedRowCount;
128-
if (scannedRowCount >= pushDownLimit) {
129-
SnapshotReader.Plan newPlan =
130-
new PlanImpl(plan.watermark(), plan.snapshotId(), limitedSplits);
131-
return new ScannedResult(newPlan);
132-
}
140+
List<Split> limitedSplits = new ArrayList<>();
141+
for (DataSplit dataSplit : splits) {
142+
if (dataSplit.rawConvertible()) {
143+
long partialMergedRowCount = dataSplit.partialMergedRowCount();
144+
limitedSplits.add(dataSplit);
145+
scannedRowCount += partialMergedRowCount;
146+
if (scannedRowCount >= pushDownLimit) {
147+
SnapshotReader.Plan newPlan =
148+
new PlanImpl(plan.watermark(), plan.snapshotId(), limitedSplits);
149+
return Optional.of(new ScannedResult(newPlan));
133150
}
134151
}
135152
}
136-
return result;
153+
return Optional.of(result);
137154
}
138155

139-
private StartingScanner.Result applyPushDownTopN(StartingScanner.Result result) {
156+
private Optional<StartingScanner.Result> applyPushDownTopN() {
140157
if (topN == null
141158
|| pushDownLimit != null
142-
|| !(result instanceof ScannedResult)
143159
|| !schema.primaryKeys().isEmpty()
144160
|| options().deletionVectorsEnabled()) {
145-
return result;
161+
return Optional.empty();
162+
}
163+
164+
List<SortValue> orders = topN.orders();
165+
if (orders.size() != 1) {
166+
return Optional.empty();
167+
}
168+
169+
if (topN.limit() > 100) {
170+
return Optional.empty();
171+
}
172+
173+
SortValue order = orders.get(0);
174+
DataType type = order.field().type();
175+
if (!minmaxAvailable(type)) {
176+
return Optional.empty();
177+
}
178+
179+
StartingScanner.Result result = startingScanner.scan(snapshotReader.keepStats());
180+
if (!(result instanceof ScannedResult)) {
181+
return Optional.of(result);
146182
}
147183

148184
SnapshotReader.Plan plan = ((ScannedResult) result).plan();
149185
List<DataSplit> splits = plan.dataSplits();
150186
if (splits.isEmpty()) {
151-
return result;
187+
return Optional.of(result);
152188
}
153189

154190
TopNDataSplitEvaluator evaluator = new TopNDataSplitEvaluator(schema, schemaManager);
155-
List<Split> topNSplits = new ArrayList<>(evaluator.evaluate(topN, splits));
191+
List<Split> topNSplits = new ArrayList<>(evaluator.evaluate(order, topN.limit(), splits));
156192
SnapshotReader.Plan newPlan = new PlanImpl(plan.watermark(), plan.snapshotId(), topNSplits);
157-
return new ScannedResult(newPlan);
193+
return Optional.of(new ScannedResult(newPlan));
158194
}
159195

160196
@Override

paimon-core/src/main/java/org/apache/paimon/stats/StatsUtils.java renamed to paimon-core/src/main/java/org/apache/paimon/table/source/PushDownUtils.java

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.paimon.stats;
19+
package org.apache.paimon.table.source;
2020

21+
import org.apache.paimon.io.DataFileMeta;
2122
import org.apache.paimon.types.BigIntType;
2223
import org.apache.paimon.types.BooleanType;
2324
import org.apache.paimon.types.DataType;
@@ -28,8 +29,13 @@
2829
import org.apache.paimon.types.SmallIntType;
2930
import org.apache.paimon.types.TinyIntType;
3031

31-
/** Utils for Stats. */
32-
public class StatsUtils {
32+
import java.util.HashSet;
33+
import java.util.Set;
34+
35+
import static org.apache.paimon.utils.ListUtils.isNullOrEmpty;
36+
37+
/** Utils for pushing downs. */
38+
public class PushDownUtils {
3339

3440
public static boolean minmaxAvailable(DataType type) {
3541
// not push down complex type
@@ -48,4 +54,23 @@ public static boolean minmaxAvailable(DataType type) {
4854
|| type instanceof DoubleType
4955
|| type instanceof DateType;
5056
}
57+
58+
public static boolean minmaxAvailable(DataSplit split, Set<String> columns) {
59+
if (isNullOrEmpty(columns)) {
60+
return false;
61+
}
62+
63+
if (!split.rawConvertible()) {
64+
return false;
65+
}
66+
67+
return split.dataFiles().stream()
68+
.map(DataFileMeta::valueStatsCols)
69+
.allMatch(
70+
valueStatsCols ->
71+
// It means there are all column statistics when valueStatsCols ==
72+
// null
73+
valueStatsCols == null
74+
|| new HashSet<>(valueStatsCols).containsAll(columns));
75+
}
5176
}

0 commit comments

Comments
 (0)