Skip to content

Commit e5ea152

Browse files
Parquet: Fix incorrect skipping of RowGroups with NaNs (apache#6517)
1 parent 6ab1986 commit e5ea152

File tree

3 files changed

+70
-93
lines changed

3 files changed

+70
-93
lines changed

data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,26 @@ public void testNoNulls() {
300300
Assert.assertTrue("Should read: struct type is not skipped", shouldRead);
301301
}
302302

303+
@Test
304+
public void testFloatWithNan() {
305+
// NaN's should break Parquet's Min/Max stats we should be reading in all cases
306+
// Only ORC should be able to distinguish using min/max when NaN is present
307+
boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0));
308+
Assert.assertTrue(shouldRead);
309+
310+
shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0));
311+
Assert.assertTrue(shouldRead);
312+
313+
shouldRead = shouldRead(lessThan("some_nans", 3.0));
314+
Assert.assertTrue(shouldRead);
315+
316+
shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0));
317+
Assert.assertTrue(shouldRead);
318+
319+
shouldRead = shouldRead(equal("some_nans", 2.0));
320+
Assert.assertTrue(shouldRead);
321+
}
322+
303323
@Test
304324
public void testIsNaN() {
305325
boolean shouldRead = shouldRead(isNaN("all_nans"));

parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java

Lines changed: 46 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -212,12 +212,12 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
212212

213213
Statistics<?> colStats = stats.get(id);
214214
if (colStats != null && !colStats.isEmpty()) {
215-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
216-
return ROWS_MIGHT_MATCH;
215+
if (allNulls(colStats, valueCount)) {
216+
return ROWS_CANNOT_MATCH;
217217
}
218218

219-
if (!colStats.hasNonNullValue()) {
220-
return ROWS_CANNOT_MATCH;
219+
if (minMaxUndefined(colStats)) {
220+
return ROWS_MIGHT_MATCH;
221221
}
222222

223223
T lower = min(colStats, id);
@@ -242,12 +242,12 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
242242

243243
Statistics<?> colStats = stats.get(id);
244244
if (colStats != null && !colStats.isEmpty()) {
245-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
246-
return ROWS_MIGHT_MATCH;
245+
if (allNulls(colStats, valueCount)) {
246+
return ROWS_CANNOT_MATCH;
247247
}
248248

249-
if (!colStats.hasNonNullValue()) {
250-
return ROWS_CANNOT_MATCH;
249+
if (minMaxUndefined(colStats)) {
250+
return ROWS_MIGHT_MATCH;
251251
}
252252

253253
T lower = min(colStats, id);
@@ -272,12 +272,12 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
272272

273273
Statistics<?> colStats = stats.get(id);
274274
if (colStats != null && !colStats.isEmpty()) {
275-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
276-
return ROWS_MIGHT_MATCH;
275+
if (allNulls(colStats, valueCount)) {
276+
return ROWS_CANNOT_MATCH;
277277
}
278278

279-
if (!colStats.hasNonNullValue()) {
280-
return ROWS_CANNOT_MATCH;
279+
if (minMaxUndefined(colStats)) {
280+
return ROWS_MIGHT_MATCH;
281281
}
282282

283283
T upper = max(colStats, id);
@@ -302,12 +302,12 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
302302

303303
Statistics<?> colStats = stats.get(id);
304304
if (colStats != null && !colStats.isEmpty()) {
305-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
306-
return ROWS_MIGHT_MATCH;
305+
if (allNulls(colStats, valueCount)) {
306+
return ROWS_CANNOT_MATCH;
307307
}
308308

309-
if (!colStats.hasNonNullValue()) {
310-
return ROWS_CANNOT_MATCH;
309+
if (minMaxUndefined(colStats)) {
310+
return ROWS_MIGHT_MATCH;
311311
}
312312

313313
T upper = max(colStats, id);
@@ -339,12 +339,12 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
339339

340340
Statistics<?> colStats = stats.get(id);
341341
if (colStats != null && !colStats.isEmpty()) {
342-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
343-
return ROWS_MIGHT_MATCH;
342+
if (allNulls(colStats, valueCount)) {
343+
return ROWS_CANNOT_MATCH;
344344
}
345345

346-
if (!colStats.hasNonNullValue()) {
347-
return ROWS_CANNOT_MATCH;
346+
if (minMaxUndefined(colStats)) {
347+
return ROWS_MIGHT_MATCH;
348348
}
349349

350350
T lower = min(colStats, id);
@@ -389,12 +389,12 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
389389

390390
Statistics<?> colStats = stats.get(id);
391391
if (colStats != null && !colStats.isEmpty()) {
392-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
393-
return ROWS_MIGHT_MATCH;
392+
if (allNulls(colStats, valueCount)) {
393+
return ROWS_CANNOT_MATCH;
394394
}
395395

396-
if (!colStats.hasNonNullValue()) {
397-
return ROWS_CANNOT_MATCH;
396+
if (minMaxUndefined(colStats)) {
397+
return ROWS_MIGHT_MATCH;
398398
}
399399

400400
Collection<T> literals = literalSet;
@@ -448,12 +448,12 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
448448

449449
Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
450450
if (colStats != null && !colStats.isEmpty()) {
451-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
452-
return ROWS_MIGHT_MATCH;
451+
if (allNulls(colStats, valueCount)) {
452+
return ROWS_CANNOT_MATCH;
453453
}
454454

455-
if (!colStats.hasNonNullValue()) {
456-
return ROWS_CANNOT_MATCH;
455+
if (minMaxUndefined(colStats)) {
456+
return ROWS_MIGHT_MATCH;
457457
}
458458

459459
ByteBuffer prefixAsBytes = lit.toByteBuffer();
@@ -501,7 +501,7 @@ public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
501501
return ROWS_MIGHT_MATCH;
502502
}
503503

504-
if (hasNonNullButNoMinMax(colStats, valueCount)) {
504+
if (minMaxUndefined(colStats)) {
505505
return ROWS_MIGHT_MATCH;
506506
}
507507

@@ -560,24 +560,30 @@ private <T> T max(Statistics<?> statistics, int id) {
560560
}
561561

562562
/**
563-
* Checks against older versions of Parquet statistics which may have a null count but undefined
564-
* min and max statistics. Returns true if nonNull values exist in the row group but no further
565-
* statistics are available.
566-
*
567-
* <p>We can't use {@code statistics.hasNonNullValue()} because it is inaccurate with older files
568-
* and will return false if min and max are not set.
563+
* Older versions of Parquet statistics which may have a null count but undefined min and max
564+
* statistics. This is similar to the current behavior when NaN values are present.
569565
*
570566
* <p>This is specifically for 1.5.0-CDH Parquet builds and later which contain the different
571567
* unusual hasNonNull behavior. OSS Parquet builds are not effected because PARQUET-251 prohibits
572568
* the reading of these statistics from versions of Parquet earlier than 1.8.0.
573569
*
574570
* @param statistics Statistics to check
575-
* @param valueCount Number of values in the row group
576-
* @return true if nonNull values exist and no other stats can be used
571+
* @return true if min and max statistics are null
572+
*/
573+
static boolean nullMinMax(Statistics statistics) {
574+
return statistics.getMaxBytes() == null || statistics.getMinBytes() == null;
575+
}
576+
577+
/**
578+
* The internal logic of Parquet-MR says that if numNulls is set but hasNonNull value is false,
579+
* then the min/max of the column are undefined.
577580
*/
578-
static boolean hasNonNullButNoMinMax(Statistics statistics, long valueCount) {
579-
return statistics.getNumNulls() < valueCount
580-
&& (statistics.getMaxBytes() == null || statistics.getMinBytes() == null);
581+
static boolean minMaxUndefined(Statistics statistics) {
582+
return (statistics.isNumNullsSet() && !statistics.hasNonNullValue()) || nullMinMax(statistics);
583+
}
584+
585+
static boolean allNulls(Statistics statistics, long valueCount) {
586+
return statistics.isNumNullsSet() && valueCount == statistics.getNumNulls();
581587
}
582588

583589
private static boolean mayContainNull(Statistics statistics) {

parquet/src/test/java/org/apache/iceberg/parquet/TestCDHParquetStatistics.java

Lines changed: 4 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -27,67 +27,18 @@
2727

2828
/**
2929
* Tests for Parquet 1.5.0-Stats which cannot be evaluated like later versions of Parquet stats.
30-
* They are intercepted by the hasNonNullButNoMinMax function which always returns ROWS_MAY_MATCH
30+
* They are intercepted by the minMaxUndefined function which always returns ROWS_MAY_MATCH
3131
*/
3232
public class TestCDHParquetStatistics {
3333

3434
@Test
3535
public void testCDHParquetStatistcs() {
3636
Statistics cdhBinaryColumnStats = mock(Statistics.class);
37+
when(cdhBinaryColumnStats.isEmpty()).thenReturn(false);
38+
when(cdhBinaryColumnStats.hasNonNullValue()).thenReturn(true);
3739
when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
3840
when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
3941
when(cdhBinaryColumnStats.getNumNulls()).thenReturn(0L);
40-
Assert.assertTrue(
41-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
42-
}
43-
44-
@Test
45-
public void testCDHParquetStatisticsNullNotSet() {
46-
Statistics cdhBinaryColumnStats = mock(Statistics.class);
47-
when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
48-
when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
49-
when(cdhBinaryColumnStats.getNumNulls()).thenReturn(-1L);
50-
Assert.assertTrue(
51-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
52-
}
53-
54-
@Test
55-
public void testCDHParquetStatistcsAllNull() {
56-
Statistics cdhBinaryColumnStats = mock(Statistics.class);
57-
when(cdhBinaryColumnStats.getMaxBytes()).thenReturn(null);
58-
when(cdhBinaryColumnStats.getMinBytes()).thenReturn(null);
59-
when(cdhBinaryColumnStats.getNumNulls()).thenReturn(50L);
60-
Assert.assertFalse(
61-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(cdhBinaryColumnStats, 50L));
62-
}
63-
64-
@Test
65-
public void testNonCDHParquetStatistics() {
66-
Statistics normalBinaryColumnStats = mock(Statistics.class);
67-
when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
68-
when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
69-
when(normalBinaryColumnStats.getNumNulls()).thenReturn(0L);
70-
Assert.assertFalse(
71-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L));
72-
}
73-
74-
@Test
75-
public void testNonCDHParquetStatisticsNullNotSet() {
76-
Statistics normalBinaryColumnStats = mock(Statistics.class);
77-
when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
78-
when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
79-
when(normalBinaryColumnStats.getNumNulls()).thenReturn(-1L);
80-
Assert.assertFalse(
81-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L));
82-
}
83-
84-
@Test
85-
public void testNonCDHParquetStatisticsAllNull() {
86-
Statistics normalBinaryColumnStats = mock(Statistics.class);
87-
when(normalBinaryColumnStats.getMaxBytes()).thenReturn(new byte[2]);
88-
when(normalBinaryColumnStats.getMinBytes()).thenReturn(new byte[2]);
89-
when(normalBinaryColumnStats.getNumNulls()).thenReturn(50L);
90-
Assert.assertFalse(
91-
ParquetMetricsRowGroupFilter.hasNonNullButNoMinMax(normalBinaryColumnStats, 50L));
42+
Assert.assertTrue(ParquetMetricsRowGroupFilter.minMaxUndefined(cdhBinaryColumnStats));
9243
}
9344
}

0 commit comments

Comments
 (0)