@@ -212,12 +212,12 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
212212
213213 Statistics <?> colStats = stats .get (id );
214214 if (colStats != null && !colStats .isEmpty ()) {
215- if (hasNonNullButNoMinMax (colStats , valueCount )) {
216- return ROWS_MIGHT_MATCH ;
215+ if (allNulls (colStats , valueCount )) {
216+ return ROWS_CANNOT_MATCH ;
217217 }
218218
219- if (! colStats . hasNonNullValue ( )) {
220- return ROWS_CANNOT_MATCH ;
219+ if (minMaxUndefined ( colStats )) {
220+ return ROWS_MIGHT_MATCH ;
221221 }
222222
223223 T lower = min (colStats , id );
@@ -242,12 +242,12 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
242242
243243 Statistics <?> colStats = stats .get (id );
244244 if (colStats != null && !colStats .isEmpty ()) {
245- if (hasNonNullButNoMinMax (colStats , valueCount )) {
246- return ROWS_MIGHT_MATCH ;
245+ if (allNulls (colStats , valueCount )) {
246+ return ROWS_CANNOT_MATCH ;
247247 }
248248
249- if (! colStats . hasNonNullValue ( )) {
250- return ROWS_CANNOT_MATCH ;
249+ if (minMaxUndefined ( colStats )) {
250+ return ROWS_MIGHT_MATCH ;
251251 }
252252
253253 T lower = min (colStats , id );
@@ -272,12 +272,12 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
272272
273273 Statistics <?> colStats = stats .get (id );
274274 if (colStats != null && !colStats .isEmpty ()) {
275- if (hasNonNullButNoMinMax (colStats , valueCount )) {
276- return ROWS_MIGHT_MATCH ;
275+ if (allNulls (colStats , valueCount )) {
276+ return ROWS_CANNOT_MATCH ;
277277 }
278278
279- if (! colStats . hasNonNullValue ( )) {
280- return ROWS_CANNOT_MATCH ;
279+ if (minMaxUndefined ( colStats )) {
280+ return ROWS_MIGHT_MATCH ;
281281 }
282282
283283 T upper = max (colStats , id );
@@ -302,12 +302,12 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
302302
303303 Statistics <?> colStats = stats .get (id );
304304 if (colStats != null && !colStats .isEmpty ()) {
305- if (hasNonNullButNoMinMax (colStats , valueCount )) {
306- return ROWS_MIGHT_MATCH ;
305+ if (allNulls (colStats , valueCount )) {
306+ return ROWS_CANNOT_MATCH ;
307307 }
308308
309- if (! colStats . hasNonNullValue ( )) {
310- return ROWS_CANNOT_MATCH ;
309+ if (minMaxUndefined ( colStats )) {
310+ return ROWS_MIGHT_MATCH ;
311311 }
312312
313313 T upper = max (colStats , id );
@@ -339,12 +339,12 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
339339
340340 Statistics <?> colStats = stats .get (id );
341341 if (colStats != null && !colStats .isEmpty ()) {
342- if (hasNonNullButNoMinMax (colStats , valueCount )) {
343- return ROWS_MIGHT_MATCH ;
342+ if (allNulls (colStats , valueCount )) {
343+ return ROWS_CANNOT_MATCH ;
344344 }
345345
346- if (! colStats . hasNonNullValue ( )) {
347- return ROWS_CANNOT_MATCH ;
346+ if (minMaxUndefined ( colStats )) {
347+ return ROWS_MIGHT_MATCH ;
348348 }
349349
350350 T lower = min (colStats , id );
@@ -389,12 +389,12 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
389389
390390 Statistics <?> colStats = stats .get (id );
391391 if (colStats != null && !colStats .isEmpty ()) {
392- if (hasNonNullButNoMinMax (colStats , valueCount )) {
393- return ROWS_MIGHT_MATCH ;
392+ if (allNulls (colStats , valueCount )) {
393+ return ROWS_CANNOT_MATCH ;
394394 }
395395
396- if (! colStats . hasNonNullValue ( )) {
397- return ROWS_CANNOT_MATCH ;
396+ if (minMaxUndefined ( colStats )) {
397+ return ROWS_MIGHT_MATCH ;
398398 }
399399
400400 Collection <T > literals = literalSet ;
@@ -448,12 +448,12 @@ public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
448448
449449 Statistics <Binary > colStats = (Statistics <Binary >) stats .get (id );
450450 if (colStats != null && !colStats .isEmpty ()) {
451- if (hasNonNullButNoMinMax (colStats , valueCount )) {
452- return ROWS_MIGHT_MATCH ;
451+ if (allNulls (colStats , valueCount )) {
452+ return ROWS_CANNOT_MATCH ;
453453 }
454454
455- if (! colStats . hasNonNullValue ( )) {
456- return ROWS_CANNOT_MATCH ;
455+ if (minMaxUndefined ( colStats )) {
456+ return ROWS_MIGHT_MATCH ;
457457 }
458458
459459 ByteBuffer prefixAsBytes = lit .toByteBuffer ();
@@ -501,7 +501,7 @@ public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) {
501501 return ROWS_MIGHT_MATCH ;
502502 }
503503
504- if (hasNonNullButNoMinMax (colStats , valueCount )) {
504+ if (minMaxUndefined (colStats )) {
505505 return ROWS_MIGHT_MATCH ;
506506 }
507507
@@ -560,24 +560,30 @@ private <T> T max(Statistics<?> statistics, int id) {
560560 }
561561
562562 /**
563- * Checks against older versions of Parquet statistics which may have a null count but undefined
564- * min and max statistics. Returns true if nonNull values exist in the row group but no further
565- * statistics are available.
566- *
567- * <p>We can't use {@code statistics.hasNonNullValue()} because it is inaccurate with older files
568- * and will return false if min and max are not set.
563+ * Older versions of Parquet statistics which may have a null count but undefined min and max
564+ * statistics. This is similar to the current behavior when NaN values are present.
569565 *
570566 * <p>This is specifically for 1.5.0-CDH Parquet builds and later which contain the different
571567 * unusual hasNonNull behavior. OSS Parquet builds are not effected because PARQUET-251 prohibits
572568 * the reading of these statistics from versions of Parquet earlier than 1.8.0.
573569 *
574570 * @param statistics Statistics to check
575- * @param valueCount Number of values in the row group
576- * @return true if nonNull values exist and no other stats can be used
571+ * @return true if min and max statistics are null
572+ */
573+ static boolean nullMinMax (Statistics statistics ) {
574+ return statistics .getMaxBytes () == null || statistics .getMinBytes () == null ;
575+ }
576+
577+ /**
578+ * The internal logic of Parquet-MR says that if numNulls is set but hasNonNull value is false,
579+ * then the min/max of the column are undefined.
577580 */
578- static boolean hasNonNullButNoMinMax (Statistics statistics , long valueCount ) {
579- return statistics .getNumNulls () < valueCount
580- && (statistics .getMaxBytes () == null || statistics .getMinBytes () == null );
581+ static boolean minMaxUndefined (Statistics statistics ) {
582+ return (statistics .isNumNullsSet () && !statistics .hasNonNullValue ()) || nullMinMax (statistics );
583+ }
584+
585+ static boolean allNulls (Statistics statistics , long valueCount ) {
586+ return statistics .isNumNullsSet () && valueCount == statistics .getNumNulls ();
581587 }
582588
583589 private static boolean mayContainNull (Statistics statistics ) {
0 commit comments