-
Notifications
You must be signed in to change notification settings - Fork 1.5k
GH-3142: Add strict validation for unsigned types in ExampleParquetWriter #3271
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,8 +30,10 @@ | |
import java.util.ArrayDeque; | ||
import java.util.Arrays; | ||
import java.util.Deque; | ||
import java.util.Optional; | ||
import org.apache.parquet.io.api.Binary; | ||
import org.apache.parquet.io.api.RecordConsumer; | ||
import org.apache.parquet.schema.LogicalTypeAnnotation; | ||
import org.apache.parquet.schema.MessageType; | ||
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; | ||
import org.apache.parquet.schema.Type; | ||
|
@@ -46,7 +48,11 @@ | |
public class ValidatingRecordConsumer extends RecordConsumer { | ||
private static final Logger LOG = LoggerFactory.getLogger(ValidatingRecordConsumer.class); | ||
|
||
private static final int UINT_8_MAX_VALUE = 255; | ||
private static final int UINT_16_MAX_VALUE = 65535; | ||
|
||
private final RecordConsumer delegate; | ||
private final boolean strictUnsignedIntegerValidation; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't it be validated at all times? This is already the ValidatingRecordConsumer so it might need to validate everything. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately there are several test breaking and existing code built with the assumption of no validation. Since we put the changes directly within There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you help investigate why existing test break? Are all of them failed because of unsigned integer overflow? If yes, they are actually bugs and should be fixed in this PR so we don't have to add a separate flag for unsigned integer. |
||
|
||
private Deque<Type> types = new ArrayDeque<>(); | ||
private Deque<Integer> fields = new ArrayDeque<>(); | ||
|
@@ -58,7 +64,18 @@ public class ValidatingRecordConsumer extends RecordConsumer { | |
* @param schema the schema to validate against | ||
*/ | ||
public ValidatingRecordConsumer(RecordConsumer delegate, MessageType schema) { | ||
this(delegate, schema, false); | ||
} | ||
|
||
/** | ||
* @param delegate the consumer to pass down the event to | ||
* @param schema the schema to validate against | ||
* @param strictUnsignedIntegerValidation whether to enable strict unsigned integer validation | ||
*/ | ||
public ValidatingRecordConsumer( | ||
RecordConsumer delegate, MessageType schema, boolean strictUnsignedIntegerValidation) { | ||
this.delegate = delegate; | ||
this.strictUnsignedIntegerValidation = strictUnsignedIntegerValidation; | ||
this.types.push(schema); | ||
} | ||
|
||
|
@@ -202,6 +219,9 @@ private void validate(PrimitiveTypeName... ptypes) { | |
@Override | ||
public void addInteger(int value) { | ||
validate(INT32); | ||
if (strictUnsignedIntegerValidation) { | ||
validateUnsignedInteger(value); | ||
} | ||
delegate.addInteger(value); | ||
} | ||
|
||
|
@@ -211,6 +231,9 @@ public void addInteger(int value) { | |
@Override | ||
public void addLong(long value) { | ||
validate(INT64); | ||
if (strictUnsignedIntegerValidation) { | ||
validateUnsignedLong(value); | ||
} | ||
delegate.addLong(value); | ||
} | ||
|
||
|
@@ -249,4 +272,66 @@ public void addDouble(double value) { | |
validate(DOUBLE); | ||
delegate.addDouble(value); | ||
} | ||
|
||
private void validateUnsignedInteger(int value) { | ||
Type currentType = types.peek().asGroupType().getType(fields.peek()); | ||
if (currentType != null && currentType.isPrimitive()) { | ||
LogicalTypeAnnotation logicalType = currentType.asPrimitiveType().getLogicalTypeAnnotation(); | ||
if (logicalType != null) { | ||
logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<Void>() { | ||
@Override | ||
public Optional<Void> visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intType) { | ||
if (!intType.isSigned()) { | ||
switch (intType.getBitWidth()) { | ||
case 8: | ||
if (value < 0 || value > UINT_8_MAX_VALUE) { | ||
throw new InvalidRecordException("Value " + value | ||
+ " is out of range for UINT_8 (0-" + UINT_8_MAX_VALUE + ") in field " | ||
+ currentType.getName()); | ||
} | ||
break; | ||
case 16: | ||
if (value < 0 || value > UINT_16_MAX_VALUE) { | ||
throw new InvalidRecordException("Value " + value | ||
+ " is out of range for UINT_16 (0-" + UINT_16_MAX_VALUE + ") in field " | ||
+ currentType.getName()); | ||
} | ||
break; | ||
case 32: | ||
case 64: | ||
if (value < 0) { | ||
throw new InvalidRecordException("Negative value " + value | ||
+ " is not allowed for unsigned integer type " | ||
+ currentType.getName()); | ||
} | ||
break; | ||
} | ||
} | ||
return Optional.empty(); | ||
} | ||
}); | ||
} | ||
} | ||
} | ||
|
||
private void validateUnsignedLong(long value) { | ||
Type currentType = types.peek().asGroupType().getType(fields.peek()); | ||
if (currentType != null && currentType.isPrimitive()) { | ||
LogicalTypeAnnotation logicalType = currentType.asPrimitiveType().getLogicalTypeAnnotation(); | ||
if (logicalType != null) { | ||
logicalType.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<Void>() { | ||
@Override | ||
public Optional<Void> visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intType) { | ||
if (!intType.isSigned()) { | ||
if (value < 0) { | ||
throw new InvalidRecordException("Negative value " + value | ||
+ " is not allowed for unsigned integer type " + currentType.getName()); | ||
} | ||
} | ||
return Optional.empty(); | ||
} | ||
}); | ||
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why only 8 and 16?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UINT_8, UINT_16, and UINT_32 annotate an int32 value, UINT_64 annotatyes a i64 value. For UINT_8, and UINT16 valid values are in the range [0, UINT_8_MAX_VALUE] and [0, UINT_16_MAX_VALUE] respectively. For UINT_32 and UINT_64, all non negative values are valid.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes that was the reason thanks!