Skip to content

Commit 9536666

Browse files
kevinwilfongmeta-codesync[bot]
authored andcommitted
fix: Check for corrupted repeat/define lengths in Parquet headers (facebookincubator#16511)
Summary: Pull Request resolved: facebookincubator#16511 We currently do not validate the correctness of the repeat/define lengths we read in the Parquet header, this can lead us to access memory outside the data buffer. Add checks to validate we do not go off the end of the pageData_ buffer. Reviewed By: Yuhta Differential Revision: D94270200 fbshipit-source-id: 1e78b2c09748fddc52bc01cbda58c2c6cb6a0f1e
1 parent f447471 commit 9536666

File tree

3 files changed

+354
-3
lines changed

3 files changed

+354
-3
lines changed

velox/dwio/parquet/reader/PageReader.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,18 +232,42 @@ void PageReader::prepareDataPageV1(const PageHeader& pageHeader, int64_t row) {
232232
pageHeader.compressed_page_size,
233233
pageHeader.uncompressed_page_size);
234234
auto pageEnd = pageData_ + pageHeader.uncompressed_page_size;
235+
auto remainingBytes = pageHeader.uncompressed_page_size;
235236
if (maxRepeat_ > 0) {
237+
VELOX_CHECK_GE(
238+
remainingBytes,
239+
sizeof(int32_t),
240+
"Insufficient bytes for repetition level length (corrupt data page?)");
236241
uint32_t repeatLength = readField<int32_t>(pageData_);
242+
remainingBytes -= sizeof(int32_t);
243+
VELOX_CHECK_LE(
244+
repeatLength,
245+
remainingBytes,
246+
"Repetition level length {} exceeds remaining page size {} (corrupt data page?)",
247+
repeatLength,
248+
remainingBytes);
237249
repeatDecoder_ = std::make_unique<RleDecoder>(
238250
reinterpret_cast<const uint8_t*>(pageData_),
239251
repeatLength,
240252
::arrow::bit_util::NumRequiredBits(maxRepeat_));
241253

242254
pageData_ += repeatLength;
255+
remainingBytes -= repeatLength;
243256
}
244257

245258
if (maxDefine_ > 0) {
259+
VELOX_CHECK_GE(
260+
remainingBytes,
261+
sizeof(uint32_t),
262+
"Insufficient bytes for definition level length (corrupt data page?)");
246263
auto defineLength = readField<uint32_t>(pageData_);
264+
remainingBytes -= sizeof(uint32_t);
265+
VELOX_CHECK_LE(
266+
defineLength,
267+
remainingBytes,
268+
"Definition level length {} exceeds remaining page size {} (corrupt data page?)",
269+
defineLength,
270+
remainingBytes);
247271
if (maxDefine_ == 1) {
248272
defineDecoder_ = std::make_unique<RleBpDecoder>(
249273
pageData_,
@@ -288,6 +312,13 @@ void PageReader::prepareDataPageV2(const PageHeader& pageHeader, int64_t row) {
288312
pageHeader.data_page_header_v2.repetition_levels_byte_length;
289313

290314
auto bytes = pageHeader.compressed_page_size;
315+
VELOX_CHECK_LE(
316+
static_cast<uint64_t>(repeatLength) + defineLength,
317+
bytes,
318+
"Repetition and definition level lengths ({} + {}) exceed compressed page size {} (corrupt data page?)",
319+
repeatLength,
320+
defineLength,
321+
bytes);
291322
pageData_ = readBytes(bytes, pageBuffer_);
292323

293324
if (repeatLength) {

velox/dwio/parquet/reader/PageReader.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,13 @@ class PageReader {
6565
common::CompressionKind codec,
6666
int64_t chunkSize,
6767
dwio::common::ColumnReaderStatistics& stats,
68-
const tz::TimeZone* sessionTimezone = nullptr)
68+
const tz::TimeZone* sessionTimezone = nullptr,
69+
int32_t maxRepeat = 0,
70+
int32_t maxDefine = 1)
6971
: pool_(pool),
7072
inputStream_(std::move(stream)),
71-
maxRepeat_(0),
72-
maxDefine_(1),
73+
maxRepeat_(maxRepeat),
74+
maxDefine_(maxDefine),
7375
isTopLevel_(maxRepeat_ == 0 && maxDefine_ <= 1),
7476
codec_(codec),
7577
chunkSize_(chunkSize),

0 commit comments

Comments
 (0)