Skip to content

Commit 487f148

Browse files
authored
apacheGH-49837: [C++][Parquet] Avoid unbounded temporary std::vector in DELTA_(LENGTH_)BYTE_ARRAY decoder (apache#49838)
### Rationale for this change `DeltaLengthByteArrayDecoder::DecodeArrow` and `DeltaByteArrayDecoder::DecodeArrow` both allocate a temporary `std::vector` for the entire range of decoded values. This generates out-of-memory failures in the Parquet encoding fuzzer as this unbounded allocation bypasses the custom fuzzing memory pool. This issue was found by OSS-Fuzz: https://oss-fuzz.com/testcase-detail/6634166832922624 ### What changes are included in this PR? Use the given MemoryPool to allocate a temporary buffer, so that potential memory limits are applied. ### Are these changes tested? Yes, using existing tests. ### Are there any user-facing changes? No. * GitHub Issue: apache#49837 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 5633a18 commit 487f148

2 files changed

Lines changed: 20 additions & 6 deletions

File tree

cpp/src/arrow/util/logging_test.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include <gtest/gtest.h>
2525

26+
#include "arrow/util/config.h"
2627
#include "arrow/util/logging_internal.h"
2728

2829
// This code is adapted from
@@ -92,6 +93,9 @@ TEST(ArrowCheck, PayloadEvaluatedOnFailure) {
9293
}
9394

9495
TEST(ArrowLog, MultiThreadedLogging) {
96+
#ifndef ARROW_ENABLE_THREADING
97+
GTEST_SKIP() << "Test requires threading support";
98+
#endif
9599
// This is mostly a visual test that logging from multiple threads produces
96100
// a clean output without interleaving messages (GH-49433).
97101
constexpr int kNumThreads = 10;

cpp/src/parquet/decoder.cc

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1716,6 +1716,7 @@ class DeltaLengthByteArrayDecoder : public TypedDecoderImpl<ByteArrayType> {
17161716
explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
17171717
MemoryPool* pool = ::arrow::default_memory_pool())
17181718
: Base(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
1719+
pool_(pool),
17191720
len_decoder_(nullptr, pool),
17201721
buffered_length_(AllocateBuffer(pool, 0)) {}
17211722

@@ -1804,15 +1805,19 @@ class DeltaLengthByteArrayDecoder : public TypedDecoderImpl<ByteArrayType> {
18041805
int64_t valid_bits_offset,
18051806
typename EncodingTraits<ByteArrayType>::Accumulator* out,
18061807
int* out_num_values) {
1807-
std::vector<ByteArray> values(num_values - null_count);
1808-
const int num_valid_values = Decode(values.data(), num_values - null_count);
1808+
ARROW_ASSIGN_OR_RAISE(
1809+
auto values_buffer,
1810+
::arrow::AllocateBuffer(sizeof(ByteArray) * (num_values - null_count), pool_));
1811+
auto values_data = values_buffer->template mutable_data_as<ByteArray>();
1812+
1813+
const int num_valid_values = Decode(values_data, num_values - null_count);
18091814
if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) {
18101815
throw ParquetException("Expected to decode ", num_values - null_count,
18111816
" values, but decoded ", num_valid_values, " values.");
18121817
}
18131818

18141819
auto visit_binary_helper = [&](auto* helper) {
1815-
auto values_ptr = values.data();
1820+
auto values_ptr = reinterpret_cast<const ByteArray*>(values_data);
18161821
int value_idx = 0;
18171822

18181823
RETURN_NOT_OK(
@@ -1837,6 +1842,7 @@ class DeltaLengthByteArrayDecoder : public TypedDecoderImpl<ByteArrayType> {
18371842
out, num_values, /*estimated_data_length=*/{}, visit_binary_helper);
18381843
}
18391844

1845+
MemoryPool* pool_;
18401846
std::shared_ptr<::arrow::bit_util::BitReader> decoder_;
18411847
DeltaBitPackDecoder<Int32Type> len_decoder_;
18421848
int num_valid_values_{0};
@@ -2140,15 +2146,19 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl<DType> {
21402146
int64_t valid_bits_offset,
21412147
typename EncodingTraits<DType>::Accumulator* out,
21422148
int* out_num_values) {
2143-
std::vector<ByteArray> values(num_values - null_count);
2144-
const int num_valid_values = GetInternal(values.data(), num_values - null_count);
2149+
ARROW_ASSIGN_OR_RAISE(
2150+
auto values_buffer,
2151+
::arrow::AllocateBuffer(sizeof(ByteArray) * (num_values - null_count), pool_));
2152+
auto values_data = values_buffer->template mutable_data_as<ByteArray>();
2153+
2154+
const int num_valid_values = GetInternal(values_data, num_values - null_count);
21452155
if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) {
21462156
throw ParquetException("Expected to decode ", num_values - null_count,
21472157
" values, but decoded ", num_valid_values, " values.");
21482158
}
21492159

21502160
auto visit_binary_helper = [&](auto* helper) {
2151-
auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());
2161+
auto values_ptr = reinterpret_cast<const ByteArray*>(values_data);
21522162
int value_idx = 0;
21532163

21542164
PARQUET_THROW_NOT_OK(

0 commit comments

Comments
 (0)