duckdb
diff --git a/‎src/duckdb/extension/parquet/column_reader.cpp‎
Lines changed: 32 additions & 147 deletions b/‎src/duckdb/extension/parquet/column_reader.cpp‎
Lines changed: 32 additions & 147 deletions
diff --git a/‎src/duckdb/extension/parquet/decoder/byte_stream_split_decoder.cpp‎
Lines changed: 47 additions & 0 deletions b/‎src/duckdb/extension/parquet/decoder/byte_stream_split_decoder.cpp‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎src/duckdb/extension/parquet/decoder/delta_binary_packed_decoder.cpp‎
Lines changed: 48 additions & 0 deletions b/‎src/duckdb/extension/parquet/decoder/delta_binary_packed_decoder.cpp‎
Lines changed: 48 additions & 0 deletions
@@ -108,8 +108,8 @@ const uint8_t ParquetDecodeUtils::BITPACK_DLEN = 8;
 ColumnReader::ColumnReader(ParquetReader &reader, LogicalType type_p, const SchemaElement &schema_p, idx_t file_idx_p,
                            idx_t max_define_p, idx_t max_repeat_p)
     : schema(schema_p), file_idx(file_idx_p), max_define(max_define_p), max_repeat(max_repeat_p), reader(reader),
-      type(std::move(type_p)), page_rows_available(0), dictionary_selection_vector(STANDARD_VECTOR_SIZE),
-      dictionary_size(0) {
+      type(std::move(type_p)), page_rows_available(0), dictionary_decoder(*this), delta_binary_packed_decoder(*this),
+      rle_decoder(*this), delta_byte_array_decoder(*this), byte_stream_split_decoder(*this) {
 
 	// dummies for Skip()
 	dummy_define.resize(reader.allocator, STANDARD_VECTOR_SIZE);
@@ -202,20 +202,7 @@ void ColumnReader::Plain(shared_ptr<ByteBuffer> plain_data, uint8_t *defines, id
 	throw NotImplementedException("Plain");
 }
 
-void ColumnReader::PrepareDeltaLengthByteArray(ResizeableBuffer &buffer) {
-	throw std::runtime_error("DELTA_LENGTH_BYTE_ARRAY encoding is only supported for text or binary data");
-}
-
-void ColumnReader::PrepareDeltaByteArray(ResizeableBuffer &buffer) {
-	throw std::runtime_error("DELTA_BYTE_ARRAY encoding is only supported for text or binary data");
-}
-
-void ColumnReader::DeltaByteArray(uint8_t *defines, idx_t num_values, // NOLINT
-                                  parquet_filter_t &filter, idx_t result_offset, Vector &result) {
-	throw NotImplementedException("DeltaByteArray");
-}
-
-void ColumnReader::PlainReference(shared_ptr<ByteBuffer>, Vector &result) { // NOLINT
+void ColumnReader::PlainReference(shared_ptr<ResizeableBuffer> &, Vector &result) { // NOLINT
 }
 
 void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns, TProtocol &protocol_p) {
@@ -239,9 +226,8 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
 }
 
 void ColumnReader::PrepareRead(parquet_filter_t &filter) {
-	dict_decoder.reset();
+	encoding = ColumnEncoding::INVALID;
 	defined_decoder.reset();
-	bss_decoder.reset();
 	block.reset();
 	PageHeader page_hdr;
 	reader.Read(page_hdr, *protocol);
@@ -261,22 +247,11 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
 		break;
 	case PageType::DICTIONARY_PAGE: {
 		PreparePage(page_hdr);
-		if (page_hdr.dictionary_page_header.num_values < 0) {
+		auto dictionary_size = page_hdr.dictionary_page_header.num_values;
+		if (dictionary_size < 0) {
 			throw std::runtime_error("Invalid dictionary page header (num_values < 0)");
 		}
-		auto old_dict_size = dictionary_size;
-		// we use the first value in the dictionary to keep a NULL
-		dictionary_size = page_hdr.dictionary_page_header.num_values;
-		if (!dictionary) {
-			dictionary = make_uniq<Vector>(type, dictionary_size + 1);
-		} else if (dictionary_size > old_dict_size) {
-			dictionary->Resize(old_dict_size, dictionary_size + 1);
-		}
-		dictionary_id = reader.file_name + "_" + schema.name + "_" + std::to_string(chunk_read_offset);
-		// we use the first entry as a NULL, dictionary vectors don't have a separate validity mask
-		FlatVector::Validity(*dictionary).SetInvalid(0);
-		PlainReference(block, *dictionary);
-		Plain(block, nullptr, dictionary_size, nullptr, 1, *dictionary);
+		dictionary_decoder.InitializeDictionary(dictionary_size);
 		break;
 	}
 	default:
@@ -459,75 +434,45 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
 	switch (page_encoding) {
 	case Encoding::RLE_DICTIONARY:
 	case Encoding::PLAIN_DICTIONARY: {
-		// where is it otherwise??
-		auto dict_width = block->read<uint8_t>();
-		// TODO somehow dict_width can be 0 ?
-		dict_decoder = make_uniq<RleBpDecoder>(block->ptr, block->len, dict_width);
-		block->inc(block->len);
+		encoding = ColumnEncoding::DICTIONARY;
+		dictionary_decoder.InitializePage();
 		break;
 	}
 	case Encoding::RLE: {
-		if (type.id() != LogicalTypeId::BOOLEAN) {
-			throw std::runtime_error("RLE encoding is only supported for boolean data");
-		}
-		block->inc(sizeof(uint32_t));
-		rle_decoder = make_uniq<RleBpDecoder>(block->ptr, block->len, 1);
+		encoding = ColumnEncoding::RLE;
+		rle_decoder.InitializePage();
 		break;
 	}
 	case Encoding::DELTA_BINARY_PACKED: {
-		dbp_decoder = make_uniq<DbpDecoder>(block->ptr, block->len);
-		block->inc(block->len);
+		encoding = ColumnEncoding::DELTA_BINARY_PACKED;
+		delta_binary_packed_decoder.InitializePage();
 		break;
 	}
 	case Encoding::DELTA_LENGTH_BYTE_ARRAY: {
-		PrepareDeltaLengthByteArray(*block);
+		encoding = ColumnEncoding::DELTA_LENGTH_BYTE_ARRAY;
+		delta_byte_array_decoder.InitializeDeltaLengthByteArray();
 		break;
 	}
 	case Encoding::DELTA_BYTE_ARRAY: {
-		PrepareDeltaByteArray(*block);
+		encoding = ColumnEncoding::DELTA_BYTE_ARRAY;
+		delta_byte_array_decoder.InitializeDeltaByteArray();
 		break;
 	}
 	case Encoding::BYTE_STREAM_SPLIT: {
-		// Subtract 1 from length as the block is allocated with 1 extra byte,
-		// but the byte stream split encoder needs to know the correct data size.
-		bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
-		block->inc(block->len);
+		encoding = ColumnEncoding::BYTE_STREAM_SPLIT;
+		byte_stream_split_decoder.InitializePage();
 		break;
 	}
 	case Encoding::PLAIN:
 		// nothing to do here, will be read directly below
+		encoding = ColumnEncoding::PLAIN;
 		break;
 
 	default:
 		throw std::runtime_error("Unsupported page encoding");
 	}
 }
 
-void ColumnReader::ConvertDictToSelVec(uint32_t *offsets, uint8_t *defines, parquet_filter_t &filter, idx_t read_now,
-                                       idx_t result_offset) {
-	D_ASSERT(read_now <= STANDARD_VECTOR_SIZE);
-	idx_t offset_idx = 0;
-	for (idx_t row_idx = 0; row_idx < read_now; row_idx++) {
-		if (HasDefines() && defines[row_idx + result_offset] != max_define) {
-			dictionary_selection_vector.set_index(row_idx, 0); // dictionary entry 0 is NULL
-			continue;                                          // we don't have a dict entry for NULLs
-		}
-		if (filter.test(row_idx + result_offset)) {
-			auto offset = offsets[offset_idx++];
-			if (offset >= dictionary_size) {
-				throw std::runtime_error("Parquet file is likely corrupted, dictionary offset out of range");
-			}
-			dictionary_selection_vector.set_index(row_idx, offset + 1);
-		} else {
-			dictionary_selection_vector.set_index(row_idx, 0); // just set NULL if the filter excludes this row
-			offset_idx++;
-		}
-	}
-#ifdef DEBUG
-	dictionary_selection_vector.Verify(read_now, dictionary_size + 1);
-#endif
-}
-
 idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr_t define_out, data_ptr_t repeat_out,
                          Vector &result) {
 	// we need to reset the location because multiple column readers share the same protocol
@@ -563,84 +508,24 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 			defined_decoder->GetBatch<uint8_t>(define_out + result_offset, read_now);
 		}
 
-		idx_t null_count = 0;
-
-		if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
-			// we need the null count because the dictionary offsets have no entries for nulls
-			for (idx_t i = result_offset; i < result_offset + read_now; i++) {
-				null_count += (define_out[i] != max_define);
-			}
-		}
-
 		if (result_offset != 0 && result.GetVectorType() != VectorType::FLAT_VECTOR) {
 			result.Flatten(result_offset);
 			result.Resize(result_offset, STANDARD_VECTOR_SIZE);
 		}
 
-		if (dict_decoder) {
-			if ((!dictionary || dictionary_size == 0) && null_count < read_now) {
-				throw std::runtime_error("Parquet file is likely corrupted, missing dictionary");
-			}
-			offset_buffer.resize(reader.allocator, sizeof(uint32_t) * (read_now - null_count));
-			dict_decoder->GetBatch<uint32_t>(offset_buffer.ptr, read_now - null_count);
-			ConvertDictToSelVec(reinterpret_cast<uint32_t *>(offset_buffer.ptr),
-			                    reinterpret_cast<uint8_t *>(define_out), filter, read_now, result_offset);
-			if (result_offset == 0) {
-				result.Dictionary(*dictionary, dictionary_size + 1, dictionary_selection_vector, read_now);
-				DictionaryVector::SetDictionaryId(result, dictionary_id);
-				D_ASSERT(result.GetVectorType() == VectorType::DICTIONARY_VECTOR);
-			} else {
-				D_ASSERT(result.GetVectorType() == VectorType::FLAT_VECTOR);
-				VectorOperations::Copy(*dictionary, result, dictionary_selection_vector, read_now, 0, result_offset);
-			}
-		} else if (dbp_decoder) {
-			// TODO keep this in the state
-			auto read_buf = make_shared_ptr<ResizeableBuffer>();
-
-			switch (schema.type) {
-			case duckdb_parquet::Type::INT32:
-				read_buf->resize(reader.allocator, sizeof(int32_t) * (read_now - null_count));
-				dbp_decoder->GetBatch<int32_t>(read_buf->ptr, read_now - null_count);
-
-				break;
-			case duckdb_parquet::Type::INT64:
-				read_buf->resize(reader.allocator, sizeof(int64_t) * (read_now - null_count));
-				dbp_decoder->GetBatch<int64_t>(read_buf->ptr, read_now - null_count);
-				break;
-
-			default:
-				throw std::runtime_error("DELTA_BINARY_PACKED should only be INT32 or INT64");
-			}
-			// Plain() will put NULLs in the right place
-			Plain(read_buf, define_out, read_now, &filter, result_offset, result);
-		} else if (rle_decoder) {
-			// RLE encoding for boolean
-			D_ASSERT(type.id() == LogicalTypeId::BOOLEAN);
-			auto read_buf = make_shared_ptr<ResizeableBuffer>();
-			read_buf->resize(reader.allocator, sizeof(bool) * (read_now - null_count));
-			rle_decoder->GetBatch<uint8_t>(read_buf->ptr, read_now - null_count);
-			PlainTemplated<bool, TemplatedParquetValueConversion<bool>>(read_buf, define_out, read_now, &filter,
-			                                                            result_offset, result);
-		} else if (byte_array_data) {
+		auto define_ptr = HasDefines() ? static_cast<uint8_t *>(define_out) : nullptr;
+		if (encoding == ColumnEncoding::DICTIONARY) {
+			dictionary_decoder.Read(define_ptr, read_now, result, result_offset);
+		} else if (encoding == ColumnEncoding::DELTA_BINARY_PACKED) {
+			delta_binary_packed_decoder.Read(define_ptr, read_now, result, result_offset);
+		} else if (encoding == ColumnEncoding::RLE) {
+			rle_decoder.Read(define_ptr, read_now, result, result_offset);
+		} else if (encoding == ColumnEncoding::DELTA_LENGTH_BYTE_ARRAY ||
+		           encoding == ColumnEncoding::DELTA_BYTE_ARRAY) {
 			// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
-			DeltaByteArray(define_out, read_now, filter, result_offset, result);
-		} else if (bss_decoder) {
-			auto read_buf = make_shared_ptr<ResizeableBuffer>();
-
-			switch (schema.type) {
-			case duckdb_parquet::Type::FLOAT:
-				read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
-				bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
-				break;
-			case duckdb_parquet::Type::DOUBLE:
-				read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
-				bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
-				break;
-			default:
-				throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
-			}
-
-			Plain(read_buf, define_out, read_now, &filter, result_offset, result);
+			delta_byte_array_decoder.Read(define_ptr, read_now, result, result_offset);
+		} else if (encoding == ColumnEncoding::BYTE_STREAM_SPLIT) {
+			byte_stream_split_decoder.Read(define_ptr, read_now, result, result_offset);
 		} else {
 			PlainReference(block, result);
 			Plain(block, define_out, read_now, &filter, result_offset, result);
 
@@ -0,0 +1,47 @@
+#include "decoder/byte_stream_split_decoder.hpp"
+#include "column_reader.hpp"
+#include "parquet_reader.hpp"
+
+namespace duckdb {
+
+ByteStreamSplitDecoder::ByteStreamSplitDecoder(ColumnReader &reader) : reader(reader) {
+}
+
+void ByteStreamSplitDecoder::InitializePage() {
+	auto &block = reader.block;
+	// Subtract 1 from length as the block is allocated with 1 extra byte,
+	// but the byte stream split encoder needs to know the correct data size.
+	bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
+	block->inc(block->len);
+}
+
+void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
+	idx_t null_count = 0;
+
+	if (defines) {
+		// we need the null count because the dictionary offsets have no entries for nulls
+		for (idx_t i = result_offset; i < result_offset + read_count; i++) {
+			null_count += defines[i] != reader.max_define;
+		}
+	}
+	idx_t valid_count = read_count - null_count;
+
+	auto read_buf = make_shared_ptr<ResizeableBuffer>();
+	auto &allocator = reader.reader.allocator;
+	switch (reader.schema.type) {
+	case duckdb_parquet::Type::FLOAT:
+		read_buf->resize(allocator, sizeof(float) * valid_count);
+		bss_decoder->GetBatch<float>(read_buf->ptr, valid_count);
+		break;
+	case duckdb_parquet::Type::DOUBLE:
+		read_buf->resize(allocator, sizeof(double) * valid_count);
+		bss_decoder->GetBatch<double>(read_buf->ptr, valid_count);
+		break;
+	default:
+		throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
+	}
+
+	reader.Plain(read_buf, defines, read_count, nullptr, result_offset, result);
+}
+
+} // namespace duckdb
@@ -0,0 +1,48 @@
+#include "decoder/delta_binary_packed_decoder.hpp"
+#include "column_reader.hpp"
+#include "parquet_reader.hpp"
+
+namespace duckdb {
+
+DeltaBinaryPackedDecoder::DeltaBinaryPackedDecoder(ColumnReader &reader) : reader(reader) {
+}
+
+void DeltaBinaryPackedDecoder::InitializePage() {
+	auto &block = reader.block;
+	dbp_decoder = make_uniq<DbpDecoder>(block->ptr, block->len);
+	block->inc(block->len);
+}
+
+void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
+	// TODO keep this in the state
+	auto read_buf = make_shared_ptr<ResizeableBuffer>();
+
+	idx_t null_count = 0;
+	if (defines) {
+		// we need the null count because the dictionary offsets have no entries for nulls
+		for (idx_t i = result_offset; i < result_offset + read_count; i++) {
+			null_count += (defines[i] != reader.max_define);
+		}
+	}
+	idx_t valid_count = read_count - null_count;
+
+	auto &allocator = reader.reader.allocator;
+	switch (reader.schema.type) {
+	case duckdb_parquet::Type::INT32:
+		read_buf->resize(allocator, sizeof(int32_t) * (valid_count));
+		dbp_decoder->GetBatch<int32_t>(read_buf->ptr, valid_count);
+
+		break;
+	case duckdb_parquet::Type::INT64:
+		read_buf->resize(allocator, sizeof(int64_t) * (valid_count));
+		dbp_decoder->GetBatch<int64_t>(read_buf->ptr, valid_count);
+		break;
+
+	default:
+		throw std::runtime_error("DELTA_BINARY_PACKED should only be INT32 or INT64");
+	}
+	// Plain() will put NULLs in the right place
+	reader.Plain(read_buf, defines, read_count, nullptr, result_offset, result);
+}
+
+} // namespace duckdb