Skip to content

Commit 5794875

Browse files
committed
vendor: Update vendored sources to duckdb/duckdb@2d530ad
Use Adaptive Filters in the Parquet reader (duckdb/duckdb#16133) [Julia] Add support for named params in prepared statements (duckdb/duckdb#15621)
1 parent 543796c commit 5794875

File tree

5 files changed

+40
-23
lines changed

5 files changed

+40
-23
lines changed

src/duckdb/extension/parquet/include/parquet_reader.hpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,19 @@
99
#pragma once
1010

1111
#include "duckdb.hpp"
12-
#ifndef DUCKDB_AMALGAMATION
1312
#include "duckdb/common/common.hpp"
1413
#include "duckdb/common/encryption_state.hpp"
1514
#include "duckdb/common/exception.hpp"
1615
#include "duckdb/common/multi_file_reader.hpp"
1716
#include "duckdb/common/multi_file_reader_options.hpp"
1817
#include "duckdb/common/string_util.hpp"
1918
#include "duckdb/common/types/data_chunk.hpp"
20-
#include "duckdb/planner/filter/conjunction_filter.hpp"
21-
#include "duckdb/planner/filter/constant_filter.hpp"
22-
#include "duckdb/planner/filter/null_filter.hpp"
23-
#include "duckdb/planner/table_filter.hpp"
24-
#endif
2519
#include "column_reader.hpp"
2620
#include "parquet_file_metadata_cache.hpp"
2721
#include "parquet_rle_bp_decoder.hpp"
2822
#include "parquet_types.h"
2923
#include "resizable_buffer.hpp"
24+
#include "duckdb/execution/adaptive_filter.hpp"
3025

3126
#include <exception>
3227

@@ -48,6 +43,14 @@ struct ParquetReaderPrefetchConfig {
4843
static constexpr double WHOLE_GROUP_PREFETCH_MINIMUM_SCAN = 0.95;
4944
};
5045

46+
struct ParquetScanFilter {
47+
ParquetScanFilter(idx_t filter_idx, TableFilter &filter) : filter_idx(filter_idx), filter(filter) {
48+
}
49+
50+
idx_t filter_idx;
51+
TableFilter &filter;
52+
};
53+
5154
struct ParquetReaderScanState {
5255
vector<idx_t> group_idx_list;
5356
int64_t current_group;
@@ -64,6 +67,11 @@ struct ParquetReaderScanState {
6467

6568
bool prefetch_mode = false;
6669
bool current_group_prefetched = false;
70+
71+
//! Adaptive filter
72+
unique_ptr<AdaptiveFilter> adaptive_filter;
73+
//! Table filter list
74+
vector<ParquetScanFilter> scan_filters;
6775
};
6876

6977
struct ParquetColumnDefinition {

src/duckdb/extension/parquet/include/reader/string_column_reader.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ class StringColumnReader : public ColumnReader {
3131
idx_t fixed_width_string_length;
3232

3333
public:
34-
static uint32_t VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
35-
uint32_t VerifyString(const char *str_data, uint32_t str_len);
34+
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
35+
void VerifyString(const char *str_data, uint32_t str_len);
3636

3737
protected:
3838
void Plain(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values, idx_t result_offset,

src/duckdb/extension/parquet/parquet_reader.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,6 @@ void ParquetReader::InitializeScan(ClientContext &context, ParquetReaderScanStat
854854

855855
state.file_handle = fs.OpenFile(file_handle->path, flags);
856856
}
857-
858857
state.thrift_file_proto = CreateThriftFileProtocol(allocator, *state.file_handle, state.prefetch_mode);
859858
state.root_reader = CreateReader(context);
860859
state.define_buf.resize(allocator, STANDARD_VECTOR_SIZE);
@@ -979,18 +978,28 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
979978
vector<bool> need_to_read(reader_data.column_ids.size(), true);
980979

981980
state.sel.Initialize(nullptr);
981+
if (state.scan_filters.empty()) {
982+
state.adaptive_filter = make_uniq<AdaptiveFilter>(*reader_data.filters);
983+
for (auto &entry : reader_data.filters->filters) {
984+
state.scan_filters.emplace_back(entry.first, *entry.second);
985+
}
986+
}
987+
D_ASSERT(state.scan_filters.size() == reader_data.filters->filters.size());
988+
982989
// first load the columns that are used in filters
983-
for (auto &filter_col : reader_data.filters->filters) {
990+
auto filter_state = state.adaptive_filter->BeginFilter();
991+
for (idx_t i = 0; i < state.scan_filters.size(); i++) {
984992
if (filter_count == 0) {
985993
// if no rows are left we can stop checking filters
986994
break;
987995
}
988-
auto filter_entry = reader_data.filter_map[filter_col.first];
996+
auto &scan_filter = state.scan_filters[state.adaptive_filter->permutation[i]];
997+
auto filter_entry = reader_data.filter_map[scan_filter.filter_idx];
989998
if (filter_entry.is_constant) {
990999
// this is a constant vector, look for the constant
9911000
auto &constant = reader_data.constant_map[filter_entry.index].value;
9921001
Vector constant_vector(constant);
993-
ApplyFilter(constant_vector, *filter_col.second, scan_count, state.sel, filter_count);
1002+
ApplyFilter(constant_vector, scan_filter.filter, scan_count, state.sel, filter_count);
9941003
} else {
9951004
auto id = filter_entry.index;
9961005
auto file_col_idx = reader_data.column_ids[id];
@@ -999,10 +1008,11 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
9991008
auto &result_vector = result.data[result_idx];
10001009
auto &child_reader = root_reader.GetChildReader(file_col_idx);
10011010
child_reader.Read(scan_count, define_ptr, repeat_ptr, result_vector);
1002-
ApplyFilter(result_vector, *filter_col.second, scan_count, state.sel, filter_count);
1011+
ApplyFilter(result_vector, scan_filter.filter, scan_count, state.sel, filter_count);
10031012
need_to_read[id] = false;
10041013
}
10051014
}
1015+
state.adaptive_filter->EndFilter(filter_state);
10061016

10071017
// we still may have to read some cols
10081018
for (idx_t col_idx = 0; col_idx < reader_data.column_ids.size(); col_idx++) {

src/duckdb/extension/parquet/reader/string_column_reader.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ StringColumnReader::StringColumnReader(ParquetReader &reader, LogicalType type_p
1818
}
1919
}
2020

21-
uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
21+
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
2222
if (!is_varchar) {
23-
return str_len;
23+
return;
2424
}
2525
// verify if a string is actually UTF8, and if there are no null bytes in the middle of the string
2626
// technically Parquet should guarantee this, but reality is often disappointing
@@ -31,11 +31,10 @@ uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len
3131
throw InvalidInputException("Invalid string encoding found in Parquet file: value \"" +
3232
Blob::ToString(string_t(str_data, str_len)) + "\" is not valid UTF8!");
3333
}
34-
return str_len;
3534
}
3635

37-
uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
38-
return VerifyString(str_data, str_len, Type() == LogicalTypeId::VARCHAR);
36+
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
37+
VerifyString(str_data, str_len, Type().id() == LogicalTypeId::VARCHAR);
3938
}
4039

4140
class ParquetStringVectorBuffer : public VectorBuffer {
@@ -63,8 +62,8 @@ string_t StringParquetValueConversion::PlainRead(ByteBuffer &plain_data, ColumnR
6362
uint32_t str_len = scr.fixed_width_string_length == 0 ? plain_data.read<uint32_t>() : scr.fixed_width_string_length;
6463
plain_data.available(str_len);
6564
auto plain_str = char_ptr_cast(plain_data.ptr);
66-
auto actual_str_len = reader.Cast<StringColumnReader>().VerifyString(plain_str, str_len);
67-
auto ret_str = string_t(plain_str, actual_str_len);
65+
scr.VerifyString(plain_str, str_len);
66+
auto ret_str = string_t(plain_str, str_len);
6867
plain_data.inc(str_len);
6968
return ret_str;
7069
}

src/duckdb/src/function/table/version/pragma_version.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#ifndef DUCKDB_PATCH_VERSION
2-
#define DUCKDB_PATCH_VERSION "1-dev128"
2+
#define DUCKDB_PATCH_VERSION "1-dev140"
33
#endif
44
#ifndef DUCKDB_MINOR_VERSION
55
#define DUCKDB_MINOR_VERSION 2
@@ -8,10 +8,10 @@
88
#define DUCKDB_MAJOR_VERSION 1
99
#endif
1010
#ifndef DUCKDB_VERSION
11-
#define DUCKDB_VERSION "v1.2.1-dev128"
11+
#define DUCKDB_VERSION "v1.2.1-dev140"
1212
#endif
1313
#ifndef DUCKDB_SOURCE_ID
14-
#define DUCKDB_SOURCE_ID "9c2cce521f"
14+
#define DUCKDB_SOURCE_ID "2d530ad365"
1515
#endif
1616
#include "duckdb/function/table/system_functions.hpp"
1717
#include "duckdb/main/database.hpp"

0 commit comments

Comments
 (0)