Skip to content

Commit e5ff8d0

Browse files
committed
vendor: Update vendored sources to duckdb/duckdb@c69efd5
MultiFileReader Rework (part 18): Replace file path with `OpenFileInfo` struct (duckdb/duckdb#17071) update julia to v1.2.2 (duckdb/duckdb#17074)
1 parent 24d58a4 commit e5ff8d0

40 files changed

+301
-219
lines changed

src/duckdb/extension/parquet/decoder/dictionary_decoder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ void DictionaryDecoder::InitializeDictionary(idx_t new_dictionary_size, optional
2626
dictionary->Resize(old_dict_size, dictionary_size + 1);
2727
}
2828
dictionary_id =
29-
reader.reader.file_name + "_" + reader.Schema().name + "_" + std::to_string(reader.chunk_read_offset);
29+
reader.reader.GetFileName() + "_" + reader.Schema().name + "_" + std::to_string(reader.chunk_read_offset);
3030
// we use the last entry as a NULL, dictionary vectors don't have a separate validity mask
3131
auto &dict_validity = FlatVector::Validity(*dictionary);
3232
dict_validity.Reset(dictionary_size + 1);

src/duckdb/extension/parquet/include/parquet_reader.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ struct ParquetOptionsSerialization {
124124
};
125125

126126
struct ParquetUnionData : public BaseUnionData {
127-
explicit ParquetUnionData(string file_name_p) : BaseUnionData(std::move(file_name_p)) {
127+
explicit ParquetUnionData(OpenFileInfo file_p) : BaseUnionData(std::move(file_p)) {
128128
}
129129
~ParquetUnionData() override;
130130

@@ -138,7 +138,7 @@ class ParquetReader : public BaseFileReader {
138138
static constexpr int32_t ORDINAL_FIELD_ID = 2147483645;
139139

140140
public:
141-
ParquetReader(ClientContext &context, string file_name, ParquetOptions parquet_options,
141+
ParquetReader(ClientContext &context, OpenFileInfo file, ParquetOptions parquet_options,
142142
shared_ptr<ParquetFileMetadataCache> metadata = nullptr);
143143
~ParquetReader() override;
144144

src/duckdb/extension/parquet/parquet_extension.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,9 @@ struct ParquetMultiFileInfo {
107107
static shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
108108
BaseUnionData &union_data, const MultiFileBindData &bind_data_p);
109109
static shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
110-
const string &filename, idx_t file_idx,
110+
const OpenFileInfo &file, idx_t file_idx,
111111
const MultiFileBindData &bind_data);
112-
static shared_ptr<BaseFileReader> CreateReader(ClientContext &context, const string &filename,
112+
static shared_ptr<BaseFileReader> CreateReader(ClientContext &context, const OpenFileInfo &file,
113113
ParquetOptions &options, const MultiFileOptions &file_options);
114114
static shared_ptr<BaseUnionData> GetUnionData(shared_ptr<BaseFileReader> scan_p, idx_t file_idx);
115115
static void FinalizeReader(ClientContext &context, BaseFileReader &reader, GlobalTableFunctionState &);
@@ -306,7 +306,11 @@ class ParquetScanFunction {
306306
auto &bind_data = bind_data_p->Cast<MultiFileBindData>();
307307
auto &parquet_data = bind_data.bind_data->Cast<ParquetReadBindData>();
308308

309-
serializer.WriteProperty(100, "files", bind_data.file_list->GetAllFiles());
309+
vector<string> files;
310+
for (auto &file : bind_data.file_list->GetAllFiles()) {
311+
files.emplace_back(file.path);
312+
}
313+
serializer.WriteProperty(100, "files", files);
310314
serializer.WriteProperty(101, "types", bind_data.types);
311315
serializer.WriteProperty(102, "names", bind_data.names);
312316
ParquetOptionsSerialization serialization(parquet_data.parquet_options, bind_data.file_options);
@@ -508,24 +512,24 @@ shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &con
508512
BaseUnionData &union_data_p,
509513
const MultiFileBindData &bind_data_p) {
510514
auto &union_data = union_data_p.Cast<ParquetUnionData>();
511-
return make_shared_ptr<ParquetReader>(context, union_data.file_name, union_data.options, union_data.metadata);
515+
return make_shared_ptr<ParquetReader>(context, union_data.file, union_data.options, union_data.metadata);
512516
}
513517

514518
shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &context, GlobalTableFunctionState &,
515-
const string &filename, idx_t file_idx,
519+
const OpenFileInfo &file, idx_t file_idx,
516520
const MultiFileBindData &multi_bind_data) {
517521
auto &bind_data = multi_bind_data.bind_data->Cast<ParquetReadBindData>();
518-
return make_shared_ptr<ParquetReader>(context, filename, bind_data.parquet_options);
522+
return make_shared_ptr<ParquetReader>(context, file.path, bind_data.parquet_options);
519523
}
520524

521-
shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &context, const string &filename,
525+
shared_ptr<BaseFileReader> ParquetMultiFileInfo::CreateReader(ClientContext &context, const OpenFileInfo &file,
522526
ParquetOptions &options, const MultiFileOptions &) {
523-
return make_shared_ptr<ParquetReader>(context, filename, options);
527+
return make_shared_ptr<ParquetReader>(context, file.path, options);
524528
}
525529

526530
shared_ptr<BaseUnionData> ParquetMultiFileInfo::GetUnionData(shared_ptr<BaseFileReader> scan_p, idx_t file_idx) {
527531
auto &scan = scan_p->Cast<ParquetReader>();
528-
auto result = make_uniq<ParquetUnionData>(scan.file_name);
532+
auto result = make_uniq<ParquetUnionData>(scan.file);
529533
if (file_idx == 0) {
530534
for (auto &column : scan.columns) {
531535
result->names.push_back(column.name);

src/duckdb/extension/parquet/parquet_metadata.cpp

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ struct ParquetMetaDataOperatorData : public GlobalTableFunctionState {
4242
ColumnDataScanState scan_state;
4343

4444
MultiFileListScanData file_list_scan;
45-
string current_file;
45+
OpenFileInfo current_file;
4646

4747
public:
4848
static void BindMetaData(vector<LogicalType> &return_types, vector<string> &names);
@@ -51,11 +51,13 @@ struct ParquetMetaDataOperatorData : public GlobalTableFunctionState {
5151
static void BindFileMetaData(vector<LogicalType> &return_types, vector<string> &names);
5252
static void BindBloomProbe(vector<LogicalType> &return_types, vector<string> &names);
5353

54-
void LoadRowGroupMetadata(ClientContext &context, const vector<LogicalType> &return_types, const string &file_path);
55-
void LoadSchemaData(ClientContext &context, const vector<LogicalType> &return_types, const string &file_path);
56-
void LoadKeyValueMetaData(ClientContext &context, const vector<LogicalType> &return_types, const string &file_path);
57-
void LoadFileMetaData(ClientContext &context, const vector<LogicalType> &return_types, const string &file_path);
58-
void ExecuteBloomProbe(ClientContext &context, const vector<LogicalType> &return_types, const string &file_path,
54+
void LoadRowGroupMetadata(ClientContext &context, const vector<LogicalType> &return_types,
55+
const OpenFileInfo &file);
56+
void LoadSchemaData(ClientContext &context, const vector<LogicalType> &return_types, const OpenFileInfo &file);
57+
void LoadKeyValueMetaData(ClientContext &context, const vector<LogicalType> &return_types,
58+
const OpenFileInfo &file);
59+
void LoadFileMetaData(ClientContext &context, const vector<LogicalType> &return_types, const OpenFileInfo &file);
60+
void ExecuteBloomProbe(ClientContext &context, const vector<LogicalType> &return_types, const OpenFileInfo &file,
5961
const string &column_name, const Value &probe);
6062
};
6163

@@ -209,10 +211,10 @@ Value ConvertParquetStats(const LogicalType &type, const ParquetColumnSchema &sc
209211
}
210212

211213
void ParquetMetaDataOperatorData::LoadRowGroupMetadata(ClientContext &context, const vector<LogicalType> &return_types,
212-
const string &file_path) {
214+
const OpenFileInfo &file) {
213215
collection.Reset();
214216
ParquetOptions parquet_options(context);
215-
ParquetReader reader(context, file_path, parquet_options);
217+
ParquetReader reader(context, file.path, parquet_options);
216218
idx_t count = 0;
217219
DataChunk current_chunk;
218220
current_chunk.Initialize(context, return_types);
@@ -242,7 +244,7 @@ void ParquetMetaDataOperatorData::LoadRowGroupMetadata(ClientContext &context, c
242244
auto &column_type = column_schema.type;
243245

244246
// file_name, LogicalType::VARCHAR
245-
current_chunk.SetValue(0, count, file_path);
247+
current_chunk.SetValue(0, count, file.path);
246248

247249
// row_group_id, LogicalType::BIGINT
248250
current_chunk.SetValue(1, count, Value::BIGINT(UnsafeNumericCast<int64_t>(row_group_idx)));
@@ -452,10 +454,10 @@ Value ParquetLogicalTypeToString(const duckdb_parquet::LogicalType &type, bool i
452454
}
453455

454456
void ParquetMetaDataOperatorData::LoadSchemaData(ClientContext &context, const vector<LogicalType> &return_types,
455-
const string &file_path) {
457+
const OpenFileInfo &file) {
456458
collection.Reset();
457459
ParquetOptions parquet_options(context);
458-
auto reader = make_uniq<ParquetReader>(context, file_path, parquet_options);
460+
auto reader = make_uniq<ParquetReader>(context, file.path, parquet_options);
459461
idx_t count = 0;
460462
DataChunk current_chunk;
461463
current_chunk.Initialize(context, return_types);
@@ -464,7 +466,7 @@ void ParquetMetaDataOperatorData::LoadSchemaData(ClientContext &context, const v
464466
auto &column = meta_data->schema[col_idx];
465467

466468
// file_name, LogicalType::VARCHAR
467-
current_chunk.SetValue(0, count, file_path);
469+
current_chunk.SetValue(0, count, file.path);
468470

469471
// name, LogicalType::VARCHAR
470472
current_chunk.SetValue(1, count, column.name);
@@ -526,10 +528,10 @@ void ParquetMetaDataOperatorData::BindKeyValueMetaData(vector<LogicalType> &retu
526528
}
527529

528530
void ParquetMetaDataOperatorData::LoadKeyValueMetaData(ClientContext &context, const vector<LogicalType> &return_types,
529-
const string &file_path) {
531+
const OpenFileInfo &file) {
530532
collection.Reset();
531533
ParquetOptions parquet_options(context);
532-
auto reader = make_uniq<ParquetReader>(context, file_path, parquet_options);
534+
auto reader = make_uniq<ParquetReader>(context, file.path, parquet_options);
533535
idx_t count = 0;
534536
DataChunk current_chunk;
535537
current_chunk.Initialize(context, return_types);
@@ -538,7 +540,7 @@ void ParquetMetaDataOperatorData::LoadKeyValueMetaData(ClientContext &context, c
538540
for (idx_t col_idx = 0; col_idx < meta_data->key_value_metadata.size(); col_idx++) {
539541
auto &entry = meta_data->key_value_metadata[col_idx];
540542

541-
current_chunk.SetValue(0, count, Value(file_path));
543+
current_chunk.SetValue(0, count, Value(file.path));
542544
current_chunk.SetValue(1, count, Value::BLOB_RAW(entry.key));
543545
current_chunk.SetValue(2, count, Value::BLOB_RAW(entry.value));
544546

@@ -583,16 +585,16 @@ void ParquetMetaDataOperatorData::BindFileMetaData(vector<LogicalType> &return_t
583585
}
584586

585587
void ParquetMetaDataOperatorData::LoadFileMetaData(ClientContext &context, const vector<LogicalType> &return_types,
586-
const string &file_path) {
588+
const OpenFileInfo &file) {
587589
collection.Reset();
588590
ParquetOptions parquet_options(context);
589-
auto reader = make_uniq<ParquetReader>(context, file_path, parquet_options);
591+
auto reader = make_uniq<ParquetReader>(context, file.path, parquet_options);
590592
DataChunk current_chunk;
591593
current_chunk.Initialize(context, return_types);
592594
auto meta_data = reader->GetFileMetadata();
593595

594596
// file_name
595-
current_chunk.SetValue(0, 0, Value(file_path));
597+
current_chunk.SetValue(0, 0, Value(file.path));
596598
// created_by
597599
current_chunk.SetValue(1, 0, ParquetElementStringVal(meta_data->created_by, meta_data->__isset.created_by));
598600
// num_rows
@@ -628,11 +630,11 @@ void ParquetMetaDataOperatorData::BindBloomProbe(vector<LogicalType> &return_typ
628630
}
629631

630632
void ParquetMetaDataOperatorData::ExecuteBloomProbe(ClientContext &context, const vector<LogicalType> &return_types,
631-
const string &file_path, const string &column_name,
633+
const OpenFileInfo &file, const string &column_name,
632634
const Value &probe) {
633635
collection.Reset();
634636
ParquetOptions parquet_options(context);
635-
auto reader = make_uniq<ParquetReader>(context, file_path, parquet_options);
637+
auto reader = make_uniq<ParquetReader>(context, file.path, parquet_options);
636638
idx_t count = 0;
637639
DataChunk current_chunk;
638640
current_chunk.Initialize(context, return_types);
@@ -646,7 +648,7 @@ void ParquetMetaDataOperatorData::ExecuteBloomProbe(ClientContext &context, cons
646648
}
647649

648650
if (!probe_column_idx.IsValid()) {
649-
throw InvalidInputException("Column %s not found in %s", column_name, file_path);
651+
throw InvalidInputException("Column %s not found in %s", column_name, file.path);
650652
}
651653

652654
auto &allocator = Allocator::DefaultAllocator();
@@ -664,7 +666,7 @@ void ParquetMetaDataOperatorData::ExecuteBloomProbe(ClientContext &context, cons
664666

665667
auto bloom_excludes =
666668
ParquetStatisticsUtils::BloomFilterExcludes(filter, column.meta_data, *protocol, allocator);
667-
current_chunk.SetValue(0, count, Value(file_path));
669+
current_chunk.SetValue(0, count, Value(file.path));
668670
current_chunk.SetValue(1, count, Value::BIGINT(NumericCast<int64_t>(row_group_idx)));
669671
current_chunk.SetValue(2, count, Value::BOOLEAN(bloom_excludes));
670672

src/duckdb/extension/parquet/parquet_reader.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -640,13 +640,13 @@ void ParquetReader::InitializeSchema(ClientContext &context) {
640640
if (file_meta_data->__isset.encryption_algorithm) {
641641
if (file_meta_data->encryption_algorithm.__isset.AES_GCM_CTR_V1) {
642642
throw InvalidInputException("File '%s' is encrypted with AES_GCM_CTR_V1, but only AES_GCM_V1 is supported",
643-
file_name);
643+
GetFileName());
644644
}
645645
}
646646
// check if we like this schema
647647
if (file_meta_data->schema.size() < 2) {
648648
throw InvalidInputException("Failed to read Parquet file '%s': Need at least one non-root column in the file",
649-
file_name);
649+
GetFileName());
650650
}
651651
root_schema = ParseSchema();
652652
for (idx_t i = 0; i < root_schema->children.size(); i++) {
@@ -690,11 +690,11 @@ ParquetColumnDefinition ParquetColumnDefinition::FromSchemaValue(ClientContext &
690690
return result;
691691
}
692692

693-
ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, ParquetOptions parquet_options_p,
693+
ParquetReader::ParquetReader(ClientContext &context_p, OpenFileInfo file_p, ParquetOptions parquet_options_p,
694694
shared_ptr<ParquetFileMetadataCache> metadata_p)
695-
: BaseFileReader(std::move(file_name_p)), fs(FileSystem::GetFileSystem(context_p)),
695+
: BaseFileReader(std::move(file_p)), fs(FileSystem::GetFileSystem(context_p)),
696696
allocator(BufferAllocator::Get(context_p)), parquet_options(std::move(parquet_options_p)) {
697-
file_handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ);
697+
file_handle = fs.OpenFile(file, FileFlags::FILE_FLAGS_READ);
698698
if (!file_handle->CanSeek()) {
699699
throw NotImplementedException(
700700
"Reading parquet files from a FIFO stream is not supported and cannot be efficiently supported since "
@@ -720,11 +720,11 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, Parqu
720720
LoadMetadata(context_p, allocator, *file_handle, parquet_options.encryption_config, *encryption_util);
721721
} else {
722722
auto last_modify_time = fs.GetLastModifiedTime(*file_handle);
723-
metadata = ObjectCache::GetObjectCache(context_p).Get<ParquetFileMetadataCache>(file_name);
723+
metadata = ObjectCache::GetObjectCache(context_p).Get<ParquetFileMetadataCache>(file.path);
724724
if (!metadata || (last_modify_time + 10 >= metadata->read_time)) {
725725
metadata = LoadMetadata(context_p, allocator, *file_handle, parquet_options.encryption_config,
726726
*encryption_util);
727-
ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
727+
ObjectCache::GetObjectCache(context_p).Put(file.path, metadata);
728728
}
729729
}
730730
} else {
@@ -1115,7 +1115,7 @@ bool ParquetReader::ScanInternal(ClientContext &context, ParquetReaderScanState
11151115
"The parquet file '%s' seems to have incorrectly set page offsets. This interferes with DuckDB's "
11161116
"prefetching optimization. DuckDB may still be able to scan this file by manually disabling the "
11171117
"prefetching mechanism using: 'SET disable_parquet_prefetching=true'.",
1118-
file_name);
1118+
GetFileName());
11191119
}
11201120

11211121
if (!filters && scan_percentage > ParquetReaderPrefetchConfig::WHOLE_GROUP_PREFETCH_MINIMUM_SCAN) {

src/duckdb/src/common/file_system.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,32 @@ string FileSystem::ExpandPath(const string &path) {
366366
}
367367

368368
// LCOV_EXCL_START
369+
unique_ptr<FileHandle> FileSystem::OpenFileExtended(const OpenFileInfo &path, FileOpenFlags flags,
370+
optional_ptr<FileOpener> opener) {
371+
// for backwards compatibility purposes - default to OpenFile
372+
throw NotImplementedException("%s: OpenFileExtended is not implemented!", GetName());
373+
}
374+
369375
unique_ptr<FileHandle> FileSystem::OpenFile(const string &path, FileOpenFlags flags, optional_ptr<FileOpener> opener) {
376+
if (SupportsOpenFileExtended()) {
377+
return OpenFileExtended(OpenFileInfo(path), flags, opener);
378+
}
370379
throw NotImplementedException("%s: OpenFile is not implemented!", GetName());
371380
}
372381

382+
unique_ptr<FileHandle> FileSystem::OpenFile(const OpenFileInfo &file, FileOpenFlags flags,
383+
optional_ptr<FileOpener> opener) {
384+
if (SupportsOpenFileExtended()) {
385+
return OpenFileExtended(file, flags, opener);
386+
} else {
387+
return OpenFile(file.path, flags, opener);
388+
}
389+
}
390+
391+
bool FileSystem::SupportsOpenFileExtended() const {
392+
return false;
393+
}
394+
373395
void FileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
374396
throw NotImplementedException("%s: Read (with location) is not implemented!", GetName());
375397
}
@@ -463,7 +485,7 @@ bool FileSystem::HasGlob(const string &str) {
463485
return false;
464486
}
465487

466-
vector<string> FileSystem::Glob(const string &path, FileOpener *opener) {
488+
vector<OpenFileInfo> FileSystem::Glob(const string &path, FileOpener *opener) {
467489
throw NotImplementedException("%s: Glob is not implemented!", GetName());
468490
}
469491

@@ -504,7 +526,7 @@ static string LookupExtensionForPattern(const string &pattern) {
504526
return "";
505527
}
506528

507-
vector<string> FileSystem::GlobFiles(const string &pattern, ClientContext &context, FileGlobOptions options) {
529+
vector<OpenFileInfo> FileSystem::GlobFiles(const string &pattern, ClientContext &context, FileGlobOptions options) {
508530
auto result = Glob(pattern);
509531
if (result.empty()) {
510532
string required_extension = LookupExtensionForPattern(pattern);

src/duckdb/src/common/hive_partitioning.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,12 @@ Value HivePartitioning::GetValue(ClientContext &context, const string &key, cons
149149

150150
// TODO: this can still be improved by removing the parts of filter expressions that are true for all remaining files.
151151
// currently, only expressions that cannot be evaluated during pushdown are removed.
152-
void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<string> &files,
152+
void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<OpenFileInfo> &files,
153153
vector<unique_ptr<Expression>> &filters,
154154
const HivePartitioningFilterInfo &filter_info,
155155
MultiFilePushdownInfo &info) {
156156

157-
vector<string> pruned_files;
157+
vector<OpenFileInfo> pruned_files;
158158
vector<bool> have_preserved_filter(filters.size(), false);
159159
vector<unique_ptr<Expression>> pruned_filters;
160160
unordered_set<idx_t> filters_applied_to_files;
@@ -167,7 +167,7 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector<str
167167
for (idx_t i = 0; i < files.size(); i++) {
168168
auto &file = files[i];
169169
bool should_prune_file = false;
170-
auto known_values = GetKnownColumnValues(file, filter_info);
170+
auto known_values = GetKnownColumnValues(file.path, filter_info);
171171

172172
for (idx_t j = 0; j < filters.size(); j++) {
173173
auto &filter = filters[j];

0 commit comments

Comments
 (0)