Skip to content

Commit 3e69762

Browse files
committed
vendor: Update vendored sources to duckdb/duckdb@db82e0b
RETURN_STATS: remove footer_offset, and emit written partition keys (duckdb/duckdb#16715)
1 parent 048d5f4 commit 3e69762

File tree

7 files changed

+77
-63
lines changed

7 files changed

+77
-63
lines changed

src/duckdb/extension/parquet/parquet_writer.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -844,8 +844,7 @@ void ParquetWriter::Finalize() {
844844
// gather written statistics from the metadata
845845
GatherWrittenStatistics();
846846
written_stats->file_size_bytes = writer->GetTotalWritten();
847-
written_stats->footer_offset = Value::UBIGINT(metadata_start_offset);
848-
written_stats->footer_size = Value::UBIGINT(footer_size);
847+
written_stats->footer_size_bytes = Value::UBIGINT(footer_size);
849848
}
850849

851850
// flush to disk

src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ class FixedBatchCopyGlobalState : public GlobalSinkState {
9898
atomic<bool> any_finished;
9999
//! Minimum memory per thread
100100
idx_t minimum_memory_per_thread;
101-
//! File stats (for RETURN_STATS)
102-
unique_ptr<CopyFunctionFileStatistics> file_stats;
101+
//! Written file info (for RETURN_STATS)
102+
unique_ptr<CopyToFileInfo> written_file_info;
103103

104104
void AddBatchData(idx_t batch_index, unique_ptr<PreparedBatchData> new_batch, idx_t memory_usage) {
105105
// move the batch data to the set of prepared batch data
@@ -602,8 +602,10 @@ unique_ptr<GlobalSinkState> PhysicalBatchCopyToFile::GetGlobalSinkState(ClientCo
602602
auto result = make_uniq<FixedBatchCopyGlobalState>(
603603
context, function.copy_to_initialize_global(context, *bind_data, file_path), minimum_memory_per_thread);
604604
if (return_type == CopyFunctionReturnType::WRITTEN_FILE_STATISTICS) {
605-
result->file_stats = make_uniq<CopyFunctionFileStatistics>();
606-
function.copy_to_get_written_statistics(context, *bind_data, *result->global_state, *result->file_stats);
605+
result->written_file_info = make_uniq<CopyToFileInfo>(file_path);
606+
result->written_file_info->file_stats = make_uniq<CopyFunctionFileStatistics>();
607+
function.copy_to_get_written_statistics(context, *bind_data, *result->global_state,
608+
*result->written_file_info->file_stats);
607609
}
608610
result->batch_size = function.desired_batch_size ? function.desired_batch_size(context, *bind_data) : 0;
609611
return std::move(result);
@@ -627,7 +629,7 @@ SourceResultType PhysicalBatchCopyToFile::GetData(ExecutionContext &context, Dat
627629
break;
628630
}
629631
case CopyFunctionReturnType::WRITTEN_FILE_STATISTICS: {
630-
PhysicalCopyToFile::ReturnStatistics(chunk, 0, fp, *g.file_stats);
632+
PhysicalCopyToFile::ReturnStatistics(chunk, 0, *g.written_file_info);
631633
break;
632634
}
633635
default:

src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp

Lines changed: 52 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,6 @@ struct VectorOfValuesEquality {
4545
template <class T>
4646
using vector_of_value_map_t = unordered_map<vector<Value>, T, VectorOfValuesHashFunction, VectorOfValuesEquality>;
4747

48-
struct CopyToFileInfo {
49-
explicit CopyToFileInfo(string file_path_p) : file_path(std::move(file_path_p)) {
50-
}
51-
52-
string file_path;
53-
unique_ptr<CopyFunctionFileStatistics> file_stats;
54-
};
55-
5648
class CopyToFunctionGlobalState : public GlobalSinkState {
5749
public:
5850
explicit CopyToFunctionGlobalState(ClientContext &context, unique_ptr<GlobalFunctionData> global_state)
@@ -67,8 +59,8 @@ class CopyToFunctionGlobalState : public GlobalSinkState {
6759
unordered_set<string> created_directories;
6860
//! shared state for HivePartitionedColumnData
6961
shared_ptr<GlobalHivePartitionState> partition_state;
70-
//! File names
71-
vector<CopyToFileInfo> file_names;
62+
//! Written file info and stats
63+
vector<unique_ptr<CopyToFileInfo>> written_files;
7264
//! Max open files
7365
idx_t max_open_files;
7466

@@ -99,16 +91,16 @@ class CopyToFunctionGlobalState : public GlobalSinkState {
9991
return path;
10092
}
10193

102-
optional_ptr<CopyFunctionFileStatistics> AddFile(const StorageLockKey &l, const string &file_name,
103-
CopyFunctionReturnType return_type) {
94+
optional_ptr<CopyToFileInfo> AddFile(const StorageLockKey &l, const string &file_name,
95+
CopyFunctionReturnType return_type) {
10496
D_ASSERT(l.GetType() == StorageLockType::EXCLUSIVE);
105-
optional_ptr<CopyFunctionFileStatistics> result;
106-
CopyToFileInfo file_info(file_name);
97+
auto file_info = make_uniq<CopyToFileInfo>(file_name);
98+
optional_ptr<CopyToFileInfo> result;
10799
if (return_type == CopyFunctionReturnType::WRITTEN_FILE_STATISTICS) {
108-
file_info.file_stats = make_uniq<CopyFunctionFileStatistics>();
109-
result = file_info.file_stats.get();
100+
file_info->file_stats = make_uniq<CopyFunctionFileStatistics>();
101+
result = file_info.get();
110102
}
111-
file_names.push_back(std::move(file_info));
103+
written_files.push_back(std::move(file_info));
112104
return result;
113105
}
114106

@@ -172,15 +164,29 @@ class CopyToFunctionGlobalState : public GlobalSinkState {
172164
full_path = op.filename_pattern.CreateFilename(fs, hive_path, op.file_extension, offset);
173165
}
174166
}
175-
optional_ptr<CopyFunctionFileStatistics> file_stats;
167+
optional_ptr<CopyToFileInfo> written_file_info;
176168
if (op.return_type != CopyFunctionReturnType::CHANGED_ROWS) {
177-
file_stats = AddFile(*global_lock, full_path, op.return_type);
169+
written_file_info = AddFile(*global_lock, full_path, op.return_type);
178170
}
179171
// initialize writes
180172
auto info = make_uniq<PartitionWriteInfo>();
181173
info->global_state = op.function.copy_to_initialize_global(context.client, *op.bind_data, full_path);
182-
if (file_stats) {
183-
op.function.copy_to_get_written_statistics(context.client, *op.bind_data, *info->global_state, *file_stats);
174+
if (written_file_info) {
175+
// set up the file stats for the copy
176+
op.function.copy_to_get_written_statistics(context.client, *op.bind_data, *info->global_state,
177+
*written_file_info->file_stats);
178+
179+
// set the partition info
180+
vector<Value> partition_keys;
181+
vector<Value> partition_values;
182+
for (idx_t i = 0; i < op.partition_columns.size(); i++) {
183+
const auto &partition_col_name = op.names[op.partition_columns[i]];
184+
const auto &partition_value = values[i];
185+
partition_keys.emplace_back(partition_col_name);
186+
partition_values.push_back(partition_value.DefaultCastAs(LogicalType::VARCHAR));
187+
}
188+
written_file_info->partition_keys = Value::MAP(LogicalType::VARCHAR, LogicalType::VARCHAR,
189+
std::move(partition_keys), std::move(partition_values));
184190
}
185191
auto &result = *info;
186192
info->active_writes = 1;
@@ -308,13 +314,13 @@ unique_ptr<GlobalFunctionData> PhysicalCopyToFile::CreateFileState(ClientContext
308314
idx_t this_file_offset = g.last_file_offset++;
309315
auto &fs = FileSystem::GetFileSystem(context);
310316
string output_path(filename_pattern.CreateFilename(fs, file_path, file_extension, this_file_offset));
311-
optional_ptr<CopyFunctionFileStatistics> file_stats;
317+
optional_ptr<CopyToFileInfo> written_file_info;
312318
if (return_type != CopyFunctionReturnType::CHANGED_ROWS) {
313-
file_stats = g.AddFile(global_lock, output_path, return_type);
319+
written_file_info = g.AddFile(global_lock, output_path, return_type);
314320
}
315321
auto result = function.copy_to_initialize_global(context, *bind_data, output_path);
316-
if (file_stats) {
317-
function.copy_to_get_written_statistics(context, *bind_data, *result, *file_stats);
322+
if (written_file_info) {
323+
function.copy_to_get_written_statistics(context, *bind_data, *result, *written_file_info->file_stats);
318324
}
319325
return result;
320326
}
@@ -410,9 +416,10 @@ unique_ptr<GlobalSinkState> PhysicalCopyToFile::GetGlobalSinkState(ClientContext
410416
auto state = make_uniq<CopyToFunctionGlobalState>(
411417
context, function.copy_to_initialize_global(context, *bind_data, file_path));
412418
auto global_lock = state->lock.GetExclusiveLock();
413-
auto file_stats = state->AddFile(*global_lock, file_path, return_type);
414-
if (file_stats) {
415-
function.copy_to_get_written_statistics(context, *bind_data, *state->global_state, *file_stats);
419+
auto written_file_info = state->AddFile(*global_lock, file_path, return_type);
420+
if (written_file_info) {
421+
function.copy_to_get_written_statistics(context, *bind_data, *state->global_state,
422+
*written_file_info->file_stats);
416423
}
417424
return std::move(state);
418425
}
@@ -576,18 +583,17 @@ unique_ptr<GlobalSourceState> PhysicalCopyToFile::GetGlobalSourceState(ClientCon
576583
return make_uniq<CopyToFileGlobalSourceState>();
577584
}
578585

579-
void PhysicalCopyToFile::ReturnStatistics(DataChunk &chunk, idx_t row_idx, const string &file_name,
580-
CopyFunctionFileStatistics &file_stats) {
586+
void PhysicalCopyToFile::ReturnStatistics(DataChunk &chunk, idx_t row_idx, CopyToFileInfo &info) {
587+
auto &file_stats = *info.file_stats;
588+
581589
// filename VARCHAR
582-
chunk.SetValue(0, row_idx, file_name);
590+
chunk.SetValue(0, row_idx, info.file_path);
583591
// count BIGINT
584592
chunk.SetValue(1, row_idx, Value::UBIGINT(file_stats.row_count));
585593
// file size bytes BIGINT
586594
chunk.SetValue(2, row_idx, Value::UBIGINT(file_stats.file_size_bytes));
587-
// footer offset BIGINT
588-
chunk.SetValue(3, row_idx, file_stats.footer_offset);
589-
// footer size BIGINT
590-
chunk.SetValue(4, row_idx, file_stats.footer_size);
595+
// footer size bytes BIGINT
596+
chunk.SetValue(3, row_idx, file_stats.footer_size_bytes);
591597
// column statistics map(varchar, map(varchar, varchar))
592598
map<string, Value> stats;
593599
for (auto &entry : file_stats.column_statistics) {
@@ -612,25 +618,27 @@ void PhysicalCopyToFile::ReturnStatistics(DataChunk &chunk, idx_t row_idx, const
612618
values.emplace_back(std::move(entry.second));
613619
}
614620
auto map_val_type = LogicalType::MAP(LogicalType::VARCHAR, LogicalType::VARCHAR);
615-
chunk.SetValue(5, row_idx, Value::MAP(LogicalType::VARCHAR, map_val_type, std::move(keys), std::move(values)));
621+
chunk.SetValue(4, row_idx, Value::MAP(LogicalType::VARCHAR, map_val_type, std::move(keys), std::move(values)));
622+
623+
// partition_keys map(varchar, varchar)
624+
chunk.SetValue(5, row_idx, info.partition_keys);
616625
}
617626

618627
SourceResultType PhysicalCopyToFile::GetData(ExecutionContext &context, DataChunk &chunk,
619628
OperatorSourceInput &input) const {
620629
auto &g = sink_state->Cast<CopyToFunctionGlobalState>();
621630
if (return_type == CopyFunctionReturnType::WRITTEN_FILE_STATISTICS) {
622631
auto &source_state = input.global_state.Cast<CopyToFileGlobalSourceState>();
623-
idx_t next_end = MinValue<idx_t>(source_state.offset + STANDARD_VECTOR_SIZE, g.file_names.size());
632+
idx_t next_end = MinValue<idx_t>(source_state.offset + STANDARD_VECTOR_SIZE, g.written_files.size());
624633
idx_t count = next_end - source_state.offset;
625634
for (idx_t i = 0; i < count; i++) {
626-
auto &file_entry = g.file_names[source_state.offset + i];
627-
auto &file_stats = *file_entry.file_stats;
628-
ReturnStatistics(chunk, i, file_entry.file_path, file_stats);
635+
auto &file_entry = *g.written_files[source_state.offset + i];
636+
ReturnStatistics(chunk, i, file_entry);
629637
}
630638
chunk.SetCardinality(count);
631639
source_state.offset += count;
632-
return source_state.offset < g.file_names.size() ? SourceResultType::HAVE_MORE_OUTPUT
633-
: SourceResultType::FINISHED;
640+
return source_state.offset < g.written_files.size() ? SourceResultType::HAVE_MORE_OUTPUT
641+
: SourceResultType::FINISHED;
634642
}
635643

636644
chunk.SetCardinality(1);
@@ -641,8 +649,8 @@ SourceResultType PhysicalCopyToFile::GetData(ExecutionContext &context, DataChun
641649
case CopyFunctionReturnType::CHANGED_ROWS_AND_FILE_LIST: {
642650
chunk.SetValue(0, 0, Value::BIGINT(NumericCast<int64_t>(g.rows_copied.load())));
643651
vector<Value> file_name_list;
644-
for (auto &file_names : g.file_names) {
645-
file_name_list.emplace_back(file_names.file_path);
652+
for (auto &file_info : g.written_files) {
653+
file_name_list.emplace_back(file_info->file_path);
646654
}
647655
chunk.SetValue(1, 0, Value::LIST(LogicalType::VARCHAR, std::move(file_name_list)));
648656
break;

src/duckdb/src/function/copy_function.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ vector<string> GetCopyFunctionReturnNames(CopyFunctionReturnType return_type) {
99
case CopyFunctionReturnType::CHANGED_ROWS_AND_FILE_LIST:
1010
return {"Count", "Files"};
1111
case CopyFunctionReturnType::WRITTEN_FILE_STATISTICS:
12-
return {"filename", "count", "file_size_bytes", "footer_offset", "footer_size", "column_statistics"};
12+
return {"filename", "count", "file_size_bytes", "footer_size_bytes", "column_statistics", "partition_keys"};
1313
default:
1414
throw NotImplementedException("Unknown CopyFunctionReturnType");
1515
}
@@ -26,8 +26,8 @@ vector<LogicalType> GetCopyFunctionReturnLogicalTypes(CopyFunctionReturnType ret
2626
LogicalType::UBIGINT,
2727
LogicalType::UBIGINT,
2828
LogicalType::UBIGINT,
29-
LogicalType::UBIGINT,
30-
LogicalType::MAP(LogicalType::VARCHAR, LogicalType::MAP(LogicalType::VARCHAR, LogicalType::VARCHAR))};
29+
LogicalType::MAP(LogicalType::VARCHAR, LogicalType::MAP(LogicalType::VARCHAR, LogicalType::VARCHAR)),
30+
LogicalType::MAP(LogicalType::VARCHAR, LogicalType::VARCHAR)};
3131
default:
3232
throw NotImplementedException("Unknown CopyFunctionReturnType");
3333
}

src/duckdb/src/function/table/version/pragma_version.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#ifndef DUCKDB_PATCH_VERSION
2-
#define DUCKDB_PATCH_VERSION "0-dev1492"
2+
#define DUCKDB_PATCH_VERSION "0-dev1496"
33
#endif
44
#ifndef DUCKDB_MINOR_VERSION
55
#define DUCKDB_MINOR_VERSION 3
@@ -8,10 +8,10 @@
88
#define DUCKDB_MAJOR_VERSION 1
99
#endif
1010
#ifndef DUCKDB_VERSION
11-
#define DUCKDB_VERSION "v1.3.0-dev1492"
11+
#define DUCKDB_VERSION "v1.3.0-dev1496"
1212
#endif
1313
#ifndef DUCKDB_SOURCE_ID
14-
#define DUCKDB_SOURCE_ID "0431406ccc"
14+
#define DUCKDB_SOURCE_ID "db82e0bfcd"
1515
#endif
1616
#include "duckdb/function/table/system_functions.hpp"
1717
#include "duckdb/main/database.hpp"

src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@
1818

1919
namespace duckdb {
2020

21+
struct CopyToFileInfo {
22+
explicit CopyToFileInfo(string file_path_p) : file_path(std::move(file_path_p)) {
23+
}
24+
25+
string file_path;
26+
unique_ptr<CopyFunctionFileStatistics> file_stats;
27+
Value partition_keys;
28+
};
29+
2130
//! Copy the contents of a query into a table
2231
class PhysicalCopyToFile : public PhysicalOperator {
2332
public:
@@ -82,8 +91,7 @@ class PhysicalCopyToFile : public PhysicalOperator {
8291

8392
string GetTrimmedPath(ClientContext &context) const;
8493

85-
static void ReturnStatistics(DataChunk &chunk, idx_t row_idx, const string &file_name,
86-
CopyFunctionFileStatistics &file_stats);
94+
static void ReturnStatistics(DataChunk &chunk, idx_t row_idx, CopyToFileInfo &written_file_info);
8795

8896
private:
8997
unique_ptr<GlobalFunctionData> CreateFileState(ClientContext &context, GlobalSinkState &sink,

src/duckdb/src/include/duckdb/function/copy_function.hpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,7 @@ vector<LogicalType> GetCopyFunctionReturnLogicalTypes(CopyFunctionReturnType ret
134134
struct CopyFunctionFileStatistics {
135135
idx_t row_count = 0;
136136
idx_t file_size_bytes = 0;
137-
//! Footer offset in the file (in bytes)
138-
Value footer_offset;
139-
//! Footer size (in bytes)
140-
Value footer_size;
137+
Value footer_size_bytes;
141138
// map of column name -> statistics name -> statistics value
142139
case_insensitive_map_t<case_insensitive_map_t<Value>> column_statistics;
143140
};

0 commit comments

Comments
 (0)