Skip to content

Commit 038084c

Browse files
authored
Merge pull request duckdb#879 from pdet/merge
Merge
2 parents 62503ef + 4c9d73e commit 038084c

41 files changed

Lines changed: 1583 additions & 408 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

duckdb

Submodule duckdb updated 390 files

src/functions/ducklake_compaction_functions.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,17 @@ SourceResultType DuckLakeCompaction::GetDataInternal(ExecutionContext &context,
9393
}
9494
source_state.returned_result = true;
9595

96+
if (!this->sink_state) {
97+
throw InternalException("DuckLakeCompaction - missing sink state while producing result");
98+
}
99+
auto &gstate = this->sink_state->Cast<DuckLakeInsertGlobalState>();
100+
auto files_created = gstate.written_files.size();
101+
96102
chunk.SetCardinality(1);
97103
chunk.SetValue(0, 0, Value(table.schema.name));
98104
chunk.SetValue(1, 0, Value(table.name));
99105
chunk.SetValue(2, 0, Value::BIGINT(static_cast<int64_t>(source_files.size())));
100-
chunk.SetValue(3, 0, Value::BIGINT(1)); // Each compaction creates 1 output file
106+
chunk.SetValue(3, 0, Value::BIGINT(static_cast<int64_t>(files_created)));
101107
return SourceResultType::FINISHED;
102108
}
103109

@@ -121,7 +127,10 @@ SinkFinalizeType DuckLakeCompaction::Finalize(Pipeline &pipeline, Event &event,
121127
OperatorSinkFinalizeInput &input) const {
122128
auto &global_state = input.global_state.Cast<DuckLakeInsertGlobalState>();
123129

124-
if (global_state.written_files.size() != 1) {
130+
if (global_state.written_files.size() > 1) {
131+
throw InternalException("DuckLakeCompaction - expected at most a single output file");
132+
}
133+
if (global_state.written_files.empty() && type != CompactionType::REWRITE_DELETES) {
125134
throw InternalException("DuckLakeCompaction - expected a single output file");
126135
}
127136
// set the partition values correctly
@@ -137,7 +146,9 @@ SinkFinalizeType DuckLakeCompaction::Finalize(Pipeline &pipeline, Event &event,
137146
DuckLakeCompactionEntry compaction_entry;
138147
compaction_entry.row_id_start = row_id_start;
139148
compaction_entry.source_files = source_files;
140-
compaction_entry.written_file = global_state.written_files[0];
149+
if (!global_state.written_files.empty()) {
150+
compaction_entry.written_file = global_state.written_files[0];
151+
}
141152
compaction_entry.type = type;
142153

143154
auto &transaction = DuckLakeTransaction::Get(context, global_state.table.catalog);
@@ -305,6 +316,9 @@ void DuckLakeCompactor::GenerateCompactions(DuckLakeTableEntry &table,
305316
break;
306317
}
307318
}
319+
if (compacted_files >= options.max_files) {
320+
break;
321+
}
308322
}
309323
}
310324

@@ -560,7 +574,7 @@ DuckLakeCompactor::GenerateCompactionCommand(vector<DuckLakeCompactionFileEntry>
560574
copy->filename_pattern = std::move(copy_options.filename_pattern);
561575
copy->file_extension = std::move(copy_options.file_extension);
562576
copy->overwrite_mode = copy_options.overwrite_mode;
563-
copy->per_thread_output = copy_options.per_thread_output;
577+
copy->per_thread_output = false;
564578
copy->file_size_bytes = copy_options.file_size_bytes;
565579
copy->rotate = copy_options.rotate;
566580
copy->return_type = copy_options.return_type;

src/functions/ducklake_flush_inlined_data.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,9 @@ DuckLakeDataFlusher::DuckLakeDataFlusher(ClientContext &context, DuckLakeCatalog
277277

278278
unique_ptr<LogicalOperator> DuckLakeDataFlusher::GenerateFlushCommand() {
279279
// get the table entry at the specified snapshot
280-
DuckLakeSnapshot snapshot(catalog.GetBeginSnapshotForTable(table_id, transaction), inlined_table.schema_version, 0,
281-
0);
280+
DuckLakeSnapshot snapshot(
281+
catalog.GetBeginSnapshotForSchemaVersion(table_id, inlined_table.schema_version, transaction),
282+
inlined_table.schema_version, 0, 0);
282283

283284
auto entry = catalog.GetEntryById(transaction, snapshot, table_id);
284285
if (!entry) {

src/include/common/index.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ namespace duckdb {
1515

1616
struct DuckLakeConstants {
1717
static constexpr const idx_t TRANSACTION_LOCAL_ID_START = 9223372036854775808ULL;
18+
static constexpr const idx_t TRANSACTION_LOCAL_ROW_ID_START = 1000000000000000000ULL;
19+
20+
static bool IsTransactionLocalRowId(int64_t rid) {
21+
return rid >= 0 && static_cast<idx_t>(rid) >= TRANSACTION_LOCAL_ROW_ID_START;
22+
}
1823
};
1924

2025
struct SchemaIndex {

src/include/metadata_manager/postgres_metadata_manager.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ class PostgresMetadataManager : public DuckLakeMetadataManager {
2727
}
2828

2929
string GetColumnTypeInternal(const LogicalType &type) override;
30+
shared_ptr<DuckLakeInlinedData> TransformInlinedData(QueryResult &result,
31+
const vector<LogicalType> &expected_types) override;
3032

3133
unique_ptr<QueryResult> Execute(DuckLakeSnapshot snapshot, string &query) override;
3234

src/include/storage/ducklake_catalog.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ struct DuckLakeConfigOption;
2525
struct DeleteFileMap;
2626
class LogicalGet;
2727

28+
enum class InlinedDeletionCacheResult { EXISTS, DOES_NOT_EXIST, UNKNOWN };
29+
2830
class DuckLakeCatalog : public Catalog {
2931
public:
3032
// default target file size: 512MB
@@ -57,6 +59,8 @@ class DuckLakeCatalog : public Catalog {
5759
return metadata_type;
5860
}
5961
idx_t DataInliningRowLimit(SchemaIndex schema_index, TableIndex table_index) const;
62+
//! Returns the inlining limit (0 if the table is not eligible)
63+
idx_t GetInliningLimit(ClientContext &context, DuckLakeTableEntry &table, const vector<LogicalType> &types);
6064
string &Separator() {
6165
return separator;
6266
}
@@ -164,12 +168,18 @@ class DuckLakeCatalog : public Catalog {
164168
optional_ptr<const DuckLakeNameMap> TryGetMappingById(DuckLakeTransaction &transaction, MappingIndex mapping_id);
165169
MappingIndex TryGetCompatibleNameMap(DuckLakeTransaction &transaction, const DuckLakeNameMap &name_map);
166170
idx_t GetBeginSnapshotForTable(TableIndex table_id, DuckLakeTransaction &transaction);
171+
idx_t GetBeginSnapshotForSchemaVersion(TableIndex table_id, idx_t schema_version, DuckLakeTransaction &transaction);
167172

168173
static unique_ptr<DuckLakeStats> ConstructStatsMap(vector<DuckLakeGlobalStatsInfo> &global_stats,
169174
DuckLakeCatalogSet &schema);
170175
//! Return the schema for the given snapshot - loading it if it is not yet loaded
171176
DuckLakeCatalogSet &GetSchemaForSnapshot(DuckLakeTransaction &transaction, DuckLakeSnapshot snapshot);
172177

178+
//! Check if an inlined deletion table is known to exist or not exist for the given table and snapshot
179+
InlinedDeletionCacheResult CheckInlinedDeletionTableCache(TableIndex table_id, DuckLakeSnapshot snapshot);
180+
//! Cache the result of an inlined deletion table existence check
181+
void CacheInlinedDeletionTableResult(TableIndex table_id, DuckLakeSnapshot snapshot, bool exists);
182+
173183
private:
174184
void DropSchema(ClientContext &context, DropInfo &info) override;
175185
unique_ptr<DuckLakeCatalogSet> LoadSchemaForSnapshot(DuckLakeTransaction &transaction, DuckLakeSnapshot snapshot);
@@ -200,6 +210,13 @@ class DuckLakeCatalog : public Catalog {
200210
string metadata_type;
201211
//! Whether or not the catalog is initialized
202212
bool initialized = false;
213+
//! Cache for inlined deletion table existence checks
214+
mutex inlined_deletion_cache_lock;
215+
//! Table IDs where the inlined deletion table is known to exist (permanent - never invalidated)
216+
unordered_set<idx_t> inlined_deletion_exists;
217+
//! Table IDs where the inlined deletion table is known to NOT exist, with the snapshot_id at which we checked
218+
//! Valid as long as current snapshot.snapshot_id <= cached snapshot_id
219+
unordered_map<idx_t, idx_t> inlined_deletion_not_exists;
203220
//! The id of the last committed snapshot, set at FlushChanges on a successful commit
204221
mutable mutex commit_lock;
205222
optional_idx last_committed_snapshot;

src/include/storage/ducklake_inlined_data.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ namespace duckdb {
1717
struct DuckLakeInlinedData {
1818
unique_ptr<ColumnDataCollection> data;
1919
map<FieldIndex, DuckLakeColumnStats> column_stats;
20+
//! Row Ids for update inlining
21+
vector<int64_t> row_ids;
22+
23+
bool HasPreservedRowIds() const;
24+
//! Get the row_id for a given position in the data collection
25+
idx_t GetRowId(idx_t position) const;
26+
//! Get the output row_id for a surviving (non-deleted) row
27+
int64_t GetOutputRowId(idx_t position) const;
28+
//! Merge preserved row_ids from update inlining
29+
void MergeRowIds(const DuckLakeInlinedData &new_data, idx_t new_data_count);
2030
};
2131

2232
struct DuckLakeInlinedDataDeletes {

src/include/storage/ducklake_metadata_manager.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ class DuckLakeMetadataManager {
145145
DuckLakeSnapshot snapshot,
146146
DuckLakeFileSizeOptions options);
147147
virtual idx_t GetBeginSnapshotForTable(TableIndex table_id);
148+
virtual idx_t GetBeginSnapshotForSchemaVersion(TableIndex table_id, idx_t schema_version);
148149
virtual idx_t GetNetDataFileRowCount(TableIndex table_id, DuckLakeSnapshot snapshot);
149150
virtual idx_t GetNetInlinedRowCount(const string &inlined_table_name, DuckLakeSnapshot snapshot);
150151
virtual vector<DuckLakeFileForCleanup> GetOldFilesForCleanup(const string &filter);
@@ -243,6 +244,7 @@ class DuckLakeMetadataManager {
243244

244245
string LoadPath(string path);
245246
string StorePath(string path);
247+
string GetPathSeparator(const string &path);
246248

247249
protected:
248250
virtual string GetLatestSnapshotQuery() const;
@@ -311,9 +313,10 @@ class DuckLakeMetadataManager {
311313
public:
312314
//! Read inlined file deletions for regular table scans (no snapshot info per row)
313315
map<idx_t, set<idx_t>> ReadInlinedFileDeletions(TableIndex table_id, DuckLakeSnapshot snapshot);
316+
//! Clear inlined table caches (needed after rollback so retry re-creates the tables)
317+
void ClearInlinedTableCaches();
314318

315319
private:
316-
unordered_map<idx_t, string> inlined_table_name_cache;
317320
static unordered_map<string /* name */, create_t> metadata_managers;
318321
static mutex metadata_managers_lock;
319322

src/include/storage/ducklake_multi_file_list.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ namespace duckdb {
2020
//! The DuckLakeMultiFileList implements the MultiFileList API to allow injecting it into the regular DuckDB parquet
2121
//! scan
2222
class DuckLakeMultiFileList : public MultiFileList {
23-
static constexpr const idx_t TRANSACTION_LOCAL_ID_START = 1000000000000000000ULL;
2423
static constexpr const char *DUCKLAKE_TRANSACTION_LOCAL_INLINED_FILENAME =
2524
"__ducklake_inlined_transaction_local_data";
2625

@@ -67,6 +66,8 @@ class DuckLakeMultiFileList : public MultiFileList {
6766
void GetFilesForTable() const;
6867
void GetTableInsertions() const;
6968
void GetTableDeletions() const;
69+
//! Get the row_id_start for transaction-local inlined data.
70+
idx_t GetTransactionLocalRowIdStart(idx_t transaction_row_start) const;
7071
void AddFilterToPushdownInfo(FilterPushdownInfo &pushdown_info, column_t column_id, unique_ptr<TableFilter> filter) const;
7172

7273
private:

src/include/storage/ducklake_stats.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class BaseStatistics;
1515

1616
//! Returns true for types that require value-based (not lexicographic string) comparison for min/max stats
1717
inline bool RequiresValueComparison(const LogicalType &type) {
18-
return type.IsNumeric() || type.IsTemporal();
18+
return type.IsNumeric() || type.IsTemporal() || type.id() == LogicalTypeId::BOOLEAN;
1919
}
2020

2121
struct DuckLakeColumnStats;

0 commit comments

Comments
 (0)