Skip to content

Commit 58e70bd

Browse files
authored
Merge pull request #12 from spiceai/peasee/20260118-duckdb-update
dependencies: Cherry-pick updates for DuckDB 1.4.3
2 parents d1dc88f + 14825ad commit 58e70bd

6 files changed

Lines changed: 250 additions & 73 deletions

File tree

src/execution/index/art/art.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#include "duckdb/execution/index/art/art.hpp"
22

3+
#include "duckdb/common/assert.hpp"
4+
#include "duckdb/common/helper.hpp"
5+
#include "duckdb/common/typedefs.hpp"
36
#include "duckdb/common/types/conflict_manager.hpp"
47
#include "duckdb/common/unordered_map.hpp"
58
#include "duckdb/common/vector_operations/vector_operations.hpp"
@@ -39,6 +42,17 @@ struct ARTIndexScanState : public IndexScanState {
3942
set<row_t> row_ids;
4043
};
4144

45+
struct ARTIndexCompoundKeyScanState : public IndexScanState {
46+
//! The predicates to scan.
47+
//! A single predicate for each constituent key in a compound index.
48+
vector<Value> values;
49+
//! The expressions over the scan predicates.
50+
vector<ExpressionType> expressions;
51+
bool checked = false;
52+
//! All scanned row IDs.
53+
set<row_t> row_ids;
54+
};
55+
4256
//===--------------------------------------------------------------------===//
4357
// ART
4458
//===--------------------------------------------------------------------===//
@@ -142,6 +156,34 @@ static unique_ptr<IndexScanState> InitializeScanTwoPredicates(const Value &low_v
142156
return std::move(result);
143157
}
144158

159+
// Build compound scan state by building individual index scans and collecting their exprs/values
160+
unique_ptr<IndexScanState> ART::TryInitializeCompoundKeyScan(const vector<unique_ptr<Expression>> &index_exprs,
161+
vector<vector<unique_ptr<Expression>>> &exprs) {
162+
auto compound_scan_state = make_uniq<ARTIndexCompoundKeyScanState>();
163+
164+
for (idx_t i = 0; i < index_exprs.size(); ++i) {
165+
auto index_expr = &index_exprs[i];
166+
auto filter_exprs = &exprs[i];
167+
168+
for (const auto &filter_expr : *filter_exprs) {
169+
auto single_scan = ART::TryInitializeScan(**index_expr, *filter_expr);
170+
if (!single_scan) {
171+
return nullptr;
172+
}
173+
174+
auto single_scan_concrete = single_scan->Cast<ARTIndexScanState>();
175+
if (single_scan_concrete.expressions[0] != ExpressionType::COMPARE_EQUAL) {
176+
return nullptr;
177+
}
178+
179+
compound_scan_state->values.push_back(single_scan_concrete.values[0]);
180+
compound_scan_state->expressions.push_back(single_scan_concrete.expressions[0]);
181+
}
182+
}
183+
184+
return std::move(compound_scan_state);
185+
}
186+
145187
unique_ptr<IndexScanState> ART::TryInitializeScan(const Expression &expr, const Expression &filter_expr) {
146188
Value low_value, high_value, equal_value;
147189
ExpressionType low_comparison_type = ExpressionType::INVALID, high_comparison_type = ExpressionType::INVALID;
@@ -678,6 +720,30 @@ bool ART::SearchCloseRange(ARTKey &lower_bound, ARTKey &upper_bound, bool left_e
678720
return it.Scan(upper_bound, max_count, row_ids, right_equal);
679721
}
680722

723+
bool ART::CompoundKeyScan(IndexScanState &state, const idx_t max_count, set<row_t> &row_ids) {
724+
auto &scan_state = state.Cast<ARTIndexCompoundKeyScanState>();
725+
726+
if (scan_state.values.size() != types.size()) {
727+
return false;
728+
}
729+
730+
for (idx_t i = 0; i < scan_state.values.size(); ++i) {
731+
D_ASSERT(scan_state.values[i].type().InternalType() == types[i]);
732+
}
733+
734+
ArenaAllocator arena_allocator(Allocator::Get(db));
735+
736+
// Make a compound key from the collected state values
737+
auto compound_key = ARTKey::CreateKey(arena_allocator, types[0], scan_state.values[0]);
738+
for (idx_t i = 1; i < scan_state.values.size(); ++i) {
739+
auto part_key = ARTKey::CreateKey(arena_allocator, types[i], scan_state.values[i]);
740+
compound_key.Concat(arena_allocator, part_key);
741+
}
742+
743+
lock_guard<mutex> l(lock);
744+
return SearchEqual(compound_key, max_count, row_ids);
745+
}
746+
681747
bool ART::Scan(IndexScanState &state, const idx_t max_count, set<row_t> &row_ids) {
682748
auto &scan_state = state.Cast<ARTIndexScanState>();
683749
D_ASSERT(scan_state.values[0].type().InternalType() == types[0]);

src/function/table/table_scan.cpp

Lines changed: 124 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
#include "duckdb/planner/filter/conjunction_filter.hpp"
3131
#include "duckdb/common/types/value_map.hpp"
3232
#include "duckdb/main/settings.hpp"
33+
#include <limits>
3334
#include <list>
35+
#include <utility>
3436

3537
namespace duckdb {
3638

@@ -520,86 +522,158 @@ vector<unique_ptr<Expression>> ExtractFilterExpressions(const ColumnDefinition &
520522

521523
bool TryScanIndex(ART &art, const ColumnList &column_list, TableFunctionInitInput &input, TableFilterSet &filter_set,
522524
idx_t max_count, set<row_t> &row_ids) {
523-
// FIXME: No support for index scans on compound ARTs.
524-
// See note above on multi-filter support.
525-
if (art.unbound_expressions.size() > 1) {
526-
return false;
525+
vector<unique_ptr<Expression>> index_exprs;
526+
for (const auto &expr : art.unbound_expressions) {
527+
index_exprs.push_back(expr->Copy());
527528
}
528529

529-
auto index_expr = art.unbound_expressions[0]->Copy();
530+
// If this is a view, the column IDs are (may be?) relative to the view projection
530531
auto &indexed_columns = art.GetColumnIds();
531532

532-
// NOTE: We do not push down multi-column filters, e.g., 42 = a + b.
533-
if (indexed_columns.size() != 1) {
533+
// Allow composite ART scans
534+
if (indexed_columns.size() != index_exprs.size()) {
534535
return false;
535536
}
536537

537538
// Resolve bound column references in the index_expr against the current input projection
538-
column_t updated_index_column;
539-
bool found_index_column_in_input = false;
540-
541-
// Find the indexed column amongst the input columns
542-
for (idx_t i = 0; i < input.column_ids.size(); ++i) {
543-
if (input.column_ids[i] == indexed_columns[0]) {
544-
updated_index_column = i;
545-
found_index_column_in_input = true;
546-
break;
539+
bool rewrite_index_exprs = false;
540+
vector<column_t> index_column_to_input_pos;
541+
index_column_to_input_pos.resize(indexed_columns.size(), std::numeric_limits<idx_t>::max());
542+
543+
// Associate indexed columns to input columns
544+
for (idx_t i = 0; i < indexed_columns.size(); ++i) {
545+
for (idx_t j = 0; j < input.column_ids.size(); ++j) {
546+
if (indexed_columns[i] == input.column_ids[j]) {
547+
rewrite_index_exprs = i != j;
548+
index_column_to_input_pos.at(i) = j;
549+
break;
550+
}
547551
}
548552
}
549553

550-
// If found, update the bound column ref within index_expr
551-
if (found_index_column_in_input) {
552-
ExpressionIterator::EnumerateExpression(index_expr, [&](Expression &expr) {
553-
if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) {
554-
return;
555-
}
556-
557-
auto &bound_column_ref_expr = expr.Cast<BoundColumnRefExpression>();
554+
// Make sure that all indexed_columns were bound, or bail out
555+
for (auto col : index_column_to_input_pos) {
556+
if (col == std::numeric_limits<idx_t>::max()) {
557+
return false;
558+
}
559+
}
558560

559-
// If the bound column references the index column, use updated_index_column
560-
if (bound_column_ref_expr.binding.column_index == indexed_columns[0]) {
561-
bound_column_ref_expr.binding.column_index = updated_index_column;
561+
// Allow scan only if index expressions reference ONE column each, and that column
562+
// is associated with an indexed_column
563+
// NOTE: We do not push down multi-column filters, e.g., 42 = a + b.
564+
for (idx_t i = 0; i < index_exprs.size(); ++i) {
565+
unordered_set<column_t> referenced_columns;
566+
auto expr = &index_exprs[i];
567+
568+
// Walk the expr in case of nesting (e.g. function)
569+
ExpressionIterator::EnumerateExpression(*expr, [&](Expression &child_expr) {
570+
if (child_expr.GetExpressionClass() == ExpressionClass::BOUND_COLUMN_REF) {
571+
auto &col_ref = child_expr.Cast<BoundColumnRefExpression>();
572+
referenced_columns.insert(col_ref.binding.column_index);
562573
}
563574
});
575+
576+
if (referenced_columns.size() != 1) {
577+
return false;
578+
}
579+
580+
// Make sure the column reference can be looked up
581+
auto ref_col_idx = *referenced_columns.begin();
582+
if (ref_col_idx >= index_column_to_input_pos.size() || ref_col_idx >= input.column_ids.size()) {
583+
return false;
584+
}
585+
586+
// The column for this position matches the indexed_column ID for this position directly
587+
auto direct_match = input.column_ids[ref_col_idx] == indexed_columns[i];
588+
589+
// We should know if there is a different mapping for this reference.
590+
// If there is not, it won't match, so it is not worth trying.
591+
if (!direct_match && !rewrite_index_exprs) {
592+
return false;
593+
}
594+
595+
auto remapped_cid_position = index_column_to_input_pos[ref_col_idx];
596+
auto remapped_match = remapped_cid_position < input.column_ids.size() &&
597+
input.column_ids[remapped_cid_position] == indexed_columns[i];
598+
599+
if (!(direct_match || remapped_match)) {
600+
return false;
601+
}
564602
}
565603

566-
// Get ART column.
567-
auto &col = column_list.GetColumn(LogicalIndex(indexed_columns[0]));
604+
// If the position of the indexed_columns differs from the order of the input, remap the index expressions
605+
if (rewrite_index_exprs) {
606+
for (auto &index_expr : index_exprs) {
607+
ExpressionIterator::EnumerateExpression(index_expr, [&](Expression &expr) {
608+
if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) {
609+
return;
610+
}
611+
612+
auto &bound_column_ref_expr = expr.Cast<BoundColumnRefExpression>();
568613

569-
// The indexes of the filters match input.column_indexes, which are: i -> column_index.
570-
// Try to find a filter on the ART column.
571-
optional_idx storage_index;
572-
for (idx_t i = 0; i < input.column_indexes.size(); i++) {
573-
if (input.column_indexes[i].ToLogical() == col.Logical()) {
574-
storage_index = i;
575-
break;
614+
// If the bound column references an indexed column, update it
615+
for (idx_t i = 0; i < indexed_columns.size(); ++i) {
616+
auto remapped_index = index_column_to_input_pos[bound_column_ref_expr.binding.column_index];
617+
if (input.column_ids[remapped_index] == indexed_columns[i]) {
618+
bound_column_ref_expr.binding.column_index = index_column_to_input_pos[i];
619+
break;
620+
}
621+
}
622+
});
576623
}
577624
}
578625

579-
// No filter matches the ART column.
580-
if (!storage_index.IsValid()) {
581-
return false;
626+
// The indexes of the filters match input.column_indexes, which are: i -> column_index.
627+
// Reuse the index <-> projection mappings from index expr rebinding (which are canonical even if not rewriting)
628+
vector<vector<unique_ptr<Expression>>> index_filters;
629+
630+
for (idx_t i = 0; i < index_column_to_input_pos.size(); ++i) {
631+
auto column_def = &column_list.GetColumn(LogicalIndex(indexed_columns[i]));
632+
auto maybe_filter = filter_set.filters.find(index_column_to_input_pos[i]);
633+
if (maybe_filter != filter_set.filters.end()) {
634+
auto filter = &maybe_filter->second;
635+
auto filter_expressions = ExtractFilterExpressions(*column_def, *filter, index_column_to_input_pos[i]);
636+
637+
index_filters.push_back(std::move(filter_expressions));
638+
}
582639
}
583640

584-
// Try to find a matching filter for the column.
585-
auto filter = filter_set.filters.find(storage_index.GetIndex());
586-
if (filter == filter_set.filters.end()) {
641+
// Index filters must:
642+
// - Match ART column count 1:1
643+
// - Match filter expression set 1:1 (there may be filters on non-indexed columns, bail out if so)
644+
if (index_filters.size() != indexed_columns.size() || filter_set.filters.size() != index_filters.size() ||
645+
index_filters.empty()) {
587646
return false;
588647
}
589648

590-
auto expressions = ExtractFilterExpressions(col, filter->second, storage_index.GetIndex());
591-
for (const auto &filter_expr : expressions) {
592-
auto scan_state = art.TryInitializeScan(*index_expr, *filter_expr);
649+
// Do a compound scan if we have filter exprs bound for several columns
650+
if (index_filters.size() > 1) {
651+
auto scan_state = art.TryInitializeCompoundKeyScan(index_exprs, index_filters);
593652
if (!scan_state) {
594653
return false;
595654
}
596655

597-
// Check if we can use an index scan, and already retrieve the matching row ids.
598-
if (!art.Scan(*scan_state, max_count, row_ids)) {
656+
if (!art.CompoundKeyScan(*scan_state, max_count, row_ids)) {
599657
row_ids.clear();
600658
return false;
601659
}
602660
}
661+
// Original single column index scan
662+
else {
663+
for (const auto &filter_expr : index_filters[0]) {
664+
auto scan_state = art.TryInitializeScan(*index_exprs[0], *filter_expr);
665+
if (!scan_state) {
666+
return false;
667+
}
668+
669+
// Check if we can use an index scan, and already retrieve the matching row ids.
670+
if (!art.Scan(*scan_state, max_count, row_ids)) {
671+
row_ids.clear();
672+
return false;
673+
}
674+
}
675+
}
676+
603677
return true;
604678
}
605679

@@ -622,9 +696,9 @@ unique_ptr<GlobalTableFunctionState> TableScanInitGlobal(ClientContext &context,
622696
// 1.2. Find + scan one ART for b = 24.
623697
// 1.3. Return the intersecting row IDs.
624698
// 2. (Reorder and) scan a single ART with a compound key of (a, b).
625-
if (filter_set.filters.size() != 1) {
626-
return DuckTableScanInitGlobal(context, input, storage, bind_data);
627-
}
699+
// if (filter_set.filters.size() != 1) {
700+
// return DuckTableScanInitGlobal(context, input, storage, bind_data);
701+
// }
628702

629703
// The checkpoint lock ensures that we do not checkpoint while scanning this table.
630704
auto &transaction = DuckTransaction::Get(context, storage.db);

src/include/duckdb/execution/index/art/art.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "duckdb/execution/index/bound_index.hpp"
1212
#include "duckdb/execution/index/art/node.hpp"
1313
#include "duckdb/common/array.hpp"
14+
#include "duckdb/planner/expression.hpp"
1415

1516
namespace duckdb {
1617

@@ -24,6 +25,7 @@ class ARTKeySection;
2425
class FixedSizeAllocator;
2526

2627
struct ARTIndexScanState;
28+
struct ARTIndexCompoundKeyScanState;
2729

2830
class ART : public BoundIndex {
2931
public:
@@ -70,10 +72,20 @@ class ART : public BoundIndex {
7072
public:
7173
//! Try to initialize a scan on the ART with the given expression and filter.
7274
unique_ptr<IndexScanState> TryInitializeScan(const Expression &expr, const Expression &filter_expr);
75+
76+
//! Try to initialize a compound key scan on the ART, using the given index expr -> filter expr mappings.
77+
//! Supports equality comparisons only.
78+
unique_ptr<IndexScanState> TryInitializeCompoundKeyScan(const vector<unique_ptr<Expression>> &index_exprs,
79+
vector<vector<unique_ptr<Expression>>> &exprs);
80+
7381
//! Perform a lookup on the ART, fetching up to max_count row IDs.
7482
//! If all row IDs were fetched, it return true, else false.
7583
bool Scan(IndexScanState &state, idx_t max_count, set<row_t> &row_ids);
7684

85+
//! Like `ART::Scan`, but uses `ARTIndexCompoundKeyScanState` to concatenate multiple
86+
//! values for equality comparisons only.
87+
bool CompoundKeyScan(IndexScanState &state, idx_t max_count, set<row_t> &row_ids);
88+
7789
//! Appends data to the locked index.
7890
ErrorData Append(IndexLock &l, DataChunk &chunk, Vector &row_ids) override;
7991
//! Appends data to the locked index and verifies constraint violations.

0 commit comments

Comments
 (0)