Skip to content

Commit 573bb88

Browse files
committed
Merge pull request #7 from spiceai/mach/compound-art-scan
ART Index: Support compound key scans Squashed commit of the following: commit a22a430 Author: David Stancu <david@spice.ai> Date: Wed Oct 29 16:37:30 2025 -0400 simplify filter expression storage index bindings (just reuse the ones we made earlier), fix single-ref-per-expr predicate to correctly walk expr tree and yank refs (allowing nesting in fns, etc) commit 9c8c1ed Author: David Stancu <david@spice.ai> Date: Wed Oct 29 15:11:23 2025 -0400 copy index expressions before rewriting column refs commit aff2c98 Author: David Stancu <david@spice.ai> Date: Wed Oct 29 14:36:33 2025 -0400 table scan: rebind projected columns in ALL index exprs do not bail out early if more than one index expr hook up composite key scan commit bfc6f02 Author: David Stancu <david@spice.ai> Date: Wed Oct 29 14:35:09 2025 -0400 make specialized compound key scan state for eq compares, specialized scan using ARTKey::Concat
1 parent b390a7c commit 573bb88

3 files changed

Lines changed: 160 additions & 51 deletions

File tree

src/execution/index/art/art.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "duckdb/execution/index/art/art.hpp"
22

3+
#include "duckdb/common/helper.hpp"
4+
#include "duckdb/common/typedefs.hpp"
35
#include "duckdb/common/types/conflict_manager.hpp"
46
#include "duckdb/common/unordered_map.hpp"
57
#include "duckdb/common/vector_operations/vector_operations.hpp"
@@ -39,6 +41,17 @@ struct ARTIndexScanState : public IndexScanState {
3941
set<row_t> row_ids;
4042
};
4143

44+
struct ARTIndexCompoundKeyScanState : public IndexScanState {
45+
//! The predicates to scan.
46+
//! A single predicate for each constituent key in a compound index.
47+
vector<Value> values;
48+
//! The expressions over the scan predicates.
49+
vector<ExpressionType> expressions;
50+
bool checked = false;
51+
//! All scanned row IDs.
52+
set<row_t> row_ids;
53+
};
54+
4255
//===--------------------------------------------------------------------===//
4356
// ART
4457
//===--------------------------------------------------------------------===//
@@ -142,6 +155,34 @@ static unique_ptr<IndexScanState> InitializeScanTwoPredicates(const Value &low_v
142155
return std::move(result);
143156
}
144157

158+
// Build compound scan state by building individual index scans and collecting their exprs/values
159+
unique_ptr<IndexScanState> ART::TryInitializeCompoundKeyScan(const vector<unique_ptr<Expression>> &index_exprs,
160+
vector<vector<unique_ptr<Expression>>> &exprs) {
161+
auto compound_scan_state = make_uniq<ARTIndexCompoundKeyScanState>();
162+
163+
for (idx_t i = 0; i < index_exprs.size(); ++i) {
164+
auto index_expr = &index_exprs[i];
165+
auto filter_exprs = &exprs[i];
166+
167+
for (const auto &filter_expr : *filter_exprs) {
168+
auto single_scan = ART::TryInitializeScan(**index_expr, *filter_expr);
169+
if (!single_scan) {
170+
return nullptr;
171+
}
172+
173+
auto single_scan_concrete = single_scan->Cast<ARTIndexScanState>();
174+
if (single_scan_concrete.expressions[0] != ExpressionType::COMPARE_EQUAL) {
175+
return nullptr;
176+
}
177+
178+
compound_scan_state->values.push_back(single_scan_concrete.values[0]);
179+
compound_scan_state->expressions.push_back(single_scan_concrete.expressions[0]);
180+
}
181+
}
182+
183+
return compound_scan_state;
184+
}
185+
145186
unique_ptr<IndexScanState> ART::TryInitializeScan(const Expression &expr, const Expression &filter_expr) {
146187
Value low_value, high_value, equal_value;
147188
ExpressionType low_comparison_type = ExpressionType::INVALID, high_comparison_type = ExpressionType::INVALID;
@@ -675,6 +716,23 @@ bool ART::SearchCloseRange(ARTKey &lower_bound, ARTKey &upper_bound, bool left_e
675716
return it.Scan(upper_bound, max_count, row_ids, right_equal);
676717
}
677718

719+
bool ART::CompoundKeyScan(IndexScanState &state, const idx_t max_count, set<row_t> &row_ids) {
720+
auto &scan_state = state.Cast<ARTIndexCompoundKeyScanState>();
721+
D_ASSERT(scan_state.values.size() == types.size());
722+
723+
ArenaAllocator arena_allocator(Allocator::Get(db));
724+
725+
// Make a compound key from the collected state values
726+
auto compound_key = ARTKey::CreateKey(arena_allocator, types[0], scan_state.values[0]);
727+
for (idx_t i = 1; i < scan_state.values.size(); ++i) {
728+
auto part_key = ARTKey::CreateKey(arena_allocator, types[i], scan_state.values[i]);
729+
compound_key.Concat(arena_allocator, part_key);
730+
}
731+
732+
lock_guard<mutex> l(lock);
733+
return SearchEqual(compound_key, max_count, row_ids);
734+
}
735+
678736
bool ART::Scan(IndexScanState &state, const idx_t max_count, set<row_t> &row_ids) {
679737
auto &scan_state = state.Cast<ARTIndexScanState>();
680738
D_ASSERT(scan_state.values[0].type().InternalType() == types[0]);

src/function/table/table_scan.cpp

Lines changed: 90 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
#include "duckdb/planner/filter/conjunction_filter.hpp"
3131
#include "duckdb/common/types/value_map.hpp"
3232
#include "duckdb/main/settings.hpp"
33+
#include <limits>
3334
#include <list>
35+
#include <utility>
3436

3537
namespace duckdb {
3638

@@ -486,86 +488,123 @@ vector<unique_ptr<Expression>> ExtractFilterExpressions(const ColumnDefinition &
486488

487489
bool TryScanIndex(ART &art, const ColumnList &column_list, TableFunctionInitInput &input, TableFilterSet &filter_set,
488490
idx_t max_count, set<row_t> &row_ids) {
489-
// FIXME: No support for index scans on compound ARTs.
490-
// See note above on multi-filter support.
491-
if (art.unbound_expressions.size() > 1) {
492-
return false;
491+
vector<unique_ptr<Expression>> index_exprs;
492+
for (const auto &expr : art.unbound_expressions) {
493+
index_exprs.push_back(expr->Copy());
493494
}
494495

495-
auto index_expr = art.unbound_expressions[0]->Copy();
496496
auto &indexed_columns = art.GetColumnIds();
497497

498-
// NOTE: We do not push down multi-column filters, e.g., 42 = a + b.
499-
if (indexed_columns.size() != 1) {
498+
// Allow composite ART scans
499+
if (indexed_columns.size() != index_exprs.size()) {
500500
return false;
501501
}
502502

503-
// Resolve bound column references in the index_expr against the current input projection
504-
column_t updated_index_column;
505-
bool found_index_column_in_input = false;
503+
// ...only if each expression has a single column reference, and
504+
// that single column reference positionally matches that of the index (this is guaranteed? paranoid?)
505+
// NOTE: We do not push down multi-column filters, e.g., 42 = a + b.
506+
for (idx_t i = 0; i < index_exprs.size(); ++i) {
507+
unordered_set<column_t> referenced_columns;
508+
auto expr = &index_exprs[i];
509+
510+
// Walk the expr in case of nesting (e.g. function)
511+
ExpressionIterator::EnumerateExpression(*expr, [&](Expression &child_expr) {
512+
if (child_expr.GetExpressionClass() == ExpressionClass::BOUND_COLUMN_REF) {
513+
auto &col_ref = child_expr.Cast<BoundColumnRefExpression>();
514+
referenced_columns.insert(col_ref.binding.column_index);
515+
}
516+
});
506517

507-
// Find the indexed column amongst the input columns
508-
for (idx_t i = 0; i < input.column_ids.size(); ++i) {
509-
if (input.column_ids[i] == indexed_columns[0]) {
510-
updated_index_column = i;
511-
found_index_column_in_input = true;
512-
break;
518+
if (referenced_columns.size() != 1 || *referenced_columns.begin() != indexed_columns[i]) {
519+
return false;
513520
}
514521
}
515522

516-
// If found, update the bound column ref within index_expr
517-
if (found_index_column_in_input) {
518-
ExpressionIterator::EnumerateExpression(index_expr, [&](Expression &expr) {
519-
if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) {
520-
return;
521-
}
523+
// Resolve bound column references in the index_expr against the current input projection
524+
vector<column_t> index_column_to_proj_pos;
525+
index_column_to_proj_pos.resize(indexed_columns.size(), std::numeric_limits<idx_t>::max());
522526

523-
auto &bound_column_ref_expr = expr.Cast<BoundColumnRefExpression>();
527+
bool found_index_column_in_input = false;
524528

525-
// If the bound column references the index column, use updated_index_column
526-
if (bound_column_ref_expr.binding.column_index == indexed_columns[0]) {
527-
bound_column_ref_expr.binding.column_index = updated_index_column;
529+
// Associate indexed columns to input columns
530+
for (idx_t i = 0; i < indexed_columns.size(); ++i) {
531+
for (idx_t j = 0; j < input.column_ids.size(); ++j) {
532+
if (indexed_columns[i] == input.column_ids[j]) {
533+
index_column_to_proj_pos.at(i) = j;
534+
found_index_column_in_input = true;
528535
}
529-
});
536+
}
530537
}
531538

532-
// Get ART column.
533-
auto &col = column_list.GetColumn(LogicalIndex(indexed_columns[0]));
534-
535-
// The indexes of the filters match input.column_indexes, which are: i -> column_index.
536-
// Try to find a filter on the ART column.
537-
optional_idx storage_index;
538-
for (idx_t i = 0; i < input.column_indexes.size(); i++) {
539-
if (input.column_indexes[i].ToLogical() == col.Logical()) {
540-
storage_index = i;
541-
break;
539+
// If found, update the bound column refs within all index_exprs
540+
if (found_index_column_in_input) {
541+
for (auto &index_expr : index_exprs) {
542+
ExpressionIterator::EnumerateExpression(index_expr, [&](Expression &expr) {
543+
if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) {
544+
return;
545+
}
546+
547+
auto &bound_column_ref_expr = expr.Cast<BoundColumnRefExpression>();
548+
549+
// If the bound column references an indexed column, update it
550+
for (idx_t i = 0; i < indexed_columns.size(); ++i) {
551+
if (bound_column_ref_expr.binding.column_index == indexed_columns[i]) {
552+
bound_column_ref_expr.binding.column_index = index_column_to_proj_pos[i];
553+
break;
554+
}
555+
}
556+
});
542557
}
543558
}
544559

545-
// No filter matches the ART column.
546-
if (!storage_index.IsValid()) {
547-
return false;
560+
// The indexes of the filters match input.column_indexes, which are: i -> column_index.
561+
// Reuse the index <-> projection mappings from index expr rebinding
562+
vector<vector<unique_ptr<Expression>>> filters;
563+
564+
for (idx_t i = 0; i < index_column_to_proj_pos.size(); ++i) {
565+
auto column_def = &column_list.GetColumn(LogicalIndex(indexed_columns[i]));
566+
auto maybe_filter = filter_set.filters.find(index_column_to_proj_pos[i]);
567+
if (maybe_filter != filter_set.filters.end()) {
568+
auto filter = &maybe_filter->second;
569+
auto filter_expressions = ExtractFilterExpressions(*column_def, *filter, index_column_to_proj_pos[i]);
570+
571+
filters.push_back(std::move(filter_expressions));
572+
}
548573
}
549574

550-
// Try to find a matching filter for the column.
551-
auto filter = filter_set.filters.find(storage_index.GetIndex());
552-
if (filter == filter_set.filters.end()) {
575+
// Filters must match ART columns 1:1
576+
if (filters.size() != indexed_columns.size() || filters.empty()) {
553577
return false;
554578
}
555579

556-
auto expressions = ExtractFilterExpressions(col, filter->second, storage_index.GetIndex());
557-
for (const auto &filter_expr : expressions) {
558-
auto scan_state = art.TryInitializeScan(*index_expr, *filter_expr);
580+
// Do a compound scan if we have filter exprs bound for several columns
581+
if (filters.size() > 1) {
582+
auto scan_state = art.TryInitializeCompoundKeyScan(index_exprs, filters);
559583
if (!scan_state) {
560584
return false;
561585
}
562586

563-
// Check if we can use an index scan, and already retrieve the matching row ids.
564-
if (!art.Scan(*scan_state, max_count, row_ids)) {
587+
if (!art.CompoundKeyScan(*scan_state, max_count, row_ids)) {
565588
row_ids.clear();
566589
return false;
567590
}
568591
}
592+
// Original single column index scan
593+
else {
594+
for (const auto &filter_expr : filters[0]) {
595+
auto scan_state = art.TryInitializeScan(*index_exprs[0], *filter_expr);
596+
if (!scan_state) {
597+
return false;
598+
}
599+
600+
// Check if we can use an index scan, and already retrieve the matching row ids.
601+
if (!art.Scan(*scan_state, max_count, row_ids)) {
602+
row_ids.clear();
603+
return false;
604+
}
605+
}
606+
}
607+
569608
return true;
570609
}
571610

@@ -588,9 +627,9 @@ unique_ptr<GlobalTableFunctionState> TableScanInitGlobal(ClientContext &context,
588627
// 1.2. Find + scan one ART for b = 24.
589628
// 1.3. Return the intersecting row IDs.
590629
// 2. (Reorder and) scan a single ART with a compound key of (a, b).
591-
if (filter_set.filters.size() != 1) {
592-
return DuckTableScanInitGlobal(context, input, storage, bind_data);
593-
}
630+
// if (filter_set.filters.size() != 1) {
631+
// return DuckTableScanInitGlobal(context, input, storage, bind_data);
632+
// }
594633

595634
// The checkpoint lock ensures that we do not checkpoint while scanning this table.
596635
auto &transaction = DuckTransaction::Get(context, storage.db);

src/include/duckdb/execution/index/art/art.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "duckdb/execution/index/bound_index.hpp"
1212
#include "duckdb/execution/index/art/node.hpp"
1313
#include "duckdb/common/array.hpp"
14+
#include "duckdb/planner/expression.hpp"
1415

1516
namespace duckdb {
1617

@@ -24,6 +25,7 @@ class ARTKeySection;
2425
class FixedSizeAllocator;
2526

2627
struct ARTIndexScanState;
28+
struct ARTIndexCompoundKeyScanState;
2729

2830
class ART : public BoundIndex {
2931
public:
@@ -70,10 +72,20 @@ class ART : public BoundIndex {
7072
public:
7173
//! Try to initialize a scan on the ART with the given expression and filter.
7274
unique_ptr<IndexScanState> TryInitializeScan(const Expression &expr, const Expression &filter_expr);
75+
76+
//! Try to initialize a compound key scan on the ART, using the given index expr -> filter expr mappings.
77+
//! Supports equality comparisons only.
78+
unique_ptr<IndexScanState> TryInitializeCompoundKeyScan(const vector<unique_ptr<Expression>> &index_exprs,
79+
vector<vector<unique_ptr<Expression>>> &exprs);
80+
7381
//! Perform a lookup on the ART, fetching up to max_count row IDs.
7482
//! If all row IDs were fetched, it return true, else false.
7583
bool Scan(IndexScanState &state, idx_t max_count, set<row_t> &row_ids);
7684

85+
//! Like `ART::Scan`, but uses `ARTIndexCompoundKeyScanState` to concatenate multiple
86+
//! values for equality comparisons only.
87+
bool CompoundKeyScan(IndexScanState &state, idx_t max_count, set<row_t> &row_ids);
88+
7789
//! Appends data to the locked index.
7890
ErrorData Append(IndexLock &l, DataChunk &chunk, Vector &row_ids) override;
7991
//! Appends data to the locked index and verifies constraint violations.

0 commit comments

Comments
 (0)