Skip to content

Commit e63d4bb

Browse files
mikewhbarrowbowang
andauthored
feat(resolver): add extensible Lance dataset resolver framework (#144)
* feat(resolver): add extensible Lance dataset resolver framework This commit introduces a modular and extensible architecture for resolving Lance dataset URIs from table names or file paths. - src/include/lance_resolver.hpp: Public API definitions - src/lance_resolver.cpp: Implementation - LanceResolvePolicy: Enum for resolution behavior (STRICT/FALLBACK_TO_PATH) - ILanceDatasetResolver: Interface for custom resolvers (third-party extensible) - LanceDatasetResolverRegistry: Singleton registry managing all resolvers - DefaultCatalogResolver: Default DuckDB catalog-based resolver - lance_metadata.cpp: Refactored to use ResolveLanceDatasetUri() with STRICT policy - lance_search.cpp: Refactored to use ResolveLanceDatasetUri() with FALLBACK_TO_PATH policy - lance_fts, lance_vector_search, lance_hybrid_search now support table names - lance_index.cpp: Added support for short-form table names in CREATE INDEX statements - Now supports: CREATE INDEX text_idx ON search_table (address) USING INVERTED; - Previously required full qualified name: lance_ns.main.search_table - Uses DatabaseManager::GetDefaultDatabase() to fill default catalog when not specified - Uses DEFAULT_SCHEMA ("main") when schema is not specified - CMakeLists.txt: Added lance_resolver.cpp to build sources - lance_resolver.hpp: Added explicit constructors for C++11 compatibility - Unified API for dataset path resolution across all Lance functions - Third-party extensions can implement ILanceDatasetResolver to customize table name resolution without modifying lance-duckdb source code - Supports priority-based resolver chaining (higher priority runs first) - CREATE INDEX syntax now more user-friendly with short table name support * fix format-check * update sqllogictests * fix PR comments on: search_functions.test --------- Co-authored-by: arrowbowang <arrowbowang@tencent.com>
1 parent 592b62a commit e63d4bb

9 files changed

Lines changed: 469 additions & 39 deletions

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ set(EXTENSION_SOURCES src/lance_extension.cpp src/lance_scan.cpp
2323
src/lance_update.cpp
2424
src/lance_write.cpp
2525
src/lance_truncate.cpp
26-
src/lance_index.cpp)
26+
src/lance_index.cpp
27+
src/lance_resolver.cpp)
2728

2829
build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
2930
build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})

src/include/lance_resolver.hpp

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#pragma once
2+
3+
#include "duckdb.hpp"
4+
#include "duckdb/common/mutex.hpp"
5+
6+
#include <memory>
7+
#include <string>
8+
#include <vector>
9+
10+
namespace duckdb {
11+
12+
// Forward declaration
13+
class LanceTableEntry;
14+
15+
//===--------------------------------------------------------------------===//
16+
// Policy for dataset path resolution behavior
17+
//===--------------------------------------------------------------------===//
18+
enum class LanceResolvePolicy : uint8_t {
19+
// Must be a Lance table, throw exception if not found or not Lance table
20+
STRICT = 0,
21+
// First try catalog lookup, fallback to file path if failed
22+
FALLBACK_TO_PATH = 1,
23+
};
24+
25+
//===--------------------------------------------------------------------===//
26+
// Result of dataset resolution
27+
//===--------------------------------------------------------------------===//
28+
struct LanceResolveResult {
29+
bool success;
30+
string dataset_uri;
31+
string error_message;
32+
33+
// Explicit constructor for C++11 compatibility
34+
LanceResolveResult() : success(false), dataset_uri(), error_message() {}
35+
36+
LanceResolveResult(bool success_, string uri, string error)
37+
: success(success_), dataset_uri(std::move(uri)),
38+
error_message(std::move(error)) {}
39+
40+
static LanceResolveResult Success(string uri) {
41+
return LanceResolveResult(true, std::move(uri), "");
42+
}
43+
44+
static LanceResolveResult Failure(string error) {
45+
return LanceResolveResult(false, "", std::move(error));
46+
}
47+
};
48+
49+
//===--------------------------------------------------------------------===//
50+
// Interface for Lance dataset resolver
51+
// Third-party can implement this to provide custom resolution logic
52+
//===--------------------------------------------------------------------===//
53+
class ILanceDatasetResolver {
54+
public:
55+
virtual ~ILanceDatasetResolver() = default;
56+
57+
//! Try to resolve the input string to a Lance dataset URI
58+
//! Returns LanceResolveResult indicating success/failure
59+
virtual LanceResolveResult TryResolve(ClientContext &context,
60+
const string &input) = 0;
61+
62+
//! Priority of this resolver (higher priority runs first)
63+
//! Default implementation has priority 0
64+
//! Third-party can use higher priority to override default behavior
65+
//! or use negative priority to serve as fallback
66+
virtual int Priority() const { return 0; }
67+
68+
//! Name of this resolver for debugging and unregistration
69+
virtual string Name() const = 0;
70+
};
71+
72+
//===--------------------------------------------------------------------===//
73+
// Registry for dataset resolvers (singleton pattern)
74+
//===--------------------------------------------------------------------===//
75+
class LanceDatasetResolverRegistry {
76+
public:
77+
//! Get the singleton instance
78+
static LanceDatasetResolverRegistry &Get();
79+
80+
//! Register a custom resolver
81+
//! Resolvers are sorted by priority (descending) after registration
82+
void RegisterResolver(shared_ptr<ILanceDatasetResolver> resolver);
83+
84+
//! Unregister a resolver by name
85+
//! Returns true if a resolver was removed
86+
bool UnregisterResolver(const string &name);
87+
88+
//! Get all registered resolver names (for debugging)
89+
vector<string> GetResolverNames() const;
90+
91+
//! Main resolution function
92+
//! This is the public API that should be called by all Lance functions
93+
//! @param context The client context
94+
//! @param input The input value (table name or file path)
95+
//! @param policy The resolution policy (STRICT or FALLBACK_TO_PATH)
96+
//! @param function_name The name of the calling function (for error messages)
97+
//! @return The resolved dataset URI
98+
string Resolve(ClientContext &context, const Value &input,
99+
LanceResolvePolicy policy, const string &function_name);
100+
101+
private:
102+
LanceDatasetResolverRegistry();
103+
~LanceDatasetResolverRegistry() = default;
104+
105+
// Non-copyable
106+
LanceDatasetResolverRegistry(const LanceDatasetResolverRegistry &) = delete;
107+
LanceDatasetResolverRegistry &
108+
operator=(const LanceDatasetResolverRegistry &) = delete;
109+
110+
//! Sort resolvers by priority (descending)
111+
void SortResolvers();
112+
113+
mutable mutex lock_;
114+
vector<shared_ptr<ILanceDatasetResolver>> resolvers_;
115+
};
116+
117+
//===--------------------------------------------------------------------===//
118+
// Default catalog resolver implementation
119+
// Resolves table names through DuckDB's catalog system
120+
//===--------------------------------------------------------------------===//
121+
class DefaultCatalogResolver : public ILanceDatasetResolver {
122+
public:
123+
LanceResolveResult TryResolve(ClientContext &context,
124+
const string &input) override;
125+
126+
int Priority() const override { return 0; }
127+
128+
string Name() const override { return "default_catalog"; }
129+
};
130+
131+
//===--------------------------------------------------------------------===//
132+
// Convenience function (wraps registry call)
133+
// This is the recommended API for resolving Lance dataset URIs
134+
//===--------------------------------------------------------------------===//
135+
string ResolveLanceDatasetUri(ClientContext &context, const Value &input,
136+
LanceResolvePolicy policy,
137+
const string &function_name);
138+
139+
} // namespace duckdb

src/lance_extension.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
namespace duckdb {
1313

1414
// Forward declaration
15+
void RegisterLanceMetadata(ExtensionLoader &loader);
1516
void RegisterLanceScan(ExtensionLoader &loader);
1617
void RegisterLanceSearch(ExtensionLoader &loader);
1718
void RegisterLanceReplacement(DBConfig &config);
@@ -26,6 +27,7 @@ static void LoadInternal(ExtensionLoader &loader) {
2627
RegisterLanceScan(loader);
2728
RegisterLanceSearch(loader);
2829
RegisterLanceWrite(loader);
30+
RegisterLanceMetadata(loader);
2931
}
3032

3133
void LanceExtension::Load(ExtensionLoader &loader) {

src/lance_index.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "duckdb/function/table/arrow.hpp"
1010
#include "duckdb/function/table_function.hpp"
1111
#include "duckdb/main/config.hpp"
12+
#include "duckdb/main/database_manager.hpp"
1213
#include "duckdb/main/extension/extension_loader.hpp"
1314
#include "duckdb/parser/expression/columnref_expression.hpp"
1415
#include "duckdb/parser/parser_extension.hpp"
@@ -1554,6 +1555,13 @@ LanceIndexPlan(ParserExtensionInfo *, ClientContext &context,
15541555
QualifiedName parsed;
15551556
if (!parse_data->target_is_path) {
15561557
parsed = QualifiedName::Parse(parse_data->target_sql);
1558+
// Fill in default catalog/schema if not specified
1559+
if (parsed.catalog.empty()) {
1560+
parsed.catalog = DatabaseManager::GetDefaultDatabase(context);
1561+
}
1562+
if (parsed.schema.empty()) {
1563+
parsed.schema = DEFAULT_SCHEMA;
1564+
}
15571565
qname = &parsed;
15581566
}
15591567
switch (parse_data->kind) {

src/lance_metadata.cpp

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
#include "duckdb.hpp"
22

3-
#include "duckdb/catalog/catalog.hpp"
4-
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
53
#include "duckdb/common/string_util.hpp"
64
#include "duckdb/function/table_function.hpp"
75
#include "duckdb/main/extension/extension_loader.hpp"
8-
#include "duckdb/parser/qualified_name.hpp"
96

107
#include "lance_common.hpp"
118
#include "lance_ffi.hpp"
12-
#include "lance_table_entry.hpp"
9+
#include "lance_resolver.hpp"
1310

1411
namespace duckdb {
1512

@@ -21,29 +18,6 @@ enum class LanceKvTarget : uint8_t {
2118
INDICES = 5,
2219
};
2320

24-
static string ResolveLanceDatasetUriFromTableName(ClientContext &context,
25-
const Value &table_name) {
26-
if (table_name.IsNull()) {
27-
throw BinderException("table name cannot be NULL");
28-
}
29-
auto table_name_str = table_name.GetValue<string>();
30-
if (table_name_str.empty()) {
31-
throw BinderException("table name cannot be empty");
32-
}
33-
34-
auto qname = QualifiedName::Parse(table_name_str);
35-
auto &entry = Catalog::GetEntry(context, CatalogType::TABLE_ENTRY,
36-
qname.catalog, qname.schema, qname.name);
37-
auto &table_entry = entry.Cast<TableCatalogEntry>();
38-
auto *lance_entry = dynamic_cast<LanceTableEntry *>(&table_entry);
39-
if (!lance_entry) {
40-
throw NotImplementedException(
41-
"This operation is only supported for tables in ATTACH TYPE LANCE "
42-
"directory namespaces");
43-
}
44-
return lance_entry->DatasetUri();
45-
}
46-
4721
static vector<pair<string, string>> ParseTsvRows(const char *ptr) {
4822
if (!ptr) {
4923
throw IOException("Failed to read metadata from Lance dataset" +
@@ -197,8 +171,8 @@ LanceKvUpdateBind(ClientContext &context, TableFunctionBindInput &input,
197171
throw BinderException("invalid argument count");
198172
}
199173

200-
auto dataset_uri =
201-
ResolveLanceDatasetUriFromTableName(context, input.inputs[0]);
174+
auto dataset_uri = ResolveLanceDatasetUri(
175+
context, input.inputs[0], LanceResolvePolicy::STRICT, "lance_metadata");
202176

203177
string key;
204178
bool has_value = false;
@@ -337,8 +311,8 @@ LanceKvListBind(ClientContext &context, TableFunctionBindInput &input,
337311
throw BinderException("invalid argument count");
338312
}
339313

340-
auto dataset_uri =
341-
ResolveLanceDatasetUriFromTableName(context, input.inputs[0]);
314+
auto dataset_uri = ResolveLanceDatasetUri(
315+
context, input.inputs[0], LanceResolvePolicy::STRICT, "lance_metadata");
342316

343317
string field_path;
344318
if (target == LanceKvTarget::FIELD_METADATA) {
@@ -479,8 +453,8 @@ LanceCompactFilesBind(ClientContext &context, TableFunctionBindInput &input,
479453
if (input.inputs.size() != 1) {
480454
throw BinderException("lance_compact_files requires 1 argument");
481455
}
482-
auto dataset_uri =
483-
ResolveLanceDatasetUriFromTableName(context, input.inputs[0]);
456+
auto dataset_uri = ResolveLanceDatasetUri(
457+
context, input.inputs[0], LanceResolvePolicy::STRICT, "lance_metadata");
484458
return_types = {LogicalType::BIGINT};
485459
names = {"Count"};
486460
return make_uniq<LanceMaintenanceBindData>(std::move(dataset_uri), 0, false);
@@ -492,8 +466,8 @@ static unique_ptr<FunctionData> LanceCleanupOldVersionsBind(
492466
if (input.inputs.size() != 3) {
493467
throw BinderException("lance_cleanup_old_versions requires 3 arguments");
494468
}
495-
auto dataset_uri =
496-
ResolveLanceDatasetUriFromTableName(context, input.inputs[0]);
469+
auto dataset_uri = ResolveLanceDatasetUri(
470+
context, input.inputs[0], LanceResolvePolicy::STRICT, "lance_metadata");
497471

498472
int64_t older_than_seconds = 0;
499473
if (!input.inputs[1].IsNull()) {

0 commit comments

Comments
 (0)