Skip to content

Commit a663d53

Browse files
authored
feat(scanner): full-text search via scanner builder (Phase 2 PR 3/3) (#23)
Phase 2 PR 3 of 3 — completes Phase 2 of the lance-c roadmap. With this PR all six Phase 2 rows in the README are checked. ## Summary **`lance_scanner_full_text_search`** — set a BM25 full-text query on the scanner. Re-uses every existing iteration mechanism (`to_arrow_stream`, `next`, `poll_next`, `scan_async`); the `_score` column is automatically included in the output stream. ```c int32_t lance_scanner_full_text_search( LanceScanner* scanner, const char* query, /* search terms */ const char* const* columns, /* NULL → all FTS-indexed columns */ uint32_t max_fuzzy_distance /* 0 = exact; >0 = MatchQuery::with_fuzziness */ ); ``` **Mutual exclusion with vector search** is enforced symmetrically: calling `nearest` after `full_text_search` (or vice versa) returns -1 with a descriptive error message naming the conflict. Lance's scanner doesn't support combining the two — see the design spec. **C++ wrapper** (`include/lance.hpp`) - `Scanner::full_text_search(query, columns={}, max_fuzzy_distance=0)` — fluent. ## Test plan - [x] `cargo fmt` - [x] `cargo clippy --all-targets -- -D warnings` - [x] `cargo test` — 74 passed (70 from main + 4 new) - [x] `cargo test --test compile_and_run_test -- --ignored` — 2 passed (C + C++ smoke) New tests: - `test_scanner_full_text_search` — build inverted index on `name`, search for `"alice"`, assert `_score` column present + ≥1 hit. - `test_fts_fuzzy` — `max_fuzzy_distance=2` matches `"alise"` → `"alice"`. - `test_nearest_after_fts_is_rejected` — calling `nearest` after `full_text_search` returns -1 with mutually-exclusive error. - `test_fts_after_nearest_is_rejected` — opposite direction. ## Spec / plan - Spec: [`docs/superpowers/specs/2026-04-23-phase2-vector-search-indexing-design.md`](docs/superpowers/specs/2026-04-23-phase2-vector-search-indexing-design.md) - Plan: [`docs/superpowers/plans/2026-04-23-phase2-vector-search-indexing.md`](docs/superpowers/plans/2026-04-23-phase2-vector-search-indexing.md) (PR 3 = Tasks 23–26) ## Phase 2 status after this PR | Status | Component | |--------|-----------| | [x] | Vector search | | [x] | Full-text search | | [x] | Vector index creation | | [x] | Scalar index creation | | [x] | Index management | | [x] | C++ wrappers | Next steps: hybrid search (vector + FTS combined retrieval with reranking) is being scoped — see https://docs.lancedb.com/search/hybrid-search. 🤖 Generated with [Claude Code](https://claude.com/claude-code)
1 parent c6b2daf commit a663d53

6 files changed

Lines changed: 337 additions & 2 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ Based on the [liblance RFC](https://github.com/lance-format/lance/discussions/60
3232
| Status | Component | Description |
3333
|--------|-----------|-------------|
3434
| [x] | Vector search | Nearest-neighbor via scanner with metric/k/nprobes |
35-
| [ ] | Full-text search | FTS queries through scanner interface |
35+
| [x] | Full-text search | FTS queries through scanner interface |
3636
| [x] | Vector index creation | IVF_PQ, IVF_FLAT, IVF_SQ, HNSW variants |
3737
| [x] | Scalar index creation | BTree, Bitmap, Inverted, Label-List indexes |
3838
| [x] | Index management | List and drop index operations |
39-
| [ ] | C++ wrappers | `create_vector_index()` and `create_scalar_index()` methods |
39+
| [x] | C++ wrappers | `create_vector_index()` and `create_scalar_index()` methods |
4040

4141
### Phase 3: Write Path & Mutations
4242

include/lance.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,27 @@ int32_t lance_scanner_set_metric(LanceScanner* scanner, LanceMetricType metric);
460460
int32_t lance_scanner_set_use_index(LanceScanner* scanner, bool enable);
461461
int32_t lance_scanner_set_prefilter(LanceScanner* scanner, bool enable);
462462

463+
/* ─── Full-text search (Phase 2) ─── */
464+
465+
/**
466+
* Set a BM25 full-text search query on the scanner.
467+
*
468+
* Mutually exclusive with lance_scanner_nearest: calling either after the
469+
* other returns LANCE_ERR_INVALID_ARGUMENT.
470+
*
471+
* @param query Query string (terms).
472+
* @param columns NULL-terminated array of columns, or NULL for all
473+
* FTS-indexed columns.
474+
* @param max_fuzzy_distance 0 = exact match; >0 = MatchQuery::with_fuzziness.
475+
* @return 0 on success, -1 on error.
476+
*/
477+
int32_t lance_scanner_full_text_search(
478+
LanceScanner* scanner,
479+
const char* query,
480+
const char* const* columns,
481+
uint32_t max_fuzzy_distance
482+
);
483+
463484
#ifdef __cplusplus
464485
} /* extern "C" */
465486
#endif

include/lance.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,23 @@ class Scanner {
367367
return *this;
368368
}
369369

370+
/// BM25 full-text search.
371+
/// `columns` empty → search all FTS-indexed columns.
372+
/// `max_fuzzy_distance` 0 = exact; >0 = MatchQuery::with_fuzziness.
373+
Scanner& full_text_search(const std::string& query,
374+
const std::vector<std::string>& columns = {},
375+
uint32_t max_fuzzy_distance = 0) {
376+
std::vector<const char*> col_ptrs;
377+
for (auto& c : columns) col_ptrs.push_back(c.c_str());
378+
col_ptrs.push_back(nullptr);
379+
const char* const* cols_c =
380+
columns.empty() ? nullptr : col_ptrs.data();
381+
if (lance_scanner_full_text_search(handle_.get(), query.c_str(),
382+
cols_c, max_fuzzy_distance) != 0)
383+
check_error();
384+
return *this;
385+
}
386+
370387
/// Access the underlying C handle.
371388
LanceScanner* c_handle() { return handle_.get(); }
372389
};

src/scanner.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use futures::{Stream, StreamExt};
1515
use lance::Dataset;
1616
use lance::dataset::scanner::DatasetRecordBatchStream;
1717
use lance_core::Result;
18+
use lance_index::scalar::FullTextSearchQuery;
1819
use lance_io::ffi::to_ffi_arrow_array_stream;
1920
use lance_io::stream::RecordBatchStream;
2021

@@ -53,6 +54,7 @@ pub struct LanceScanner {
5354
metric_override: Option<crate::index::LanceMetricType>,
5455
use_index: Option<bool>,
5556
prefilter: bool,
57+
fts_query: Option<FullTextSearchQuery>,
5658
// Materialized on first iteration call
5759
stream: Option<Pin<Box<DatasetRecordBatchStream>>>,
5860
#[allow(dead_code)]
@@ -102,6 +104,7 @@ impl LanceScanner {
102104
metric_override: None,
103105
use_index: None,
104106
prefilter: false,
107+
fts_query: None,
105108
stream: None,
106109
schema: None,
107110
}
@@ -162,6 +165,9 @@ impl LanceScanner {
162165
scanner.prefilter(true);
163166
}
164167
}
168+
if let Some(fts) = &self.fts_query {
169+
scanner.full_text_search(fts.clone())?;
170+
}
165171
let stream = block_on(scanner.try_into_stream())?;
166172
self.schema = Some(stream.schema());
167173
self.stream = Some(Box::pin(stream));
@@ -208,6 +214,9 @@ impl LanceScanner {
208214
scanner.prefilter(true);
209215
}
210216
}
217+
if let Some(fts) = &self.fts_query {
218+
scanner.full_text_search(fts.clone())?;
219+
}
211220
Ok(scanner)
212221
}
213222
}
@@ -776,6 +785,13 @@ unsafe fn scanner_nearest_inner(
776785
});
777786
}
778787
let s = unsafe { &mut *scanner };
788+
if s.fts_query.is_some() {
789+
return Err(lance_core::Error::InvalidInput {
790+
source: "cannot call nearest after full_text_search; they are mutually exclusive"
791+
.into(),
792+
location: snafu::location!(),
793+
});
794+
}
779795
let column_str = unsafe { helpers::parse_c_string(column)? }.unwrap();
780796

781797
let dtype = match element_type {
@@ -824,3 +840,75 @@ unsafe fn scanner_nearest_inner(
824840
});
825841
Ok(0)
826842
}
843+
844+
// ---------------------------------------------------------------------------
845+
// Full-text search (Phase 2)
846+
// ---------------------------------------------------------------------------
847+
848+
/// Set a BM25 full-text search query on the scanner.
849+
///
850+
/// - `query`: Query string (terms).
851+
/// - `columns`: NULL-terminated array of column names, or NULL to search all
852+
/// FTS-indexed columns.
853+
/// - `max_fuzzy_distance`: 0 = exact match; >0 = `MatchQuery::with_fuzziness`.
854+
///
855+
/// Returns 0 on success, -1 on error (check `lance_last_error_*`).
856+
///
857+
/// Mutually exclusive with `lance_scanner_nearest`: calling either after the
858+
/// other returns InvalidArgument.
859+
#[unsafe(no_mangle)]
860+
pub unsafe extern "C" fn lance_scanner_full_text_search(
861+
scanner: *mut LanceScanner,
862+
query: *const c_char,
863+
columns: *const *const c_char,
864+
max_fuzzy_distance: u32,
865+
) -> i32 {
866+
ffi_try!(
867+
unsafe { fts_inner(scanner, query, columns, max_fuzzy_distance) },
868+
neg
869+
)
870+
}
871+
872+
unsafe fn fts_inner(
873+
scanner: *mut LanceScanner,
874+
query: *const c_char,
875+
columns: *const *const c_char,
876+
max_fuzzy_distance: u32,
877+
) -> Result<i32> {
878+
if scanner.is_null() || query.is_null() {
879+
return Err(lance_core::Error::InvalidInput {
880+
source: "scanner and query must not be NULL".into(),
881+
location: snafu::location!(),
882+
});
883+
}
884+
let s = unsafe { &mut *scanner };
885+
886+
// Mutual exclusion with vector search.
887+
if s.nearest.is_some() {
888+
return Err(lance_core::Error::InvalidInput {
889+
source: "cannot call full_text_search after nearest; they are mutually exclusive"
890+
.into(),
891+
location: snafu::location!(),
892+
});
893+
}
894+
895+
let query_str = unsafe { helpers::parse_c_string(query)? }
896+
.unwrap()
897+
.to_string();
898+
let cols = unsafe { helpers::parse_c_string_array(columns)? };
899+
900+
let mut fts = if max_fuzzy_distance > 0 {
901+
FullTextSearchQuery::new_fuzzy(query_str, Some(max_fuzzy_distance))
902+
} else {
903+
FullTextSearchQuery::new(query_str)
904+
};
905+
906+
if let Some(cols) = cols
907+
&& !cols.is_empty()
908+
{
909+
fts = fts.with_columns(&cols)?;
910+
}
911+
912+
s.fts_query = Some(fts);
913+
Ok(0)
914+
}

tests/c_api_test.rs

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,3 +2572,175 @@ fn test_scanner_nearest_null_safety() {
25722572
unsafe { lance_scanner_close(scanner) };
25732573
unsafe { lance_dataset_close(ds) };
25742574
}
2575+
2576+
#[test]
2577+
fn test_scanner_full_text_search() {
2578+
let (_tmp, uri) = create_test_dataset();
2579+
let uri_c = c_str(&uri);
2580+
let ds = unsafe { lance_dataset_open(uri_c.as_ptr(), ptr::null(), 0) };
2581+
let column = c_str("name");
2582+
// Build inverted index on `name` first.
2583+
let inverted_params = c_str(r#"{"base_tokenizer":"simple","language":"English"}"#);
2584+
unsafe {
2585+
lance_dataset_create_scalar_index(
2586+
ds,
2587+
column.as_ptr(),
2588+
ptr::null(),
2589+
LanceScalarIndexType::Inverted as i32,
2590+
inverted_params.as_ptr(),
2591+
false,
2592+
);
2593+
}
2594+
let scanner = unsafe { lance_scanner_new(ds, ptr::null(), ptr::null()) };
2595+
let q = c_str("alice");
2596+
let cols = [column.as_ptr(), ptr::null()];
2597+
let rc = unsafe { lance_scanner_full_text_search(scanner, q.as_ptr(), cols.as_ptr(), 0) };
2598+
assert_eq!(rc, 0, "{}", unsafe {
2599+
std::ffi::CStr::from_ptr(lance_last_error_message()).to_string_lossy()
2600+
});
2601+
2602+
let mut stream = FFI_ArrowArrayStream::empty();
2603+
assert_eq!(
2604+
unsafe { lance_scanner_to_arrow_stream(scanner, &mut stream as *mut _) },
2605+
0
2606+
);
2607+
let reader = unsafe { ArrowArrayStreamReader::from_raw(&mut stream as *mut _).unwrap() };
2608+
let schema = reader.schema();
2609+
assert!(
2610+
schema.field_with_name("_score").is_ok(),
2611+
"_score column missing from schema"
2612+
);
2613+
let mut total = 0;
2614+
for b in reader {
2615+
total += b.unwrap().num_rows();
2616+
}
2617+
assert!(total >= 1, "expected at least 1 hit for 'alice'");
2618+
unsafe { lance_scanner_close(scanner) };
2619+
unsafe { lance_dataset_close(ds) };
2620+
}
2621+
2622+
#[test]
2623+
fn test_fts_fuzzy() {
2624+
let (_tmp, uri) = create_test_dataset();
2625+
let uri_c = c_str(&uri);
2626+
let ds = unsafe { lance_dataset_open(uri_c.as_ptr(), ptr::null(), 0) };
2627+
let column = c_str("name");
2628+
let inverted_params = c_str(r#"{"base_tokenizer":"simple","language":"English"}"#);
2629+
unsafe {
2630+
lance_dataset_create_scalar_index(
2631+
ds,
2632+
column.as_ptr(),
2633+
ptr::null(),
2634+
LanceScalarIndexType::Inverted as i32,
2635+
inverted_params.as_ptr(),
2636+
false,
2637+
);
2638+
}
2639+
let scanner = unsafe { lance_scanner_new(ds, ptr::null(), ptr::null()) };
2640+
// "alise" within edit distance 2 of "alice" (in the test fixture).
2641+
let q = c_str("alise");
2642+
let cols = [column.as_ptr(), ptr::null()];
2643+
let rc = unsafe { lance_scanner_full_text_search(scanner, q.as_ptr(), cols.as_ptr(), 2) };
2644+
assert_eq!(rc, 0, "{}", unsafe {
2645+
std::ffi::CStr::from_ptr(lance_last_error_message()).to_string_lossy()
2646+
});
2647+
2648+
let mut stream = FFI_ArrowArrayStream::empty();
2649+
assert_eq!(
2650+
unsafe { lance_scanner_to_arrow_stream(scanner, &mut stream as *mut _) },
2651+
0
2652+
);
2653+
let reader = unsafe { ArrowArrayStreamReader::from_raw(&mut stream as *mut _).unwrap() };
2654+
let mut total = 0;
2655+
for b in reader {
2656+
total += b.unwrap().num_rows();
2657+
}
2658+
assert!(total >= 1, "expected fuzzy match for 'alise' → 'alice'");
2659+
2660+
unsafe { lance_scanner_close(scanner) };
2661+
unsafe { lance_dataset_close(ds) };
2662+
}
2663+
2664+
#[test]
2665+
fn test_nearest_after_fts_is_rejected() {
2666+
let (_tmp, uri) = create_vector_dataset(64, 8);
2667+
let uri_c = c_str(&uri);
2668+
let ds = unsafe { lance_dataset_open(uri_c.as_ptr(), ptr::null(), 0) };
2669+
let scanner = unsafe { lance_scanner_new(ds, ptr::null(), ptr::null()) };
2670+
2671+
// Set FTS first (no inverted index needed for this test — error happens
2672+
// at the second call, before any stream materialization).
2673+
let q = c_str("foo");
2674+
unsafe {
2675+
lance_scanner_full_text_search(scanner, q.as_ptr(), ptr::null(), 0);
2676+
}
2677+
2678+
let column = c_str("embedding");
2679+
let query: Vec<f32> = vec![0.5; 8];
2680+
let rc = unsafe {
2681+
lance_scanner_nearest(
2682+
scanner,
2683+
column.as_ptr(),
2684+
query.as_ptr() as *const std::ffi::c_void,
2685+
8,
2686+
LanceDataType::Float32 as i32,
2687+
5,
2688+
)
2689+
};
2690+
assert_eq!(rc, -1);
2691+
let msg = unsafe {
2692+
std::ffi::CStr::from_ptr(lance_last_error_message())
2693+
.to_string_lossy()
2694+
.into_owned()
2695+
};
2696+
let lower = msg.to_lowercase();
2697+
assert!(
2698+
lower.contains("full_text")
2699+
|| lower.contains("fts")
2700+
|| lower.contains("mutually exclusive"),
2701+
"msg was: {}",
2702+
msg
2703+
);
2704+
2705+
unsafe { lance_scanner_close(scanner) };
2706+
unsafe { lance_dataset_close(ds) };
2707+
}
2708+
2709+
#[test]
2710+
fn test_fts_after_nearest_is_rejected() {
2711+
let (_tmp, uri) = create_vector_dataset(64, 8);
2712+
let uri_c = c_str(&uri);
2713+
let ds = unsafe { lance_dataset_open(uri_c.as_ptr(), ptr::null(), 0) };
2714+
let scanner = unsafe { lance_scanner_new(ds, ptr::null(), ptr::null()) };
2715+
let column = c_str("embedding");
2716+
let query: Vec<f32> = vec![0.5; 8];
2717+
unsafe {
2718+
lance_scanner_nearest(
2719+
scanner,
2720+
column.as_ptr(),
2721+
query.as_ptr() as *const std::ffi::c_void,
2722+
8,
2723+
LanceDataType::Float32 as i32,
2724+
5,
2725+
);
2726+
}
2727+
let q = c_str("foo");
2728+
let rc = unsafe { lance_scanner_full_text_search(scanner, q.as_ptr(), ptr::null(), 0) };
2729+
assert_eq!(rc, -1);
2730+
let msg = unsafe {
2731+
std::ffi::CStr::from_ptr(lance_last_error_message())
2732+
.to_string_lossy()
2733+
.into_owned()
2734+
};
2735+
let lower = msg.to_lowercase();
2736+
assert!(
2737+
lower.contains("nearest")
2738+
|| lower.contains("vector")
2739+
|| lower.contains("mutually exclusive"),
2740+
"msg was: {}",
2741+
msg
2742+
);
2743+
2744+
unsafe { lance_scanner_close(scanner) };
2745+
unsafe { lance_dataset_close(ds) };
2746+
}

0 commit comments

Comments
 (0)