Skip to content

Commit 99e4ae0

Browse files
committed
Engine: bump DuckDB to v1.5.3 (latest) + two-step FTS to fix v1.5 visibility
DuckDB CLI v1.4.4 -> v1.5.3 (the actual latest stable). The path forward needed solving one v1.5 regression: the fts PRAGMA can't see tables created in the same -c invocation. The two-step text_search already runs in CI for the upsert pattern, so I copied that: - plan.rs: ctl text_search now records a TextSearchSpec on Stage instead of emitting the whole multi-statement SQL. The executor picks it up. - lib.rs: run_text_search makes two CLI calls sharing the temp DB. Phase 1 stages the upstream into a named table; Phase 2 builds the BM25 index and the final SELECT in a second invocation that sees the staged table on disk. - The two-step also works fine on v1.4 (just one extra CLI spawn). Verified locally: 62/62 engine tests pass against v1.5.3 AND v1.4.4 with the new path. Avro stays preview (community ext still has no v1.5+ build; same on v1.4).
1 parent b2d7abe commit 99e4ae0

5 files changed

Lines changed: 121 additions & 57 deletions

File tree

.github/workflows/ci.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646
shell: bash
4747
run: |
4848
set -e
49-
ver=1.4.4
49+
ver=1.5.3
5050
case "$RUNNER_OS" in
5151
Linux) asset=duckdb_cli-linux-amd64.zip; exe=duckdb ;;
5252
Windows) asset=duckdb_cli-windows-amd64.zip; exe=duckdb.exe ;;
@@ -108,7 +108,7 @@ jobs:
108108
- name: Install DuckDB CLI
109109
run: |
110110
set -e
111-
curl -L -o duckdb.zip https://github.com/duckdb/duckdb/releases/download/v1.4.4/duckdb_cli-linux-amd64.zip
111+
curl -L -o duckdb.zip https://github.com/duckdb/duckdb/releases/download/v1.5.3/duckdb_cli-linux-amd64.zip
112112
mkdir -p "$RUNNER_TEMP/duckdb"
113113
unzip -o duckdb.zip -d "$RUNNER_TEMP/duckdb"
114114
echo "DUCKLE_DUCKDB_BIN=$RUNNER_TEMP/duckdb/duckdb" >> "$GITHUB_ENV"
@@ -146,7 +146,7 @@ jobs:
146146
- name: Install DuckDB CLI
147147
run: |
148148
set -e
149-
curl -L -o duckdb.zip https://github.com/duckdb/duckdb/releases/download/v1.4.4/duckdb_cli-linux-amd64.zip
149+
curl -L -o duckdb.zip https://github.com/duckdb/duckdb/releases/download/v1.5.3/duckdb_cli-linux-amd64.zip
150150
mkdir -p "$RUNNER_TEMP/duckdb"
151151
unzip -o duckdb.zip -d "$RUNNER_TEMP/duckdb"
152152
echo "DUCKLE_DUCKDB_BIN=$RUNNER_TEMP/duckdb/duckdb" >> "$GITHUB_ENV"
@@ -191,7 +191,7 @@ jobs:
191191
- name: Install DuckDB CLI
192192
run: |
193193
set -e
194-
curl -L -o duckdb.zip https://github.com/duckdb/duckdb/releases/download/v1.4.4/duckdb_cli-linux-amd64.zip
194+
curl -L -o duckdb.zip https://github.com/duckdb/duckdb/releases/download/v1.5.3/duckdb_cli-linux-amd64.zip
195195
mkdir -p "$RUNNER_TEMP/duckdb"
196196
unzip -o duckdb.zip -d "$RUNNER_TEMP/duckdb"
197197
echo "DUCKLE_DUCKDB_BIN=$RUNNER_TEMP/duckdb/duckdb" >> "$GITHUB_ENV"

apps/desktop/src/engine_manager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use serde::Serialize;
1010
use std::io::Read;
1111
use std::path::{Path, PathBuf};
1212

13-
pub const DUCKDB_VERSION: &str = "1.4.4";
13+
pub const DUCKDB_VERSION: &str = "1.5.3";
1414
pub const SLOTHDB_VERSION: &str = "0.2.7";
1515

1616
/// Static description of an installable engine.

apps/desktop/tauri.conf.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"$schema": "https://schema.tauri.app/config/2",
33
"productName": "Duckle",
4-
"version": "0.0.3",
4+
"version": "0.0.4",
55
"identifier": "io.duckle.app",
66
"build": {
77
"beforeDevCommand": "npm --prefix ../frontend run dev",

crates/duckdb-engine/src/lib.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,11 @@ impl DuckdbEngine {
313313
// get the column list, then assemble INSERT ... ON
314314
// CONFLICT (Postgres) or ON DUPLICATE KEY UPDATE (MySQL).
315315
self.run_upsert(&db_path, &secret_prefix, spec)
316+
} else if let Some(spec) = stage.text_search.as_ref() {
317+
// FTS in DuckDB v1.5+ can't see tables created in the
318+
// same -c invocation, so we stage in one CLI call then
319+
// index + query in a second.
320+
self.run_text_search(&db_path, &secret_prefix, &stage.node_id, spec)
316321
} else if stage.sink_mode.as_deref() == Some("error")
317322
&& stage
318323
.sink_path
@@ -500,6 +505,75 @@ impl DuckdbEngine {
500505
self.run(Some(db), &exec_sql, false)
501506
}
502507

508+
/// Full-Text Search runs in two CLI invocations sharing the same
509+
/// temp DB file. The first stages the upstream into a permanent
510+
/// table; the second builds the BM25 index and the final node
511+
/// table. The split is needed for DuckDB v1.5+ where the fts
512+
/// PRAGMA can't see tables created in the same -c invocation; on
513+
/// v1.4 it just costs one extra CLI spawn.
514+
fn run_text_search(
515+
&self,
516+
db: &Path,
517+
secret_prefix: &str,
518+
node_id: &str,
519+
spec: &plan::TextSearchSpec,
520+
) -> Result<String, EngineError> {
521+
let staging = plan::quote_ident(&spec.staging_table);
522+
let upstream = plan::quote_ident(&spec.from_view);
523+
let node_q = plan::quote_ident(node_id);
524+
let id_col_q = plan::quote_ident(&spec.id_col);
525+
let output_q = plan::quote_ident(&spec.output_col);
526+
527+
// Phase 1: stage upstream into a named table that the next CLI
528+
// invocation will see.
529+
let stage_sql = format!(
530+
"{secret}INSTALL fts; LOAD fts; \
531+
DROP TABLE IF EXISTS {staging}; \
532+
CREATE TABLE {staging} AS SELECT * FROM {upstream};",
533+
secret = secret_prefix,
534+
staging = staging,
535+
upstream = upstream,
536+
);
537+
self.run(Some(db), &stage_sql, false)?;
538+
539+
// Phase 2: PRAGMA create_fts_index sees the staged table from
540+
// disk; the same invocation then runs the BM25 SELECT.
541+
let text_args = spec
542+
.text_cols
543+
.iter()
544+
.map(|c| format!("'{}'", c.replace('\'', "''")))
545+
.collect::<Vec<_>>()
546+
.join(", ");
547+
let index_schema = format!("fts_main_{}", spec.staging_table);
548+
let match_expr = format!(
549+
"{}.match_bm25({}, '{}')",
550+
index_schema,
551+
id_col_q,
552+
spec.query.replace('\'', "''")
553+
);
554+
let order_limit = match spec.top_k {
555+
Some(k) => format!(" ORDER BY {} DESC LIMIT {}", output_q, k),
556+
None => String::new(),
557+
};
558+
let index_sql = format!(
559+
"{secret}INSTALL fts; LOAD fts; \
560+
PRAGMA create_fts_index('{staging_raw}', '{id_col}', {text_args}); \
561+
CREATE OR REPLACE TABLE {node} AS \
562+
SELECT *, {match_expr} AS {output_q} FROM {staging} \
563+
WHERE {match_expr} IS NOT NULL{order_limit};",
564+
secret = secret_prefix,
565+
staging_raw = spec.staging_table.replace('\'', "''"),
566+
id_col = spec.id_col.replace('\'', "''"),
567+
text_args = text_args,
568+
node = node_q,
569+
match_expr = match_expr,
570+
output_q = output_q,
571+
staging = staging,
572+
order_limit = order_limit,
573+
);
574+
self.run(Some(db), &index_sql, false)
575+
}
576+
503577
fn count_rows(&self, db: &Path, name: &str) -> Result<u64, EngineError> {
504578
let sql = format!("SELECT COUNT(*) AS n FROM {};", plan::quote_ident(name));
505579
let rows = self.run_rows(Some(db), &sql)?;

crates/duckdb-engine/src/plan.rs

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,25 @@ pub struct Stage {
4242
/// upstream (DESCRIBE) before assembling the final INSERT ... ON
4343
/// CONFLICT statement.
4444
pub upsert: Option<UpsertSpec>,
45+
/// For xf.ai.text_search: in DuckDB v1.5.x the fts PRAGMA can't see
46+
/// tables created in the same -c invocation. The planner records
47+
/// the spec; the executor runs two CLI calls (stage then index +
48+
/// query) so the PRAGMA sees committed state. Works unchanged on
49+
/// v1.4 too.
50+
pub text_search: Option<TextSearchSpec>,
51+
}
52+
53+
#[derive(Debug, Clone)]
54+
pub struct TextSearchSpec {
55+
pub from_view: String,
56+
pub id_col: String,
57+
pub text_cols: Vec<String>,
58+
pub query: String,
59+
pub top_k: Option<u64>,
60+
pub output_col: String,
61+
/// Sanitized staging table name (so PRAGMA can reference a valid
62+
/// SQL identifier even when the node id has special characters).
63+
pub staging_table: String,
4564
}
4665

4766
#[derive(Debug, Clone)]
@@ -341,6 +360,7 @@ fn build_stage(
341360
let mut sink_path: Option<String> = None;
342361
let mut sink_mode: Option<String> = None;
343362
let mut upsert: Option<UpsertSpec> = None;
363+
let mut text_search: Option<TextSearchSpec> = None;
344364
// ATTACH statements for external-DB nodes (DuckDB/SQLite). Each stage
345365
// runs in its own CLI process, so fixed aliases are collision-free.
346366
let attach = attach_prelude(component_id, &props);
@@ -408,15 +428,14 @@ fn build_stage(
408428
})?;
409429
(format!("{}{}", attach, sql), StageKind::View, None)
410430
} else if component_id == "xf.ai.text_search" {
411-
// Full-Text Search needs a stable named table for
412-
// create_fts_index, so we materialize the upstream into a temp
413-
// table, build the BM25 index on it, then SELECT through the
414-
// index. Multi-statement, so it bypasses the standard view
415-
// wrapping and emits its own CREATE TABLE for node.id.
416-
let sql = build_text_search(&node.id, inputs, &props).map_err(|e| {
431+
// Full-Text Search runs as a two-step path in the executor (the
432+
// v1.5 fts PRAGMA can't see tables created in the same -c
433+
// invocation). The planner records the spec; sql stays empty.
434+
let spec = build_text_search_spec(&node.id, inputs, &props).map_err(|e| {
417435
EngineError::Config(format!("{} ({} / {}): {}", node.data.label, component_id, node.id, e))
418436
})?;
419-
(format!("{}{}", attach, sql), StageKind::View, None)
437+
text_search = Some(spec);
438+
(String::new(), StageKind::View, None)
420439
} else {
421440
let body = build_view_sql(component_id, &props, inputs).map_err(|e| {
422441
EngineError::Config(format!("{} ({} / {}): {}", node.data.label, component_id, node.id, e))
@@ -453,6 +472,7 @@ fn build_stage(
453472
sink_path,
454473
sink_mode,
455474
upsert,
475+
text_search,
456476
})
457477
}
458478

@@ -2659,13 +2679,9 @@ fn build_avro_source(props: &JsonValue) -> String {
26592679
format!("SELECT * FROM read_avro('{}')", sql_escape(&path))
26602680
}
26612681

2662-
/// Full-Text Search via the DuckDB fts extension. The extension's
2663-
/// API (PRAGMA create_fts_index + a per-table fts_main_<table>.match_bm25
2664-
/// function) needs a named table to index, so this materializes the
2665-
/// upstream into a temp table, builds the BM25 index on it, then
2666-
/// produces `<node>` with the original columns + a score column.
2667-
/// Optionally limits to the top-K matches.
2668-
fn build_text_search(node_id: &str, inputs: &NodeInputs, props: &JsonValue) -> Result<String, String> {
2682+
/// Validate the text-search form and produce the spec the executor
2683+
/// uses to run the two CLI calls (stage table -> index + final query).
2684+
fn build_text_search_spec(node_id: &str, inputs: &NodeInputs, props: &JsonValue) -> Result<TextSearchSpec, String> {
26692685
let upstream = inputs
26702686
.main()
26712687
.ok_or_else(|| missing_input_msg("xf.ai.text_search"))?;
@@ -2683,49 +2699,23 @@ fn build_text_search(node_id: &str, inputs: &NodeInputs, props: &JsonValue) -> R
26832699
.get("topK")
26842700
.and_then(|v| v.as_u64())
26852701
.filter(|k| *k > 0);
2686-
let output = string_prop(props, "outputColumn")
2702+
let output_col = string_prop(props, "outputColumn")
26872703
.filter(|s| !s.is_empty())
26882704
.unwrap_or_else(|| "score".into());
2689-
2690-
// Sanitized temp-table suffix from the node id (table names must
2691-
// be valid SQL identifiers; the upstream node id may not be).
26922705
let suffix: String = node_id
26932706
.chars()
26942707
.map(|c| if c.is_ascii_alphanumeric() { c } else { '_' })
26952708
.collect();
2696-
let temp_table = format!("_fts_{}", suffix);
2697-
let index_schema = format!("fts_main_{}", temp_table);
2698-
2699-
let text_args = text_cols
2700-
.iter()
2701-
.map(|c| format!("'{}'", c.replace('\'', "''")))
2702-
.collect::<Vec<_>>()
2703-
.join(", ");
2704-
let match_expr = format!(
2705-
"{}.match_bm25({}, '{}')",
2706-
index_schema,
2707-
quote_ident(&id_col),
2708-
query.replace('\'', "''")
2709-
);
2710-
let order_limit = match top_k {
2711-
Some(k) => format!(" ORDER BY {} DESC LIMIT {}", quote_ident(&output), k),
2712-
None => String::new(),
2713-
};
2714-
Ok(format!(
2715-
"DROP TABLE IF EXISTS {temp}; \
2716-
CREATE TEMP TABLE {temp} AS SELECT * FROM {up}; \
2717-
PRAGMA create_fts_index('{temp_raw}', '{id_col}', {text_args}); \
2718-
CREATE OR REPLACE TABLE {node} AS \
2719-
SELECT *, {match} AS {out} FROM {temp} \
2720-
WHERE {match} IS NOT NULL{order_limit}",
2721-
temp = quote_ident(&temp_table),
2722-
temp_raw = temp_table,
2723-
up = quote_ident(upstream),
2724-
id_col = id_col,
2725-
node = quote_ident(node_id),
2726-
match = match_expr,
2727-
out = quote_ident(&output),
2728-
))
2709+
let staging_table = format!("_fts_{}", suffix);
2710+
Ok(TextSearchSpec {
2711+
from_view: upstream.to_string(),
2712+
id_col,
2713+
text_cols,
2714+
query,
2715+
top_k,
2716+
output_col,
2717+
staging_table,
2718+
})
27292719
}
27302720

27312721
/// Vector Similarity Search via the DuckDB vss extension. Adds a

0 commit comments

Comments
 (0)