Skip to content

Commit 4c96b09

Browse files
committed
Refactor and simply search query and add BM25 ranking to boost SQLite FTS match quality.
1 parent d24e261 commit 4c96b09

File tree

1 file changed

+16
-15
lines changed

1 file changed

+16
-15
lines changed

static/sql/queries.sql

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,33 @@
11
-- name: search
2-
-- FTS5 search with length-based ranking (shorter content ranks higher).
2+
-- FTS5 search with BM25 + length-based ranking.
3+
-- BM25 rewards matching more query tokens. Shorter content_head breaks ties.
34
-- Exact content matches get extra boost via negative rank adjustment.
4-
-- Supports both FTS matches and direct content_head matches (for multi-word phrases
5-
-- where tokens may be incomplete). Uses UNION for better index utilization.
65
-- $1: lang, $2: raw query, $3: FTS query, $4: status, $5: offset, $6: limit
76
SELECT e.*,
87
JSON_ARRAY_LENGTH(e.content) AS content_length,
9-
-- Rank: weight - (20 - content_length). Shorter content = more negative = ranks first.
10-
-- 20 as that's the max length of the generated content_head column.
11-
-- Exact matches get extra -1000 boost to always rank highest.
12-
e.weight + (-1.0 * (20.0 - LENGTH(e.content_head)))
8+
-- Rank: BM25 (FTS match quality) + weight - (20 - content_length) for tie-breaking.
9+
-- BM25 is negative (lower = better match), so adding it boosts better matches.
10+
-- Exact content_head matches get extra -1000 boost to always rank highest.
11+
MIN(matches.bm25_rank) + e.weight + (-1.0 * (20.0 - LENGTH(e.content_head)))
1312
+ CASE WHEN e.content_head = LOWER(SUBSTR($2, 1, 20)) THEN -1000.0 ELSE 0.0 END
1413
AS rank,
1514
COUNT(*) OVER() AS total
16-
FROM entries e
17-
WHERE e.id IN (
18-
-- FTS matches
19-
SELECT rowid FROM entries_fts WHERE entries_fts MATCH $3
20-
UNION
15+
FROM (
16+
-- FTS matches with BM25 scores
17+
SELECT rowid AS id, bm25(entries_fts) AS bm25_rank
18+
FROM entries_fts WHERE entries_fts MATCH $3
19+
UNION ALL
2120
-- Direct content_head matches (for multi-word phrases with incomplete tokens)
22-
SELECT id FROM entries
21+
SELECT id, 0.0 AS bm25_rank FROM entries
2322
WHERE content_head = LOWER(SUBSTR($2, 1, 20))
2423
AND ($1 = '' OR lang = $1)
2524
AND status != 'disabled'
26-
)
27-
AND EXISTS (SELECT 1 FROM relations r WHERE r.from_id = e.id)
25+
) matches
26+
INNER JOIN entries e ON e.id = matches.id
27+
WHERE EXISTS (SELECT 1 FROM relations r WHERE r.from_id = e.id)
2828
AND ($1 = '' OR e.lang = $1)
2929
AND ($4 = '' OR e.status = $4)
30+
GROUP BY e.id
3031
ORDER BY rank
3132
LIMIT $6 OFFSET $5;
3233

0 commit comments

Comments
 (0)