|
1 | 1 | -- name: search |
2 | | --- FTS5 search with length-based ranking (shorter content ranks higher). |
| 2 | +-- FTS5 search with BM25 + length-based ranking. |
| 3 | +-- BM25 rewards matching more query tokens. Shorter content_head breaks ties. |
3 | 4 | -- Exact content matches get extra boost via negative rank adjustment. |
4 | | --- Supports both FTS matches and direct content_head matches (for multi-word phrases |
5 | | --- where tokens may be incomplete). Uses UNION for better index utilization. |
6 | 5 | -- $1: lang, $2: raw query, $3: FTS query, $4: status, $5: offset, $6: limit |
7 | 6 | SELECT e.*, |
8 | 7 | JSON_ARRAY_LENGTH(e.content) AS content_length, |
9 | | - -- Rank: weight - (20 - content_length). Shorter content = more negative = ranks first. |
10 | | - -- 20 as that's the max length of the generated content_head column. |
11 | | - -- Exact matches get extra -1000 boost to always rank highest. |
12 | | - e.weight + (-1.0 * (20.0 - LENGTH(e.content_head))) |
| 8 | + -- Rank: BM25 (FTS match quality) + weight - (20 - content_length) for tie-breaking. |
| 9 | + -- BM25 is negative (lower = better match), so adding it boosts better matches. |
| 10 | + -- Exact content_head matches get extra -1000 boost to always rank highest. |
| 11 | + MIN(matches.bm25_rank) + e.weight + (-1.0 * (20.0 - LENGTH(e.content_head))) |
13 | 12 | + CASE WHEN e.content_head = LOWER(SUBSTR($2, 1, 20)) THEN -1000.0 ELSE 0.0 END |
14 | 13 | AS rank, |
15 | 14 | COUNT(*) OVER() AS total |
16 | | -FROM entries e |
17 | | -WHERE e.id IN ( |
18 | | - -- FTS matches |
19 | | - SELECT rowid FROM entries_fts WHERE entries_fts MATCH $3 |
20 | | - UNION |
| 15 | +FROM ( |
| 16 | + -- FTS matches with BM25 scores |
| 17 | + SELECT rowid AS id, bm25(entries_fts) AS bm25_rank |
| 18 | + FROM entries_fts WHERE entries_fts MATCH $3 |
| 19 | + UNION ALL |
21 | 20 | -- Direct content_head matches (for multi-word phrases with incomplete tokens) |
22 | | - SELECT id FROM entries |
| 21 | + SELECT id, 0.0 AS bm25_rank FROM entries |
23 | 22 | WHERE content_head = LOWER(SUBSTR($2, 1, 20)) |
24 | 23 | AND ($1 = '' OR lang = $1) |
25 | 24 | AND status != 'disabled' |
26 | | -) |
27 | | - AND EXISTS (SELECT 1 FROM relations r WHERE r.from_id = e.id) |
| 25 | +) matches |
| 26 | +INNER JOIN entries e ON e.id = matches.id |
| 27 | +WHERE EXISTS (SELECT 1 FROM relations r WHERE r.from_id = e.id) |
28 | 28 | AND ($1 = '' OR e.lang = $1) |
29 | 29 | AND ($4 = '' OR e.status = $4) |
| 30 | +GROUP BY e.id |
30 | 31 | ORDER BY rank |
31 | 32 | LIMIT $6 OFFSET $5; |
32 | 33 |
|
|
0 commit comments