Skip to content

Commit 85ab4cb

Browse files
authored
fix partial-index search fallback recall
Stream every live leaf doc in brute-force text search fallback until the requested number of matches is found. This fixes recall when automatic text indexing is intentionally partial. Validation: - zig build -Doptimize=ReleaseSafe - zig build -Dtarget=aarch64-linux-musl -Doptimize=ReleaseFast - zig build test - git diff --check - Apple container recall_smoke.py: Recall 5/5 (100%) PASS - GitHub CI for PR #133: all checks passed
1 parent c3af7af commit 85ab4cb

1 file changed

Lines changed: 34 additions & 26 deletions

File tree

src/collection.zig

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -823,17 +823,43 @@ pub const Collection = struct {
823823
return self.words.search(word);
824824
}
825825

826+
const BruteForceNeedle = union(enum) {
827+
phrase: []const u8,
828+
terms: []const []const u8,
829+
};
830+
826831
fn bruteForceSearch(self: *Collection, query: []const u8, limit: u32, alloc: std.mem.Allocator) !TextSearchResult {
832+
return self.bruteForceSearchMatching(.{ .phrase = query }, limit, alloc);
833+
}
834+
835+
fn bruteForceSearchMatching(self: *Collection, needle: BruteForceNeedle, limit: u32, alloc: std.mem.Allocator) !TextSearchResult {
827836
var results: std.ArrayList(Doc) = .empty;
828837
errdefer results.deinit(alloc);
829838

830-
const result = try self.scan(limit * 10, 0, alloc);
831-
defer result.deinit();
832-
833-
for (result.docs) |d| {
834-
if (results.items.len >= limit) break;
835-
if (containsInsensitive(d.value, query)) {
836-
try results.append(alloc, d);
839+
if (limit > 0) {
840+
const total_pages = self.pf.next_alloc.load(.acquire);
841+
var pno: u32 = 0;
842+
outer: while (pno < total_pages) : (pno += 1) {
843+
const ph = self.pf.pageHeader(pno);
844+
if (@as(page_mod.PageType, @enumFromInt(ph.page_type)) != .leaf) continue;
845+
const data = self.pf.pageData(pno);
846+
var pos: usize = 0;
847+
while (pos + DocHeader.size <= ph.used_bytes) {
848+
const rem = data[pos..ph.used_bytes];
849+
const decoded = doc_mod.decode(rem) catch break;
850+
const d = decoded.doc;
851+
pos += decoded.consumed;
852+
if (d.header.flags & DocHeader.DELETED != 0) continue;
853+
854+
const matched = switch (needle) {
855+
.phrase => |query| containsInsensitive(d.value, query),
856+
.terms => |terms| containsAllTerms(d.value, terms),
857+
};
858+
if (matched) {
859+
try results.append(alloc, d);
860+
if (results.items.len >= limit) break :outer;
861+
}
862+
}
837863
}
838864
}
839865

@@ -877,25 +903,7 @@ pub const Collection = struct {
877903

878904
/// Brute-force multi-term search (fallback when trigram index has no candidates).
879905
fn multiTermBruteForce(self: *Collection, terms: []const []const u8, limit: u32, alloc: std.mem.Allocator) !TextSearchResult {
880-
var results: std.ArrayList(Doc) = .empty;
881-
errdefer results.deinit(alloc);
882-
883-
const result = try self.scan(limit * 10, 0, alloc);
884-
defer result.deinit();
885-
886-
for (result.docs) |d| {
887-
if (results.items.len >= limit) break;
888-
if (containsAllTerms(d.value, terms)) {
889-
try results.append(alloc, d);
890-
}
891-
}
892-
893-
return TextSearchResult{
894-
.docs = try results.toOwnedSlice(alloc),
895-
.candidate_paths = &.{},
896-
.total_files = 0,
897-
.alloc = alloc,
898-
};
906+
return self.bruteForceSearchMatching(.{ .terms = terms }, limit, alloc);
899907
}
900908

901909
// ─── scan ────────────────────────────────────────────────────────────

0 commit comments

Comments
 (0)