justrach · justrach · Apr 26, 2026 · Apr 26, 2026 · chatgpt-codex-connector · Apr 26, 2026
diff --git a/src/explore.zig b/src/explore.zig
@@ -1401,8 +1401,9 @@ pub const Explorer = struct {
             for (word_hits) |hit| {
                 const hit_path = self.word_index.hitPath(hit);
                 if (hit_path.len == 0) continue;
-                const cached = self.contents.get(hit_path) orelse continue;
-                const line_text = extractLineByNumber(cached, hit.line_num) orelse continue;
+                const ref = self.readContentForSearch(hit_path, allocator) orelse continue;
+                defer ref.deinit();
+                const line_text = extractLineByNumber(ref.data, hit.line_num) orelse continue;
                 if (indexOfCaseInsensitive(line_text, query) == null) continue;
                 const duped_text = try allocator.dupe(u8, line_text);
                 errdefer allocator.free(duped_text);
@@ -1429,8 +1430,9 @@ pub const Explorer = struct {
             for (prefix_hits) |hit| {
                 const hit_path = self.word_index.hitPath(hit);
                 if (hit_path.len == 0) continue;
-                const cached = self.contents.get(hit_path) orelse continue;
-                const line_text = extractLineByNumber(cached, hit.line_num) orelse continue;
+                const ref = self.readContentForSearch(hit_path, allocator) orelse continue;
+                defer ref.deinit();
+                const line_text = extractLineByNumber(ref.data, hit.line_num) orelse continue;
                 if (indexOfCaseInsensitive(line_text, query) == null) continue;
                 const duped_text = try allocator.dupe(u8, line_text);
                 errdefer allocator.free(duped_text);

diff --git a/src/index.zig b/src/index.zig
@@ -61,6 +61,12 @@ pub const WordIndex = struct {
         }
         self.file_words.deinit();
 
+        if (self.skip_file_words) {
+            for (self.id_to_path.items) |path| {
+                if (path.len > 0) self.allocator.free(path);
+            }
+        }
+
         self.path_to_id.deinit();
         self.id_to_path.deinit(self.allocator);
     }
@@ -362,7 +368,11 @@ pub const WordIndex = struct {
             }
         }.lt);
 
-        const rand_suffix = @as(u64, blk: { var ts: std.c.timespec = undefined; _ = std.c.clock_gettime(std.c.CLOCK.REALTIME, &ts); break :blk @as(u64, @intCast(ts.nsec)) ^ (@as(u64, @intCast(ts.sec)) << 1); });
+        const rand_suffix = @as(u64, blk: {
+            var ts: std.c.timespec = undefined;
+            _ = std.c.clock_gettime(std.c.CLOCK.REALTIME, &ts);
+            break :blk @as(u64, @intCast(ts.nsec)) ^ (@as(u64, @intCast(ts.sec)) << 1);
+        });
         const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}/word.index.{x}.tmp", .{ dir_path, rand_suffix });
         defer self.allocator.free(tmp_path);
         const final_path = try std.fmt.allocPrint(self.allocator, "{s}/word.index", .{dir_path});
@@ -917,7 +927,9 @@ pub const TrigramIndex = struct {
             }
             if (is_new_doc) {
                 try idx_gop.value_ptr.items.append(self.allocator, .{
-                    .doc_id = doc_id, .next_mask = mask.next_mask, .loc_mask = mask.loc_mask,
+                    .doc_id = doc_id,
+                    .next_mask = mask.next_mask,
+                    .loc_mask = mask.loc_mask,
                 });
             } else {
                 const posting = try idx_gop.value_ptr.getOrAddPosting(self.allocator, doc_id);
@@ -969,7 +981,9 @@ pub const TrigramIndex = struct {
                 idx_gop.value_ptr.* = .{ .path_to_id = &self.path_to_id };
             }
             try idx_gop.value_ptr.items.append(self.allocator, .{
-                .doc_id = doc_id, .next_mask = mask.next_mask, .loc_mask = mask.loc_mask,
+                .doc_id = doc_id,
+                .next_mask = mask.next_mask,
+                .loc_mask = mask.loc_mask,
             });
             try tri_list.append(self.allocator, tri);
         }
@@ -988,7 +1002,9 @@ pub const TrigramIndex = struct {
                 idx_gop.value_ptr.* = .{ .path_to_id = &self.path_to_id };
             }
             try idx_gop.value_ptr.items.append(self.allocator, .{
-                .doc_id = doc_id, .next_mask = te.mask.next_mask, .loc_mask = te.mask.loc_mask,
+                .doc_id = doc_id,
+                .next_mask = te.mask.next_mask,
+                .loc_mask = te.mask.loc_mask,
             });
         }
     }
@@ -1982,7 +1998,6 @@ pub const MmapTrigramIndex = struct {
     }
 };
 
-
 pub const AnyTrigramIndex = union(enum) {
     heap: TrigramIndex,
     mmap: MmapTrigramIndex,

diff --git a/src/main.zig b/src/main.zig
@@ -538,13 +538,14 @@ fn mainImpl() !void {
         };
         const git_head = git_mod.getGitHead(abs_root, allocator) catch null;
         loadWordIndexFromDiskIfPresent(io, &explorer, data_dir, git_head, allocator);
-        if (!explorer.wordIndexIsComplete()) {
-            explorer.rebuildWordIndex() catch |err| {
-                out.p("{s}\xe2\x9c\x97{s} word index rebuild failed: {}\n", .{ s.red, s.reset, err });
+        if (!wordIndexMatchesOutlines(&explorer)) {
+            persistWordIndexFromSource(io, &explorer, abs_root, data_dir, git_head, allocator) catch |err| {
+                out.p("{s}\xe2\x9c\x97{s} word index persist failed: {}\n", .{ s.red, s.reset, err });
                 std.process.exit(1);
             };
+        } else {
+            persistWordIndexToDisk(io, &explorer, data_dir, git_head);
         }
-        persistWordIndexToDisk(io, &explorer, data_dir, git_head);
         const elapsed = cio.nanoTimestamp() - t0;
         var dur_buf: [64]u8 = undefined;
         out.p("{s}\xe2\x9c\x93{s} {s}snapshot{s}  {s}{s}{s}  {s}{d} files{s}  {s}{s}{s}\n", .{
@@ -629,6 +630,7 @@ fn mainImpl() !void {
             const startup_time_ms: u64 = @intCast(@max(cio.milliTimestamp() - startup_t0, 0));
             loadTrigramFromDiskIfPresent(io, &explorer, data_dir, allocator);
             telem.recordCodebaseStats(&explorer, startup_time_ms);
+            compactMcpReadyMemory(io, &explorer, data_dir, git_head, allocator);
             mcp_server.setScanState(.ready);
         }
 
@@ -736,6 +738,52 @@ fn loadWordIndexFromDiskIfPresent(
     }
 }
 
+fn wordIndexDiskMatches(
+    io: std.Io,
+    explorer: *Explorer,
+    data_dir: []const u8,
+    current_git_head: ?[40]u8,
+    allocator: std.mem.Allocator,
+) bool {
+    const header = WordIndex.readDiskHeader(io, data_dir, allocator) catch null orelse return false;
+
+    explorer.mu.lockShared();
+    const current_count = @as(u32, @intCast(explorer.outlines.count()));
+    explorer.mu.unlockShared();
+    if (header.file_count != current_count) return false;
+
+    if (current_git_head == null and header.git_head == null) return true;
+    if (current_git_head == null or header.git_head == null) return false;
+    return std.mem.eql(u8, &current_git_head.?, &header.git_head.?);
+}
+
+fn compactMcpReadyMemory(
+    io: std.Io,
+    explorer: *Explorer,
+    data_dir: []const u8,
+    current_git_head: ?[40]u8,
+    allocator: std.mem.Allocator,
+) void {
+    explorer.mu.lockShared();
+    const file_count = explorer.outlines.count();
+    explorer.mu.unlockShared();
+
+    if (file_count <= 1000 and cio.posixGetenv("CODEDB_LOW_MEMORY") == null) return;
+
+    const can_release_contents =
+        explorer.wordIndexIsComplete() or
+        (explorer.wordIndexCanLoadFromDisk() and wordIndexDiskMatches(io, explorer, data_dir, current_git_head, allocator));
+
+    if (can_release_contents) {
+        explorer.releaseContents();
+    }
+    explorer.releaseSecondaryIndexes();
+
+    // Shrink index allocations to reclaim ArrayList over-allocation.
+    if (explorer.trigram_index.asHeap()) |heap| heap.shrinkPostingLists();
+    explorer.word_index.shrinkAllocations();
+}
+
 fn persistWordIndexToDisk(io: std.Io, explorer: *Explorer, data_dir: []const u8, git_head: ?[40]u8) void {
     const generation = explorer.wordIndexGenerationToPersist() orelse return;
 
@@ -749,6 +797,52 @@ fn persistWordIndexToDisk(io: std.Io, explorer: *Explorer, data_dir: []const u8,
     explorer.markWordIndexPersisted(generation);
 }
 
+fn wordIndexMatchesOutlines(explorer: *Explorer) bool {
+    explorer.mu.lockShared();
+    defer explorer.mu.unlockShared();
+    return explorer.word_index_complete and
+        explorer.word_index.id_to_path.items.len == explorer.outlines.count();
+}
+
+fn persistWordIndexFromSource(
+    io: std.Io,
+    explorer: *Explorer,
+    root_path: []const u8,
+    data_dir: []const u8,
+    git_head: ?[40]u8,
+    allocator: std.mem.Allocator,
+) !void {
+    var paths: std.ArrayList([]const u8) = .empty;
+    defer paths.deinit(allocator);
+
+    {
+        explorer.mu.lockShared();
+        defer explorer.mu.unlockShared();
+        try paths.ensureTotalCapacity(allocator, explorer.outlines.count());
+        var path_iter = explorer.outlines.keyIterator();
+        while (path_iter.next()) |path_ptr| {
+            paths.appendAssumeCapacity(path_ptr.*);
+        }
+    }
+
+    var root_dir = try std.Io.Dir.cwd().openDir(io, root_path, .{});
+    defer root_dir.close(io);
+
+    var word_index = WordIndex.init(allocator);
+    defer word_index.deinit();
+    word_index.skip_file_words = true;
+
+    for (paths.items) |path| {
+        const content = root_dir.readFileAlloc(io, path, allocator, .limited(64 * 1024 * 1024)) catch continue;
+        errdefer allocator.free(content);
+        try word_index.indexFile(path, content);
+        allocator.free(content);
+    }
+
+    if (word_index.id_to_path.items.len == 0 and paths.items.len != 0) return error.NoWordIndexData;
+    try word_index.writeToDisk(io, data_dir, git_head);
+}
+
 fn saveProjectInfo(io: std.Io, allocator: std.mem.Allocator, data_dir: []const u8, abs_root: []const u8) !void {
     const info_path = try std.fmt.allocPrint(allocator, "{s}/project.txt", .{data_dir});
     defer allocator.free(info_path);

diff --git a/src/mcp.zig b/src/mcp.zig
@@ -156,6 +156,24 @@ fn loadProjectWordIndexFromDiskIfPresent(io: std.Io, explorer: *Explorer, projec
     }
 }
 
+fn shouldLoadWordIndexForSearch(args: *const std.json.ObjectMap) bool {
+    if (getBool(args, "regex")) return false;
+    const query = getStr(args, "query") orelse return false;
+    if (query.len < 2 or query.len > 256) return false;
+
+    var saw_word_char = false;
+    for (query) |c| {
+        const is_word_char =
+            (c >= 'a' and c <= 'z') or
+            (c >= 'A' and c <= 'Z') or
+            (c >= '0' and c <= '9') or
+            c == '_';
+        if (!is_word_char) return false;
+        if (c != '_') saw_word_char = true;
+    }
+    return saw_word_char;
+}
+
 const ProjectCache = struct {
     const MAX_CACHED = 5;
 
@@ -811,7 +829,7 @@ fn dispatch(
         return;
     };
 
-    if (tool == .codedb_word) {
+    if (tool == .codedb_word or (tool == .codedb_search and shouldLoadWordIndexForSearch(args))) {
         const effective_project = project_path orelse cache.default_path;
         loadProjectWordIndexFromDiskIfPresent(io, ctx.explorer, effective_project, alloc);
     }

diff --git a/src/snapshot.zig b/src/snapshot.zig
@@ -88,8 +88,10 @@ pub fn writeSnapshot(
         defer buf.deinit(allocator);
         const writer = cio.listWriter(&buf, allocator);
         var total_bytes: u64 = 0;
-        var ct_iter = explorer.contents.valueIterator();
-        while (ct_iter.next()) |v| total_bytes += v.*.len;
+        var outline_size_iter = explorer.outlines.valueIterator();
+        while (outline_size_iter.next()) |outline| {
+            total_bytes += outline.byte_size;
+        }
         var file_count_meta: u32 = 0;
         var fc_iter = explorer.outlines.keyIterator();
         while (fc_iter.next()) |k| {
@@ -227,20 +229,38 @@ pub fn writeSnapshot(
     // ── Section: CONTENT ──
     {
         const offset = file_writer.logicalPos();
-        var ct_iter = explorer.contents.iterator();
-        while (ct_iter.next()) |entry| {
-            const path = entry.key_ptr.*;
+        var root_dir = std.Io.Dir.cwd().openDir(io, root_path, .{}) catch null;
+        defer if (root_dir) |*dir| dir.close(io);
+
+        var path_iter = explorer.outlines.keyIterator();
+        while (path_iter.next()) |path_ptr| {
+            const path = path_ptr.*;
             // Skip sensitive files that may contain secrets
             if (isSensitivePath(path)) continue;
-            const content = entry.value_ptr.*;
-            var pl_buf: [2]u8 = undefined;
-            std.mem.writeInt(u16, &pl_buf, @intCast(path.len), .little);
-            try fw.writeAll(&pl_buf);
-            try fw.writeAll(path);
-            var cl_buf: [4]u8 = undefined;
-            std.mem.writeInt(u32, &cl_buf, @intCast(content.len), .little);
-            try fw.writeAll(&cl_buf);
-            try fw.writeAll(content);
+            const cached_content = explorer.contents.get(path);
+            if (cached_content) |content| {
+                var pl_buf: [2]u8 = undefined;
+                std.mem.writeInt(u16, &pl_buf, @intCast(path.len), .little);
+                try fw.writeAll(&pl_buf);
+                try fw.writeAll(path);
+                var cl_buf: [4]u8 = undefined;
+                std.mem.writeInt(u32, &cl_buf, @intCast(content.len), .little);
+                try fw.writeAll(&cl_buf);
+                try fw.writeAll(content);
+            } else if (root_dir) |*dir| {
+                const disk_content = dir.readFileAlloc(io, path, allocator, .limited(64 * 1024 * 1024)) catch continue;
+                errdefer allocator.free(disk_content);
+
+                var pl_buf: [2]u8 = undefined;
+                std.mem.writeInt(u16, &pl_buf, @intCast(path.len), .little);
+                try fw.writeAll(&pl_buf);
+                try fw.writeAll(path);
+                var cl_buf: [4]u8 = undefined;
+                std.mem.writeInt(u32, &cl_buf, @intCast(disk_content.len), .little);
+                try fw.writeAll(&cl_buf);
+                try fw.writeAll(disk_content);
+                allocator.free(disk_content);
+            }
         }
         const end = file_writer.logicalPos();
         try sections.append(allocator, .{ .id = @intFromEnum(SectionId.content), .offset = offset, .length = end - offset });
@@ -304,7 +324,6 @@ pub fn writeSnapshot(
     };
 }
 
-
 /// Read section table from a `.codedb` file.
 fn readSectionsFromFile(io: std.Io, file: std.Io.File, allocator: std.mem.Allocator) !?std.AutoHashMap(u32, SectionEntry) {
     var magic_buf: [4]u8 = undefined;
@@ -680,7 +699,7 @@ fn insertRestoredFile(
     content: []const u8,
     outline: FileOutline,
     allocator: std.mem.Allocator,
-) !void {
+) !bool {
     var restored_outline = outline;
     restored_outline.path = path;
 
@@ -689,12 +708,17 @@ fn insertRestoredFile(
     outline_gop.key_ptr.* = path;
     outline_gop.value_ptr.* = restored_outline;
 
-    const content_gop = try explorer.contents.getOrPut(path);
-    if (content_gop.found_existing) return error.InvalidData;
-    content_gop.key_ptr.* = path;
-    content_gop.value_ptr.* = content;
+    const content_cache_limit: u32 = 1000;
+    const should_cache = explorer.outlines.count() <= content_cache_limit;
+    if (should_cache) {
+        const content_gop = try explorer.contents.getOrPut(path);
+        if (content_gop.found_existing) return error.InvalidData;
+        content_gop.key_ptr.* = path;
+        content_gop.value_ptr.* = content;
+    }
 
     try rebuildDepsFromOutline(explorer, path, &restored_outline, allocator);
+    return should_cache;
 }
 
 fn loadSnapshotFast(
@@ -795,7 +819,7 @@ fn loadSnapshotFast(
             allocator.free(content);
         } else if (outline_states.fetchRemove(path_buf)) |removed| {
             allocator.free(path_buf);
-            insertRestoredFile(explorer, removed.key, content, removed.value, allocator) catch {
+            const content_cached = insertRestoredFile(explorer, removed.key, content, removed.value, allocator) catch {
                 allocator.free(removed.key);
                 var bad_outline = removed.value;
                 bad_outline.deinit();
@@ -804,6 +828,7 @@ fn loadSnapshotFast(
             };
             const hash = std.hash.Wyhash.hash(0, content);
             _ = store.recordSnapshot(removed.key, content.len, hash) catch {};
+            if (!content_cached) allocator.free(content);
         } else {
             word_index_can_load_from_disk = false;
             explorer.indexFileOutlineOnly(path_buf, content) catch {