Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1401,8 +1401,9 @@ pub const Explorer = struct {
for (word_hits) |hit| {
const hit_path = self.word_index.hitPath(hit);
if (hit_path.len == 0) continue;
const cached = self.contents.get(hit_path) orelse continue;
const line_text = extractLineByNumber(cached, hit.line_num) orelse continue;
const ref = self.readContentForSearch(hit_path, allocator) orelse continue;
defer ref.deinit();
const line_text = extractLineByNumber(ref.data, hit.line_num) orelse continue;
if (indexOfCaseInsensitive(line_text, query) == null) continue;
const duped_text = try allocator.dupe(u8, line_text);
errdefer allocator.free(duped_text);
Expand All @@ -1429,8 +1430,9 @@ pub const Explorer = struct {
for (prefix_hits) |hit| {
const hit_path = self.word_index.hitPath(hit);
if (hit_path.len == 0) continue;
const cached = self.contents.get(hit_path) orelse continue;
const line_text = extractLineByNumber(cached, hit.line_num) orelse continue;
const ref = self.readContentForSearch(hit_path, allocator) orelse continue;
defer ref.deinit();
const line_text = extractLineByNumber(ref.data, hit.line_num) orelse continue;
if (indexOfCaseInsensitive(line_text, query) == null) continue;
const duped_text = try allocator.dupe(u8, line_text);
errdefer allocator.free(duped_text);
Expand Down
25 changes: 20 additions & 5 deletions src/index.zig
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ pub const WordIndex = struct {
}
self.file_words.deinit();

if (self.skip_file_words) {
for (self.id_to_path.items) |path| {
if (path.len > 0) self.allocator.free(path);
}
}

self.path_to_id.deinit();
self.id_to_path.deinit(self.allocator);
}
Expand Down Expand Up @@ -362,7 +368,11 @@ pub const WordIndex = struct {
}
}.lt);

const rand_suffix = @as(u64, blk: { var ts: std.c.timespec = undefined; _ = std.c.clock_gettime(std.c.CLOCK.REALTIME, &ts); break :blk @as(u64, @intCast(ts.nsec)) ^ (@as(u64, @intCast(ts.sec)) << 1); });
const rand_suffix = @as(u64, blk: {
var ts: std.c.timespec = undefined;
_ = std.c.clock_gettime(std.c.CLOCK.REALTIME, &ts);
break :blk @as(u64, @intCast(ts.nsec)) ^ (@as(u64, @intCast(ts.sec)) << 1);
});
const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}/word.index.{x}.tmp", .{ dir_path, rand_suffix });
defer self.allocator.free(tmp_path);
const final_path = try std.fmt.allocPrint(self.allocator, "{s}/word.index", .{dir_path});
Expand Down Expand Up @@ -917,7 +927,9 @@ pub const TrigramIndex = struct {
}
if (is_new_doc) {
try idx_gop.value_ptr.items.append(self.allocator, .{
.doc_id = doc_id, .next_mask = mask.next_mask, .loc_mask = mask.loc_mask,
.doc_id = doc_id,
.next_mask = mask.next_mask,
.loc_mask = mask.loc_mask,
});
} else {
const posting = try idx_gop.value_ptr.getOrAddPosting(self.allocator, doc_id);
Expand Down Expand Up @@ -969,7 +981,9 @@ pub const TrigramIndex = struct {
idx_gop.value_ptr.* = .{ .path_to_id = &self.path_to_id };
}
try idx_gop.value_ptr.items.append(self.allocator, .{
.doc_id = doc_id, .next_mask = mask.next_mask, .loc_mask = mask.loc_mask,
.doc_id = doc_id,
.next_mask = mask.next_mask,
.loc_mask = mask.loc_mask,
});
try tri_list.append(self.allocator, tri);
}
Expand All @@ -988,7 +1002,9 @@ pub const TrigramIndex = struct {
idx_gop.value_ptr.* = .{ .path_to_id = &self.path_to_id };
}
try idx_gop.value_ptr.items.append(self.allocator, .{
.doc_id = doc_id, .next_mask = te.mask.next_mask, .loc_mask = te.mask.loc_mask,
.doc_id = doc_id,
.next_mask = te.mask.next_mask,
.loc_mask = te.mask.loc_mask,
});
}
}
Expand Down Expand Up @@ -1982,7 +1998,6 @@ pub const MmapTrigramIndex = struct {
}
};


pub const AnyTrigramIndex = union(enum) {
heap: TrigramIndex,
mmap: MmapTrigramIndex,
Expand Down
102 changes: 98 additions & 4 deletions src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -538,13 +538,14 @@ fn mainImpl() !void {
};
const git_head = git_mod.getGitHead(abs_root, allocator) catch null;
loadWordIndexFromDiskIfPresent(io, &explorer, data_dir, git_head, allocator);
if (!explorer.wordIndexIsComplete()) {
explorer.rebuildWordIndex() catch |err| {
out.p("{s}\xe2\x9c\x97{s} word index rebuild failed: {}\n", .{ s.red, s.reset, err });
if (!wordIndexMatchesOutlines(&explorer)) {
persistWordIndexFromSource(io, &explorer, abs_root, data_dir, git_head, allocator) catch |err| {
out.p("{s}\xe2\x9c\x97{s} word index persist failed: {}\n", .{ s.red, s.reset, err });
std.process.exit(1);
};
} else {
persistWordIndexToDisk(io, &explorer, data_dir, git_head);
}
persistWordIndexToDisk(io, &explorer, data_dir, git_head);
const elapsed = cio.nanoTimestamp() - t0;
var dur_buf: [64]u8 = undefined;
out.p("{s}\xe2\x9c\x93{s} {s}snapshot{s} {s}{s}{s} {s}{d} files{s} {s}{s}{s}\n", .{
Expand Down Expand Up @@ -629,6 +630,7 @@ fn mainImpl() !void {
const startup_time_ms: u64 = @intCast(@max(cio.milliTimestamp() - startup_t0, 0));
loadTrigramFromDiskIfPresent(io, &explorer, data_dir, allocator);
telem.recordCodebaseStats(&explorer, startup_time_ms);
compactMcpReadyMemory(io, &explorer, data_dir, git_head, allocator);
mcp_server.setScanState(.ready);
}

Expand Down Expand Up @@ -736,6 +738,52 @@ fn loadWordIndexFromDiskIfPresent(
}
}

fn wordIndexDiskMatches(
io: std.Io,
explorer: *Explorer,
data_dir: []const u8,
current_git_head: ?[40]u8,
allocator: std.mem.Allocator,
) bool {
const header = WordIndex.readDiskHeader(io, data_dir, allocator) catch null orelse return false;

explorer.mu.lockShared();
const current_count = @as(u32, @intCast(explorer.outlines.count()));
explorer.mu.unlockShared();
if (header.file_count != current_count) return false;

if (current_git_head == null and header.git_head == null) return true;
if (current_git_head == null or header.git_head == null) return false;
return std.mem.eql(u8, &current_git_head.?, &header.git_head.?);
}

fn compactMcpReadyMemory(
io: std.Io,
explorer: *Explorer,
data_dir: []const u8,
current_git_head: ?[40]u8,
allocator: std.mem.Allocator,
) void {
explorer.mu.lockShared();
const file_count = explorer.outlines.count();
explorer.mu.unlockShared();

if (file_count <= 1000 and cio.posixGetenv("CODEDB_LOW_MEMORY") == null) return;

const can_release_contents =
explorer.wordIndexIsComplete() or
(explorer.wordIndexCanLoadFromDisk() and wordIndexDiskMatches(io, explorer, data_dir, current_git_head, allocator));

if (can_release_contents) {
explorer.releaseContents();
}
explorer.releaseSecondaryIndexes();

// Shrink index allocations to reclaim ArrayList over-allocation.
if (explorer.trigram_index.asHeap()) |heap| heap.shrinkPostingLists();
explorer.word_index.shrinkAllocations();
}

fn persistWordIndexToDisk(io: std.Io, explorer: *Explorer, data_dir: []const u8, git_head: ?[40]u8) void {
const generation = explorer.wordIndexGenerationToPersist() orelse return;

Expand All @@ -749,6 +797,52 @@ fn persistWordIndexToDisk(io: std.Io, explorer: *Explorer, data_dir: []const u8,
explorer.markWordIndexPersisted(generation);
}

fn wordIndexMatchesOutlines(explorer: *Explorer) bool {
explorer.mu.lockShared();
defer explorer.mu.unlockShared();
return explorer.word_index_complete and
explorer.word_index.id_to_path.items.len == explorer.outlines.count();
}

fn persistWordIndexFromSource(
io: std.Io,
explorer: *Explorer,
root_path: []const u8,
data_dir: []const u8,
git_head: ?[40]u8,
allocator: std.mem.Allocator,
) !void {
var paths: std.ArrayList([]const u8) = .empty;
defer paths.deinit(allocator);

{
explorer.mu.lockShared();
defer explorer.mu.unlockShared();
try paths.ensureTotalCapacity(allocator, explorer.outlines.count());
var path_iter = explorer.outlines.keyIterator();
while (path_iter.next()) |path_ptr| {
paths.appendAssumeCapacity(path_ptr.*);
}
}

var root_dir = try std.Io.Dir.cwd().openDir(io, root_path, .{});
defer root_dir.close(io);

var word_index = WordIndex.init(allocator);
defer word_index.deinit();
word_index.skip_file_words = true;

for (paths.items) |path| {
const content = root_dir.readFileAlloc(io, path, allocator, .limited(64 * 1024 * 1024)) catch continue;
errdefer allocator.free(content);
try word_index.indexFile(path, content);
allocator.free(content);
}

if (word_index.id_to_path.items.len == 0 and paths.items.len != 0) return error.NoWordIndexData;
try word_index.writeToDisk(io, data_dir, git_head);
}

fn saveProjectInfo(io: std.Io, allocator: std.mem.Allocator, data_dir: []const u8, abs_root: []const u8) !void {
const info_path = try std.fmt.allocPrint(allocator, "{s}/project.txt", .{data_dir});
defer allocator.free(info_path);
Expand Down
20 changes: 19 additions & 1 deletion src/mcp.zig
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,24 @@ fn loadProjectWordIndexFromDiskIfPresent(io: std.Io, explorer: *Explorer, projec
}
}

fn shouldLoadWordIndexForSearch(args: *const std.json.ObjectMap) bool {
if (getBool(args, "regex")) return false;
const query = getStr(args, "query") orelse return false;
if (query.len < 2 or query.len > 256) return false;

var saw_word_char = false;
for (query) |c| {
const is_word_char =
(c >= 'a' and c <= 'z') or
(c >= 'A' and c <= 'Z') or
(c >= '0' and c <= '9') or
c == '_';
if (!is_word_char) return false;
if (c != '_') saw_word_char = true;
}
return saw_word_char;
}

const ProjectCache = struct {
const MAX_CACHED = 5;

Expand Down Expand Up @@ -811,7 +829,7 @@ fn dispatch(
return;
};

if (tool == .codedb_word) {
if (tool == .codedb_word or (tool == .codedb_search and shouldLoadWordIndexForSearch(args))) {
const effective_project = project_path orelse cache.default_path;
loadProjectWordIndexFromDiskIfPresent(io, ctx.explorer, effective_project, alloc);
}
Expand Down
67 changes: 46 additions & 21 deletions src/snapshot.zig
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,10 @@ pub fn writeSnapshot(
defer buf.deinit(allocator);
const writer = cio.listWriter(&buf, allocator);
var total_bytes: u64 = 0;
var ct_iter = explorer.contents.valueIterator();
while (ct_iter.next()) |v| total_bytes += v.*.len;
var outline_size_iter = explorer.outlines.valueIterator();
while (outline_size_iter.next()) |outline| {
total_bytes += outline.byte_size;
}
var file_count_meta: u32 = 0;
var fc_iter = explorer.outlines.keyIterator();
while (fc_iter.next()) |k| {
Expand Down Expand Up @@ -227,20 +229,38 @@ pub fn writeSnapshot(
// ── Section: CONTENT ──
{
const offset = file_writer.logicalPos();
var ct_iter = explorer.contents.iterator();
while (ct_iter.next()) |entry| {
const path = entry.key_ptr.*;
var root_dir = std.Io.Dir.cwd().openDir(io, root_path, .{}) catch null;
defer if (root_dir) |*dir| dir.close(io);

var path_iter = explorer.outlines.keyIterator();
while (path_iter.next()) |path_ptr| {
const path = path_ptr.*;
// Skip sensitive files that may contain secrets
if (isSensitivePath(path)) continue;
const content = entry.value_ptr.*;
var pl_buf: [2]u8 = undefined;
std.mem.writeInt(u16, &pl_buf, @intCast(path.len), .little);
try fw.writeAll(&pl_buf);
try fw.writeAll(path);
var cl_buf: [4]u8 = undefined;
std.mem.writeInt(u32, &cl_buf, @intCast(content.len), .little);
try fw.writeAll(&cl_buf);
try fw.writeAll(content);
const cached_content = explorer.contents.get(path);
if (cached_content) |content| {
var pl_buf: [2]u8 = undefined;
std.mem.writeInt(u16, &pl_buf, @intCast(path.len), .little);
try fw.writeAll(&pl_buf);
try fw.writeAll(path);
var cl_buf: [4]u8 = undefined;
std.mem.writeInt(u32, &cl_buf, @intCast(content.len), .little);
try fw.writeAll(&cl_buf);
try fw.writeAll(content);
} else if (root_dir) |*dir| {
const disk_content = dir.readFileAlloc(io, path, allocator, .limited(64 * 1024 * 1024)) catch continue;
errdefer allocator.free(disk_content);

var pl_buf: [2]u8 = undefined;
std.mem.writeInt(u16, &pl_buf, @intCast(path.len), .little);
try fw.writeAll(&pl_buf);
try fw.writeAll(path);
var cl_buf: [4]u8 = undefined;
std.mem.writeInt(u32, &cl_buf, @intCast(disk_content.len), .little);
try fw.writeAll(&cl_buf);
try fw.writeAll(disk_content);
allocator.free(disk_content);
}
}
const end = file_writer.logicalPos();
try sections.append(allocator, .{ .id = @intFromEnum(SectionId.content), .offset = offset, .length = end - offset });
Expand Down Expand Up @@ -304,7 +324,6 @@ pub fn writeSnapshot(
};
}


/// Read section table from a `.codedb` file.
fn readSectionsFromFile(io: std.Io, file: std.Io.File, allocator: std.mem.Allocator) !?std.AutoHashMap(u32, SectionEntry) {
var magic_buf: [4]u8 = undefined;
Expand Down Expand Up @@ -680,7 +699,7 @@ fn insertRestoredFile(
content: []const u8,
outline: FileOutline,
allocator: std.mem.Allocator,
) !void {
) !bool {
var restored_outline = outline;
restored_outline.path = path;

Expand All @@ -689,12 +708,17 @@ fn insertRestoredFile(
outline_gop.key_ptr.* = path;
outline_gop.value_ptr.* = restored_outline;

const content_gop = try explorer.contents.getOrPut(path);
if (content_gop.found_existing) return error.InvalidData;
content_gop.key_ptr.* = path;
content_gop.value_ptr.* = content;
const content_cache_limit: u32 = 1000;
const should_cache = explorer.outlines.count() <= content_cache_limit;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve rebuildable content when capping snapshot cache

This cap drops raw content for files after the first 1000 during fast snapshot restore, but Explorer.searchWord rebuilds the in-memory word index from explorer.contents when disk index loading fails. In repos larger than 1000 files where word.index is missing/unreadable (for example on a fresh machine with only codedb.snapshot), the rebuild becomes partial yet is treated as complete, so codedb_word and identifier-style search paths silently miss hits in uncached files.

Useful? React with 👍 / 👎.

if (should_cache) {
const content_gop = try explorer.contents.getOrPut(path);
if (content_gop.found_existing) return error.InvalidData;
content_gop.key_ptr.* = path;
content_gop.value_ptr.* = content;
}

try rebuildDepsFromOutline(explorer, path, &restored_outline, allocator);
return should_cache;
}

fn loadSnapshotFast(
Expand Down Expand Up @@ -795,7 +819,7 @@ fn loadSnapshotFast(
allocator.free(content);
} else if (outline_states.fetchRemove(path_buf)) |removed| {
allocator.free(path_buf);
insertRestoredFile(explorer, removed.key, content, removed.value, allocator) catch {
const content_cached = insertRestoredFile(explorer, removed.key, content, removed.value, allocator) catch {
allocator.free(removed.key);
var bad_outline = removed.value;
bad_outline.deinit();
Expand All @@ -804,6 +828,7 @@ fn loadSnapshotFast(
};
const hash = std.hash.Wyhash.hash(0, content);
_ = store.recordSnapshot(removed.key, content.len, hash) catch {};
if (!content_cached) allocator.free(content);
} else {
word_index_can_load_from_disk = false;
explorer.indexFileOutlineOnly(path_buf, content) catch {
Expand Down
Loading
Loading