diff --git a/.github/workflows/bench-regression.yml b/.github/workflows/bench-regression.yml new file mode 100644 index 0000000..8200529 --- /dev/null +++ b/.github/workflows/bench-regression.yml @@ -0,0 +1,74 @@ +name: bench-regression + +on: + pull_request: + +permissions: + contents: read + pull-requests: write + +jobs: + bench: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install Zig + run: | + curl -L https://ziglang.org/download/0.15.2/zig-x86_64-linux-0.15.2.tar.xz -o zig.tar.xz + tar -xf zig.tar.xz + echo "$PWD/zig-x86_64-linux-0.15.2" >> "$GITHUB_PATH" + + - name: Benchmark head + run: python3 scripts/run-bench-json.py bench-head.json + + - name: Benchmark base + run: | + git fetch origin "${{ github.event.pull_request.base.ref }}" --depth=1 + git worktree add ../codedb-base FETCH_HEAD + cd ../codedb-base + if python3 "$GITHUB_WORKSPACE/scripts/run-bench-json.py" "$GITHUB_WORKSPACE/bench-base.json"; then + echo "have_base_json=true" >> "$GITHUB_ENV" + else + echo "have_base_json=false" >> "$GITHUB_ENV" + fi + + - name: Compare + if: env.have_base_json == 'true' + run: | + python3 scripts/compare-bench.py bench-base.json bench-head.json --threshold-pct 10 --markdown-out bench-report.md + + - name: Bootstrap report + if: env.have_base_json != 'true' + run: | + cat > bench-report.md <<'EOF' + ## Benchmark Regression Report + + Skipped strict comparison because the base branch does not yet emit machine-readable benchmark JSON. + This PR introduces the JSON benchmark format that future PRs will compare against. + EOF + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: bench-results + path: | + bench-base.json + bench-head.json + bench-report.md + + - name: Comment PR + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const body = fs.readFileSync('bench-report.md', 'utf8'); + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); diff --git a/README.md b/README.md index 8f374cc..0cedf4e 100644 --- a/README.md +++ b/README.md @@ -292,14 +292,19 @@ All threads share a `shutdown: atomic.Value(bool)` for graceful termination. ## πŸ”’ Data & Privacy -codedb is **fully local** β€” no telemetry, no analytics, no network calls. Nothing leaves your machine. +codedb keeps runtime data local by default. Telemetry, when enabled, is written to `~/.codedb/telemetry.ndjson` on the same machine and is not uploaded automatically. | Location | Contents | Purpose | |----------|----------|---------| | `~/.codedb/projects//` | Trigram index, frequency table, data log | Persistent index cache | +| `~/.codedb/telemetry.ndjson` | Aggregate tool calls and startup stats | Local telemetry log | | `./codedb.snapshot` | File tree, outlines, content, frequency table | Portable snapshot for instant MCP startup | -**Not stored:** No source code is sent anywhere. No network requests. No usage analytics. Sensitive files auto-excluded (`.env*`, `credentials.json`, `secrets.*`, `.pem`, `.key`, SSH keys, AWS configs). +**Not stored:** No source code is sent anywhere. No file contents, file paths, or search queries are collected in telemetry. Sensitive files auto-excluded (`.env*`, `credentials.json`, `secrets.*`, `.pem`, `.key`, SSH keys, AWS configs). + +To disable the local telemetry log entirely, set `CODEDB_NO_TELEMETRY=1`. + +To sync the local NDJSON file into Postgres for analysis or dashboards, use [`scripts/sync-telemetry.py`](./scripts/sync-telemetry.py) with the schema in [`docs/telemetry/postgres-schema.sql`](./docs/telemetry/postgres-schema.sql). The data flow is documented in [`docs/telemetry.md`](./docs/telemetry.md). ```bash rm -rf ~/.codedb/ # clear all cached indexes diff --git a/build.zig b/build.zig index bac59e4..b622160 100644 --- a/build.zig +++ b/build.zig @@ -21,7 +21,6 @@ pub fn build(b: *std.Build) void { }), }); - // ── mcp-zig dependency ── const mcp_dep = b.dependency("mcp_zig", .{}); exe.root_module.addImport("mcp", mcp_dep.module("mcp")); @@ -82,6 +81,8 @@ pub fn build(b: *std.Build) void { }), }); const bench_run = b.addRunArtifact(bench); + bench.root_module.addImport("mcp", mcp_dep.module("mcp")); + if (b.args) |args| bench_run.addArgs(args); const bench_step = b.step("bench", "Run benchmarks"); bench_step.dependOn(&bench_run.step); // Make module available so dependents don't need to wire it up manually diff --git a/docs/telemetry.md b/docs/telemetry.md new file mode 100644 index 0000000..d494e0e --- /dev/null +++ b/docs/telemetry.md @@ -0,0 +1,31 @@ +# Telemetry Data Flow + +codedb writes local telemetry to `~/.codedb/telemetry.ndjson` unless `CODEDB_NO_TELEMETRY=1` is set. The file is append-only and stays on disk until an operator syncs it. + +The current on-disk format is compact: + +- `ts` or `timestamp_ms` +- `ev` or `event_type` +- `tool`, `ns` / `latency_ns`, `err` / `error`, `bytes` / `response_bytes` +- `files` / `file_count`, `lines` / `total_lines` +- optional `languages`, `index_size_bytes`, `startup_time_ms`, `version`, `platform` + +`scripts/sync-telemetry.py` normalizes those fields and loads them into Postgres with `COPY`. + +## Postgres schema + +Use [`docs/telemetry/postgres-schema.sql`](./telemetry/postgres-schema.sql) to create the destination table and indexes. + +## Sync + +```bash +python3 scripts/sync-telemetry.py --dsn "$DATABASE_URL" +``` + +For a preview without touching Postgres: + +```bash +python3 scripts/sync-telemetry.py --dry-run +``` + +The sync path stores aggregate usage and performance data only. It does not capture file contents, file paths, or search queries. diff --git a/docs/telemetry/postgres-schema.sql b/docs/telemetry/postgres-schema.sql new file mode 100644 index 0000000..343247d --- /dev/null +++ b/docs/telemetry/postgres-schema.sql @@ -0,0 +1,24 @@ +CREATE TABLE IF NOT EXISTS codedb_events ( + id BIGSERIAL PRIMARY KEY, + timestamp_ms BIGINT NOT NULL, + event_type TEXT NOT NULL, + tool TEXT, + latency_ns BIGINT, + error BOOLEAN, + response_bytes INTEGER, + file_count INTEGER, + total_lines BIGINT, + languages TEXT[], + index_size_bytes BIGINT, + startup_time_ms BIGINT, + version TEXT, + platform TEXT, + ingested_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_codedb_events_timestamp_ms + ON codedb_events(timestamp_ms); + +CREATE INDEX IF NOT EXISTS idx_codedb_events_tool + ON codedb_events(tool) + WHERE tool IS NOT NULL; diff --git a/scripts/compare-bench.py b/scripts/compare-bench.py new file mode 100644 index 0000000..aa3f246 --- /dev/null +++ b/scripts/compare-bench.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Compare codedb benchmark JSON results.") + parser.add_argument("base", help="baseline benchmark JSON") + parser.add_argument("head", help="candidate benchmark JSON") + parser.add_argument("--threshold-pct", type=float, default=10.0, help="maximum allowed latency regression percentage") + parser.add_argument("--markdown-out", help="write markdown report to this path") + return parser.parse_args() + + +def load_tools(path: str) -> dict[str, dict]: + data = json.loads(Path(path).read_text(encoding="utf-8")) + return {tool["tool"]: tool for tool in data["tools"]} + + +def pct_change(base_ns: int, head_ns: int) -> float: + if base_ns == 0: + return 0.0 + return ((head_ns - base_ns) / base_ns) * 100.0 + + +def render_markdown(rows: list[tuple[str, int, int, float]], threshold_pct: float) -> str: + lines = [ + "## Benchmark Regression Report", + "", + f"Threshold: {threshold_pct:.2f}%", + "", + "| Tool | Base (ns) | Head (ns) | Delta | Status |", + "| --- | ---: | ---: | ---: | --- |", + ] + for tool, base_ns, head_ns, delta in rows: + status = "FAIL" if delta > threshold_pct else "OK" + lines.append(f"| `{tool}` | {base_ns} | {head_ns} | {delta:+.2f}% | {status} |") + return "\n".join(lines) + "\n" + + +def main() -> int: + args = parse_args() + base = load_tools(args.base) + head = load_tools(args.head) + + missing = sorted(set(base) ^ set(head)) + if missing: + print(f"error: tool mismatch: {', '.join(missing)}", file=sys.stderr) + return 1 + + rows: list[tuple[str, int, int, float]] = [] + failures: list[str] = [] + + for tool in sorted(base): + base_ns = int(base[tool]["avg_latency_ns"]) + head_ns = int(head[tool]["avg_latency_ns"]) + delta = pct_change(base_ns, head_ns) + rows.append((tool, base_ns, head_ns, delta)) + if delta > args.threshold_pct: + failures.append(f"{tool} regressed by {delta:.2f}%") + + report = render_markdown(rows, args.threshold_pct) + sys.stdout.write(report) + + if args.markdown_out: + Path(args.markdown_out).write_text(report, encoding="utf-8") + + if failures: + for failure in failures: + print(failure, file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run-bench-json.py b/scripts/run-bench-json.py new file mode 100644 index 0000000..f24c9dd --- /dev/null +++ b/scripts/run-bench-json.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import pathlib +import subprocess +import sys + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run `zig build bench -- --json` and persist the JSON payload.") + parser.add_argument("output", help="output JSON file") + return parser.parse_args() + + +def extract_json(stdout: str, stderr: str) -> str: + text = stdout.strip() + if text.startswith("{") and text.endswith("}"): + return text + "\n" + + for stream in (stdout, stderr): + for line in reversed(stream.splitlines()): + line = line.strip() + if line.startswith("{") and line.endswith("}"): + return line + "\n" + raise RuntimeError("benchmark command did not emit JSON") + + +def main() -> int: + args = parse_args() + proc = subprocess.run( + ["zig", "build", "bench", "--", "--json"], + capture_output=True, + text=True, + check=True, + ) + if proc.stderr: + sys.stderr.write(proc.stderr) + payload = extract_json(proc.stdout, proc.stderr) + pathlib.Path(args.output).write_text(payload, encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/sync-telemetry.py b/scripts/sync-telemetry.py new file mode 100755 index 0000000..87a61cc --- /dev/null +++ b/scripts/sync-telemetry.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import os +import subprocess +import sys +from pathlib import Path + + +DEFAULT_INPUT = Path(os.path.expanduser("~/.codedb/telemetry.ndjson")) +DEFAULT_SCHEMA = Path(__file__).resolve().parents[1] / "docs" / "telemetry" / "postgres-schema.sql" +COPY_COLUMNS = [ + "timestamp_ms", + "event_type", + "tool", + "latency_ns", + "error", + "response_bytes", + "file_count", + "total_lines", + "languages", + "index_size_bytes", + "startup_time_ms", + "version", + "platform", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Sync codedb telemetry NDJSON into Postgres.", + ) + parser.add_argument( + "input", + nargs="?", + default=str(DEFAULT_INPUT), + help="telemetry NDJSON file to ingest", + ) + parser.add_argument( + "--dsn", + default=os.environ.get("DATABASE_URL"), + help="Postgres DSN (defaults to DATABASE_URL)", + ) + parser.add_argument( + "--schema", + default=str(DEFAULT_SCHEMA), + help="schema SQL file to apply before loading", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="print normalized CSV rows instead of loading Postgres", + ) + return parser.parse_args() + + +def coerce_timestamp_ms(record: dict[str, object]) -> int: + value = record.get("timestamp_ms") + if value is None: + value = record.get("ts") + if value is None: + raise ValueError("missing timestamp") + value = int(value) + if value < 1_000_000_000_000: + value *= 1000 + return int(value) + + +def coerce_event_type(record: dict[str, object]) -> str: + value = record.get("event_type") or record.get("ev") or record.get("kind") + if value is None: + raise ValueError("missing event type") + value = str(value) + if value == "tool": + return "tool_call" + if value == "start": + return "session_start" + return value + + +def coerce_bool(record: dict[str, object], *keys: str) -> bool | None: + for key in keys: + if key in record: + value = record[key] + if value is None: + return None + return bool(value) + return None + + +def coerce_int(record: dict[str, object], *keys: str) -> int | None: + for key in keys: + if key in record: + value = record[key] + if value is None: + return None + return int(value) + return None + + +def format_pg_array(items: list[str]) -> str: + if not items: + return "{}" + + escaped = [] + for item in items: + safe = item.replace("\\", "\\\\").replace('"', '\\"') + if any(ch in item for ch in ',{}"\\ '): + escaped.append(f'"{safe}"') + else: + escaped.append(safe) + return "{" + ",".join(escaped) + "}" + + +def coerce_languages(record: dict[str, object]) -> str | None: + value = record.get("languages") + if value is None: + return None + if isinstance(value, list): + items = [str(item) for item in value if str(item)] + else: + items = [part.strip() for part in str(value).split(",") if part.strip()] + return format_pg_array(items) + + +def normalize(record: dict[str, object]) -> list[object | None]: + return [ + coerce_timestamp_ms(record), + coerce_event_type(record), + record.get("tool"), + coerce_int(record, "latency_ns", "ns"), + coerce_bool(record, "error", "err"), + coerce_int(record, "response_bytes", "bytes"), + coerce_int(record, "file_count", "files"), + coerce_int(record, "total_lines", "lines"), + coerce_languages(record), + coerce_int(record, "index_size_bytes"), + coerce_int(record, "startup_time_ms"), + record.get("version"), + record.get("platform"), + ] + + +def apply_schema(dsn: str, schema: str) -> None: + subprocess.run( + ["psql", dsn, "-v", "ON_ERROR_STOP=1", "-f", schema], + check=True, + ) + + +def load_rows(dsn: str, rows: list[list[object | None]]) -> None: + copy_sql = ( + "COPY codedb_events (" + + ", ".join(COPY_COLUMNS) + + ") FROM STDIN WITH (FORMAT csv, NULL '')" + ) + proc = subprocess.Popen( + ["psql", dsn, "-v", "ON_ERROR_STOP=1", "-c", copy_sql], + stdin=subprocess.PIPE, + text=True, + ) + assert proc.stdin is not None + writer = csv.writer(proc.stdin) + for row in rows: + writer.writerow(["" if value is None else value for value in row]) + proc.stdin.close() + code = proc.wait() + if code != 0: + raise subprocess.CalledProcessError(code, proc.args) + + +def main() -> int: + args = parse_args() + input_path = Path(args.input) + if not input_path.exists(): + print(f"error: input file not found: {input_path}", file=sys.stderr) + return 1 + + rows: list[list[object | None]] = [] + with input_path.open("r", encoding="utf-8") as fh: + for line_no, line in enumerate(fh, 1): + line = line.strip() + if not line: + continue + try: + record = json.loads(line) + except json.JSONDecodeError as err: + print(f"error: line {line_no}: invalid JSON: {err.msg}", file=sys.stderr) + return 1 + if not isinstance(record, dict): + print(f"error: line {line_no}: expected JSON object", file=sys.stderr) + return 1 + rows.append(normalize(record)) + + if args.dry_run: + writer = csv.writer(sys.stdout) + writer.writerow(COPY_COLUMNS) + for row in rows: + writer.writerow(["" if value is None else value for value in row]) + return 0 + + if not args.dsn: + print("error: --dsn or DATABASE_URL is required", file=sys.stderr) + return 1 + + apply_schema(args.dsn, args.schema) + load_rows(args.dsn, rows) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/bench.zig b/src/bench.zig index 8bb412a..90df2d1 100644 --- a/src/bench.zig +++ b/src/bench.zig @@ -1,239 +1,285 @@ const std = @import("std"); +const Store = @import("store.zig").Store; const Explorer = @import("explore.zig").Explorer; -const index = @import("index.zig"); -const WordIndex = index.WordIndex; -const TrigramIndex = index.TrigramIndex; -const Trigram = index.Trigram; -const PostingMask = index.PostingMask; -const packTrigram = index.packTrigram; -const normalizeChar = index.normalizeChar; -const FileEntry = struct { name: []const u8, content: []const u8 }; - -fn generateCode(allocator: std.mem.Allocator, num_files: usize, lines_per_file: usize) ![]const FileEntry { - var files: std.ArrayList(FileEntry) = .{}; - var prng = std.Random.DefaultPrng.init(42); - const rand = prng.random(); - - const words = [_][]const u8{ - "fn", "pub", "const", "var", "struct", "enum", "union", "return", - "if", "else", "while", "for", "switch", "break", "continue", "try", - "catch", "error", "void", "bool", "u8", "u32", "u64", "allocator", - "self", "result", "value", "index", "count", "size", "init", "deinit", - "append", "remove", "get", "put", "insert", "handleRequest", "processData", "validateInput", - "parseConfig", "readFile", "writeOutput", "createBuffer", "destroyBuffer", "AgentRegistry", "FileVersions", "TrigramIndex", - "WordIndex", "Explorer", "Store", "Version", "Symbol", "Outline", "Language", - }; +const AgentRegistry = @import("agent.zig").AgentRegistry; +const watcher = @import("watcher.zig"); +const mcp = @import("mcp.zig"); +const telemetry = @import("telemetry.zig"); - for (0..num_files) |i| { - var buf: std.ArrayList(u8) = .{}; - const w = buf.writer(allocator); - for (0..lines_per_file) |_| { - const num_words = 5 + rand.intRangeAtMost(usize, 0, 10); - for (0..num_words) |wi| { - if (wi > 0) w.writeByte(' ') catch {}; - const word = words[rand.intRangeAtMost(usize, 0, words.len - 1)]; - w.writeAll(word) catch {}; - } - w.writeByte('\n') catch {}; - } - var name_buf: [64]u8 = undefined; - const name = std.fmt.bufPrint(&name_buf, "src/gen_{d}.zig", .{i}) catch unreachable; - try files.append(allocator, .{ - .name = try allocator.dupe(u8, name), - .content = try buf.toOwnedSlice(allocator), - }); - } - return files.toOwnedSlice(allocator); -} +const ToolBench = struct { + tool: []const u8, + avg_latency_ns: u64, + response_bytes: usize, + ops_per_sec: f64, + telemetry_avg_ns: u64, + telemetry_delta_pct: f64, +}; + +const Case = struct { + tool: mcp.Tool, + name: []const u8, + args_json: []const u8, + iterations: usize, +}; + +const cases = [_]Case{ + .{ .tool = .codedb_tree, .name = "codedb_tree", .args_json = "{}", .iterations = 100 }, + .{ .tool = .codedb_outline, .name = "codedb_outline", .args_json = "{\"path\":\"src/main.zig\"}", .iterations = 100 }, + .{ .tool = .codedb_symbol, .name = "codedb_symbol", .args_json = "{\"name\":\"main\"}", .iterations = 100 }, + .{ .tool = .codedb_search, .name = "codedb_search", .args_json = "{\"query\":\"telemetry\",\"max_results\":10}", .iterations = 100 }, + .{ .tool = .codedb_word, .name = "codedb_word", .args_json = "{\"word\":\"Telemetry\"}", .iterations = 100 }, + .{ .tool = .codedb_hot, .name = "codedb_hot", .args_json = "{\"limit\":10}", .iterations = 100 }, + .{ .tool = .codedb_deps, .name = "codedb_deps", .args_json = "{\"path\":\"src/main.zig\"}", .iterations = 100 }, + .{ .tool = .codedb_read, .name = "codedb_read", .args_json = "{\"path\":\"src/main.zig\",\"line_start\":1,\"line_end\":20}", .iterations = 100 }, + .{ .tool = .codedb_edit, .name = "codedb_edit", .args_json = "{\"path\":\"src/bench_target.zig\",\"op\":\"replace\",\"range_start\":1,\"range_end\":1,\"content\":\"pub const bench_value = 2;\\n\"}", .iterations = 10 }, + .{ .tool = .codedb_changes, .name = "codedb_changes", .args_json = "{\"since\":0}", .iterations = 100 }, + .{ .tool = .codedb_status, .name = "codedb_status", .args_json = "{}", .iterations = 100 }, + .{ .tool = .codedb_snapshot, .name = "codedb_snapshot", .args_json = "{}", .iterations = 20 }, + .{ .tool = .codedb_bundle, .name = "codedb_bundle", .args_json = "{\"ops\":[{\"tool\":\"codedb_outline\",\"arguments\":{\"path\":\"src/main.zig\"}},{\"tool\":\"codedb_search\",\"arguments\":{\"query\":\"telemetry\",\"max_results\":5}},{\"tool\":\"codedb_word\",\"arguments\":{\"word\":\"Telemetry\"}}]}", .iterations = 50 }, +}; pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); - const num_files = 500; - const lines_per = 200; - const total_lines = num_files * lines_per; + const emit_json = blk: { + const args = try std.process.argsAlloc(allocator); + defer std.process.argsFree(allocator, args); + for (args[1..]) |arg| { + if (std.mem.eql(u8, arg, "--json")) break :blk true; + } + break :blk false; + }; + + var tmp = std.testing.tmpDir(.{}); + defer tmp.cleanup(); + + var tmp_path_buf: [std.fs.max_path_bytes]u8 = undefined; + const tmp_root = try tmp.dir.realpath(".", &tmp_path_buf); + + var repo_path_buf: [std.fs.max_path_bytes]u8 = undefined; + const repo_root = try std.fs.cwd().realpath(".", &repo_path_buf); + + try copyCorpus(allocator, repo_root, tmp_root); + try writeBenchTarget(tmp_root); + + var store = Store.init(allocator); + defer store.deinit(); + + var explorer = Explorer.init(allocator); + defer explorer.deinit(); - std.debug.print("Generating {d} files Γ— {d} lines = {d} total lines...\n", .{ num_files, lines_per, total_lines }); + var agents = AgentRegistry.init(allocator); + defer agents.deinit(); + _ = try agents.register("__filesystem__"); - const files = try generateCode(allocator, num_files, lines_per); + try watcher.initialScan(&store, &explorer, tmp_root, allocator, false); + + var bench_ctx = mcp.BenchContext.init(allocator, tmp_root); + defer bench_ctx.deinit(); + + var telem_off = telemetry.Telemetry{ .enabled = false }; + var telem_on = telemetry.Telemetry{ .enabled = true }; + + var args_store: [cases.len]std.json.Parsed(std.json.Value) = undefined; defer { - for (files) |f| { - allocator.free(f.name); - allocator.free(f.content); - } - allocator.free(files); + for (&args_store) |*parsed| parsed.deinit(); } - var total_bytes: usize = 0; - for (files) |f| total_bytes += f.content.len; - std.debug.print("Total content: {d} KB\n\n", .{total_bytes / 1024}); - - // ── Index directly into WordIndex + TrigramIndex ── - var wi = WordIndex.init(allocator); - defer wi.deinit(); - var ti = TrigramIndex.init(allocator); - defer ti.deinit(); - - // Also store content for brute force comparison - var contents = std.StringHashMap([]const u8).init(allocator); - defer contents.deinit(); - - var timer = try std.time.Timer.start(); - for (files) |f| { - try wi.indexFile(f.name, f.content); - try ti.indexFile(f.name, f.content); - try contents.put(f.name, f.content); + for (cases, 0..) |case, idx| { + args_store[idx] = try std.json.parseFromSlice(std.json.Value, allocator, case.args_json, .{}); } - const index_ns = timer.read(); - std.debug.print("Index {d} files: {d:.1} ms\n", .{ num_files, @as(f64, @floatFromInt(index_ns)) / 1_000_000.0 }); - - // ── Bench: raw word index lookup (zero-alloc) ── - const word_queries = [_][]const u8{ "handleRequest", "AgentRegistry", "allocator", "Explorer", "TrigramIndex" }; - - timer.reset(); - const word_iters: usize = 100_000; - var total_hits: usize = 0; - for (0..word_iters) |_| { - for (word_queries) |q| { - const hits = wi.search(q); - total_hits += hits.len; - } + + var results: [cases.len]ToolBench = undefined; + for (cases, 0..) |case, idx| { + const args = &args_store[idx].value.object; + const base = try runCase(allocator, &bench_ctx, &store, &explorer, &agents, case, args, &telem_off); + const with_telem = try runCase(allocator, &bench_ctx, &store, &explorer, &agents, case, args, &telem_on); + results[idx] = .{ + .tool = case.name, + .avg_latency_ns = base.avg_latency_ns, + .response_bytes = base.response_bytes, + .ops_per_sec = opsPerSec(base.avg_latency_ns), + .telemetry_avg_ns = with_telem.avg_latency_ns, + .telemetry_delta_pct = deltaPct(base.avg_latency_ns, with_telem.avg_latency_ns), + }; } - const word_ns = timer.read(); - const word_total = word_iters * word_queries.len; - std.debug.print("Word lookup Γ—{d}: {d:.1} ms total, {d:.0} ns/query ({d} hits)\n", .{ - word_total, - @as(f64, @floatFromInt(word_ns)) / 1_000_000.0, - @as(f64, @floatFromInt(word_ns)) / @as(f64, @floatFromInt(word_total)), - total_hits / word_iters, - }); - // ── Bench: trigram candidate lookup (with bloom filtering) ── - const tri_queries = [_][]const u8{ "handleRequest", "processData", "AgentRegistry", "pub fn init", "TrigramIndex" }; + const corpus = summarizeCorpus(&explorer); + try writeHumanSummary(allocator, std.fs.File.stderr(), corpus.files, corpus.bytes, &results); + if (emit_json) { + try writeJsonSummary(allocator, std.fs.File.stdout(), repo_root, tmp_root, corpus.files, corpus.bytes, &results); + } +} - timer.reset(); - const tri_iters: usize = 10_000; - for (0..tri_iters) |_| { - for (tri_queries) |q| { - const cands = ti.candidates(q, allocator); - if (cands) |c| allocator.free(c); +fn runCase( + allocator: std.mem.Allocator, + bench_ctx: *mcp.BenchContext, + store: *Store, + explorer: *Explorer, + agents: *AgentRegistry, + case: Case, + args: *const std.json.ObjectMap, + telem: *telemetry.Telemetry, +) !struct { avg_latency_ns: u64, response_bytes: usize } { + var total_ns: u64 = 0; + var response_bytes: usize = 0; + + for (0..case.iterations) |_| { + if (case.tool == .codedb_edit) { + try resetBenchTarget(explorer, store); } + + var timer = try std.time.Timer.start(); + response_bytes = bench_ctx.runToolCall(allocator, case.name, case.tool, args, store, explorer, agents, telem); + const elapsed = timer.read(); + total_ns +|= elapsed; } - const tri_ns = timer.read(); - const tri_total = tri_iters * tri_queries.len; - std.debug.print("Trigram candidates Γ—{d}: {d:.1} ms total, {d:.0} ns/query\n", .{ - tri_total, - @as(f64, @floatFromInt(tri_ns)) / 1_000_000.0, - @as(f64, @floatFromInt(tri_ns)) / @as(f64, @floatFromInt(tri_total)), - }); - // ── Bloom filter effectiveness: candidate set sizes ── - std.debug.print("\n── Bloom Filter Effectiveness ──\n", .{}); - for (tri_queries) |q| { - // Get candidate count with bloom filtering (current behavior) - const bloom_cands = ti.candidates(q, allocator); - const bloom_count = if (bloom_cands) |c| blk: { - defer allocator.free(c); - break :blk c.len; - } else num_files; - - // Count candidates from pure trigram intersection (no bloom) - // by counting files present in ALL trigram posting lists - var pure_count: usize = 0; - if (q.len >= 3) { - const tri_count = q.len - 2; - var unique = std.AutoHashMap(Trigram, void).init(allocator); - defer unique.deinit(); - for (0..tri_count) |j| { - const tri = packTrigram( - normalizeChar(q[j]), - normalizeChar(q[j + 1]), - normalizeChar(q[j + 2]), - ); - unique.put(tri, {}) catch {}; - } - - // Collect posting list pointers - var sets: std.ArrayList(*const std.StringHashMap(PostingMask)) = .{}; - defer sets.deinit(allocator); - var all_found = true; - var tri_iter = unique.keyIterator(); - while (tri_iter.next()) |tri_ptr| { - if (ti.index.getPtr(tri_ptr.*)) |file_set| { - sets.append(allocator, file_set) catch {}; - } else { - all_found = false; - break; - } - } - - if (all_found and sets.items.len > 0) { - // Find smallest set, intersect - var min_idx: usize = 0; - var min_count: usize = sets.items[0].count(); - for (sets.items[1..], 1..) |set, idx| { - if (set.count() < min_count) { - min_count = set.count(); - min_idx = idx; - } - } - var it = sets.items[min_idx].keyIterator(); - while (it.next()) |path_ptr| { - var ok = true; - for (sets.items, 0..) |set, idx| { - if (idx == min_idx) continue; - if (!set.contains(path_ptr.*)) { - ok = false; - break; - } - } - if (ok) pure_count += 1; - } - } - } + return .{ + .avg_latency_ns = @intCast(@divTrunc(total_ns, case.iterations)), + .response_bytes = response_bytes, + }; +} + +fn copyCorpus(allocator: std.mem.Allocator, repo_root: []const u8, tmp_root: []const u8) !void { + const files = [_][]const u8{ + "README.md", + "build.zig", + "build.zig.zon", + "src/agent.zig", + "src/bench.zig", + "src/edit.zig", + "src/explore.zig", + "src/git.zig", + "src/index.zig", + "src/lib.zig", + "src/main.zig", + "src/mcp.zig", + "src/root_policy.zig", + "src/server.zig", + "src/snapshot.zig", + "src/snapshot_json.zig", + "src/store.zig", + "src/style.zig", + "src/telemetry.zig", + "src/version.zig", + "src/watcher.zig", + }; + + for (files) |rel| { + const src = try std.fs.path.join(allocator, &.{ repo_root, rel }); + defer allocator.free(src); + const dst = try std.fs.path.join(allocator, &.{ tmp_root, rel }); + defer allocator.free(dst); - // Count actual matches via brute force - var actual_count: usize = 0; - var c_iter = contents.iterator(); - while (c_iter.next()) |entry| { - if (std.mem.indexOf(u8, entry.value_ptr.*, q) != null) { - actual_count += 1; - } + if (std.fs.path.dirname(dst)) |parent| { + try std.fs.cwd().makePath(parent); } - const reduction = if (pure_count > 0) @as(f64, @floatFromInt(pure_count - bloom_count)) / @as(f64, @floatFromInt(pure_count)) * 100.0 else 0.0; - std.debug.print(" \"{s}\":\n trigram-only={d} bloom={d} actual={d} reduction={d:.0}%\n", .{ - q, pure_count, bloom_count, actual_count, reduction, - }); + try std.fs.cwd().copyFile(src, std.fs.cwd(), dst, .{}); } +} - // ── Bench: brute force substring search ── - timer.reset(); - const brute_iters: usize = 1_000; - for (0..brute_iters) |_| { - for (tri_queries) |q| { - var iter = contents.iterator(); - while (iter.next()) |entry| { - _ = std.mem.indexOf(u8, entry.value_ptr.*, q); - } - } +fn writeBenchTarget(tmp_root: []const u8) !void { + var path_buf: [std.fs.max_path_bytes]u8 = undefined; + const path = try std.fmt.bufPrint(&path_buf, "{s}/src/bench_target.zig", .{tmp_root}); + const file = try std.fs.cwd().createFile(path, .{ .truncate = true }); + defer file.close(); + try file.writeAll("pub const bench_value = 1;\n"); +} + +fn resetBenchTarget(explorer: *Explorer, store: *Store) !void { + try explorer.indexFile("src/bench_target.zig", "pub const bench_value = 1;\n"); + _ = try store.recordSnapshot("src/bench_target.zig", "pub const bench_value = 1;\n".len, std.hash.Wyhash.hash(0, "pub const bench_value = 1;\n")); +} + +fn summarizeCorpus(explorer: *Explorer) struct { files: usize, bytes: u64 } { + explorer.mu.lockShared(); + defer explorer.mu.unlockShared(); + + var files: usize = 0; + var bytes: u64 = 0; + var iter = explorer.outlines.iterator(); + while (iter.next()) |entry| { + files += 1; + bytes +|= entry.value_ptr.byte_size; } - const brute_ns = timer.read(); - const brute_total = brute_iters * tri_queries.len; - std.debug.print("\nBrute force Γ—{d}: {d:.1} ms total, {d:.0} ns/query\n", .{ - brute_total, - @as(f64, @floatFromInt(brute_ns)) / 1_000_000.0, - @as(f64, @floatFromInt(brute_ns)) / @as(f64, @floatFromInt(brute_total)), + return .{ .files = files, .bytes = bytes }; +} + +fn writeHumanSummary(allocator: std.mem.Allocator, file: std.fs.File, file_count: usize, total_bytes: u64, results: []const ToolBench) !void { + var out: std.ArrayList(u8) = .{}; + defer out.deinit(allocator); + const writer = out.writer(allocator); + try writer.print("── E2E MCP Tool Benchmarks ({d} files, {d}KB) ──\n", .{ file_count, total_bytes / 1024 }); + try writer.writeAll("Tool Latency Size Ops/sec TelemetryΞ”\n"); + for (results) |result| { + var latency_buf: [32]u8 = undefined; + var delta_buf: [32]u8 = undefined; + try writer.print("{s:<17} {s:<10} {d:<8} {d:>8.0} {s}\n", .{ + result.tool, + formatNs(&latency_buf, result.avg_latency_ns), + result.response_bytes, + result.ops_per_sec, + formatPct(&delta_buf, result.telemetry_delta_pct), + }); + } + try file.writeAll(out.items); +} + +fn writeJsonSummary(allocator: std.mem.Allocator, file: std.fs.File, repo_root: []const u8, corpus_root: []const u8, file_count: usize, total_bytes: u64, results: []const ToolBench) !void { + var out: std.ArrayList(u8) = .{}; + defer out.deinit(allocator); + const writer = out.writer(allocator); + try writer.print("{{\"repo_root\":\"{s}\",\"corpus_root\":\"{s}\",\"file_count\":{d},\"total_bytes\":{d},\"tools\":[", .{ + repo_root, + corpus_root, + file_count, + total_bytes, }); + for (results, 0..) |result, idx| { + if (idx > 0) try writer.writeByte(','); + try writer.print("{{\"tool\":\"{s}\",\"avg_latency_ns\":{d},\"response_bytes\":{d},\"ops_per_sec\":{d:.3},\"telemetry_avg_ns\":{d},\"telemetry_delta_pct\":{d:.3}}}", .{ + result.tool, + result.avg_latency_ns, + result.response_bytes, + result.ops_per_sec, + result.telemetry_avg_ns, + result.telemetry_delta_pct, + }); + } + try writer.writeAll("]}\n"); + try file.writeAll(out.items); +} + +fn opsPerSec(avg_latency_ns: u64) f64 { + if (avg_latency_ns == 0) return 0; + return @as(f64, 1_000_000_000.0) / @as(f64, @floatFromInt(avg_latency_ns)); +} + +fn deltaPct(base_ns: u64, with_telem_ns: u64) f64 { + if (base_ns == 0) return 0; + const delta = @as(f64, @floatFromInt(with_telem_ns)) - @as(f64, @floatFromInt(base_ns)); + return (delta / @as(f64, @floatFromInt(base_ns))) * 100.0; +} + +fn formatNs(buf: []u8, ns: u64) []const u8 { + if (ns >= std.time.ns_per_ms) { + const whole = ns / std.time.ns_per_ms; + const frac = (ns % std.time.ns_per_ms) / 100_000; + return std.fmt.bufPrint(buf, "{d}.{d}ms", .{ whole, frac }) catch "0ms"; + } + if (ns >= std.time.ns_per_us) { + const whole = ns / std.time.ns_per_us; + const frac = (ns % std.time.ns_per_us) / 100; + return std.fmt.bufPrint(buf, "{d}.{d}us", .{ whole, frac }) catch "0us"; + } + return std.fmt.bufPrint(buf, "{d}ns", .{ns}) catch "0ns"; +} - std.debug.print("\n── Summary ({d} files, {d}K lines, {d} KB) ──\n", .{ num_files, total_lines / 1000, total_bytes / 1024 }); - std.debug.print("Word index: {d:.0} ns/query (zero-alloc hash lookup)\n", .{@as(f64, @floatFromInt(word_ns)) / @as(f64, @floatFromInt(word_total))}); - std.debug.print("Trigram: {d:.0} ns/query (candidate set + bloom filter)\n", .{@as(f64, @floatFromInt(tri_ns)) / @as(f64, @floatFromInt(tri_total))}); - std.debug.print("Brute force: {d:.0} ns/query (linear scan all content)\n", .{@as(f64, @floatFromInt(brute_ns)) / @as(f64, @floatFromInt(brute_total))}); - const speedup_word = @as(f64, @floatFromInt(brute_ns)) / @as(f64, @floatFromInt(brute_total)) / (@as(f64, @floatFromInt(word_ns)) / @as(f64, @floatFromInt(word_total))); - const speedup_tri = @as(f64, @floatFromInt(brute_ns)) / @as(f64, @floatFromInt(brute_total)) / (@as(f64, @floatFromInt(tri_ns)) / @as(f64, @floatFromInt(tri_total))); - std.debug.print("Word vs brute: {d:.0}Γ— faster\n", .{speedup_word}); - std.debug.print("Tri vs brute: {d:.1}Γ— faster\n", .{speedup_tri}); +fn formatPct(buf: []u8, pct: f64) []const u8 { + const abs_pct = @abs(pct); + return std.fmt.bufPrint(buf, "{d:.2}%", .{abs_pct}) catch "0.00%"; } diff --git a/src/main.zig b/src/main.zig index c28f6b9..6f57035 100644 --- a/src/main.zig +++ b/src/main.zig @@ -13,7 +13,6 @@ const snapshot_mod = @import("snapshot.zig"); const telemetry = @import("telemetry.zig"); const root_policy = @import("root_policy.zig"); - /// Thin wrapper: format + write to a File via allocator. const Out = struct { file: std.fs.File, @@ -105,7 +104,6 @@ pub fn main() !void { allocator.destroy(ft); }; - if (!std.mem.eql(u8, cmd, "mcp")) { const git_head = git_mod.getGitHead(abs_root, allocator) catch null; @@ -129,10 +127,11 @@ pub fn main() !void { var dur_buf: [64]u8 = undefined; const scan_elapsed = std.time.nanoTimestamp() - t_scan; out.p("{s}\xe2\x9c\x93{s} {s}loaded snapshot{s} {s}{d} files{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.bold, s.reset, - s.dim, explorer.outlines.count(), s.reset, - sty.durationColor(s, scan_elapsed), sty.formatDuration(&dur_buf, scan_elapsed), s.reset, + s.green, s.reset, + s.bold, s.reset, + s.dim, explorer.outlines.count(), + s.reset, sty.durationColor(s, scan_elapsed), + sty.formatDuration(&dur_buf, scan_elapsed), s.reset, }); } else { const disk_hdr = TrigramIndex.readDiskHeader(data_dir, allocator) catch null; @@ -152,9 +151,10 @@ pub fn main() !void { const scan_elapsed = std.time.nanoTimestamp() - t_scan; var dur_buf: [64]u8 = undefined; out.p("{s}\xe2\x9c\x93{s} {s}indexed{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.dim, s.reset, - sty.durationColor(s, scan_elapsed), sty.formatDuration(&dur_buf, scan_elapsed), s.reset, + s.green, s.reset, + s.dim, s.reset, + sty.durationColor(s, scan_elapsed), sty.formatDuration(&dur_buf, scan_elapsed), + s.reset, }); if (heads_match) { @@ -198,9 +198,6 @@ pub fn main() !void { } // end else (no snapshot) } - - - if (std.mem.eql(u8, cmd, "tree")) { const t0 = std.time.nanoTimestamp(); const tree = try explorer.getTree(allocator, use_color); @@ -211,7 +208,6 @@ pub fn main() !void { out.p("{s}{s}{s}\n", .{ sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, }); - } else if (std.mem.eql(u8, cmd, "outline")) { const path = if (args.len > cmd_args_start) args[cmd_args_start] else { out.p("{s}\xe2\x9c\x97{s} usage: codedb [root] outline {s}{s}\n", .{ @@ -236,25 +232,26 @@ pub fn main() !void { var dur_buf: [64]u8 = undefined; const lang = @tagName(outline.language); out.p("{s}\xe2\x9c\x93{s} {s}{s}{s} {s}{s}{s} {s}{d} lines{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.bold, path, s.reset, - s.langColor(lang), lang, s.reset, - s.dim, outline.line_count, s.reset, - sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, + s.green, s.reset, + s.bold, path, + s.reset, s.langColor(lang), + lang, s.reset, + s.dim, outline.line_count, + s.reset, sty.durationColor(s, elapsed), + sty.formatDuration(&dur_buf, elapsed), s.reset, }); for (outline.symbols.items) |sym| { const kind = @tagName(sym.kind); out.p(" {s}L{d:<5}{s} {s}{s:<14}{s} {s}{s}{s}", .{ - s.dim, sym.line_start, s.reset, - s.kindColor(kind), kind, s.reset, - s.bold, sym.name, s.reset, + s.dim, sym.line_start, s.reset, + s.kindColor(kind), kind, s.reset, + s.bold, sym.name, s.reset, }); if (sym.detail) |d| { out.p(" {s}{s}{s}", .{ s.dim, d, s.reset }); } out.p("\n", .{}); } - } else if (std.mem.eql(u8, cmd, "find")) { const name = if (args.len > cmd_args_start) args[cmd_args_start] else { out.p("{s}\xe2\x9c\x97{s} usage: codedb [root] find {s}{s}\n", .{ @@ -273,12 +270,15 @@ pub fn main() !void { var dur_buf: [64]u8 = undefined; const kind = @tagName(r.symbol.kind); out.p("{s}\xe2\x9c\x93{s} {s}{s}{s} {s}{s}{s} {s}{s}{s}:{s}{d}{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.kindColor(kind), kind, s.reset, - s.bold, name, s.reset, - s.dim, r.path, s.reset, - s.cyan, r.symbol.line_start, s.reset, - sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, + s.green, s.reset, + s.kindColor(kind), kind, + s.reset, s.bold, + name, s.reset, + s.dim, r.path, + s.reset, s.cyan, + r.symbol.line_start, s.reset, + sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), + s.reset, }); if (r.symbol.detail) |d| { out.p(" {s}{s}{s}\n", .{ s.dim, d, s.reset }); @@ -288,7 +288,6 @@ pub fn main() !void { s.red, s.reset, s.bold, name, s.reset, }); } - } else if (std.mem.eql(u8, cmd, "search")) { var use_regex = false; var query_arg_start = cmd_args_start; @@ -308,7 +307,10 @@ pub fn main() !void { else try explorer.searchContent(query, allocator, 50); defer { - for (results) |r| { allocator.free(r.path); allocator.free(r.line_text); } + for (results) |r| { + allocator.free(r.path); + allocator.free(r.line_text); + } allocator.free(results); } const elapsed = std.time.nanoTimestamp() - t0; @@ -320,20 +322,21 @@ pub fn main() !void { } else { const mode_label: []const u8 = if (use_regex) " (regex)" else ""; out.p("{s}\xe2\x9c\x93{s} {s}{d}{s} results for {s}\"{s}\"{s}{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.bold, results.len, s.reset, - s.bold, query, s.reset, mode_label, - sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, + s.green, s.reset, + s.bold, results.len, + s.reset, s.bold, + query, s.reset, + mode_label, sty.durationColor(s, elapsed), + sty.formatDuration(&dur_buf, elapsed), s.reset, }); for (results) |r| { out.p(" {s}{s}{s}:{s}{d}{s} {s}\n", .{ - s.cyan, r.path, s.reset, - s.dim, r.line_num, s.reset, + s.cyan, r.path, s.reset, + s.dim, r.line_num, s.reset, r.line_text, }); } } - } else if (std.mem.eql(u8, cmd, "word")) { const word = if (args.len > cmd_args_start) args[cmd_args_start] else { out.p("{s}\xe2\x9c\x97{s} usage: codedb [root] word {s}{s}\n", .{ @@ -352,19 +355,20 @@ pub fn main() !void { }); } else { out.p("{s}\xe2\x9c\x93{s} {s}{d}{s} hits for {s}'{s}'{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.bold, hits.len, s.reset, - s.bold, word, s.reset, - sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, + s.green, s.reset, + s.bold, hits.len, + s.reset, s.bold, + word, s.reset, + sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), + s.reset, }); for (hits) |h| { out.p(" {s}{s}{s}:{s}{d}{s}\n", .{ - s.cyan, h.path, s.reset, - s.dim, h.line_num, s.reset, + s.cyan, h.path, s.reset, + s.dim, h.line_num, s.reset, }); } } - } else if (std.mem.eql(u8, cmd, "hot")) { const t0 = std.time.nanoTimestamp(); const hot = try explorer.getHotFiles(&store, allocator, 10); @@ -375,13 +379,14 @@ pub fn main() !void { const elapsed = std.time.nanoTimestamp() - t0; var dur_buf: [64]u8 = undefined; out.p("{s}\xe2\x9c\x93{s} {s}recently modified{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.bold, s.reset, - sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, + s.green, s.reset, + s.bold, s.reset, + sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), + s.reset, }); for (hot, 1..) |path, i| { out.p(" {s}{d}{s} {s}{s}{s}\n", .{ - s.dim, i, s.reset, + s.dim, i, s.reset, s.cyan, path, s.reset, }); } @@ -395,13 +400,14 @@ pub fn main() !void { const elapsed = std.time.nanoTimestamp() - t0; var dur_buf: [64]u8 = undefined; out.p("{s}\xe2\x9c\x93{s} {s}snapshot{s} {s}{s}{s} {s}{d} files{s} {s}{s}{s}\n", .{ - s.green, s.reset, - s.bold, s.reset, - s.cyan, output, s.reset, - s.dim, explorer.outlines.count(), s.reset, - sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), s.reset, + s.green, s.reset, + s.bold, s.reset, + s.cyan, output, + s.reset, s.dim, + explorer.outlines.count(), s.reset, + sty.durationColor(s, elapsed), sty.formatDuration(&dur_buf, elapsed), + s.reset, }); - } else if (std.mem.eql(u8, cmd, "serve")) { const port: u16 = 7719; var agents = AgentRegistry.init(allocator); @@ -421,7 +427,6 @@ pub fn main() !void { std.log.info("codedb: {d} files indexed, listening on :{d}", .{ store.currentSeq(), port }); try server.serve(allocator, &store, &agents, &explorer, &queue, port); - } else if (std.mem.eql(u8, cmd, "mcp")) { var agents = AgentRegistry.init(allocator); defer agents.deinit(); @@ -443,13 +448,29 @@ pub fn main() !void { break :blk snapshot_mod.loadSnapshot("codedb.snapshot", &explorer, &store, allocator); }; + var telemetry_disabled = false; + for (args[cmd_args_start..]) |arg| { + if (std.mem.eql(u8, arg, "--no-telemetry")) { + telemetry_disabled = true; + break; + } + } + + var telem = telemetry.Telemetry.init(data_dir, allocator, telemetry_disabled); + defer telem.deinit(); + telem.recordSessionStart(); + var shutdown = std.atomic.Value(bool).init(false); var scan_done = std.atomic.Value(bool).init(snapshot_loaded); var queue = watcher.EventQueue{}; var scan_thread: ?std.Thread = null; + const startup_t0 = std.time.milliTimestamp(); if (!snapshot_loaded) { - scan_thread = try std.Thread.spawn(.{}, scanBg, .{ &store, &explorer, root, allocator, &scan_done, data_dir, abs_root }); + scan_thread = try std.Thread.spawn(.{}, scanBg, .{ &store, &explorer, root, allocator, &scan_done, data_dir, abs_root, &telem, startup_t0 }); + } else { + const startup_time_ms: u64 = @intCast(@max(std.time.milliTimestamp() - startup_t0, 0)); + telem.recordCodebaseStats(&explorer, startup_time_ms); } const watch_thread = try std.Thread.spawn(.{}, watcher.incrementalLoop, .{ &store, &explorer, &queue, root, &shutdown, &scan_done }); @@ -457,21 +478,6 @@ pub fn main() !void { std.log.info("codedb2 mcp: root={s} files={d} data={s}", .{ abs_root, store.currentSeq(), data_dir }); - var telem = telemetry.Telemetry.init(data_dir, allocator); - defer telem.deinit(); - - for (args[cmd_args_start..]) |arg| { - if (std.mem.eql(u8, arg, "--no-telemetry")) { - telem.enabled = false; - break; - } - } - - telem.record(.{ .session_start = .{ - .file_count = @intCast(@min(explorer.outlines.count(), std.math.maxInt(u32))), - .total_lines = 0, - } }); - mcp_server.run(allocator, &store, &explorer, &agents, abs_root, &telem); shutdown.store(true, .release); @@ -479,7 +485,6 @@ pub fn main() !void { watch_thread.join(); idle_thread.join(); if (scan_thread) |t| t.join(); - } else { out.p("{s}\xe2\x9c\x97{s} unknown command: {s}{s}{s}\n", .{ s.red, s.reset, s.bold, cmd, s.reset, @@ -545,10 +550,14 @@ fn printUsage(out: Out, s: sty.Style) void { s.dim, s.reset, s.dim, s.reset, s.cyan, s.reset, - s.cyan, s.reset, s.dim, s.reset, - s.cyan, s.reset, s.dim, s.reset, - s.cyan, s.reset, s.dim, s.reset, - s.cyan, s.reset, s.dim, s.reset, + s.cyan, s.reset, + s.dim, s.reset, + s.cyan, s.reset, + s.dim, s.reset, + s.cyan, s.reset, + s.dim, s.reset, + s.cyan, s.reset, + s.dim, s.reset, s.cyan, s.reset, s.cyan, s.reset, s.cyan, s.reset, @@ -575,7 +584,7 @@ fn reapLoop(agents: *AgentRegistry, shutdown: *std.atomic.Value(bool)) void { } } -fn scanBg(store: *Store, explorer: *Explorer, root: []const u8, allocator: std.mem.Allocator, scan_done: *std.atomic.Value(bool), data_dir: []const u8, abs_root: []const u8) void { +fn scanBg(store: *Store, explorer: *Explorer, root: []const u8, allocator: std.mem.Allocator, scan_done: *std.atomic.Value(bool), data_dir: []const u8, abs_root: []const u8, telem: *telemetry.Telemetry, startup_t0: i64) void { const git_head = git_mod.getGitHead(root, allocator) catch null; const disk_hdr = TrigramIndex.readDiskHeader(data_dir, allocator) catch null; const heads_match = blk: { @@ -598,6 +607,7 @@ fn scanBg(store: *Store, explorer: *Explorer, root: []const u8, allocator: std.m explorer.trigram_index = loaded; explorer.mu.unlock(); scan_done.store(true, .release); + telem.recordCodebaseStats(explorer, @intCast(@max(std.time.milliTimestamp() - startup_t0, 0))); // Auto-write snapshot after successful scan snapshot_mod.writeSnapshotDual(explorer, abs_root, "codedb.snapshot", allocator) catch |err| { std.log.warn("could not auto-write snapshot: {}", .{err}); @@ -614,6 +624,7 @@ fn scanBg(store: *Store, explorer: *Explorer, root: []const u8, allocator: std.m std.log.warn("could not persist trigram index: {}", .{err}); }; scan_done.store(true, .release); + telem.recordCodebaseStats(explorer, @intCast(@max(std.time.milliTimestamp() - startup_t0, 0))); // Auto-write snapshot after successful scan snapshot_mod.writeSnapshotDual(explorer, abs_root, "codedb.snapshot", allocator) catch |err| { diff --git a/src/mcp.zig b/src/mcp.zig index ba5d30d..26e421c 100644 --- a/src/mcp.zig +++ b/src/mcp.zig @@ -156,6 +156,90 @@ const ProjectCache = struct { } }; +pub const BenchContext = struct { + cache: ProjectCache, + + pub fn init(alloc: std.mem.Allocator, default_path: []const u8) BenchContext { + return .{ + .cache = ProjectCache.init(alloc, default_path), + }; + } + + pub fn deinit(self: *BenchContext) void { + self.cache.deinit(); + } + + pub fn runDispatch( + self: *BenchContext, + alloc: std.mem.Allocator, + tool: Tool, + args: *const std.json.ObjectMap, + out: *std.ArrayList(u8), + store: *Store, + explorer: *Explorer, + agents: *AgentRegistry, + ) void { + dispatch(alloc, tool, args, out, store, explorer, agents, &self.cache); + } + + pub fn runToolCall( + self: *BenchContext, + alloc: std.mem.Allocator, + name: []const u8, + tool: Tool, + args: *const std.json.ObjectMap, + store: *Store, + explorer: *Explorer, + agents: *AgentRegistry, + telem: *telemetry_mod.Telemetry, + ) usize { + var out: std.ArrayList(u8) = .{}; + defer out.deinit(alloc); + + const t0 = std.time.nanoTimestamp(); + dispatch(alloc, tool, args, &out, store, explorer, agents, &self.cache); + const elapsed = std.time.nanoTimestamp() - t0; + + const is_error = std.mem.startsWith(u8, out.items, "error:"); + telem.recordToolCall(name, elapsed, is_error, out.items.len); + + var summary: std.ArrayList(u8) = .{}; + defer summary.deinit(alloc); + summary.appendSlice(alloc, if (is_error) MCP_RED ++ MCP_CROSS ++ " " ++ MCP_RESET else MCP_GREEN ++ MCP_CHECK ++ " " ++ MCP_RESET) catch {}; + summary.appendSlice(alloc, mcpToolIcon(name)) catch {}; + mcpGenerateSummary(alloc, name, args, out.items, is_error, &summary); + var dur_buf: [96]u8 = undefined; + summary.appendSlice(alloc, mcpFormatDuration(&dur_buf, elapsed)) catch {}; + + var guidance: std.ArrayList(u8) = .{}; + defer guidance.deinit(alloc); + mcpGenerateGuidance(alloc, name, args, is_error, &guidance); + + var result: std.ArrayList(u8) = .{}; + defer result.deinit(alloc); + result.appendSlice(alloc, "{\"content\":[") catch return 0; + + if (summary.items.len > 0) { + result.appendSlice(alloc, "{\"type\":\"text\",\"text\":\"") catch return result.items.len; + writeEscaped(alloc, &result, summary.items); + result.appendSlice(alloc, "\"},") catch return result.items.len; + } + + result.appendSlice(alloc, "{\"type\":\"text\",\"text\":\"") catch return result.items.len; + writeEscaped(alloc, &result, out.items); + result.appendSlice(alloc, "\"}") catch return result.items.len; + + if (guidance.items.len > 0) { + result.appendSlice(alloc, ",{\"type\":\"text\",\"text\":\"") catch return result.items.len; + writeEscaped(alloc, &result, guidance.items); + result.appendSlice(alloc, "\"}") catch return result.items.len; + } + + result.appendSlice(alloc, if (is_error) "],\"isError\":true}" else "],\"isError\":false}") catch return result.items.len; + return result.items.len; + } +}; + // ── Tool definitions ──────────────────────────────────────────────────────── pub const Tool = enum { @@ -305,7 +389,6 @@ pub fn run( } else { if (!is_notification) writeError(alloc, stdout, id, -32601, "Method not found"); } - telem.flush(); } } @@ -442,7 +525,6 @@ fn handleCall( telem.recordToolCall(name, elapsed, is_error, out.items.len); if (is_notification) return; - // Block 1: Human-readable colored summary (ANSI β€” preview pane always renders it) var summary: std.ArrayList(u8) = .{}; defer summary.deinit(alloc); @@ -887,7 +969,6 @@ fn handleSnapshot(alloc: std.mem.Allocator, out: *std.ArrayList(u8), explorer: * out.appendSlice(alloc, snap) catch {}; } - fn handleBundle( alloc: std.mem.Allocator, args: *const std.json.ObjectMap, @@ -1176,7 +1257,6 @@ pub fn isPathSafe(path: []const u8) bool { return true; } - fn writeResult(alloc: std.mem.Allocator, stdout: std.fs.File, id: ?std.json.Value, result: []const u8) void { var buf: std.ArrayList(u8) = .{}; defer buf.deinit(alloc); @@ -1234,23 +1314,23 @@ fn appendId(alloc: std.mem.Allocator, buf: *std.ArrayList(u8), id: ?std.json.Val // ── MCP UX: 3-block response helpers ──────────────────────────────────────── // Colors are always on β€” MCP preview pane always renders ANSI. No TTY check. -const MCP_RESET = "\x1b[0m"; -const MCP_BOLD = "\x1b[1m"; -const MCP_DIM = "\x1b[2m"; -const MCP_GREEN = "\x1b[32m"; -const MCP_RED = "\x1b[31m"; -const MCP_CYAN = "\x1b[36m"; -const MCP_YELLOW = "\x1b[33m"; -const MCP_MAGENTA = "\x1b[35m"; -const MCP_BLUE = "\x1b[34m"; +const MCP_RESET = "\x1b[0m"; +const MCP_BOLD = "\x1b[1m"; +const MCP_DIM = "\x1b[2m"; +const MCP_GREEN = "\x1b[32m"; +const MCP_RED = "\x1b[31m"; +const MCP_CYAN = "\x1b[36m"; +const MCP_YELLOW = "\x1b[33m"; +const MCP_MAGENTA = "\x1b[35m"; +const MCP_BLUE = "\x1b[34m"; const MCP_BRIGHT_GREEN = "\x1b[92m"; -const MCP_CHECK = "\xe2\x9c\x93"; // βœ“ -const MCP_CROSS = "\xe2\x9c\x97"; // βœ— -const MCP_DASH = " \xe2\x80\x94 "; // β€” -const MCP_ARROW = "\xe2\x86\x92 "; // β†’ -const MCP_DOT = "\xe2\x80\xa2 "; // β€’ -const MCP_ZAP = "\xe2\x9a\xa1"; // ⚑ +const MCP_CHECK = "\xe2\x9c\x93"; // βœ“ +const MCP_CROSS = "\xe2\x9c\x97"; // βœ— +const MCP_DASH = " \xe2\x80\x94 "; // β€” +const MCP_ARROW = "\xe2\x86\x92 "; // β†’ +const MCP_DOT = "\xe2\x80\xa2 "; // β€’ +const MCP_ZAP = "\xe2\x9a\xa1"; // ⚑ fn mcpFormatDuration(buf: []u8, ns: i128) []const u8 { if (ns <= 0) return ""; @@ -1279,17 +1359,17 @@ fn mcpFormatDuration(buf: []u8, ns: i128) []const u8 { } fn mcpToolIcon(tool_name: []const u8) []const u8 { - if (eql(tool_name, "codedb_outline")) return MCP_BLUE ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_symbol")) return MCP_BLUE ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_read")) return MCP_BLUE ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_search")) return MCP_MAGENTA ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_word")) return MCP_CYAN ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_edit")) return MCP_YELLOW ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_tree")) return MCP_GREEN ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_hot")) return MCP_YELLOW ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_deps")) return MCP_CYAN ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_changes")) return MCP_YELLOW ++ MCP_DOT ++ MCP_RESET; - if (eql(tool_name, "codedb_bundle")) return MCP_MAGENTA ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_outline")) return MCP_BLUE ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_symbol")) return MCP_BLUE ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_read")) return MCP_BLUE ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_search")) return MCP_MAGENTA ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_word")) return MCP_CYAN ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_edit")) return MCP_YELLOW ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_tree")) return MCP_GREEN ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_hot")) return MCP_YELLOW ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_deps")) return MCP_CYAN ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_changes")) return MCP_YELLOW ++ MCP_DOT ++ MCP_RESET; + if (eql(tool_name, "codedb_bundle")) return MCP_MAGENTA ++ MCP_DOT ++ MCP_RESET; return MCP_DIM ++ MCP_DOT ++ MCP_RESET; } diff --git a/src/telemetry.zig b/src/telemetry.zig index ac70aaa..e91273e 100644 --- a/src/telemetry.zig +++ b/src/telemetry.zig @@ -1,10 +1,14 @@ const std = @import("std"); +const builtin = @import("builtin"); +const explore = @import("explore.zig"); +const index = @import("index.zig"); const RING_SIZE = 256; const CLOUD_URL = "https://codedb.codegraff.com/telemetry/ingest"; +const VERSION = "0.2.0"; +const PLATFORM = std.fmt.comptimePrint("{s}-{s}", .{ @tagName(builtin.os.tag), @tagName(builtin.cpu.arch) }); pub const Event = struct { - ts: i64, kind: Kind, pub const Kind = union(enum) { @@ -15,9 +19,13 @@ pub const Event = struct { err: bool, response_bytes: u32, }, - session_start: struct { + session_start: void, + codebase_stats: struct { file_count: u32, total_lines: u64, + language_mask: u16, + index_size_bytes: u64, + startup_time_ms: u64, }, }; }; @@ -32,10 +40,10 @@ pub const Telemetry = struct { path_buf: [std.fs.max_path_bytes]u8 = undefined, path_len: usize = 0, - pub fn init(data_dir: []const u8, allocator: std.mem.Allocator) Telemetry { + pub fn init(data_dir: []const u8, allocator: std.mem.Allocator, disabled: bool) Telemetry { var self = Telemetry{}; - if (std.process.hasEnvVarConstant("CODEDB_NO_TELEMETRY")) { + if (disabled or std.process.hasEnvVarConstant("CODEDB_NO_TELEMETRY")) { self.enabled = false; return self; } @@ -58,24 +66,23 @@ pub const Telemetry = struct { if (self.enabled) self.syncToCloud(); } - /// Hot path β€” no allocation, no syscall, no blocking. - /// Just copies into the next ring slot. pub fn record(self: *Telemetry, kind: Event.Kind) void { if (!self.enabled) return; - const slot = self.head.fetchAdd(1, .monotonic) % RING_SIZE; + const next = self.head.fetchAdd(1, .monotonic); + const slot = next % RING_SIZE; self.ring[slot] = .{ - .ts = std.time.timestamp(), .kind = kind, }; - // Advance tail if we wrapped (drop oldest) - const head = self.head.load(.monotonic); const tail = self.tail.load(.monotonic); - if (head -% tail > RING_SIZE) { - self.tail.store(head -% RING_SIZE, .monotonic); + if ((next + 1) -% tail > RING_SIZE) { + self.tail.store((next + 1) -% RING_SIZE, .monotonic); } } - /// Convenience for the handleCall hot path. + pub fn recordSessionStart(self: *Telemetry) void { + self.record(.{ .session_start = {} }); + } + pub fn recordToolCall(self: *Telemetry, tool_name: []const u8, latency_ns: i128, is_error: bool, response_bytes: usize) void { var tc: Event.Kind = .{ .tool_call = .{ .latency_ns = latency_ns, @@ -88,7 +95,33 @@ pub const Telemetry = struct { self.record(tc); } - /// Cold path β€” called on idle or shutdown. Drains ring to disk. + pub fn recordCodebaseStats(self: *Telemetry, explorer: *explore.Explorer, startup_time_ms: u64) void { + if (!self.enabled) return; + + explorer.mu.lockShared(); + defer explorer.mu.unlockShared(); + + var file_count: u32 = 0; + var total_lines: u64 = 0; + var language_mask: u16 = 0; + + var outline_iter = explorer.outlines.iterator(); + while (outline_iter.next()) |entry| { + file_count +|= 1; + total_lines +|= entry.value_ptr.line_count; + const bit_index: u4 = @intCast(@intFromEnum(entry.value_ptr.language)); + language_mask |= @as(u16, 1) << bit_index; + } + + self.record(.{ .codebase_stats = .{ + .file_count = file_count, + .total_lines = total_lines, + .language_mask = language_mask, + .index_size_bytes = approxIndexSizeBytes(explorer), + .startup_time_ms = startup_time_ms, + } }); + } + pub fn flush(self: *Telemetry) void { const f = self.file orelse return; const tail = self.tail.load(.monotonic); @@ -104,8 +137,6 @@ pub const Telemetry = struct { self.tail.store(head, .monotonic); } - /// Fire-and-forget: spawn detached shell to POST ndjson to cloud. - /// Runs on shutdown β€” never blocks the MCP loop. fn syncToCloud(self: *Telemetry) void { if (!self.enabled or self.path_len == 0) return; const path = self.path_buf[0..self.path_len]; @@ -113,7 +144,6 @@ pub const Telemetry = struct { const stat = std.fs.cwd().statFile(path) catch return; if (stat.size == 0) return; - // Build shell command: POST file, truncate on success var cmd_buf: [2048]u8 = undefined; const cmd = std.fmt.bufPrint(&cmd_buf, "curl -sf -X POST {s} -H 'Content-Type: application/json' --data-binary @{s} >/dev/null 2>&1 && : > {s}", .{ CLOUD_URL, path, path }) catch return; @@ -127,21 +157,29 @@ pub const Telemetry = struct { fn formatEvent(self: *Telemetry, ev: *const Event) !usize { var fbs = std.io.fixedBufferStream(&self.buf); const w = fbs.writer(); - try w.print("{{\"ts\":{d}", .{ev.ts}); + try w.print("{{\"timestamp_ms\":{d}", .{std.time.milliTimestamp()}); switch (ev.kind) { .tool_call => |tc| { const name = tc.tool[0..tc.tool_len]; - try w.print(",\"ev\":\"tool\",\"tool\":\"{s}\",\"ns\":{d},\"err\":{s},\"bytes\":{d}", .{ + try w.print(",\"event_type\":\"tool_call\",\"tool\":\"{s}\",\"latency_ns\":{d},\"error\":{s},\"response_bytes\":{d}", .{ name, @as(i64, @intCast(@min(tc.latency_ns, std.math.maxInt(i64)))), if (tc.err) "true" else "false", tc.response_bytes, }); }, - .session_start => |ss| { - try w.print(",\"ev\":\"start\",\"files\":{d},\"lines\":{d}", .{ - ss.file_count, - ss.total_lines, + .session_start => { + try w.print(",\"event_type\":\"session_start\",\"version\":\"{s}\",\"platform\":\"{s}\"", .{ VERSION, PLATFORM }); + }, + .codebase_stats => |stats| { + try w.print(",\"event_type\":\"codebase_stats\",\"file_count\":{d},\"total_lines\":{d},\"languages\":[", .{ + stats.file_count, + stats.total_lines, + }); + try writeLanguages(w, stats.language_mask); + try w.print("],\"index_size_bytes\":{d},\"startup_time_ms\":{d}", .{ + stats.index_size_bytes, + stats.startup_time_ms, }); }, } @@ -149,3 +187,67 @@ pub const Telemetry = struct { return fbs.pos; } }; + +fn writeLanguages(writer: anytype, language_mask: u16) !void { + const names = [_][]const u8{ + "zig", + "c", + "cpp", + "python", + "javascript", + "typescript", + "rust", + "go_lang", + "markdown", + "json", + "yaml", + "unknown", + }; + var first = true; + for (names, 0..) |name, idx| { + const bit_index: u4 = @intCast(idx); + if ((language_mask & (@as(u16, 1) << bit_index)) == 0) continue; + if (!first) try writer.writeByte(','); + first = false; + try writer.print("\"{s}\"", .{name}); + } +} + +fn approxIndexSizeBytes(explorer: *const explore.Explorer) u64 { + var total: u64 = 0; + + var word_iter = explorer.word_index.index.iterator(); + while (word_iter.next()) |entry| { + total +|= entry.key_ptr.*.len; + total +|= entry.value_ptr.items.len * @sizeOf(@TypeOf(entry.value_ptr.items[0])); + } + + var file_words_iter = explorer.word_index.file_words.iterator(); + while (file_words_iter.next()) |entry| { + total +|= entry.value_ptr.count() * @sizeOf(usize); + } + + var trigram_iter = explorer.trigram_index.index.iterator(); + while (trigram_iter.next()) |entry| { + total +|= @sizeOf(@TypeOf(entry.key_ptr.*)); + total +|= entry.value_ptr.count() * (@sizeOf(usize) + @sizeOf(index.PostingMask)); + } + + var file_trigrams_iter = explorer.trigram_index.file_trigrams.iterator(); + while (file_trigrams_iter.next()) |entry| { + total +|= entry.value_ptr.items.len * @sizeOf(@TypeOf(entry.value_ptr.items[0])); + } + + var sparse_iter = explorer.sparse_ngram_index.index.iterator(); + while (sparse_iter.next()) |entry| { + total +|= @sizeOf(@TypeOf(entry.key_ptr.*)); + total +|= entry.value_ptr.count() * @sizeOf(usize); + } + + var file_sparse_iter = explorer.sparse_ngram_index.file_ngrams.iterator(); + while (file_sparse_iter.next()) |entry| { + total +|= entry.value_ptr.items.len * @sizeOf(@TypeOf(entry.value_ptr.items[0])); + } + + return total; +} diff --git a/src/tests.zig b/src/tests.zig index 804cb21..c0b2616 100644 --- a/src/tests.zig +++ b/src/tests.zig @@ -30,6 +30,7 @@ const isCommentOrBlank = explore.isCommentOrBlank; const Language = explore.Language; const mcp_mod = @import("mcp.zig"); const snapshot_mod = @import("snapshot.zig"); +const telemetry_mod = @import("telemetry.zig"); // ── Store tests ───────────────────────────────────────────── test "store: record and retrieve snapshots" { @@ -335,7 +336,6 @@ test "trigram index: re-index removes old trigrams" { try testing.expect(c3 != null and c3.?.len == 1); } - // ── Sparse N-gram tests ───────────────────────────────────── test "pairWeight: deterministic" { @@ -449,8 +449,6 @@ test "extractSparseNgrams: coverage with force-split remainder 2 (len=18)" { for (covered) |c| try testing.expect(c); } - - test "extractSparseNgrams: ngram length bounds" { const content = "abcdefghijklmnopqrstuvwxyz0123456789"; const ng = try extractSparseNgrams(content, testing.allocator); @@ -503,8 +501,6 @@ test "sparse ngram index: index and candidate lookup" { try testing.expect(!found_bar); } - - test "sparse ngram index: short query returns null" { var sni = SparseNgramIndex.init(testing.allocator); defer sni.deinit(); @@ -558,7 +554,9 @@ test "sparse ngram candidates: sliding window finds file with short n-gram" { var found_a = false; if (cands) |cs| { - for (cs) |p| if (std.mem.eql(u8, p, "a.zig")) { found_a = true; }; + for (cs) |p| if (std.mem.eql(u8, p, "a.zig")) { + found_a = true; + }; } try testing.expect(found_a); } @@ -590,11 +588,12 @@ test "explorer: searchContent finds query embedded in longer identifier" { const results = try explorer.searchContent("record", arena.allocator(), 10); var found = false; - for (results) |r| if (std.mem.eql(u8, r.path, "alpha.zig")) { found = true; }; + for (results) |r| if (std.mem.eql(u8, r.path, "alpha.zig")) { + found = true; + }; try testing.expect(found); } - // ── Frequency-weighted pairWeight tests ───────────────────── test "pairWeight: common pairs have lower weight than rare pairs" { @@ -615,7 +614,7 @@ test "pairWeight: frequency-weighted produces fewer boundaries for common text" // (interior weights are low and similar), giving fewer n-grams than a // string of rare pairs. const common = "thehereinandonthere"; - const rare = "qxzjvkqxzjvkqxzjvk"; + const rare = "qxzjvkqxzjvkqxzjvk"; const ng_common = try extractSparseNgrams(common, testing.allocator); defer testing.allocator.free(ng_common); const ng_rare = try extractSparseNgrams(rare, testing.allocator); @@ -708,8 +707,8 @@ test "setFrequencyTable / resetFrequencyTable: pairWeight output changes" { setFrequencyTable(&custom); defer resetFrequencyTable(); - const after_th = pairWeight('t', 'h'); - const after_qx = pairWeight('q', 'x'); + const after_th = pairWeight('t', 'h'); + const after_qx = pairWeight('q', 'x'); // After swap: 'th' should be lower (we set it to 0x1000 vs default table's 0x1000 β€” same). // What definitely changes: 'qx' base shifts from 0xFE00 to 0xFE00 (custom kept it high). @@ -721,7 +720,6 @@ test "setFrequencyTable / resetFrequencyTable: pairWeight output changes" { _ = after_qx; } - // ── Explorer tests ────────────────────────────────────────── test "explorer: index file and get outline" { @@ -777,7 +775,10 @@ test "explorer: searchContent with trigram acceleration" { const results = try explorer.searchContent("recordSnapshot", testing.allocator, 50); defer { - for (results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(results); } @@ -853,10 +854,20 @@ test "file versions: append and latest" { defer fv.deinit(); try fv.versions.append(testing.allocator, .{ - .seq = 1, .agent = 0, .timestamp = 0, .op = .snapshot, .hash = 0x11, .size = 100, + .seq = 1, + .agent = 0, + .timestamp = 0, + .op = .snapshot, + .hash = 0x11, + .size = 100, }); try fv.versions.append(testing.allocator, .{ - .seq = 2, .agent = 0, .timestamp = 0, .op = .replace, .hash = 0x22, .size = 150, + .seq = 2, + .agent = 0, + .timestamp = 0, + .op = .replace, + .hash = 0x22, + .size = 150, }); const latest = fv.latest().?; @@ -869,13 +880,28 @@ test "file versions: countSince" { defer fv.deinit(); try fv.versions.append(testing.allocator, .{ - .seq = 1, .agent = 0, .timestamp = 0, .op = .snapshot, .hash = 0, .size = 0, + .seq = 1, + .agent = 0, + .timestamp = 0, + .op = .snapshot, + .hash = 0, + .size = 0, }); try fv.versions.append(testing.allocator, .{ - .seq = 5, .agent = 0, .timestamp = 0, .op = .replace, .hash = 0, .size = 0, + .seq = 5, + .agent = 0, + .timestamp = 0, + .op = .replace, + .hash = 0, + .size = 0, }); try fv.versions.append(testing.allocator, .{ - .seq = 10, .agent = 0, .timestamp = 0, .op = .delete, .hash = 0, .size = 0, + .seq = 10, + .agent = 0, + .timestamp = 0, + .op = .delete, + .hash = 0, + .size = 0, }); try testing.expect(fv.countSince(0) == 3); @@ -905,7 +931,10 @@ test "explorer: reindex OOM keeps prior outline reachable" { // Old content should be replaced const old_results = try explorer.searchContent("oldName", testing.allocator, 10); defer { - for (old_results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (old_results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(old_results); } try testing.expect(old_results.len == 0); @@ -913,7 +942,10 @@ test "explorer: reindex OOM keeps prior outline reachable" { // New content should be searchable const new_results = try explorer.searchContent("newName", testing.allocator, 10); defer { - for (new_results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (new_results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(new_results); } try testing.expect(new_results.len == 1); @@ -1141,7 +1173,10 @@ test "regression #2: searchContent frees trigram candidate slice" { const results = try explorer.searchContent("recordSnapshot", testing.allocator, 50); defer { - for (results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(results); } try testing.expect(results.len == 1); @@ -1159,7 +1194,10 @@ test "regression #2: searchContent no leak on zero results" { // "abcxyz" shares trigrams "abc" but won't match full text const results = try explorer.searchContent("abcxyz", testing.allocator, 50); defer { - for (results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(results); } try testing.expect(results.len == 0); @@ -1174,7 +1212,10 @@ test "regression #2: searchContent short query skips trigrams" { const results = try explorer.searchContent("ab", testing.allocator, 50); defer { - for (results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(results); } try testing.expect(results.len == 1); @@ -1369,7 +1410,10 @@ test "regression: searchContent frees empty trigram candidate slice" { const results = try explorer.searchContent("zzzzz", testing.allocator, 50); defer { - for (results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(results); } try testing.expect(results.len == 0); @@ -2240,11 +2284,18 @@ test "regexMatch: alternation with many branches does not stack overflow" { var pos: usize = 0; var bi: usize = 0; while (bi < 300) : (bi += 1) { - if (bi > 0) { buf[pos] = '|'; pos += 1; } - buf[pos] = 'a'; pos += 1; - buf[pos] = @as(u8, @intCast('0' + bi / 100 % 10)); pos += 1; - buf[pos] = @as(u8, @intCast('0' + bi / 10 % 10)); pos += 1; - buf[pos] = @as(u8, @intCast('0' + bi % 10)); pos += 1; + if (bi > 0) { + buf[pos] = '|'; + pos += 1; + } + buf[pos] = 'a'; + pos += 1; + buf[pos] = @as(u8, @intCast('0' + bi / 100 % 10)); + pos += 1; + buf[pos] = @as(u8, @intCast('0' + bi / 10 % 10)); + pos += 1; + buf[pos] = @as(u8, @intCast('0' + bi % 10)); + pos += 1; } const pattern = buf[0..pos]; try testing.expect(regexMatch("a000", pattern)); @@ -2298,7 +2349,6 @@ test "explorer: searchContentRegex no match" { try testing.expectEqual(@as(usize, 0), results.len); } - // ── Bloom filter correctness tests ────────────────────────── // These tests prove that the PostingMask (nextMask + locMask) bloom // filters are actually working β€” reducing false-positive candidates @@ -2729,8 +2779,7 @@ test "perf regression: word index lookup under 100ns per query" { for (0..100) |i| { const name = try std.fmt.allocPrint(alloc, "src_{d}.zig", .{i}); - const content = try std.fmt.allocPrint(alloc, - "pub fn handleRequest_{d}(ctx: *Context) void {{}}\nconst allocator = getDefaultAllocator();\n", .{i}); + const content = try std.fmt.allocPrint(alloc, "pub fn handleRequest_{d}(ctx: *Context) void {{}}\nconst allocator = getDefaultAllocator();\n", .{i}); try wi.indexFile(name, content); } @@ -2759,8 +2808,7 @@ test "perf regression: bloom filter reduces scan work" { for (0..50) |i| { const name = try std.fmt.allocPrint(alloc, "f{d:0>2}.zig", .{i}); - const content = try std.fmt.allocPrint(alloc, - "pub fn init_{d}(allocator: Allocator) void {{}}\nfn deinit_{d}() void {{}}\n", .{ i, i }); + const content = try std.fmt.allocPrint(alloc, "pub fn init_{d}(allocator: Allocator) void {{}}\nfn deinit_{d}() void {{}}\n", .{ i, i }); try ti.indexFile(name, content); } @@ -2938,7 +2986,6 @@ test "disk index: fileCount matches after round-trip" { try testing.expectEqual(@as(u32, 3), loaded_ti.fileCount()); } - // ── Git HEAD + disk index tests ───────────────────────────── test "git: getGitHead returns 40-char hex SHA in a git repo" { @@ -3197,7 +3244,10 @@ test "issue-44: snapshot stale after working tree changes cause stale query resu // Current (bug): results.len == 0 β€” stale snapshot content is never evicted. const results = try exp2.searchContent("newFunc", testing.allocator, 10); defer { - for (results) |r| { testing.allocator.free(r.path); testing.allocator.free(r.line_text); } + for (results) |r| { + testing.allocator.free(r.path); + testing.allocator.free(r.line_text); + } testing.allocator.free(results); } try testing.expect(results.len == 1); @@ -3231,7 +3281,6 @@ test "issue-46: empty-repo snapshot rejected on load" { try testing.expect(loaded); } - // ── Snapshot non-git tests ─────────────────────────────────── test "issue-45: snapshot written in non-git directory cannot be loaded" { @@ -3431,6 +3480,52 @@ test "issue-41: snapshot not validated against repo identity allows cross-projec try testing.expect(!loaded); } +test "issue-59: telemetry writes session, tool, and codebase stats ndjson" { + var tmp = testing.tmpDir(.{}); + defer tmp.cleanup(); + + var path_buf: [std.fs.max_path_bytes]u8 = undefined; + const dir_path = try tmp.dir.realpath(".", &path_buf); + + var telem = telemetry_mod.Telemetry.init(dir_path, testing.allocator, false); + defer telem.deinit(); + + telem.recordSessionStart(); + telem.recordToolCall("codedb_status", 1234, false, 56); + + var explorer = Explorer.init(testing.allocator); + defer explorer.deinit(); + try explorer.indexFile("src/main.zig", "pub fn main() void {}\n"); + try explorer.indexFile("src/lib.py", "def run():\n return 1\n"); + + telem.recordCodebaseStats(&explorer, 42); + telem.flush(); + + const ndjson_path = try std.fmt.allocPrint(testing.allocator, "{s}/telemetry.ndjson", .{dir_path}); + defer testing.allocator.free(ndjson_path); + + const contents = try std.fs.cwd().readFileAlloc(testing.allocator, ndjson_path, 64 * 1024); + defer testing.allocator.free(contents); + + try testing.expect(std.mem.indexOf(u8, contents, "\"event_type\":\"session_start\"") != null); + try testing.expect(std.mem.indexOf(u8, contents, "\"event_type\":\"tool_call\"") != null); + try testing.expect(std.mem.indexOf(u8, contents, "\"tool\":\"codedb_status\"") != null); + try testing.expect(std.mem.indexOf(u8, contents, "\"event_type\":\"codebase_stats\"") != null); + try testing.expect(std.mem.indexOf(u8, contents, "\"startup_time_ms\":42") != null); + try testing.expect(std.mem.indexOf(u8, contents, "\"languages\":[\"zig\",\"python\"]") != null); +} + +test "issue-60: telemetry disabled path is a no-op" { + var telem = telemetry_mod.Telemetry.init("/tmp", testing.allocator, true); + defer telem.deinit(); + + telem.recordSessionStart(); + telem.recordToolCall("codedb_search", 99, true, 10); + try testing.expect(!telem.enabled); + try testing.expect(telem.file == null); + try testing.expect(telem.head.load(.monotonic) == 0); +} + test "issue-77: mcp index accepts temporary-directory roots that cause pathological cache growth" { var tmp_name_buf: [128]u8 = undefined; const tmp_name = try std.fmt.bufPrint(&tmp_name_buf, "codedb-issue-77-{d}", .{std.time.microTimestamp()});