fix(git): cap and stream untracked-file line counting (#2218)

pauldambra · claude · web-flow · commit a017c555cb34 · 2026-05-19T11:49:21.000+01:00
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/packages/git/src/queries.test.ts b/packages/git/src/queries.test.ts
@@ -6,6 +6,7 @@ import { createGitClient } from "./client";
 import {
   detectDefaultBranch,
   getBranchDiffPatchesByPath,
+  getChangedFilesDetailed,
   splitUnifiedDiffByFile,
 } from "./queries";
 
@@ -242,3 +243,72 @@ describe("getBranchDiffPatchesByPath", () => {
     }
   });
 });
+
+// Picked to land well past the default 64KB highWaterMark of
+// `createReadStream` so the regression test actually exercises the
+// across-chunk path of the streaming line counter.
+const LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK = 800_000;
+
+describe("getChangedFilesDetailed > untracked line counts", () => {
+  let repoDir: string;
+
+  afterEach(async () => {
+    if (repoDir) {
+      await rm(repoDir, { recursive: true, force: true });
+      repoDir = "";
+    }
+  });
+
+  it.each([
+    { name: "trailing newline", content: "a\nb\nc\n", expected: 3 },
+    { name: "no trailing newline", content: "a\nb\nc", expected: 3 },
+    { name: "single byte, no newline", content: "a", expected: 1 },
+    { name: "lone newline", content: "\n", expected: 1 },
+    { name: "consecutive newlines", content: "\n\n", expected: 2 },
+    // CRLF: legacy `split("\n")` counted only `\n` separators, so
+    // `"a\r\nb\r\n"` -> 2 lines. Byte-counter matches.
+    { name: "CRLF endings", content: "a\r\nb\r\n", expected: 2 },
+  ])("counts $name as $expected line(s)", async ({ content, expected }) => {
+    repoDir = await setupRepo();
+    await writeFile(path.join(repoDir, "f.txt"), content);
+
+    const files = await getChangedFilesDetailed(repoDir);
+    const f = files.find((file) => file.path === "f.txt");
+
+    expect(f).toMatchObject({ status: "untracked", linesAdded: expected });
+  });
+
+  it("reports 0 lines for empty untracked files", async () => {
+    repoDir = await setupRepo();
+    await writeFile(path.join(repoDir, "empty.txt"), "");
+
+    const files = await getChangedFilesDetailed(repoDir);
+    const empty = files.find((f) => f.path === "empty.txt");
+
+    expect(empty).toMatchObject({ status: "untracked", linesAdded: 0 });
+  });
+
+  // Regression guard for the OOM in #2218. Before the fix `countFileLines`
+  // read each untracked file's full content into memory via
+  // `fs.readFile(..., "utf-8")`, 16-way concurrent against every untracked
+  // path returned by `streamGitStatus` (up to 50k). On a monorepo with
+  // multi-MB build artifacts this exhausted the main-process V8 heap
+  // (`16 * file_bytes * 2` for V8's UTF-16) and froze the renderer waiting
+  // on the dead tRPC call. The fix stream-counts via `createReadStream`,
+  // so peak per-stream memory is ~64KB regardless of file size — the
+  // multi-MB case below would have OOM'd pre-fix and must still report an
+  // accurate line count.
+  it("stream-counts untracked files larger than the streaming chunk size", async () => {
+    repoDir = await setupRepo();
+    const content = "a\n".repeat(LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK);
+    await writeFile(path.join(repoDir, "huge.txt"), content);
+
+    const files = await getChangedFilesDetailed(repoDir);
+    const huge = files.find((f) => f.path === "huge.txt");
+
+    expect(huge).toMatchObject({
+      status: "untracked",
+      linesAdded: LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK,
+    });
+  });
+});
diff --git a/packages/git/src/queries.ts b/packages/git/src/queries.ts
@@ -1,3 +1,4 @@
+import { createReadStream } from "node:fs";
 import * as fs from "node:fs/promises";
 import * as path from "node:path";
 import type { CreateGitClientOptions } from "./client";
@@ -413,11 +414,55 @@ function matchesExcludePattern(filePath: string, patterns: string[]): boolean {
   });
 }
 
-async function countFileLines(filePath: string): Promise<number> {
+async function countFileLines(
+  filePath: string,
+  options?: { signal?: AbortSignal },
+): Promise<number> {
   try {
-    const content = await fs.readFile(filePath, "utf-8");
-    if (!content) return 0;
-    return content.split("\n").length - (content.endsWith("\n") ? 1 : 0);
+    // `lstat` instead of `stat` so an untracked symlink (rare, but legal)
+    // pointing at /dev/zero or a path outside the workdir doesn't stream
+    // forever — symlinks fail `isFile()` and short-circuit to 0.
+    const stat = await fs.lstat(filePath);
+    if (!stat.isFile() || stat.size === 0) return 0;
+    return await new Promise<number>((resolve) => {
+      let newlines = 0;
+      let lastByte = -1;
+      const stream = createReadStream(filePath, { signal: options?.signal });
+      stream.on("data", (rawChunk) => {
+        // Node types stream chunks as `string | Buffer`; without an
+        // `encoding` option `createReadStream` always emits `Buffer`,
+        // so the cast is for the type checker, not the runtime.
+        const chunk = rawChunk as Buffer;
+        // Native `Buffer.indexOf` — ~10x faster than a per-byte JS loop
+        // on multi-MB buffers, which is the workload this whole function
+        // exists to handle.
+        for (
+          let idx = chunk.indexOf(0x0a);
+          idx !== -1;
+          idx = chunk.indexOf(0x0a, idx + 1)
+        ) {
+          newlines++;
+        }
+        if (chunk.length > 0) lastByte = chunk[chunk.length - 1];
+      });
+      stream.on("end", () => {
+        // Guards against TOCTOU truncation between lstat and read —
+        // size > 0 at stat time, zero bytes by the time we open.
+        if (lastByte === -1) {
+          resolve(0);
+          return;
+        }
+        resolve(lastByte === 0x0a ? newlines : newlines + 1);
+      });
+      stream.on("error", (err) => {
+        // Don't propagate — caller already treats any failure as 0 lines.
+        // But log so the next time a "shows 0 lines" mystery shows up
+        // there's a breadcrumb (the original OOM in #2218 hid behind the
+        // same silent-zero return).
+        console.warn(`countFileLines failed for ${filePath}:`, err);
+        resolve(0);
+      });
+    });
   } catch {
     return 0;
   }
@@ -427,12 +472,14 @@ async function mapWithConcurrency<T, R>(
   items: readonly T[],
   concurrency: number,
   mapper: (item: T) => Promise<R>,
+  options?: { signal?: AbortSignal },
 ): Promise<R[]> {
   if (items.length === 0) return [];
   const results = new Array<R>(items.length);
   let index = 0;
   const worker = async () => {
     while (index < items.length) {
+      if (options?.signal?.aborted) return;
       const i = index++;
       results[i] = await mapper(items[i]);
     }
@@ -521,7 +568,11 @@ export async function getChangedFilesDetailed(
         const untrackedLineCounts = await mapWithConcurrency(
           untrackedToCount,
           16,
-          (file) => countFileLines(path.join(baseDir, file)),
+          (file) =>
+            countFileLines(path.join(baseDir, file), {
+              signal: gitOptions?.abortSignal,
+            }),
+          { signal: gitOptions?.abortSignal },
         );
         for (let i = 0; i < untrackedToCount.length; i++) {
           files.push({