Skip to content

Commit a017c55

Browse files
pauldambraclaude
andauthored
fix(git): cap and stream untracked-file line counting (#2218)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0ed1d0b commit a017c55

2 files changed

Lines changed: 126 additions & 5 deletions

File tree

packages/git/src/queries.test.ts

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { createGitClient } from "./client";
66
import {
77
detectDefaultBranch,
88
getBranchDiffPatchesByPath,
9+
getChangedFilesDetailed,
910
splitUnifiedDiffByFile,
1011
} from "./queries";
1112

@@ -242,3 +243,72 @@ describe("getBranchDiffPatchesByPath", () => {
242243
}
243244
});
244245
});
246+
247+
// Picked to land well past the default 64KB highWaterMark of
248+
// `createReadStream` so the regression test actually exercises the
249+
// across-chunk path of the streaming line counter.
250+
const LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK = 800_000;
251+
252+
describe("getChangedFilesDetailed > untracked line counts", () => {
253+
let repoDir: string;
254+
255+
afterEach(async () => {
256+
if (repoDir) {
257+
await rm(repoDir, { recursive: true, force: true });
258+
repoDir = "";
259+
}
260+
});
261+
262+
it.each([
263+
{ name: "trailing newline", content: "a\nb\nc\n", expected: 3 },
264+
{ name: "no trailing newline", content: "a\nb\nc", expected: 3 },
265+
{ name: "single byte, no newline", content: "a", expected: 1 },
266+
{ name: "lone newline", content: "\n", expected: 1 },
267+
{ name: "consecutive newlines", content: "\n\n", expected: 2 },
268+
// CRLF: legacy `split("\n")` counted only `\n` separators, so
269+
// `"a\r\nb\r\n"` -> 2 lines. Byte-counter matches.
270+
{ name: "CRLF endings", content: "a\r\nb\r\n", expected: 2 },
271+
])("counts $name as $expected line(s)", async ({ content, expected }) => {
272+
repoDir = await setupRepo();
273+
await writeFile(path.join(repoDir, "f.txt"), content);
274+
275+
const files = await getChangedFilesDetailed(repoDir);
276+
const f = files.find((file) => file.path === "f.txt");
277+
278+
expect(f).toMatchObject({ status: "untracked", linesAdded: expected });
279+
});
280+
281+
it("reports 0 lines for empty untracked files", async () => {
282+
repoDir = await setupRepo();
283+
await writeFile(path.join(repoDir, "empty.txt"), "");
284+
285+
const files = await getChangedFilesDetailed(repoDir);
286+
const empty = files.find((f) => f.path === "empty.txt");
287+
288+
expect(empty).toMatchObject({ status: "untracked", linesAdded: 0 });
289+
});
290+
291+
// Regression guard for the OOM in #2218. Before the fix `countFileLines`
292+
// read each untracked file's full content into memory via
293+
// `fs.readFile(..., "utf-8")`, 16-way concurrent against every untracked
294+
// path returned by `streamGitStatus` (up to 50k). On a monorepo with
295+
// multi-MB build artifacts this exhausted the main-process V8 heap
296+
// (`16 * file_bytes * 2` for V8's UTF-16) and froze the renderer waiting
297+
// on the dead tRPC call. The fix stream-counts via `createReadStream`,
298+
// so peak per-stream memory is ~64KB regardless of file size — the
299+
// multi-MB case below would have OOM'd pre-fix and must still report an
300+
// accurate line count.
301+
it("stream-counts untracked files larger than the streaming chunk size", async () => {
302+
repoDir = await setupRepo();
303+
const content = "a\n".repeat(LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK);
304+
await writeFile(path.join(repoDir, "huge.txt"), content);
305+
306+
const files = await getChangedFilesDetailed(repoDir);
307+
const huge = files.find((f) => f.path === "huge.txt");
308+
309+
expect(huge).toMatchObject({
310+
status: "untracked",
311+
linesAdded: LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK,
312+
});
313+
});
314+
});

packages/git/src/queries.ts

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { createReadStream } from "node:fs";
12
import * as fs from "node:fs/promises";
23
import * as path from "node:path";
34
import type { CreateGitClientOptions } from "./client";
@@ -413,11 +414,55 @@ function matchesExcludePattern(filePath: string, patterns: string[]): boolean {
413414
});
414415
}
415416

416-
async function countFileLines(filePath: string): Promise<number> {
417+
async function countFileLines(
418+
filePath: string,
419+
options?: { signal?: AbortSignal },
420+
): Promise<number> {
417421
try {
418-
const content = await fs.readFile(filePath, "utf-8");
419-
if (!content) return 0;
420-
return content.split("\n").length - (content.endsWith("\n") ? 1 : 0);
422+
// `lstat` instead of `stat` so an untracked symlink (rare, but legal)
423+
// pointing at /dev/zero or a path outside the workdir doesn't stream
424+
// forever — symlinks fail `isFile()` and short-circuit to 0.
425+
const stat = await fs.lstat(filePath);
426+
if (!stat.isFile() || stat.size === 0) return 0;
427+
return await new Promise<number>((resolve) => {
428+
let newlines = 0;
429+
let lastByte = -1;
430+
const stream = createReadStream(filePath, { signal: options?.signal });
431+
stream.on("data", (rawChunk) => {
432+
// Node types stream chunks as `string | Buffer`; without an
433+
// `encoding` option `createReadStream` always emits `Buffer`,
434+
// so the cast is for the type checker, not the runtime.
435+
const chunk = rawChunk as Buffer;
436+
// Native `Buffer.indexOf` — ~10x faster than a per-byte JS loop
437+
// on multi-MB buffers, which is the workload this whole function
438+
// exists to handle.
439+
for (
440+
let idx = chunk.indexOf(0x0a);
441+
idx !== -1;
442+
idx = chunk.indexOf(0x0a, idx + 1)
443+
) {
444+
newlines++;
445+
}
446+
if (chunk.length > 0) lastByte = chunk[chunk.length - 1];
447+
});
448+
stream.on("end", () => {
449+
// Guards against TOCTOU truncation between lstat and read —
450+
// size > 0 at stat time, zero bytes by the time we open.
451+
if (lastByte === -1) {
452+
resolve(0);
453+
return;
454+
}
455+
resolve(lastByte === 0x0a ? newlines : newlines + 1);
456+
});
457+
stream.on("error", (err) => {
458+
// Don't propagate — caller already treats any failure as 0 lines.
459+
// But log so the next time a "shows 0 lines" mystery shows up
460+
// there's a breadcrumb (the original OOM in #2218 hid behind the
461+
// same silent-zero return).
462+
console.warn(`countFileLines failed for ${filePath}:`, err);
463+
resolve(0);
464+
});
465+
});
421466
} catch {
422467
return 0;
423468
}
@@ -427,12 +472,14 @@ async function mapWithConcurrency<T, R>(
427472
items: readonly T[],
428473
concurrency: number,
429474
mapper: (item: T) => Promise<R>,
475+
options?: { signal?: AbortSignal },
430476
): Promise<R[]> {
431477
if (items.length === 0) return [];
432478
const results = new Array<R>(items.length);
433479
let index = 0;
434480
const worker = async () => {
435481
while (index < items.length) {
482+
if (options?.signal?.aborted) return;
436483
const i = index++;
437484
results[i] = await mapper(items[i]);
438485
}
@@ -521,7 +568,11 @@ export async function getChangedFilesDetailed(
521568
const untrackedLineCounts = await mapWithConcurrency(
522569
untrackedToCount,
523570
16,
524-
(file) => countFileLines(path.join(baseDir, file)),
571+
(file) =>
572+
countFileLines(path.join(baseDir, file), {
573+
signal: gitOptions?.abortSignal,
574+
}),
575+
{ signal: gitOptions?.abortSignal },
525576
);
526577
for (let i = 0; i < untrackedToCount.length; i++) {
527578
files.push({

0 commit comments

Comments
 (0)