perf(core): Add newline pre-filter to base64 run detection

claude · claude · commit 5ff9b1faac2f · 2026-06-01T19:12:22.000Z
Skip the per-character `hasLongBase64Run` scan for files whose lines are
all shorter than the 256-char standalone-base64 threshold.

Why:
  - `truncateBase64Content` runs on the main thread for every collected
    file (no worker pool on the default pack path), so its CPU cost is
    fully on the serial critical path. With `truncateBase64: true` (set in
    this repo's own repomix.config.json, the benchmark target) it is the
    dominant cost of the file-processing phase.
  - `hasLongBase64Run` previously charCodeAt-scanned every byte of every
    file (~5.5 MB across ~1.1k files) just to gate the standalone-base64
    regex.

What:
  - A 256-char base64 run cannot contain a newline (`\n` is not a base64
    character and resets the run), so it must fit inside a single line.
    Before the byte scan, walk newline offsets with the native
    `String.prototype.indexOf`; if no line reaches the threshold, no run is
    possible and we return early. Files with a long line fall through to
    the unchanged full scan.

Behavior-preserving:
  - The pre-filter can only short-circuit when a long run is provably
    absent; any file with a &gt;=256-char line still runs the authoritative
    byte scan, so results are identical. CLI output verified byte-identical
    across xml/markdown/json/plain. Isolated run over all 1127 repo files:
    0 mismatches vs the previous implementation.

Benchmark (this container, `node bin/repomix.cjs`, warm cache):
  - Isolated `truncateBase64Content` over the full repo file set
    (interleaved, JIT-warmed median): 42.5ms -&gt; 17.3ms, -25.2ms.
  - Whole-process wall clock (interleaved, noise floor):
    min   -32.5ms (-3.73%)
    p25   -24.3ms (-2.53%)
    Comfortably above the 2%-of-total improvement bar.

Tests:
  - Added newline-split / many-short-lines / CRLF / no-newline cases to
    tests/core/file/truncateBase64.test.ts. Full suite: 1345 passing.
diff --git a/src/core/file/truncateBase64.ts b/src/core/file/truncateBase64.ts
@@ -24,6 +24,25 @@ const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_S
 const hasLongBase64Run = (content: string): boolean => {
   const len = content.length;
   if (len < MIN_BASE64_LENGTH_STANDALONE) return false;
+  // Newline pre-filter: `\n` is not a base64 character, so it always resets the
+  // run below. A run of `MIN_BASE64_LENGTH_STANDALONE` therefore has to fit
+  // inside a single line. If every line is shorter than that threshold no run is
+  // possible, and we can bail out before the per-character scan. `indexOf` is a
+  // native (memchr-style) scan, far cheaper than the charCodeAt loop, and the
+  // vast majority of source files have no such long line, so this skips the hot
+  // loop entirely for them.
+  let lineStart = 0;
+  let newlineIndex = content.indexOf('\n');
+  while (newlineIndex !== -1) {
+    if (newlineIndex - lineStart >= MIN_BASE64_LENGTH_STANDALONE) break;
+    lineStart = newlineIndex + 1;
+    newlineIndex = content.indexOf('\n', lineStart);
+  }
+  // The final segment (after the last newline, or the whole content when there
+  // is none) also needs the length check before we can rule out a long run.
+  if (newlineIndex === -1 && len - lineStart < MIN_BASE64_LENGTH_STANDALONE) {
+    return false;
+  }
   let run = 0;
   for (let i = 0; i < len; i++) {
     const c = content.charCodeAt(i);
diff --git a/tests/core/file/truncateBase64.test.ts b/tests/core/file/truncateBase64.test.ts
@@ -127,6 +127,43 @@ describe('truncateBase64Content', () => {
     expect(result).toBe(input);
   });
 
+  it('should not truncate a base64-like run split across a newline', () => {
+    // A 320-char base64 body interrupted by a newline: neither line segment
+    // reaches 256, and `\n` resets the run, so nothing should be truncated.
+    // Guards the newline pre-filter in `hasLongBase64Run`.
+    const half = longBase64.slice(0, 160);
+    const input = `const data = "${half}\n${half}";`;
+    const result = truncateBase64Content(input);
+    expect(result).toBe(input);
+  });
+
+  it('should truncate a long base64 run that follows many short lines', () => {
+    // Many short lines (each < 256) precede the real run, so the newline
+    // pre-filter must fall through to the full scan and still truncate.
+    const shortLines = 'const a = 1;\n'.repeat(50);
+    const input = `${shortLines}const data = "${longBase64}";`;
+    const result = truncateBase64Content(input);
+    expect(result).toContain('DTJXfKHG6xA1Wn+kye4TOF2Cp8zxFjtg...');
+    expect(result.startsWith(shortLines)).toBe(true);
+  });
+
+  it('should truncate a base64 run on a CRLF-terminated line', () => {
+    // The `\r` before `\n` is also non-base64; the long line must still be
+    // detected by the pre-filter and truncated by the full scan.
+    const input = `const data = "${longBase64}";\r\nconst next = 2;\r\n`;
+    const result = truncateBase64Content(input);
+    expect(result).toContain('DTJXfKHG6xA1Wn+kye4TOF2Cp8zxFjtg...');
+    expect(result).toContain('const next = 2;');
+  });
+
+  it('should truncate a long base64 run with no newline at all', () => {
+    // Single-line content (no `\n`): the pre-filter treats the whole string as
+    // one segment and must fall through to the full scan.
+    const input = `prefix-${longBase64}-suffix`;
+    const result = truncateBase64Content(input);
+    expect(result).toContain('DTJXfKHG6xA1Wn+kye4TOF2Cp8zxFjtg...');
+  });
+
   it('should leave non-base64 data URIs untouched', () => {
     // `data:text/plain,hello` has no `;base64,` literal, so the dataUriPattern
     // cannot match. Verifies the `includes(';base64,')` guard short-circuits