perf(file): Sample-scan base64 run detection in truncateBase64

claude · claude · commit 2d0a45ab1110 · 2026-06-12T10:07:25.000Z
intent(file-process): automated perf tuning pass — single highest-impact, behavior-preserving change against a ~865ms default pack run; truncateBase64 is enabled in this repo's own config so its precondition scan runs on every packed file in the benchmark workload learned(base64-scan): hasLongBase64Run walked every character of every file (~5.5MB per pack, 23ms main-thread self time in CPU profiles, 35ms isolated) even though it almost always returns false — the per-character loop was itself the previous optimization over the regex it gates decision(sampled-scan): sample one character every MIN_BASE64_LENGTH_STANDALONE (256) positions — any qualifying run occupies 256 consecutive indices, so it must contain a sample point; only a sampled base64-class hit triggers a bounded outward expansion to measure the surrounding run, and the sampling phase resets cleanly after each short-run skip (next possible run from hi+1 always covers sample hi+256) constraint(equivalence): differential-tested against the per-character reference on the full repo corpus (1096 files, 0 mismatches) plus 20k randomized fuzz cases; a deterministic-LCG differential test now pins both false-positive and false-negative directions in the suite rejected(regex-precheck): /[A-Za-z0-9+/]{256}/.test() measured 4.5x SLOWER than the per-character loop (155ms vs 35ms on the corpus) — bounded-repetition re-scanning at each start position, not a viable replacement rejected(early-git-token-dispatch): pre-dispatching git diff/log token counts from the packager — with a warm token cache they resolve while calculateMetrics awaits outputPromise (Promise.all resolves in ~0ms; the 63-67ms wall time is main-thread-busy completion latency, not queue wait), e2e median +15ms under noise, unproven rejected(collect-concurrency): FILE_COLLECT_CONCURRENCY 50 -> 128/256 — identical medians over 40 quiet interleaved runs; libuv's 4-thread pool is saturated at depth 50, queue depth adds nothing rejected(startup-lazy-imports): module-level import() prefetches of tinypool/fast-glob/handlebars all measure 0 to -3ms — ESM already fetches/compiles the static graph in parallel; the budget is sequential module evaluation (~255 modules), only bundling would cut it rejected(lazy-render-context): skipping fileLineCounts + markdownCodeBlockDelimiter on the XML path re-measured at ~11ms p50 quiet (6.2 + 4.7) — still below the 2% threshold, matching the previous pass's rejection Benchmark (repomix repo itself, ~1100 files, 20 interleaved warm pairs, quiet 4-core Linux, default pack, pristine HEAD worktree build vs patched build): - end-to-end median 865ms -> 820.5ms (paired delta median -26.5ms, -3.1%), paired mean -37.5ms (t = 5.14), 18/20 pairs improved - isolated scan cost over the packed corpus: 35.6ms -> 1.6ms p50 (~22x) - output byte-identical (cmp) vs the base build on the same tree - 6 new tests: stride alignments 0-511, run ending at EOF, whole-content run, phase reset after short-run skips, near-threshold non-matches, and the seeded differential fuzz npm run test: 1385/1385 pass. npm run lint: clean (3 pre-existing warnings in unrelated files). https://claude.ai/code/session_01Ea6eConhLEQFKZsVkJz1zE
diff --git a/src/core/file/truncateBase64.ts b/src/core/file/truncateBase64.ts
@@ -12,30 +12,52 @@ const dataUriPattern = new RegExp(
 );
 const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_STANDALONE},}={0,2})`, 'g');
 
+// [A-Z]:65-90, [a-z]:97-122, [0-9]:48-57, '+':43, '/':47
+const isBase64CharCode = (c: number): boolean =>
+  (c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c === 43 || c === 47;
+
 /**
- * Cheap precondition for `standaloneBase64Pattern`: scans for any run of
- * `[A-Za-z0-9+/]` reaching `MIN_BASE64_LENGTH_STANDALONE`, the smallest body
+ * Cheap precondition for `standaloneBase64Pattern`: detects whether any run of
+ * `[A-Za-z0-9+/]` reaches `MIN_BASE64_LENGTH_STANDALONE`, the smallest body
  * the regex can match. When this returns false, the regex provably has zero
  * matches, so we can skip the much more expensive backtracking scan over the
- * whole content. The hot loop avoids regex engine overhead and runs ~4x faster
- * than the original `replace`, which dominated `applyLightweightTransforms`
- * CPU on profiles of repos with `truncateBase64: true`.
+ * whole content.
+ *
+ * Instead of testing every character, the scan samples one character every
+ * `MIN_BASE64_LENGTH_STANDALONE` positions: a run of that length occupies
+ * `MIN_BASE64_LENGTH_STANDALONE` consecutive indices, so it necessarily
+ * contains a sample point — no qualifying run can slip between two samples.
+ * Only when a sampled character is in the base64 class does the scan expand
+ * outward to measure the surrounding run (bounded by the run itself, typically
+ * a handful of characters in source code). This visits ~1/64th of the input on
+ * typical text and replaced a per-character loop that dominated
+ * `applyLightweightTransforms` CPU on profiles of repos with
+ * `truncateBase64: true`.
  */
 const hasLongBase64Run = (content: string): boolean => {
   const len = content.length;
   if (len < MIN_BASE64_LENGTH_STANDALONE) return false;
-  let run = 0;
-  for (let i = 0; i < len; i++) {
-    const c = content.charCodeAt(i);
-    // [A-Z]:65-90, [a-z]:97-122, [0-9]:48-57, '+':43, '/':47
-    if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c === 43 || c === 47) {
-      run++;
-      if (run >= MIN_BASE64_LENGTH_STANDALONE) return true;
-    } else {
-      run = 0;
+  let i = MIN_BASE64_LENGTH_STANDALONE - 1;
+  while (true) {
+    // Clamp the last sample to the final character so the trailing partial
+    // window (shorter than the sampling stride) is still covered.
+    if (i >= len) i = len - 1;
+    if (isBase64CharCode(content.charCodeAt(i))) {
+      // Sample hit: measure the maximal run containing it.
+      let lo = i - 1;
+      while (lo >= 0 && isBase64CharCode(content.charCodeAt(lo))) lo--;
+      let hi = i + 1;
+      while (hi < len && isBase64CharCode(content.charCodeAt(hi))) hi++;
+      if (hi - lo - 1 >= MIN_BASE64_LENGTH_STANDALONE) return true;
+      // The run around this sample is too short. Resume sampling after it:
+      // `hi` is a non-base64 index (or `len`), so the next possible run starts
+      // at `hi + 1` and any qualifying run from there contains index
+      // `hi + MIN_BASE64_LENGTH_STANDALONE`.
+      i = hi;
     }
+    if (i >= len - 1) return false;
+    i += MIN_BASE64_LENGTH_STANDALONE;
   }
-  return false;
 };
 
 /**
diff --git a/tests/core/file/truncateBase64.test.ts b/tests/core/file/truncateBase64.test.ts
@@ -135,4 +135,106 @@ describe('truncateBase64Content', () => {
     const result = truncateBase64Content(input);
     expect(result).toBe(input);
   });
+
+  describe('sampled run detection (hasLongBase64Run precondition)', () => {
+    // The precondition samples one character every 256 positions instead of
+    // scanning every character. These cases pin the sampling-specific edges:
+    // runs at arbitrary offsets, runs in the trailing partial window, and
+    // sampling-phase resets after short-run expansions.
+
+    it('should detect a qualifying run at every alignment relative to the sample stride', () => {
+      // A 256-char run starting at offset k occupies [k, k+255], which must
+      // contain a sample regardless of k. Exercise alignments around the
+      // first two sample points (indices 255 and 511).
+      for (const offset of [0, 1, 127, 254, 255, 256, 257, 300, 511]) {
+        const input = `${'-'.repeat(offset)}${longBase64.slice(0, 256)}#tail`;
+        const result = truncateBase64Content(input);
+        expect(result, `offset ${offset}`).toContain('...');
+      }
+    });
+
+    it('should detect a run that ends exactly at the end of content', () => {
+      // The final partial window is shorter than the sampling stride; the
+      // clamped last sample must still see this run.
+      const input = `${'x '.repeat(150)}${longBase64.slice(0, 256)}`;
+      const result = truncateBase64Content(input);
+      expect(result).toContain('...');
+    });
+
+    it('should detect a run when the whole content is exactly one run of threshold length', () => {
+      const input = longBase64.slice(0, 256);
+      const result = truncateBase64Content(input);
+      expect(result).toBe(`${longBase64.slice(0, 32)}...`);
+    });
+
+    it('should detect a qualifying run that follows many short runs', () => {
+      // Every sample before the real run lands inside a short base64-like word,
+      // forcing repeated expand-and-skip steps that reset the sampling phase.
+      const shortWords = 'word1 path/to2 abc3 '.repeat(60); // 1200 chars of short runs
+      const input = `${shortWords}${longBase64.slice(0, 256)} end`;
+      const result = truncateBase64Content(input);
+      expect(result).toContain('...');
+    });
+
+    it('should preserve content of many near-threshold runs separated by breaks', () => {
+      // 255-char runs (one below threshold) back to back with separators must
+      // never match, even though almost every sample hits a base64 character.
+      const nearRun = longBase64.slice(0, 255).replace(/[+/]/g, 'a');
+      const input = Array.from({ length: 10 }, () => nearRun).join('\n');
+      const result = truncateBase64Content(input);
+      expect(result).toBe(input);
+    });
+
+    it('should match the per-character reference scan on randomized content', () => {
+      // Differential check: the sampled precondition must agree with a
+      // straightforward per-character reference on generated inputs.
+      const referenceHasLongRun = (content: string): boolean => {
+        let run = 0;
+        for (let i = 0; i < content.length; i++) {
+          if (/[A-Za-z0-9+/]/.test(content[i])) {
+            run++;
+            if (run >= 256) return true;
+          } else {
+            run = 0;
+          }
+        }
+        return false;
+      };
+      // Deterministic LCG so failures are reproducible.
+      let seed = 0x2f6e2b1;
+      const rand = () => {
+        seed = (seed * 1103515245 + 12345) & 0x7fffffff;
+        return seed / 0x7fffffff;
+      };
+      const alphabet = 'Aa0+/ .,\n=-_';
+      for (let trial = 0; trial < 500; trial++) {
+        let s = '';
+        const length = Math.floor(rand() * 700);
+        for (let j = 0; j < length; j++) {
+          s += alphabet[Math.floor(rand() * alphabet.length)];
+        }
+        let injectedQualifyingRun = false;
+        if (rand() < 0.3) {
+          const runLength = 200 + Math.floor(rand() * 120);
+          const pos = Math.floor(rand() * (s.length + 1));
+          // Repetitions of this diverse base64 prefix always pass isLikelyBase64.
+          const run = longBase64
+            .slice(0, 32)
+            .repeat(Math.ceil(runLength / 32))
+            .slice(0, runLength);
+          s = s.slice(0, pos) + run + s.slice(pos);
+          injectedQualifyingRun = runLength >= 256;
+        }
+        if (injectedQualifyingRun) {
+          // False-negative direction: a diverse run of >= 256 chars exists, so
+          // the sampled precondition must let the truncation happen.
+          expect(truncateBase64Content(s), `trial ${trial}`).not.toBe(s);
+        } else if (!referenceHasLongRun(s)) {
+          // False-positive direction: no qualifying run anywhere, so content
+          // must come back untouched.
+          expect(truncateBase64Content(s), `trial ${trial}`).toBe(s);
+        }
+      }
+    });
+  });
 });