perf(metrics): Cache gpt-tokenizer BPE ranks as JSON to cut tokenizer init

claude · claude · commit ade7f34fe635 · 2026-05-31T16:09:37.000Z
The metrics worker's TokenCounter initialization is the single largest cost on
a warm-cache CLI run. gpt-tokenizer ships each BPE merge-rank table as a ~2 MB
CommonJS module of inline array literals; `resolveEncodingAsync` `require`s it,
forcing V8 to lex/parse/execute the file and allocate a ~200k-element array on
every cold worker thread (~120 ms), before `GptEncoding.getEncodingApi` then
builds the rank Map (~90 ms). A verbose trace showed this landing at ~211 ms on
the metrics critical path that gates output generation.

The value returned by `resolveEncodingAsync` is a plain JSON-serializable array
(strings, plus byte arrays of 1–19 bytes for ranks whose token bytes are not
valid UTF-8). New `bpeRanksCache.ts` persists it once as JSON under
`$TMPDIR/repomix/cache/bpe-ranks/` and, on later runs, reloads it via
`readFileSync` + `JSON.parse` (~40 ms) — a restricted-grammar parse V8 handles
in native C++, ~3x faster than re-executing the JS module. `getEncodingApi`
receives a byte-identical ranks array, so token counts are unchanged.

intent(metrics): cut the ~211ms tokenizer init that dominates warm-cache runs
decision(bpe-cache): runtime JSON disk cache via gpt-tokenizer's public resolveEncodingAsync output — no build step, no bundled data, no internal imports
rejected(bpe-cache): bundling a pre-built ranks JSON in lib/ — adds ~2MB to the published package and reaches into gpt-tokenizer's internal cjs/bpeRanks path, fragile across upgrades
decision(module-split): extract the cache to bpeRanksCache.ts mirroring tokenCountCache.ts — keeps TokenCounter focused and makes the cache unit-testable
decision(cache-key): key the file by gpt-tokenizer version + format version so an upgrade auto-invalidates stale tables (miss → rebuild)
constraint(bpe-cache): runs inside worker threads — sync fs + crypto-random tmp name + atomic rename so a concurrent worker (which shares this pid) never reads a partial file; all read/write errors fall back to resolveEncodingAsync (pure optimization, never a correctness signal)
constraint(shape-guard): readBpeRanksCache rejects any non-array/empty JSON as a clean miss, so a structurally-valid-but-wrong file is rebuilt rather than silently producing zero counts
constraint(opt-out): shares the REPOMIX_TOKEN_CACHE=0 switch with the token-count cache; REPOMIX_BPE_RANKS_CACHE_PATH redirects the dir for tests

Correctness:
- Library-level equivalence verified: JSON round-trip yields identical encode()
  output and countTokens() across source code, unicode, emoji, special tokens
  (&lt;|endoftext|&gt;) and invalid UTF-8; all 1571 byte-array rank entries (lengths
  1–19) preserved for o200k_base and cl100k_base.
- Full CLI token totals identical with the cache enabled vs REPOMIX_TOKEN_CACHE=0,
  and the generated output file is byte-identical.
- 1332 tests pass (12 new for the cache: hit/miss/disabled/corrupt/malformed-shape
  fallback); `tsgo --noEmit`, biome and oxlint clean.

Benchmark — `node bin/repomix.cjs --include src -o &lt;tmp&gt; --quiet`, warm cache,
25 runs/round, baseline (no cache) and patched rebuilt and run interleaved
(two rounds each, to control for environment drift):
  base:    773.1 / 742.6 ms  (mean 757.9)
  patched: 722.0 / 688.1 ms  (mean 705.1)
  delta:   -52.8 ms (7.0%)   — minimums ~703 → ~665 ms move in lockstep
Exceeds the 2% target and sits outside the run-to-run noise band (sd ~25 ms on
the patched runs) as shown by the consistent direction across both rounds.
diff --git a/src/core/metrics/TokenCounter.ts b/src/core/metrics/TokenCounter.ts
@@ -1,12 +1,32 @@
 import { GptEncoding } from 'gpt-tokenizer/GptEncoding';
 import { resolveEncodingAsync } from 'gpt-tokenizer/resolveEncodingAsync';
 import { logger } from '../../shared/logger.js';
+import { readBpeRanksCache, writeBpeRanksCache } from './bpeRanksCache.js';
 import { TOKEN_ENCODINGS, type TokenEncoding } from './tokenEncodings.js';
 
 // Re-export for backward compatibility with existing
 // `import { TOKEN_ENCODINGS, TokenEncoding } from './TokenCounter.js'` call sites.
 export { TOKEN_ENCODINGS, type TokenEncoding };
 
+// Resolved BPE merge-rank table, as returned by `resolveEncodingAsync`. A plain
+// JSON-serializable array of strings, plus byte arrays (1–19 bytes in
+// o200k_base) for ranks whose token bytes are not valid UTF-8.
+type BpeRanks = Awaited<ReturnType<typeof resolveEncodingAsync>>;
+
+// Load the BPE merge-rank table from the on-disk JSON cache when present,
+// otherwise resolve it from gpt-tokenizer and persist it for next time. See
+// bpeRanksCache.ts for why this avoids the ~120 ms JS-module parse on warm runs.
+const resolveBpeRanks = async (encodingName: TokenEncoding): Promise<BpeRanks> => {
+  const cached = readBpeRanksCache(encodingName);
+  if (cached !== undefined) {
+    return cached as BpeRanks;
+  }
+
+  const bpeRanks = await resolveEncodingAsync(encodingName);
+  writeBpeRanksCache(encodingName, bpeRanks);
+  return bpeRanks;
+};
+
 interface CountTokensOptions {
   disallowedSpecial?: Set<string>;
 }
@@ -30,9 +50,10 @@ const loadEncoding: LoadEncodingFn = async (encodingName) => {
 
   const startTime = process.hrtime.bigint();
 
-  // Use resolveEncodingAsync to lazily load BPE rank data, then create a GptEncoding instance.
-  // resolveEncodingAsync uses static import paths internally, so bundlers (rolldown) can resolve them.
-  const bpeRanks = await resolveEncodingAsync(encodingName);
+  // Load BPE rank data (from the on-disk JSON cache when available, else from
+  // gpt-tokenizer), then create a GptEncoding instance. resolveEncodingAsync
+  // uses static import paths internally, so bundlers (rolldown) can resolve them.
+  const bpeRanks = await resolveBpeRanks(encodingName);
   const encoder = GptEncoding.getEncodingApi(encodingName, () => bpeRanks);
   const countFn = encoder.countTokens.bind(encoder) as CountTokensFn;
   encodingModules.set(encodingName, countFn);
diff --git a/src/core/metrics/bpeRanksCache.ts b/src/core/metrics/bpeRanksCache.ts
@@ -0,0 +1,118 @@
+import { randomBytes } from 'node:crypto';
+import fs from 'node:fs';
+import { createRequire } from 'node:module';
+import path from 'node:path';
+import { logger } from '../../shared/logger.js';
+import { getRepomixTmpDir } from '../../shared/tmpDir.js';
+import { isCacheDisabled } from './tokenCountCache.js';
+import type { TokenEncoding } from './tokenEncodings.js';
+
+// On-disk JSON cache for gpt-tokenizer's BPE merge-rank tables.
+//
+// gpt-tokenizer ships each table as a ~2 MB CommonJS module of inline array
+// literals. `resolveEncodingAsync` `require`s it, forcing V8 to lex/parse/
+// execute the file and allocate a ~200k-element array on every cold worker
+// thread (~120 ms) — the single largest cost on a warm-cache CLI run. The
+// resolved value is a plain JSON-serializable array, so we persist it once and
+// reload it with `readFileSync` + `JSON.parse` (~40 ms) on later runs: a
+// restricted-grammar parse V8 handles in native code, ~3x faster than
+// re-executing the JS module. The reloaded array is byte-identical to the
+// resolved one (same encode output), so token counts are unchanged.
+//
+// This is a pure optimization: every read/write failure is swallowed and the
+// caller falls back to `resolveEncodingAsync`. The cache shares the
+// `REPOMIX_TOKEN_CACHE=0` opt-out and the `$TMPDIR/repomix/cache/` umbrella
+// with the token-count cache (see tokenCountCache.ts).
+
+const cjsRequire = createRequire(import.meta.url);
+
+// On-disk serialization format version. Bump only if the persisted JSON shape
+// changes in a way incompatible with files written by older repomix versions.
+const BPE_RANKS_CACHE_FORMAT = 1;
+// Shares the `cache/` umbrella with the token-count cache; the per-encoding
+// files live in a `bpe-ranks/` subdirectory beneath it.
+const CACHE_SUBDIR_NAME = 'cache';
+const BPE_RANKS_SUBDIR_NAME = 'bpe-ranks';
+
+// gpt-tokenizer version keys the cache file name so a dependency upgrade that
+// changes a table automatically invalidates the stale file (different name →
+// cache miss → rebuild). `gpt-tokenizer/package.json` is exported by the
+// package, so this resolves the same way `gpt-tokenizer/GptEncoding` does.
+//
+// Files from superseded versions are not swept; they live under $TMPDIR (which
+// the OS may evict) and amount to a few MB per version, so the simplicity is
+// worth more than reclaiming the space.
+const getGptTokenizerVersion = (): string => {
+  try {
+    return (cjsRequire('gpt-tokenizer/package.json') as { version: string }).version;
+  } catch {
+    return 'unknown';
+  }
+};
+
+/**
+ * Absolute path of the cached BPE-ranks file for an encoding.
+ *
+ * `REPOMIX_BPE_RANKS_CACHE_PATH` overrides the parent directory for tests and
+ * explicit user configuration (mirrors `REPOMIX_TOKEN_CACHE_PATH`).
+ */
+export const getBpeRanksCachePath = (encodingName: TokenEncoding): string => {
+  const fileName = `${encodingName}-${getGptTokenizerVersion()}-v${BPE_RANKS_CACHE_FORMAT}.json`;
+  const override = process.env.REPOMIX_BPE_RANKS_CACHE_PATH;
+  if (override) {
+    return path.join(override, fileName);
+  }
+  return path.join(getRepomixTmpDir(), CACHE_SUBDIR_NAME, BPE_RANKS_SUBDIR_NAME, fileName);
+};
+
+/**
+ * Read and parse the cached BPE ranks for `encodingName`. Returns `undefined`
+ * on a cache miss, an unreadable/corrupt file, or when caching is disabled
+ * (`REPOMIX_TOKEN_CACHE=0`). Never throws.
+ *
+ * A shape check rejects any structurally-valid-but-wrong file (e.g. an object,
+ * a number, or an empty array left by an incompatible writer) so it is treated
+ * as a clean miss and rebuilt, rather than handed to the tokenizer where it
+ * would silently produce zero token counts.
+ */
+export const readBpeRanksCache = (encodingName: TokenEncoding): unknown | undefined => {
+  if (isCacheDisabled()) {
+    return undefined;
+  }
+  try {
+    const parsed = JSON.parse(fs.readFileSync(getBpeRanksCachePath(encodingName), 'utf8'));
+    if (!Array.isArray(parsed) || parsed.length === 0) {
+      logger.trace(`Ignoring malformed BPE ranks cache for ${encodingName}`);
+      return undefined;
+    }
+    logger.trace(`Loaded BPE ranks for ${encodingName} from cache`);
+    return parsed;
+  } catch {
+    // Cache miss or unreadable/corrupt file — caller resolves from gpt-tokenizer.
+    return undefined;
+  }
+};
+
+/**
+ * Persist `bpeRanks` for `encodingName` as JSON. Best-effort and never throws.
+ *
+ * A unique tmp name (pid + crypto-random suffix) written then atomically
+ * renamed means a concurrent reader never observes a partial file, even when
+ * several worker threads (which share this process's pid) resolve the same
+ * encoding at once. No-ops when caching is disabled. All errors (read-only FS,
+ * permission denied, races) are swallowed — the cache is optional.
+ */
+export const writeBpeRanksCache = (encodingName: TokenEncoding, bpeRanks: unknown): void => {
+  if (isCacheDisabled()) {
+    return;
+  }
+  const cachePath = getBpeRanksCachePath(encodingName);
+  try {
+    fs.mkdirSync(path.dirname(cachePath), { recursive: true });
+    const tmpPath = `${cachePath}.${process.pid}.${randomBytes(4).toString('hex')}.tmp`;
+    fs.writeFileSync(tmpPath, JSON.stringify(bpeRanks), { mode: 0o600 });
+    fs.renameSync(tmpPath, cachePath);
+  } catch (error) {
+    logger.trace(`Failed to persist BPE ranks cache for ${encodingName}:`, error);
+  }
+};
diff --git a/tests/core/metrics/bpeRanksCache.test.ts b/tests/core/metrics/bpeRanksCache.test.ts
@@ -0,0 +1,106 @@
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import { afterEach, beforeEach, describe, expect, test } from 'vitest';
+import {
+  getBpeRanksCachePath,
+  readBpeRanksCache,
+  writeBpeRanksCache,
+} from '../../../src/core/metrics/bpeRanksCache.js';
+
+// A small stand-in for the real BPE ranks: a mix of strings and single-element
+// byte arrays, mirroring the shape gpt-tokenizer returns (and the JSON
+// round-trip that token correctness depends on).
+const SAMPLE_RANKS: unknown = ['!', '"', '#', [161], [194, 162]];
+
+describe('bpeRanksCache', () => {
+  let tmpDir: string;
+  let prevPathEnv: string | undefined;
+  let prevDisableEnv: string | undefined;
+
+  beforeEach(() => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'repomix-bpe-test-'));
+    prevPathEnv = process.env.REPOMIX_BPE_RANKS_CACHE_PATH;
+    prevDisableEnv = process.env.REPOMIX_TOKEN_CACHE;
+    process.env.REPOMIX_BPE_RANKS_CACHE_PATH = tmpDir;
+    // The suite disables caching globally; enable it for these tests.
+    delete process.env.REPOMIX_TOKEN_CACHE;
+  });
+
+  afterEach(() => {
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+    if (prevPathEnv === undefined) {
+      delete process.env.REPOMIX_BPE_RANKS_CACHE_PATH;
+    } else {
+      process.env.REPOMIX_BPE_RANKS_CACHE_PATH = prevPathEnv;
+    }
+    if (prevDisableEnv === undefined) {
+      delete process.env.REPOMIX_TOKEN_CACHE;
+    } else {
+      process.env.REPOMIX_TOKEN_CACHE = prevDisableEnv;
+    }
+  });
+
+  test('getBpeRanksCachePath honors the path override and includes the encoding name', () => {
+    const cachePath = getBpeRanksCachePath('o200k_base');
+    expect(path.dirname(cachePath)).toBe(tmpDir);
+    expect(path.basename(cachePath)).toMatch(/^o200k_base-.*\.json$/);
+  });
+
+  test('returns undefined on a cache miss', () => {
+    expect(readBpeRanksCache('o200k_base')).toBeUndefined();
+  });
+
+  test('write then read round-trips the ranks (including byte-array entries)', () => {
+    writeBpeRanksCache('o200k_base', SAMPLE_RANKS);
+    expect(fs.existsSync(getBpeRanksCachePath('o200k_base'))).toBe(true);
+    expect(readBpeRanksCache('o200k_base')).toEqual(SAMPLE_RANKS);
+  });
+
+  test('a corrupt cache file falls back to a miss without throwing', () => {
+    fs.writeFileSync(getBpeRanksCachePath('o200k_base'), 'not valid json {{{');
+    expect(readBpeRanksCache('o200k_base')).toBeUndefined();
+  });
+
+  test.each([
+    ['an object', '{}'],
+    ['a number', '42'],
+    ['null', 'null'],
+    ['an empty array', '[]'],
+  ])('rejects structurally-valid-but-wrong cache content (%s) as a miss', (_label, content) => {
+    fs.writeFileSync(getBpeRanksCachePath('o200k_base'), content);
+    expect(readBpeRanksCache('o200k_base')).toBeUndefined();
+  });
+
+  test('leaves no stray temp files after a write', () => {
+    writeBpeRanksCache('o200k_base', SAMPLE_RANKS);
+    const leftovers = fs.readdirSync(tmpDir).filter((f) => f.endsWith('.tmp'));
+    expect(leftovers).toEqual([]);
+  });
+
+  test('different encodings use distinct cache files', () => {
+    writeBpeRanksCache('o200k_base', ['a']);
+    writeBpeRanksCache('cl100k_base', ['b']);
+    expect(readBpeRanksCache('o200k_base')).toEqual(['a']);
+    expect(readBpeRanksCache('cl100k_base')).toEqual(['b']);
+  });
+
+  describe('when caching is disabled via REPOMIX_TOKEN_CACHE=0', () => {
+    beforeEach(() => {
+      process.env.REPOMIX_TOKEN_CACHE = '0';
+    });
+
+    test('read returns undefined and write is a no-op', () => {
+      writeBpeRanksCache('o200k_base', SAMPLE_RANKS);
+      // Path is computed independent of the disable flag, so the file must be absent.
+      expect(fs.existsSync(getBpeRanksCachePath('o200k_base'))).toBe(false);
+      expect(readBpeRanksCache('o200k_base')).toBeUndefined();
+    });
+
+    test('read returns undefined even when a cache file exists', () => {
+      // Write a file directly, bypassing the disabled writer.
+      fs.writeFileSync(getBpeRanksCachePath('o200k_base'), JSON.stringify(SAMPLE_RANKS));
+      expect(readBpeRanksCache('o200k_base')).toBeUndefined();
+    });
+  });
+});
diff --git a/tests/testing/vitestSetup.ts b/tests/testing/vitestSetup.ts
@@ -1,3 +1,6 @@
+import os from 'node:os';
+import path from 'node:path';
+
 // Disable the token-count disk cache by default for the entire test suite so
 // that (a) test runs do not read or write the developer's real cache file in
 // $TMPDIR and (b) tests asserting on worker dispatch behavior are not skewed
@@ -6,3 +9,11 @@
 if (process.env.REPOMIX_TOKEN_CACHE === undefined) {
   process.env.REPOMIX_TOKEN_CACHE = '0';
 }
+
+// Redirect the BPE-ranks disk cache to an isolated per-process temp directory so
+// any test that re-enables the cache (by clearing REPOMIX_TOKEN_CACHE) never
+// touches the developer's real $TMPDIR/repomix cache. Tests that exercise the
+// cache directly override this with their own temp dir.
+if (process.env.REPOMIX_BPE_RANKS_CACHE_PATH === undefined) {
+  process.env.REPOMIX_BPE_RANKS_CACHE_PATH = path.join(os.tmpdir(), `repomix-test-bpe-ranks-${process.pid}`);
+}