Skip to content

Commit 4f8d27c

Browse files
committed
fix: scale cache-bypass prefix to prompt size
The fixed ~4 KB prefix doubled a 1K-token input. Now the prefix adapts: ~5% of prompt length, clamped to [128, 4096] chars (~32–1024 tokens). Short prompts get a small prefix, long prompts still span enough KV-cache blocks to reliably miss.
1 parent 1b7ada8 commit 4f8d27c

2 files changed

Lines changed: 16 additions & 9 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/), and this project adheres to [Semantic Versioning](https://semver.org/).
66

7+
## [Unreleased]
8+
9+
### Fixed
10+
- Cache hit rate: prefix size now adapts to prompt length (~5%, clamped 128–4096 chars) to avoid inflating short prompts — previously a fixed ~4 KB prefix would double a 1K-token input
11+
712
## [2.4.1] - 2026-04-18
813

914
### Fixed

backend/src/services/benchmarkEngine.ts

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -198,16 +198,17 @@ function sleep(ms: number, benchmarkId: string): Promise<void> {
198198
});
199199
}
200200

201-
// Generate a random prefix of ~1024 tokens (~4 KB) to bust KV-cache blocks.
202-
// Uses base62 characters separated by spaces every 4-6 chars to form
203-
// realistic token boundaries. Each call produces a unique, non-repeating string.
201+
// Generate a random prefix sized to the prompt to bust KV-cache blocks.
202+
// The prefix scales with input size: ~5% of prompt length, clamped to
203+
// [128 chars, 4096 chars] (~32–1024 tokens). Uses base62 characters with
204+
// spaces every 4-6 chars to form realistic token boundaries.
204205
const BASE62 = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
205-
function generateRandomPrefix(): string {
206-
const bytes = randomBytes(4096);
206+
function generateRandomPrefix(promptLength: number): string {
207+
const targetChars = Math.min(4096, Math.max(128, Math.round(promptLength * 0.05)));
208+
const bytes = randomBytes(targetChars);
207209
let result = '';
208210
for (let i = 0; i < bytes.length; i++) {
209211
result += BASE62[bytes[i] % 62];
210-
// Insert a space every 4–6 characters to create word-like token boundaries
211212
if (i > 0 && i % (4 + (bytes[i] % 3)) === 0) result += ' ';
212213
}
213214
return result;
@@ -341,8 +342,9 @@ async function runProviderBenchmark(
341342

342343
// Build random-prefix variant pool for cache hit rate control.
343344
// K unique prefixes → target hit rate ≈ (N - K) / N.
344-
// Each prefix is ~1024 tokens (~4 KB of random base62 text) so it spans
345-
// multiple KV-cache blocks and reliably prevents prefix-cache reuse.
345+
// Prefix size adapts to prompt length (~5%, clamped 128–4096 chars)
346+
// so short prompts aren't bloated while long prompts still bust
347+
// block-level KV cache reliably.
346348
//
347349
// We pre-build a shuffled schedule of length N so that cache misses
348350
// (first appearance of each variant) are spread evenly across the run,
@@ -351,7 +353,7 @@ async function runProviderBenchmark(
351353
let variantSchedule: string[] | null = null;
352354
if (config.targetCacheHitRate !== undefined && config.targetCacheHitRate < 1) {
353355
const K = Math.max(1, Math.round(totalIterations * (1 - config.targetCacheHitRate)));
354-
const prefixes = Array.from({ length: K }, () => generateRandomPrefix());
356+
const prefixes = Array.from({ length: K }, () => generateRandomPrefix(config.prompt.length));
355357
// Build schedule: assign each iteration a variant, then shuffle
356358
const schedule: string[] = [];
357359
for (let i = 0; i < totalIterations; i++) {

0 commit comments

Comments
 (0)