start setup

pythongiant · pythongiant · commit b8f1f8f29fc2 · 2026-06-02T10:46:18.000+05:30
diff --git a/benchmarks_and_experiments/coding_vs_vllm/README.md b/benchmarks_and_experiments/coding_vs_vllm/README.md
@@ -41,13 +41,13 @@ start one server, benchmark it (save with `--out`), stop it, start the other,
 benchmark it — then combine the two saved files into the side-by-side report
 with `--compare` (no server needed for that step).
 
-**Step 1 — kvboost.** Start its server:
+**Step 1 — kvboost.** Start its server (best setup — see `start_kvboost.sh`):
 ```bash
-python -m kvboost.server --model Qwen/Qwen2.5-3B-Instruct --dtype float16 \
-    --recompute-strategy cacheblend_sparse --kv-cache-bits 8 \
-    --max-cache-bytes 4e9 --max-batch-size 1 --port 9000
+./start_kvboost.sh            # MODEL=... PORT=... MAX_CACHE_BYTES=... to override
 ```
-Then in another shell:
+That runs kvboost with `cacheblend_sparse` (faithful selective recompute),
+int8 KV, and the OOM planner — the features the benchmark measures. Then in
+another shell:
 ```bash
 python bench_coding.py --backend kvboost --url http://localhost:9000 \
     --model Qwen/Qwen2.5-3B-Instruct \
diff --git a/benchmarks_and_experiments/coding_vs_vllm/start_kvboost.sh b/benchmarks_and_experiments/coding_vs_vllm/start_kvboost.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# Launch kvboost in its BEST setup for the coding benchmark — showcases the
+# features the benchmark measures: KV reuse (faster TTFT) + OOM recovery, with
+# the recent correctness/perf fixes all active.
+#
+# Run this, then in another shell:
+#   python bench_coding.py --backend kvboost --url http://localhost:9000 \
+#       --model "$MODEL" --mode both --out kvboost.json
+# Stop it (Ctrl-C) before launching vLLM — one model fits the GPU at a time.
+#
+# Override via env: MODEL=... PORT=... MAX_CACHE_BYTES=... ./start_kvboost.sh
+
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen/Qwen2.5-3B-Instruct}"
+PORT="${PORT:-9000}"
+# KV-cache budget for cross-request chunk reuse. Size to (free VRAM after
+# weights). On a 14.6 GiB card with a 3B fp16 model (~6 GiB) → ~4 GiB leaves
+# headroom for prefill activations + the live request. Lower for the OOM-
+# stress run to make the planner's adaptation more visible (e.g. 1.5e9).
+MAX_CACHE_BYTES="${MAX_CACHE_BYTES:-4e9}"
+SAFETY_MARGIN="${SAFETY_MARGIN:-0.15}"
+
+echo "kvboost (best setup)"
+echo "  model:            $MODEL"
+echo "  port:             $PORT"
+echo "  recompute:        cacheblend_sparse  (faithful selective recompute)"
+echo "  kv-cache-bits:    8                  (int8 KV → 2× reuse capacity)"
+echo "  max-cache-bytes:  $MAX_CACHE_BYTES"
+echo "  oom planning:     on (safety_margin=$SAFETY_MARGIN)"
+echo
+
+# Why each flag:
+#   --recompute-strategy cacheblend_sparse
+#       Faithful CacheBlend: recompute only high-deviation tokens layer-by-
+#       layer (paper's 2.2-3.3× TTFT), not the full-forward variant. This is
+#       the "faster TTFT on reused context" feature. Falls back to plain
+#       cacheblend automatically on unsupported architectures.
+#   --kv-cache-bits 8
+#       int8 KV cache: ~2× the cached-chunk capacity (more cross-request
+#       reuse) and lower memory pressure, negligible quality cost.
+#   --max-cache-bytes
+#       Cross-request chunk-cache budget — bigger = more reuse, bounded by VRAM.
+#   OOM planner (on by default) + --planner-safety-margin
+#       Per-request peak prediction → picks chunk_size/kv_bits that fit, or a
+#       clean HTTP 413. This is the "OOM recovery" feature. Add --auto-truncate
+#       to truncate-and-complete oversized prompts instead of 413.
+#   --max-batch-size 1
+#       The benchmark replays sequentially (single GPU worker); 1 avoids
+#       pointless batch-window latency. Raise for concurrent throughput tests.
+#   (automatic, no flag: O(n) incremental detok, chunked CacheBlend forward,
+#    streaming usage emission for input-throughput, planner cost probe.)
+exec python -m kvboost.server \
+    --model "$MODEL" \
+    --dtype float16 \
+    --recompute-strategy cacheblend_sparse \
+    --kv-cache-bits 8 \
+    --max-cache-bytes "$MAX_CACHE_BYTES" \
+    --planner-safety-margin "$SAFETY_MARGIN" \
+    --max-batch-size 1 \
+    --host 0.0.0.0 \
+    --port "$PORT"
+
+# ── Optional add-ons (uncomment to enable) ───────────────────────────────────
+# Speculative decoding to lift DECODE throughput (where vLLM's continuous
+# batching otherwise leads). Needs a small same-family draft model and ~1 GiB
+# extra VRAM; --speculative-tree turns on the SpecBlock-inspired tree variant
+# with cost-aware per-request mode selection:
+#     --speculative-draft-model Qwen/Qwen2.5-0.5B-Instruct \
+#     --speculative-tree \
+#
+# Oversized-prompt policy for the OOM ramp: complete-by-truncation instead of
+# a clean 413 reject:
+#     --auto-truncate
diff --git a/benchmarks_and_experiments/coding_vs_vllm/start_vllm.sh b/benchmarks_and_experiments/coding_vs_vllm/start_vllm.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Launch vLLM in its USUAL serving setup for the coding benchmark — the
+# standard OpenAI server with prefix caching (vLLM's cross-request reuse) and
+# continuous batching (its default). Matched model + dtype to kvboost so the
+# comparison is fair.
+#
+# Run this AFTER stopping the kvboost server (one model fits the GPU at a
+# time), then in another shell:
+#   python bench_coding.py --backend vllm --url http://localhost:8001 \
+#       --model "$MODEL" --mode both --out vllm.json
+#   # ... use the SAME --dataset/--n/--n-files/--contexts/--corpus-size as the
+#   #     kvboost run so both backends see identical prompts.
+#
+# Override via env: MODEL=... PORT=... GPU_MEM_UTIL=... MAX_MODEL_LEN=...
+
+set -euo pipefail
+
+MODEL="${MODEL:-Qwen/Qwen2.5-3B-Instruct}"
+PORT="${PORT:-8001}"
+# vLLM pre-allocates this fraction of total VRAM for weights + its paged KV
+# pool. 0.85 is the common production value.
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.85}"
+# Max admitted context. 32768 covers the throughput/TTFT workload. For the OOM
+# ramp: a HIGH value (e.g. 131072) admits long prompts so they hit the runtime
+# KV ceiling (real OOM); a LOW value makes vLLM reject over-length prompts with
+# a graceful 400 instead (the benchmark scores that as success, not failure).
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+
+echo "vLLM (usual setup)"
+echo "  model:                 $MODEL"
+echo "  port:                  $PORT"
+echo "  prefix caching:        on        (vLLM cross-request reuse)"
+echo "  gpu-memory-utilization: $GPU_MEM_UTIL"
+echo "  max-model-len:         $MAX_MODEL_LEN"
+echo
+
+# Why each flag:
+#   --enable-prefix-caching  vLLM's reuse mechanism — the matched counterpart
+#                            to kvboost's chunk-reuse/CacheBlend (reuses an
+#                            exact shared *prefix* across requests).
+#   --gpu-memory-utilization standard memory budget; matched to leave the same
+#                            class of headroom kvboost gets.
+#   --max-model-len          admitted context length (see note above re: OOM).
+#   --dtype float16          matched to kvboost.
+# Continuous batching is vLLM's default and stays on — it's why vLLM usually
+# leads raw decode throughput; the benchmark reports that honestly.
+exec vllm serve "$MODEL" \
+    --dtype float16 \
+    --enable-prefix-caching \
+    --gpu-memory-utilization "$GPU_MEM_UTIL" \
+    --max-model-len "$MAX_MODEL_LEN" \
+    --host 0.0.0.0 \
+    --port "$PORT"