|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Launch kvboost in its BEST setup for the coding benchmark — showcases the |
| 3 | +# features the benchmark measures: KV reuse (faster TTFT) + OOM recovery, with |
| 4 | +# the recent correctness/perf fixes all active. |
| 5 | +# |
| 6 | +# Run this, then in another shell: |
| 7 | +# python bench_coding.py --backend kvboost --url http://localhost:9000 \ |
| 8 | +# --model "$MODEL" --mode both --out kvboost.json |
| 9 | +# Stop it (Ctrl-C) before launching vLLM — one model fits the GPU at a time. |
| 10 | +# |
| 11 | +# Override via env: MODEL=... PORT=... MAX_CACHE_BYTES=... ./start_kvboost.sh |
| 12 | + |
| 13 | +set -euo pipefail |
| 14 | + |
| 15 | +MODEL="${MODEL:-Qwen/Qwen2.5-3B-Instruct}" |
| 16 | +PORT="${PORT:-9000}" |
| 17 | +# KV-cache budget for cross-request chunk reuse. Size to (free VRAM after |
| 18 | +# weights). On a 14.6 GiB card with a 3B fp16 model (~6 GiB) → ~4 GiB leaves |
| 19 | +# headroom for prefill activations + the live request. Lower for the OOM- |
| 20 | +# stress run to make the planner's adaptation more visible (e.g. 1.5e9). |
| 21 | +MAX_CACHE_BYTES="${MAX_CACHE_BYTES:-4e9}" |
| 22 | +SAFETY_MARGIN="${SAFETY_MARGIN:-0.15}" |
| 23 | + |
| 24 | +echo "kvboost (best setup)" |
| 25 | +echo " model: $MODEL" |
| 26 | +echo " port: $PORT" |
| 27 | +echo " recompute: cacheblend_sparse (faithful selective recompute)" |
| 28 | +echo " kv-cache-bits: 8 (int8 KV → 2× reuse capacity)" |
| 29 | +echo " max-cache-bytes: $MAX_CACHE_BYTES" |
| 30 | +echo " oom planning: on (safety_margin=$SAFETY_MARGIN)" |
| 31 | +echo |
| 32 | + |
| 33 | +# Why each flag: |
| 34 | +# --recompute-strategy cacheblend_sparse |
| 35 | +# Faithful CacheBlend: recompute only high-deviation tokens layer-by- |
| 36 | +# layer (paper's 2.2-3.3× TTFT), not the full-forward variant. This is |
| 37 | +# the "faster TTFT on reused context" feature. Falls back to plain |
| 38 | +# cacheblend automatically on unsupported architectures. |
| 39 | +# --kv-cache-bits 8 |
| 40 | +# int8 KV cache: ~2× the cached-chunk capacity (more cross-request |
| 41 | +# reuse) and lower memory pressure, negligible quality cost. |
| 42 | +# --max-cache-bytes |
| 43 | +# Cross-request chunk-cache budget — bigger = more reuse, bounded by VRAM. |
| 44 | +# OOM planner (on by default) + --planner-safety-margin |
| 45 | +# Per-request peak prediction → picks chunk_size/kv_bits that fit, or a |
| 46 | +# clean HTTP 413. This is the "OOM recovery" feature. Add --auto-truncate |
| 47 | +# to truncate-and-complete oversized prompts instead of 413. |
| 48 | +# --max-batch-size 1 |
| 49 | +# The benchmark replays sequentially (single GPU worker); 1 avoids |
| 50 | +# pointless batch-window latency. Raise for concurrent throughput tests. |
| 51 | +# (automatic, no flag: O(n) incremental detok, chunked CacheBlend forward, |
| 52 | +# streaming usage emission for input-throughput, planner cost probe.) |
| 53 | +exec python -m kvboost.server \ |
| 54 | + --model "$MODEL" \ |
| 55 | + --dtype float16 \ |
| 56 | + --recompute-strategy cacheblend_sparse \ |
| 57 | + --kv-cache-bits 8 \ |
| 58 | + --max-cache-bytes "$MAX_CACHE_BYTES" \ |
| 59 | + --planner-safety-margin "$SAFETY_MARGIN" \ |
| 60 | + --max-batch-size 1 \ |
| 61 | + --host 0.0.0.0 \ |
| 62 | + --port "$PORT" |
| 63 | + |
| 64 | +# ── Optional add-ons (uncomment to enable) ─────────────────────────────────── |
| 65 | +# Speculative decoding to lift DECODE throughput (where vLLM's continuous |
| 66 | +# batching otherwise leads). Needs a small same-family draft model and ~1 GiB |
| 67 | +# extra VRAM; --speculative-tree turns on the SpecBlock-inspired tree variant |
| 68 | +# with cost-aware per-request mode selection: |
| 69 | +# --speculative-draft-model Qwen/Qwen2.5-0.5B-Instruct \ |
| 70 | +# --speculative-tree \ |
| 71 | +# |
| 72 | +# Oversized-prompt policy for the OOM ramp: complete-by-truncation instead of |
| 73 | +# a clean 413 reject: |
| 74 | +# --auto-truncate |
0 commit comments