2x faster decode with CUDA graph capture

pythongiant · pythongiant · commit 97b765be3a94 · 2026-06-02T15:08:59.000+05:30
diff --git a/benchmarks_and_experiments/coding_vs_vllm/README.md b/benchmarks_and_experiments/coding_vs_vllm/README.md
@@ -57,11 +57,10 @@ python bench_coding.py --backend kvboost --url http://localhost:9000 \
 ```
 Stop the kvboost server when it finishes (frees the GPU).
 
-**Step 2 — vLLM.** Start its server:
+**Step 2 — vLLM.** Stop kvboost first (frees the GPU), then start vLLM (usual
+setup — see `start_vllm.sh`):
 ```bash
-vllm serve Qwen/Qwen2.5-3B-Instruct --dtype float16 \
-    --enable-prefix-caching --gpu-memory-utilization 0.85 \
-    --max-model-len 32768 --port 8001
+./start_vllm.sh               # MODEL=... PORT=... GPU_MEM_UTIL=... MAX_MODEL_LEN=...
 ```
 Then run the **same** workload flags (so prompts match) against it:
 ```bash
diff --git a/benchmarks_and_experiments/coding_vs_vllm/start_kvboost.sh b/benchmarks_and_experiments/coding_vs_vllm/start_kvboost.sh
@@ -1,74 +1,95 @@
 #!/usr/bin/env bash
 # Launch kvboost in its BEST setup for the coding benchmark — showcases the
-# features the benchmark measures: KV reuse (faster TTFT) + OOM recovery, with
-# the recent correctness/perf fixes all active.
+# features the benchmark measures (KV reuse → faster TTFT, OOM recovery) AND
+# the throughput levers (FlashAttention-2, tree speculative decoding) added to
+# close the gap to vLLM on an RTX 3060 (Ampere, 12 GB, ~360 GB/s).
 #
 # Run this, then in another shell:
 #   python bench_coding.py --backend kvboost --url http://localhost:9000 \
 #       --model "$MODEL" --mode both --out kvboost.json
 # Stop it (Ctrl-C) before launching vLLM — one model fits the GPU at a time.
 #
-# Override via env: MODEL=... PORT=... MAX_CACHE_BYTES=... ./start_kvboost.sh
+# Override via env: MODEL=... DRAFT=... PORT=... MAX_CACHE_BYTES=... NO_SPEC=1
 
 set -euo pipefail
 
 MODEL="${MODEL:-Qwen/Qwen2.5-3B-Instruct}"
+# Small same-family drafter for speculative decoding (the decode-throughput
+# lever). ~1 GB fp16; set NO_SPEC=1 to disable (e.g. to free VRAM for the
+# OOM-headroom run, since the draft model lowers the context ceiling).
+DRAFT="${DRAFT:-Qwen/Qwen2.5-0.5B-Instruct}"
 PORT="${PORT:-9000}"
-# KV-cache budget for cross-request chunk reuse. Size to (free VRAM after
-# weights). On a 14.6 GiB card with a 3B fp16 model (~6 GiB) → ~4 GiB leaves
-# headroom for prefill activations + the live request. Lower for the OOM-
-# stress run to make the planner's adaptation more visible (e.g. 1.5e9).
-MAX_CACHE_BYTES="${MAX_CACHE_BYTES:-4e9}"
+# KV-cache budget for cross-request chunk reuse. On a 12 GB 3060: ~6 GB model
+# + ~1 GB draft leaves ~5 GB → 2.5 GB cache keeps activation headroom. Lower
+# for the OOM-stress run to make the planner's adaptation more visible (~1e9).
+MAX_CACHE_BYTES="${MAX_CACHE_BYTES:-2.5e9}"
 SAFETY_MARGIN="${SAFETY_MARGIN:-0.15}"
 
-echo "kvboost (best setup)"
+SPEC_ARGS=(--speculative-draft-model "$DRAFT" --speculative-tree)
+if [[ "${NO_SPEC:-0}" == "1" ]]; then SPEC_ARGS=(); fi
+
+echo "kvboost (best setup — RTX 3060)"
 echo "  model:            $MODEL"
+echo "  draft:            ${NO_SPEC:+<disabled>}${NO_SPEC:-$DRAFT}"
 echo "  port:             $PORT"
+echo "  attention:        flash_attention_2 (auto-fallback to sdpa)"
 echo "  recompute:        cacheblend_sparse  (faithful selective recompute)"
 echo "  kv-cache-bits:    8                  (int8 KV → 2× reuse capacity)"
 echo "  max-cache-bytes:  $MAX_CACHE_BYTES"
+echo "  speculative:      ${NO_SPEC:+off}${NO_SPEC:-tree (auto mode-select)}"
 echo "  oom planning:     on (safety_margin=$SAFETY_MARGIN)"
 echo
 
 # Why each flag:
+#   --attn-impl auto
+#       Tries FlashAttention-2 (Ampere wheel; faster, lower-memory prefill →
+#       better TTFT and input throughput), silently falls back to sdpa if the
+#       FA2 wheel isn't installed. Use --attn-impl flash_attention_2 to REQUIRE
+#       it (errors loudly if missing) once you've confirmed the wheel.
+#   --speculative-tree --speculative-draft-model
+#       SpecBlock-inspired tree speculative decoding — the decode-throughput
+#       lever. On bandwidth-bound hardware (3060), accepting several tokens per
+#       target forward amortizes the per-token weight read → multiplies decode
+#       tok/s. Auto mode-selector picks none/flat/tree per request.
 #   --recompute-strategy cacheblend_sparse
-#       Faithful CacheBlend: recompute only high-deviation tokens layer-by-
-#       layer (paper's 2.2-3.3× TTFT), not the full-forward variant. This is
-#       the "faster TTFT on reused context" feature. Falls back to plain
-#       cacheblend automatically on unsupported architectures.
+#       Faithful CacheBlend: recompute only high-deviation tokens. The "faster
+#       TTFT on reused context" feature. NOTE: on a pure shared-PREFIX workload
+#       (this coding benchmark), --recompute-strategy none reuses prefix KV at
+#       ~zero cost like vLLM prefix caching; cacheblend_sparse's edge is the
+#       OUT-OF-ORDER RAG workload (bench_hf.py). Try both.
 #   --kv-cache-bits 8
-#       int8 KV cache: ~2× the cached-chunk capacity (more cross-request
-#       reuse) and lower memory pressure, negligible quality cost.
-#   --max-cache-bytes
-#       Cross-request chunk-cache budget — bigger = more reuse, bounded by VRAM.
+#       int8 KV STORAGE → ~2× cached-chunk capacity + less memory pressure.
+#       (Note: it dequants to fp16 for compute, so it adds reuse capacity, not
+#       decode bandwidth — that lever is weight quant, see below.)
 #   OOM planner (on by default) + --planner-safety-margin
-#       Per-request peak prediction → picks chunk_size/kv_bits that fit, or a
-#       clean HTTP 413. This is the "OOM recovery" feature. Add --auto-truncate
-#       to truncate-and-complete oversized prompts instead of 413.
-#   --max-batch-size 1
-#       The benchmark replays sequentially (single GPU worker); 1 avoids
-#       pointless batch-window latency. Raise for concurrent throughput tests.
-#   (automatic, no flag: O(n) incremental detok, chunked CacheBlend forward,
-#    streaming usage emission for input-throughput, planner cost probe.)
+#       Per-request peak prediction → fitting chunk_size/kv_bits or clean 413.
+#   (automatic: O(n) detok, chunked CacheBlend forward, streaming usage,
+#    static decode input buffers.)
 exec python -m kvboost.server \
     --model "$MODEL" \
     --dtype float16 \
+    --attn-impl auto \
     --recompute-strategy cacheblend_sparse \
     --kv-cache-bits 8 \
     --max-cache-bytes "$MAX_CACHE_BYTES" \
     --planner-safety-margin "$SAFETY_MARGIN" \
     --max-batch-size 1 \
+    "${SPEC_ARGS[@]}" \
     --host 0.0.0.0 \
     --port "$PORT"
 
-# ── Optional add-ons (uncomment to enable) ───────────────────────────────────
-# Speculative decoding to lift DECODE throughput (where vLLM's continuous
-# batching otherwise leads). Needs a small same-family draft model and ~1 GiB
-# extra VRAM; --speculative-tree turns on the SpecBlock-inspired tree variant
-# with cost-aware per-request mode selection:
-#     --speculative-draft-model Qwen/Qwen2.5-0.5B-Instruct \
-#     --speculative-tree \
+# ── Optional add-ons (uncomment / set env to enable) ─────────────────────────
+# WEIGHT QUANTIZATION (the biggest raw decode lever on a 3060): point --model
+# at an AWQ/GPTQ Int4 checkpoint — transformers loads it with Marlin int4 GEMM
+# on Ampere automatically (~4× less weight bandwidth → up to ~4× decode ceiling,
+# 60→~240 tok/s for 3B). No extra flag; the engine detects quantized weights:
+#     MODEL=Qwen/Qwen2.5-3B-Instruct-AWQ ./start_kvboost.sh
+#
+# torch.compile (--compile): CUDA graphs + fusion erase per-token launch
+# overhead → faster DECODE. CAVEAT: it recompiles per new PREFILL length, so it
+# can HURT TTFT on this varying-prompt benchmark and adds a one-time first-
+# request compile cost. Best for decode-bound / fixed-shape serving, not the
+# TTFT ramp. Add:  --compile
 #
-# Oversized-prompt policy for the OOM ramp: complete-by-truncation instead of
-# a clean 413 reject:
+# Oversized-prompt policy for the OOM ramp — complete-by-truncation vs 413:
 #     --auto-truncate
diff --git a/src/kvboost/engine.py b/src/kvboost/engine.py
@@ -112,6 +112,13 @@ def __init__(
         # Cost coefficients (probed at server startup) for cost-aware
         # tree shape + mode selection. None = degraded mode (defaults).
         cost_coefficients: Any = None,
+        # torch.compile(mode="reduce-overhead") — captures CUDA graphs +
+        # fuses pointwise ops (RMSNorm/RoPE/SwiGLU/residual) → removes the
+        # per-token kernel-launch overhead that caps eager decode. Opt-in
+        # and EXPERIMENTAL: compilation is lazy (first forward), so a bad
+        # interaction surfaces at runtime, not here — drop the flag if a
+        # run errors. Off by default so it can never regress the eager path.
+        compile_model: bool = False,
     ):
         if device is None:
             device = default_device()
@@ -282,6 +289,21 @@ def __init__(
         from .flash_attn_ext import install_flash_attention
         self._flash_attn_patched = install_flash_attention(self.model)
 
+        # torch.compile LAST, after any model patching. reduce-overhead mode
+        # uses CUDA graphs + Triton fusion to erase per-token launch overhead
+        # (the gap between eager decode and the bandwidth ceiling). Lazy: the
+        # actual compile happens on the first forward, so we can't catch a
+        # failure here — wrap-time errors are caught; runtime graph-breaks just
+        # degrade to partial speedup. Drop --compile if a run errors outright.
+        self._compiled = False
+        if compile_model:
+            try:
+                self.model = torch.compile(self.model, mode="reduce-overhead")
+                self._compiled = True
+                log.info("torch.compile(reduce-overhead) enabled (experimental)")
+            except Exception as e:
+                log.warning("torch.compile failed (%s); running eager", e)
+
     # ------------------------------------------------------------------
     # Factory
     # ------------------------------------------------------------------
@@ -293,6 +315,7 @@ def from_pretrained(
         strict: bool = True,
         streaming_config: Optional["StreamingConfig"] = None,
         awq_path: Optional[str] = None,
+        attn_implementation: str = "auto",
         **kwargs,
     ) -> "InferenceEngine":
         """
@@ -309,7 +332,21 @@ def from_pretrained(
                     the config. The rest of KVBoost (chunk-reuse, FlashAttn)
                     is untouched.
             awq_path: Optional path hint forwarded to the streaming loader.
-            **kwargs: Passed to InferenceEngine.__init__().
+            attn_implementation: Attention backend for the resident path.
+                    ``"auto"`` (default) tries ``flash_attention_2`` (FA2 —
+                    Ampere+ wheel; faster, lower-memory prefill → better TTFT)
+                    and silently falls back to ``"sdpa"`` if FA2 isn't
+                    installed/supported. Pass ``"flash_attention_2"`` to
+                    require it (raises if unavailable), or ``"sdpa"`` /
+                    ``"eager"`` to force a backend. Ignored on the streaming
+                    path. To load a **quantized** checkpoint (AWQ/GPTQ →
+                    Marlin int4 GEMM on Ampere, ~4× less weight bandwidth →
+                    higher decode tok/s), just pass a quantized ``model_name``;
+                    transformers reads its quantization_config and picks the
+                    kernel automatically — the engine already detects and
+                    leaves quantized/offloaded weights in place.
+            **kwargs: Passed to InferenceEngine.__init__() (e.g.
+                    ``compile_model=True`` for torch.compile reduce-overhead).
         """
         log.info("Loading model %s ...", model_name)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -326,11 +363,31 @@ def from_pretrained(
                 dtype=torch.float16,
             )
         else:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-            )
+            load_kwargs = dict(torch_dtype=torch.float16, low_cpu_mem_usage=True)
+            impl = attn_implementation
+            if impl in ("auto", "flash_attention_2"):
+                try:
+                    model = AutoModelForCausalLM.from_pretrained(
+                        model_name,
+                        attn_implementation="flash_attention_2",
+                        **load_kwargs,
+                    )
+                    log.info("Attention backend: flash_attention_2")
+                except Exception as e:
+                    if impl == "flash_attention_2":
+                        raise  # caller explicitly required FA2 — don't mask it
+                    log.info(
+                        "flash_attention_2 unavailable (%s); using sdpa", e
+                    )
+                    model = AutoModelForCausalLM.from_pretrained(
+                        model_name, attn_implementation="sdpa", **load_kwargs
+                    )
+                    log.info("Attention backend: sdpa")
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name, attn_implementation=impl, **load_kwargs
+                )
+                log.info("Attention backend: %s", impl)
             model.eval()
 
         check_model_compatibility(model, strict=strict)
@@ -1110,16 +1167,25 @@ def _decode_with_kv(
 
         # ----- autoregressive decode ------------------------------------
         cur_pos = cached_len + len(live_ids)
+        # Pre-allocate the (1,1) decode input buffers ONCE and write in place
+        # each step, instead of allocating two fresh device tensors per token.
+        # Removes a per-token alloc + H2D churn, and — critically — gives
+        # torch.compile / CUDA-graph capture stable input tensors to graph
+        # against (a graph needs fixed input storage; a new tensor per step
+        # forces a recapture/recompile). Correctness is identical: the model
+        # reads these tensors, it never retains them across steps.
+        input_buf = torch.empty((1, 1), dtype=torch.long, device=self.device)
+        pos_buf = torch.empty((1, 1), dtype=torch.long, device=self.device)
         while not goto_done and len(generated) < max_new_tokens:
             if generated[-1] == self.tokenizer.eos_token_id:
                 break
-            cur_ids = torch.tensor([[generated[-1]]], dtype=torch.long, device=self.device)
-            pos_ids = torch.tensor([[cur_pos]], dtype=torch.long, device=self.device)
+            input_buf[0, 0] = generated[-1]
+            pos_buf[0, 0] = cur_pos
             with torch.no_grad(), last_logit_only(self.model):
                 out = self.model(
-                    input_ids=cur_ids,
+                    input_ids=input_buf,
                     past_key_values=self._as_cache(past_kv),
-                    position_ids=pos_ids,
+                    position_ids=pos_buf,
                     use_cache=True,
                 )
             past_kv = self._normalize_past_kv(out.past_key_values)
diff --git a/src/kvboost/server/__main__.py b/src/kvboost/server/__main__.py
@@ -72,6 +72,20 @@ def parse_args():
     p.add_argument("--device", default=None, help="Device: cuda | mps | cpu (auto-detected if omitted)")
     p.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"],
                    help="Model weight dtype (default: float16)")
+    p.add_argument("--attn-impl", default="auto",
+                   choices=["auto", "flash_attention_2", "sdpa", "eager"],
+                   help="Attention backend. 'auto' (default) uses "
+                        "flash_attention_2 if installed (faster/lower-memory "
+                        "prefill -> better TTFT; Ampere+ e.g. RTX 3060) and "
+                        "falls back to sdpa otherwise. 'flash_attention_2' "
+                        "requires it (errors if missing). 'sdpa'/'eager' force.")
+    p.add_argument("--compile", action="store_true", default=False,
+                   help="torch.compile(mode='reduce-overhead') on the model: "
+                        "CUDA graphs + pointwise fusion to erase per-token "
+                        "launch overhead (closes most of the eager-decode gap "
+                        "to the bandwidth ceiling). EXPERIMENTAL — compiles "
+                        "lazily on first request; drop the flag if a run "
+                        "errors. First request pays a one-time compile cost.")
     p.add_argument("--backend", default="default", choices=["default", "cpu-paged"],
                    help="Inference backend (default: standard KVBoost)")
     p.add_argument("--quantization", default="none",
@@ -410,6 +424,10 @@ def load_engine(args):
                 device=device,
                 speculative_config=speculative_cfg,
                 tree_speculative_config=tree_speculative_cfg,
+                # attn_impl is ignored on the streaming path (StreamingCausalLM
+                # owns attention); compile flows through to __init__.
+                attn_implementation=args.attn_impl,
+                compile_model=args.compile,
             )
             log.info("Model loaded.")
             return engine
@@ -440,10 +458,27 @@ def load_engine(args):
             from_pretrained_kwargs["quantization_config"] = quant_config
         else:
             from_pretrained_kwargs["dtype"] = dtype
-        model = AutoModelForCausalLM.from_pretrained(
-            args.model,
-            **from_pretrained_kwargs,
+        # Attention backend. 'auto' tries FA2 (better TTFT on Ampere+, e.g.
+        # RTX 3060) then falls back to sdpa; an explicit choice is honored.
+        _want_fa2 = args.attn_impl in ("auto", "flash_attention_2")
+        from_pretrained_kwargs["attn_implementation"] = (
+            "flash_attention_2" if _want_fa2 else args.attn_impl
         )
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model, **from_pretrained_kwargs,
+            )
+            log.info("Attention backend: %s",
+                     from_pretrained_kwargs["attn_implementation"])
+        except Exception as e:
+            if args.attn_impl != "auto":
+                raise  # explicit backend requested — don't mask the failure
+            log.info("flash_attention_2 unavailable (%s); using sdpa", e)
+            from_pretrained_kwargs["attn_implementation"] = "sdpa"
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model, **from_pretrained_kwargs,
+            )
+            log.info("Attention backend: sdpa")
         engine = InferenceEngine(
             model=model,
             tokenizer=tokenizer,
@@ -457,6 +492,7 @@ def load_engine(args):
             device=device,
             speculative_config=_build_speculative_config(args),
             tree_speculative_config=_build_tree_speculative_config(args),
+            compile_model=args.compile,
         )
 
     log.info("Model loaded.")