11#! /usr/bin/env bash
22# Launch kvboost in its FASTEST setup on the RTX 3060 — speed over fairness.
3- # This stacks every working throughput/latency lever for the coding benchmark:
4- # * Marlin int4 weight quant (AWQ model) — biggest decode-bandwidth lever
5- # * Tree speculative decoding (draft model) — multi-token/step decode lever
6- # * INT8 SageAttention prefill (Triton 'sage') — faster TTFT, self-checks → SDPA
7- # * recompute=none — zero-cost shared-prefix reuse
8- # * int8 KV storage + OOM planner — more reuse capacity, no crashes
3+ # Stacks every working throughput/latency lever for the coding benchmark:
4+ # * Marlin int4 weight quant (AWQ model) — biggest decode-bandwidth lever (~4×)
5+ # * Tree speculative decoding (draft model) — multi-token/step decode lever
6+ # * recompute=none — zero-cost shared-prefix reuse (TTFT)
7+ # * int8 KV storage + OOM planner — more reuse capacity, no crashes
98#
10- # NOTE this is NOT a fair vs-vLLM config (int4 + spec). For an apples-to-apples
11- # run, point vLLM at the SAME AWQ checkpoint (it also uses Marlin) — see below.
9+ # AWQ LOADING (important on this box): the plain *resident* AWQ load goes through
10+ # transformers' AWQ quantizer, which pulls in a `gptqmodel` that's mismatched
11+ # with the installed transformers ("module 'transformers.utils.hub' has no
12+ # attribute 'create_repo'") and crashes on import. kvboost's OWN AWQ loader
13+ # sidesteps it: `--awq-streaming --streaming-mode full_resident` keeps all int4
14+ # weights ON the GPU (no DMA) and uses the Marlin int4 GEMM, never touching
15+ # transformers' AWQ path. So for an AWQ/GPTQ MODEL this script auto-uses that
16+ # loader. Tradeoff: the streaming path owns attention, so --attn-impl (incl. our
17+ # Triton 'sage' kernel) is IGNORED there → SDPA prefill. int4 ≫ sage, so this is
18+ # still the fastest config that actually launches. An fp16 MODEL uses the
19+ # resident path + sage instead. (To regain sage WITH int4, repair the env — see
20+ # the foot of this file — and force RESIDENT=1.)
1221#
1322# Run this, then in another shell:
1423# python bench_coding.py --backend kvboost --url http://localhost:9000 \
1726#
1827# Override via env:
1928# MODEL=... PORT=... MAX_CACHE_BYTES=... SPEC=0 (disable spec) DRAFT=...
20- # ATTN=flashinfer (decode-attn instead of sage) RECOMPUTE=cacheblend_sparse
29+ # ATTN=flashinfer RECOMPUTE=cacheblend_sparse STREAMING_MODE=... QUANT_KERNEL=marlin
30+ # RESIDENT=1 (force the transformers resident load even for an AWQ model)
2131
2232set -euo pipefail
2333
@@ -32,6 +42,8 @@ MAX_CACHE_BYTES="${MAX_CACHE_BYTES:-5e9}"
3242SAFETY_MARGIN=" ${SAFETY_MARGIN:- 0.15} "
3343ATTN=" ${ATTN:- sage} "
3444RECOMPUTE=" ${RECOMPUTE:- none} "
45+ STREAMING_MODE=" ${STREAMING_MODE:- full_resident} " # all weights on GPU, no DMA
46+ QUANT_KERNEL=" ${QUANT_KERNEL:- auto} " # auto = probe Marlin first
3547
3648# Tree speculative decoding is ON by default here (it's a speed setup). Needs
3749# the ~1 GB fp16 draft model + VRAM. Disable with SPEC=0 for a no-spec run.
4456 SPEC_DESC=" off (SPEC=0)"
4557fi
4658
59+ # Choose the load path. AWQ/GPTQ → kvboost's streaming loader (bypasses the
60+ # broken transformers AWQ quantizer); fp16 → resident path + sage.
61+ case " $MODEL " in
62+ * AWQ* |* awq* |* GPTQ* |* gptq* |* Int4* |* int4* |* INT4* )
63+ if [[ " ${RESIDENT:- 0} " == " 1" ]]; then
64+ LOAD_ARGS=(--attn-impl " $ATTN " )
65+ ATTN_DESC=" $ATTN (resident AWQ — needs a repaired gptqmodel/transformers env)"
66+ else
67+ LOAD_ARGS=(--awq-streaming --streaming-mode " $STREAMING_MODE " \
68+ --streaming-quant-kernel " $QUANT_KERNEL " )
69+ ATTN_DESC=" SDPA (AWQ streaming owns attention; --attn-impl ignored)"
70+ fi
71+ ;;
72+ * )
73+ LOAD_ARGS=(--attn-impl " $ATTN " )
74+ ATTN_DESC=" $ATTN (INT8 SageAttention prefill; self-check → SDPA)"
75+ ;;
76+ esac
77+
4778echo " kvboost (FASTEST setup — RTX 3060, speed over fairness)"
4879echo " model: $MODEL (int4 Marlin GEMM if AWQ/GPTQ)"
4980echo " port: $PORT "
50- echo " attention: $ATTN (INT8 SageAttention prefill; self-check → SDPA)"
81+ echo " load path: ${LOAD_ARGS[*]} "
82+ echo " attention: $ATTN_DESC "
5183echo " recompute: $RECOMPUTE (zero-cost shared-prefix reuse = fastest TTFT)"
5284echo " kv-cache-bits: 8 (int8 KV → 2× reuse capacity)"
5385echo " max-cache-bytes: $MAX_CACHE_BYTES "
5789
5890# Why each flag (impact order on a 3060):
5991# MODEL=...-AWQ (the #1 raw lever)
60- # int4 weight quant → transformers loads the AWQ/Marlin int4 GEMM CUDA
61- # kernels on Ampere automatically (~4× less weight bandwidth → up to ~4×
62- # the decode ceiling). The 3 GB→~2 GB model also frees VRAM for KV cache.
92+ # int4 weight quant → Marlin int4 GEMM on Ampere (~4× less weight bandwidth
93+ # → up to ~4× the decode ceiling). Loaded here via --awq-streaming
94+ # --streaming-mode full_resident (kvboost's own AWQ→Marlin loader, all
95+ # weights resident on GPU) to dodge the broken transformers AWQ quantizer.
6396# --speculative-tree (+ draft) (the #2 decode lever)
6497# SpecBlock-inspired tree speculative decoding — verifies several drafted
6598# tokens per target step; auto mode-select per request. Decode throughput.
66- # --attn-impl sage (prefill lever; pairs with spec)
67- # INT8 SageAttention prefill via Triton (INT8 tensor-core QK^T on sm_86;
68- # no nvcc/flash-attn build). Decode (q_len==1) delegates to SDPA. One-time
69- # numerical self-check vs SDPA → permanent SDPA fallback on mismatch, so
70- # worst case is the SDPA baseline, never wrong. Watch the log for
71- # "sage self-check passed". Set ATTN=flashinfer instead to accelerate
72- # single-token DECODE attention (better when SPEC=0 + long context).
7399# --recompute-strategy none (fastest TTFT on shared prefix)
74100# Reuses prefix KV at ~zero cost (like vLLM prefix caching) — lossless on
75- # this coding benchmark's shared prefix. Set RECOMPUTE=cacheblend_sparse
76- # for the OUT-OF-ORDER multiturn/RAG workload (faithful selective recompute
77- # where moved chunks would otherwise go stale).
101+ # this benchmark's shared prefix. Set RECOMPUTE=cacheblend_sparse for the
102+ # OUT-OF-ORDER multiturn/RAG workload (faithful selective recompute).
78103# --kv-cache-bits 8
79104# int8 KV STORAGE → ~2× cached-chunk capacity. (Dequants to fp16 for
80105# compute — adds reuse capacity, not decode bandwidth; that's weight quant.)
83108exec python -m kvboost.server \
84109 --model " $MODEL " \
85110 --dtype float16 \
86- --attn-impl " $ATTN " \
111+ " ${LOAD_ARGS[@]} " \
87112 --recompute-strategy " $RECOMPUTE " \
88113 --chunk-boundary-window 32 \
89114 --kv-cache-bits 8 \
@@ -96,31 +121,36 @@ exec python -m kvboost.server \
96121
97122
98123# ── Optional add-ons / alternatives ──────────────────────────────────────────
99- # FAIR int4-vs-int4 comparison: run vLLM on the SAME AWQ checkpoint (it also
100- # uses Marlin) so the only difference is the engine, not the weights:
101- # MODEL=Qwen/Qwen2.5-3B-Instruct-AWQ ./start_vllm.sh
124+ # FASTEST-that-launches (default): AWQ int4 via kvboost's streaming loader. If
125+ # the streaming AWQ load itself errors (rare on a pure-transformer fp16 AWQ like
126+ # Qwen2.5), fall back to fp16 — you keep spec + sage, lose only int4:
127+ # MODEL=Qwen/Qwen2.5-3B-Instruct ./start_kvboost.sh
102128#
103- # DISABLE the aggressive levers (toward the old fair-vs-vLLM baseline):
104- # MODEL=Qwen/Qwen2.5-3B-Instruct SPEC=0 ATTN=auto RECOMPUTE=cacheblend_sparse ./start_kvboost.sh
129+ # REGAIN sage WITH int4 (resident AWQ): repair the env so transformers' AWQ path
130+ # works, then force the resident load:
131+ # pip install -U "gptqmodel" "transformers" # realign the two, OR
132+ # pip uninstall -y gptqmodel && pip install autoawq # AWQ via autoawq, not gptqmodel
133+ # RESIDENT=1 ./start_kvboost.sh
134+ # (Confirm in the logs you see "sage self-check passed".)
135+ #
136+ # FAIR int4-vs-int4 comparison: run vLLM on the SAME AWQ checkpoint (Marlin too):
137+ # MODEL=Qwen/Qwen2.5-3B-Instruct-AWQ ./start_vllm.sh
105138#
106- # FLASHINFER decode-attention (use instead of sage when SPEC=0): ATTN=flashinfer.
107- # Routes only the single-token DECODE step through FlashInfer's CUDA kernel
108- # (SDPA prefill + fallback, one-time self-check). Helps most at long context
109- # where KV reads dominate. Needs `pip install flashinfer-python`.
139+ # FLASHINFER decode-attention (fp16 model, use with SPEC=0): ATTN=flashinfer.
140+ # Routes only single-token DECODE through FlashInfer's CUDA kernel (SDPA prefill
141+ # + fallback, one-time self-check). Helps most at long context.
110142#
111143# CUDA-GRAPH DECODE (--cuda-graph-decode): LEFT OFF here on purpose — it caused
112- # recompile thrash on this box and was removed from this setup (commit
113- # "Remove cuda graph decode"). It targets the per-token launch overhead (~36 of
114- # ~56 ms/token) and stacks with int4, so if you've since fixed the re-capture
115- # thrash it's a big decode-latency win — add it back and validate output vs a
116- # run without it:
144+ # recompile thrash on this box and was removed from this setup (commit "Remove
145+ # cuda graph decode"). It targets per-token launch overhead (~36 of ~56 ms/token)
146+ # and stacks with int4, so if you've fixed the re-capture thrash, add it back and
147+ # validate output vs a run without it:
117148# ... --cuda-graph-decode
118149#
119150# MULTI-TURN CacheBlend run (where CacheBlend beats vLLM prefix caching): the
120151# --mode multiturn workload reshuffles in-context files each turn (same files,
121- # OUT OF ORDER) — prefix caching misses, CacheBlend reuses. Use the faithful
122- # recompute path + content-aligned chunking (already on via --chunk-boundary-
123- # window 32 so a moved file still chunks identically):
152+ # OUT OF ORDER). Use the faithful recompute path (content-aligned chunking is
153+ # already on via --chunk-boundary-window 32):
124154# RECOMPUTE=cacheblend_sparse ./start_kvboost.sh
125155# then: python bench_coding.py --backend kvboost --url http://localhost:9000 \
126156# --model "$MODEL" --mode multiturn --out kvboost_mt.json
0 commit comments