pythongiant
diff --git a/‎benchmarks_and_experiments/coding_vs_vllm/start_kvboost.sh‎
Lines changed: 0 additions & 1 deletion b/‎benchmarks_and_experiments/coding_vs_vllm/start_kvboost.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎install_deps.sh‎
Lines changed: 42 additions & 15 deletions b/‎install_deps.sh‎
Lines changed: 42 additions & 15 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/kvboost/engine.py‎
Lines changed: 8 additions & 1 deletion b/‎src/kvboost/engine.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/kvboost/kernels/__init__.py‎
Lines changed: 51 additions & 8 deletions b/‎src/kvboost/kernels/__init__.py‎
Lines changed: 51 additions & 8 deletions
@@ -65,7 +65,6 @@ echo
 exec python -m kvboost.server \
     --model "$MODEL" \
     --dtype float16 \
-    --cuda-graph-decode \
     --attn-impl auto \
     --recompute-strategy cacheblend_sparse \
     --chunk-boundary-window 32\
 
@@ -6,12 +6,16 @@
 #   * GPU, no nvcc                     -> CUDA torch + flash-attn (prebuilt wheel) + FlashInfer
 #   * GPU + nvcc (CUDA 12.x / 13.x)    -> full path incl. bundled kernel + flash-attn
 #
-# flash-attn is REQUIRED on any GPU box (it's the prefill backend you want): a
-# matching prebuilt wheel is installed when available (no nvcc needed), with a
-# source build as fallback. The install fails loudly if it can't be installed,
-# so you never silently end up on SDPA. The bundled kernel and FlashInfer stay
-# best-effort (the repo falls back to SDPA for those). Use --skip-flash-attn to
-# opt out. Every build is time-boxed and logged to install_deps.log.
+# The primary accelerated prefill path on a GPU box is now Triton: the 'sage'
+# (INT8 SageAttention) and 'triton_flash' (FP16 flash) backends JIT-compile
+# through the CUDA driver — no nvcc, no prebuilt-wheel matching, no multi-arch
+# source build. Triton ships with the CUDA torch wheel on Linux; we just verify
+# it imports. flash-attn is now OPTIONAL and best-effort (NEVER fatal): a
+# matching prebuilt wheel is installed when available, with a source build as
+# fallback, but if neither works the runtime simply uses SDPA / the Triton
+# kernels. The bundled CUDA kernel and FlashInfer are also best-effort. Use
+# --skip-flash-attn to opt out. Every build is time-boxed and logged to
+# install_deps.log.
 #
 # Usage
 # -----
@@ -288,19 +292,34 @@ if (( CAN_BUILD_EXT == 1 )); then
 fi
 
 if [[ "${MODE}" == "cuda" ]]; then
-    # FlashAttention-2 — REQUIRED (the prefill backend you want). Prebuilt wheel
-    # first (no nvcc needed), source build as fallback. Fatal if it can't go in.
+    # Triton — the PRIMARY accelerated kernel path (SageAttention INT8 prefill +
+    # FP16 'triton_flash'). JIT-compiles via the CUDA driver: no nvcc, no wheel
+    # matching, no multi-arch source build. Ships with the CUDA torch wheel on
+    # Linux; verify it imports and install best-effort if somehow missing.
+    if python -c 'import triton' 2>/dev/null; then
+        log "Triton present ($(python -c 'import triton; print(triton.__version__)')) — 'sage' / 'triton_flash' backends enabled."
+    else
+        warn "Triton not importable (unexpected with a CUDA torch wheel); installing best-effort."
+        python -m pip install -q triton \
+            || warn "triton install failed; 'sage'/'triton_flash' will fall back to SDPA."
+    fi
+
+    # FlashAttention-2 — OPTIONAL / best-effort now (Triton 'sage'/'triton_flash'
+    # is the recommended prefill path on Ampere). Prebuilt wheel first (no nvcc),
+    # source build as fallback. NEVER fatal: on failure the runtime uses the
+    # Triton kernels or SDPA.
     if (( SKIP_FLASH_ATTN == 1 )); then
-        warn "Skipping flash-attn at your request (--skip-flash-attn); prefill uses torch SDPA."
+        warn "Skipping flash-attn at your request (--skip-flash-attn); use --attn-impl sage / triton_flash, or SDPA."
     elif install_flash_attn; then
         log "FlashAttention-2 ready ($(python -c 'import flash_attn; print(flash_attn.__version__)'))"
     else
-        fail "flash-attn could not be installed (you asked for it explicitly).
-  See ${BUILD_LOG} for the exact build error. Most common cause: the torch
-  pulled from ${TORCH_CUDA_TAG:-the CUDA index} is newer than any published
-  flash-attn wheel. Fixes:
-    * pin torch to a release that has wheels:  TORCH_SPEC=torch==2.7.1 ./install_deps.sh
-    * or pin a flash-attn version:             FLASH_ATTN_SPEC=flash-attn==2.7.4.post1 ./install_deps.sh"
+        warn "flash-attn could not be installed (optional). See ${BUILD_LOG}.
+  This is fine — the recommended path no longer needs it: run the server with
+  --attn-impl sage (INT8 SageAttention prefill via Triton) or --attn-impl
+  triton_flash (FP16). To install flash-attn anyway, the usual fixes are:
+    * pin torch to a release with wheels:  TORCH_SPEC=torch==2.7.1 ./install_deps.sh
+    * or pin a flash-attn version:         FLASH_ATTN_SPEC=flash-attn==2.7.4.post1 ./install_deps.sh
+    * or limit the source build to Ampere: FLASH_ATTN_CUDA_ARCHS=80 ./install_deps.sh"
     fi
 
     # FlashInfer (decode attention) — JIT, best-effort, works without nvcc.
@@ -355,8 +374,16 @@ def have(mod):
 
 fa2  = have("flash_attn")
 fi   = have("flashinfer")
+tri  = have("triton")
 kern = have("kvboost._flash_attn_cuda")
+try:
+    from kvboost.kernels import sage_available
+    sage = sage_available()
+except Exception:
+    sage = False
 print(f"  info prefill backend  : {'flash_attention_2' if fa2 else 'torch SDPA (flash-attn not installed)'}")
+print(f"  info sage/triton flash: {'available (--attn-impl sage | triton_flash)' if sage else 'unavailable (triton missing → SDPA)'}")
+print(f"  info triton           : {'present' if tri else 'absent'}")
 print(f"  info decode  backend  : {'flashinfer' if fi else 'torch SDPA (flashinfer not installed)'}")
 print(f"  info bundled kernel   : {'kvboost._flash_attn_cuda' if kern else 'not built (SDPA patch path)'}")
 
 
@@ -40,6 +40,11 @@ dev = [
 ]
 cuda = [
     "ninja>=1.11",
+    # Triton backs the 'sage' (INT8 SageAttention) and 'triton_flash' kernels.
+    # JIT-compiled via the CUDA driver — no nvcc, no flash-attn-style wheel
+    # build. Ships with the CUDA torch wheel on Linux; pinned here so it's
+    # explicit. (Linux-only: Triton has no macOS/Windows wheels.)
+    "triton>=2.1 ; platform_system=='Linux'",
 ]
 streaming = [
     "safetensors>=0.4",
 
@@ -371,7 +371,14 @@ def from_pretrained(
                     and silently falls back to ``"sdpa"`` if FA2 isn't
                     installed/supported. Pass ``"flash_attention_2"`` to
                     require it (raises if unavailable), or ``"sdpa"`` /
-                    ``"eager"`` to force a backend. Ignored on the streaming
+                    ``"eager"`` to force a backend. ``"sage"`` runs INT8
+                    SageAttention on prefill via a Triton kernel (no nvcc /
+                    no flash-attn build needed; INT8 tensor-core QKᵀ on
+                    Ampere, SDPA fallback for decode); ``"triton_flash"`` is
+                    the FP16 Triton flash baseline; ``"flashinfer"`` routes
+                    decode attention through FlashInfer. Each JIT/optional
+                    backend self-checks against SDPA on first use and
+                    disables itself on mismatch. Ignored on the streaming
                     path. To load a **quantized** checkpoint (AWQ/GPTQ →
                     Marlin int4 GEMM on Ampere, ~4× less weight bandwidth →
                     higher decode tok/s), just pass a quantized ``model_name``;
 
@@ -1,17 +1,60 @@
-"""Proven external inference kernels kvboost routes to at runtime.
+"""Proven external/JIT inference kernels kvboost routes to at runtime.
 
-Currently: FlashInfer decode-attention (see ``flashinfer_attn``). Each kernel
-is gated on availability + a numerical self-check, and falls back to PyTorch
-SDPA so a missing or misbehaving kernel never corrupts output.
+* FlashInfer decode-attention (``flashinfer_attn``) — ``--attn-impl flashinfer``.
+* SageAttention INT8 prefill + FP16 Triton flash (``sage_attn``) —
+  ``--attn-impl sage`` / ``--attn-impl triton_flash``.
+
+Each kernel is gated on availability + a one-time numerical self-check, and
+falls back to PyTorch SDPA so a missing or misbehaving kernel never corrupts
+output. ``resolve_attn_impl`` registers the requested backend with HuggingFace
+(if its dependency is present) before model load, else downgrades to ``sdpa``.
 """
-from .flashinfer_attn import (
-    flashinfer_available,
-    install_flashinfer_attention,
-    resolve_attn_impl,
+import logging
+
+from .flashinfer_attn import flashinfer_available, install_flashinfer_attention
+from .sage_attn import (
+    install_sage_attention,
+    sage_attention_forward,
+    sage_available,
+    triton_available,
+    triton_flash_attention_forward,
 )
 
+_log = logging.getLogger("kvboost.kernels")
+
+
+def resolve_attn_impl(requested: str) -> str:
+    """Map a requested attn-impl to one HF can actually load.
+
+    Registers the backend with HuggingFace if its dependency is importable,
+    otherwise falls back to ``"sdpa"`` with a warning. ``"auto"`` and stock
+    impls (``"sdpa"``, ``"eager"``, ``"flash_attention_2"``) pass through.
+    """
+    if requested == "flashinfer":
+        if install_flashinfer_attention():
+            return "flashinfer"
+        _log.warning("attn-impl 'flashinfer' requested but unavailable; using sdpa.")
+        return "sdpa"
+
+    if requested in ("sage", "triton_flash"):
+        if install_sage_attention():
+            return requested
+        _log.warning(
+            "attn-impl '%s' requested but Triton is unavailable; using sdpa.",
+            requested,
+        )
+        return "sdpa"
+
+    return requested
+
+
 __all__ = [
     "flashinfer_available",
     "install_flashinfer_attention",
+    "install_sage_attention",
+    "sage_attention_forward",
+    "sage_available",
+    "triton_available",
+    "triton_flash_attention_forward",
     "resolve_attn_impl",
 ]
Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,11 @@ dev = [`
`40`	`40`	`]`
`41`	`41`	`cuda = [`
`42`	`42`	`"ninja>=1.11",`
	`43`	`+ # Triton backs the 'sage' (INT8 SageAttention) and 'triton_flash' kernels.`
	`44`	`+ # JIT-compiled via the CUDA driver — no nvcc, no flash-attn-style wheel`
	`45`	`+ # build. Ships with the CUDA torch wheel on Linux; pinned here so it's`
	`46`	`+ # explicit. (Linux-only: Triton has no macOS/Windows wheels.)`
	`47`	`+ "triton>=2.1 ; platform_system=='Linux'",`
`43`	`48`	`]`
`44`	`49`	`streaming = [`
`45`	`50`	`"safetensors>=0.4",`