pythongiant
diff --git a/‎src/kvboost/engine.py‎
Lines changed: 128 additions & 7 deletions b/‎src/kvboost/engine.py‎
Lines changed: 128 additions & 7 deletions
diff --git a/‎src/kvboost/server/__main__.py‎
Lines changed: 63 additions & 0 deletions b/‎src/kvboost/server/__main__.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/kvboost/speculative/__init__.py‎
Lines changed: 20 additions & 1 deletion b/‎src/kvboost/speculative/__init__.py‎
Lines changed: 20 additions & 1 deletion
@@ -30,7 +30,10 @@
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import Callable, Dict, List, Optional, Set, Tuple
+from typing import Any, Callable, Dict, List, Optional, Set, TYPE_CHECKING, Tuple
+
+if TYPE_CHECKING:
+    from .speculative.tree.config import TreeSpeculativeConfig
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -101,6 +104,11 @@ def __init__(
         prefill_chunk_size: int = 0,
         # Speculative decoding (None = disabled, baseline decode path)
         speculative_config: Optional["SpeculativeConfig"] = None,
+        # Tree speculative — SpecBlock-inspired, may coexist with flat
+        tree_speculative_config: Optional["TreeSpeculativeConfig"] = None,
+        # Cost coefficients (probed at server startup) for cost-aware
+        # tree shape + mode selection. None = degraded mode (defaults).
+        cost_coefficients: Any = None,
     ):
         if device is None:
             device = default_device()
@@ -160,30 +168,105 @@ def __init__(
 
         # Speculative decoding (decode-phase orthogonal to recompute_strategy).
         # CacheBlend handles prefill; speculative handles decode. They stack.
+        # Two flavors: flat (token-by-token K draft) and tree (SpecBlock-
+        # inspired). Both may be present; the bridge / ModeSelector picks
+        # per-request when so.
         self.speculative_config = speculative_config
+        self.tree_speculative_config = tree_speculative_config
+        self.cost_coefficients = cost_coefficients
         self.speculative_engine = None
-        if speculative_config is not None:
-            speculative_config.validate()
+        self.tree_speculative_engine = None
+        self.mode_selector = None
+
+        need_draft = (
+            speculative_config is not None
+            or tree_speculative_config is not None
+        )
+        if need_draft:
             from .speculative.draft import DraftModel
-            from .speculative.engine import SpeculativeEngine
             from .speculative.stats import SpeculativeStats
             from .speculative.verifier import TargetVerifier
 
+            # Validate whichever configs are present. The DraftModel
+            # itself needs a flat-style ``SpeculativeConfig`` so its
+            # model-load path stays one code path (the tree config
+            # doesn't carry draft_model_id / draft_streaming_config; if
+            # only tree is wired we still rely on the flat config for
+            # the drafter handle).
+            if speculative_config is not None:
+                speculative_config.validate()
+            if tree_speculative_config is not None:
+                tree_speculative_config.validate()
+
+            # The drafter is shared across flat + tree.
+            if speculative_config is None:
+                raise ValueError(
+                    "tree_speculative_config requires a flat "
+                    "SpeculativeConfig (drafter model handle); pass "
+                    "both."
+                )
+
+            self._speculative_stats = SpeculativeStats()
             log.info(
-                "Speculative decoding enabled: %s",
+                "Speculative decoding enabled: flat=%s tree=%s",
                 speculative_config.summary(),
+                tree_speculative_config.summary()
+                if tree_speculative_config else "off",
             )
-            self._speculative_stats = SpeculativeStats()
             draft = DraftModel(
                 speculative_config, target_tokenizer=tokenizer
             )
             verifier = TargetVerifier(self.model, device=device)
+
+            # Flat engine: existing path, unchanged.
+            from .speculative.engine import SpeculativeEngine
             self.speculative_engine = SpeculativeEngine(
                 cfg=speculative_config,
                 target_verifier=verifier,
                 draft_model=draft,
                 stats=self._speculative_stats,
             )
+
+            # Tree engine: only when its config is provided.
+            if tree_speculative_config is not None:
+                from .speculative.tree.engine import TreeSpeculativeEngine
+
+                target_step_ms = (
+                    cost_coefficients.step_latency_ms
+                    if cost_coefficients is not None else 50.0
+                )
+                # Draft step latency is unknown without probing the
+                # drafter directly; approximate as a small fraction of
+                # the target step (drafter is ~1/10th model size).
+                draft_step_ms = max(1.0, target_step_ms * 0.15)
+
+                self.tree_speculative_engine = TreeSpeculativeEngine(
+                    cfg=tree_speculative_config,
+                    target_verifier=verifier,
+                    draft_model=draft,
+                    cost_coefficients=cost_coefficients,
+                    target_step_ms=target_step_ms,
+                    draft_step_ms=draft_step_ms,
+                    mode=speculative_config.mode,
+                    temperature=speculative_config.temperature,
+                    stats=self._speculative_stats,
+                )
+
+                # Build the auto-selector. Shares the tree engine's
+                # EWMA so its scoring reads the same observations the
+                # tree engine writes after every round.
+                from .speculative.mode_selector import ModeSelector
+                self.mode_selector = ModeSelector(
+                    target_step_ms=target_step_ms,
+                    draft_step_ms=draft_step_ms,
+                    flat_available=True,
+                    tree_available=True,
+                    tree_config=tree_speculative_config,
+                    flat_k=speculative_config.draft_k,
+                    flat_cold_accept=0.4,
+                    tree_ewma=self.tree_speculative_engine.ewma,
+                    cost_coefficients=cost_coefficients,
+                )
         else:
             self._speculative_stats = None
 
@@ -270,6 +353,35 @@ def reset_cache(self) -> None:
         """
         self.cache_manager.clear()
 
+    def set_cost_coefficients(self, cc: Any) -> None:
+        """Populate cost coefficients post-construction.
+
+        The server probes coefficients AFTER engine load (the probe
+        needs the loaded model), then plumbs them back here. They
+        drive tree-shape selection and mode-auto-selection; setting
+        them late just means the first request uses the defaults
+        and subsequent requests are calibrated. Safe to call multiple
+        times (e.g. if the operator updates them via /v1/stats).
+        """
+        self.cost_coefficients = cc
+        if self.tree_speculative_engine is not None:
+            self.tree_speculative_engine.cc = cc
+            # Update measured step latency if available — the tree
+            # engine multiplies this by predicted node count, so a
+            # bad value distorts every shape decision.
+            try:
+                self.tree_speculative_engine.target_step_ms = float(
+                    cc.step_latency_ms
+                )
+            except Exception:
+                pass
+        if self.mode_selector is not None:
+            self.mode_selector.cc = cc
+            try:
+                self.mode_selector.target_step_ms = float(cc.step_latency_ms)
+            except Exception:
+                pass
+
     def generate(
         self,
         prompt: str,
@@ -877,7 +989,11 @@ def _decode_with_kv(
         # We extend past_kv by one forward to cover that first sampled
         # token, then hand off — speculative's invariant is that past_kv
         # exactly covers the input prompt_ids.
-        if self.speculative_engine is not None and len(generated) < max_new_tokens:
+        any_spec = (
+            self.speculative_engine is not None
+            or self.tree_speculative_engine is not None
+        )
+        if any_spec and len(generated) < max_new_tokens:
             extended_pos = cached_len + len(live_ids)
             first_t = torch.tensor(
                 [[generated[-1]]], dtype=torch.long, device=self.device
@@ -896,11 +1012,16 @@ def _decode_with_kv(
             extended_prompt_ids = list(full_token_ids) + [generated[-1]]
             from .speculative.bridge import run_speculative_decode
 
+            tree_cfg = self.tree_speculative_config
+            policy = tree_cfg.policy if tree_cfg is not None else "auto"
             spec_generated, past_kv = run_speculative_decode(
                 full_token_ids=extended_prompt_ids,
                 target_past_kv=past_kv,
                 cached_length=len(extended_prompt_ids),
                 spec_engine=self.speculative_engine,
+                tree_engine=self.tree_speculative_engine,
+                mode_selector=self.mode_selector,
+                policy=policy,
                 max_new_tokens=max_new_tokens - len(generated),
                 eos_token_id=self.tokenizer.eos_token_id,
                 on_token=on_token,
 
@@ -157,6 +157,34 @@ def parse_args():
                    help="Temperature applied to target logits in sampling mode "
                         "(default: 1.0). Ignored in greedy mode.")
 
+    # SpecBlock-inspired tree speculative decoding. Requires the flat
+    # speculative drafter to be set (uses the same draft model with a
+    # tree-drafting wrapper). The ``ModeSelector`` then picks per-request
+    # between flat-K and tree-(B,D) by expected wall-time tokens/s.
+    p.add_argument("--speculative-tree", action="store_true", default=False,
+                   help="Enable SpecBlock-inspired tree speculative "
+                        "decoding alongside flat. Requires --speculative-"
+                        "draft-model. Per-request mode is auto-selected "
+                        "by the cost model unless --speculative-mode-policy "
+                        "overrides.")
+    p.add_argument("--speculative-mode-policy", default=None,
+                   choices=["auto", "flat", "tree", "none"],
+                   help="Force one speculative mode per request. Default "
+                        "is 'auto' when --speculative-tree is set, else "
+                        "'flat'. 'none' disables speculation entirely.")
+    p.add_argument("--speculative-tree-max-branching", type=int, default=4,
+                   help="Cap on per-node children in the draft tree "
+                        "(default: 4). Higher = wider tree.")
+    p.add_argument("--speculative-tree-max-depth", type=int, default=6,
+                   help="Cap on tree depth (default: 6). Deeper trees "
+                        "win more when acceptance is high.")
+    p.add_argument("--speculative-tree-node-budget", type=int, default=32,
+                   help="Total node-count cap for the tree (default: 32). "
+                        "Hard-bounds the target verifier's cost.")
+    p.add_argument("--speculative-tree-cold-accept", type=float, default=0.5,
+                   help="Seed acceptance prior for the tree EWMA (default: "
+                        "0.5). Used until 16+ samples per (B,D) cohort.")
+
     # Server
     p.add_argument("--host", default="0.0.0.0")
     p.add_argument("--port", type=int, default=8000)
@@ -363,6 +391,7 @@ def load_engine(args):
                 args.streaming_quant_kernel,
             )
             speculative_cfg = _build_speculative_config(args)
+            tree_speculative_cfg = _build_tree_speculative_config(args)
             engine = InferenceEngine.from_pretrained(
                 args.model,
                 streaming_config=streaming_config,
@@ -375,6 +404,7 @@ def load_engine(args):
                 prefill_chunk_size=args.prefill_chunk_size,
                 device=device,
                 speculative_config=speculative_cfg,
+                tree_speculative_config=tree_speculative_cfg,
             )
             log.info("Model loaded.")
             return engine
@@ -421,6 +451,7 @@ def load_engine(args):
             prefill_chunk_size=args.prefill_chunk_size,
             device=device,
             speculative_config=_build_speculative_config(args),
+            tree_speculative_config=_build_tree_speculative_config(args),
         )
 
     log.info("Model loaded.")
@@ -441,6 +472,34 @@ def _build_speculative_config(args):
     )
 
 
+def _build_tree_speculative_config(args):
+    """Build a TreeSpeculativeConfig from parsed CLI args, or return None
+    when tree mode is disabled.
+
+    Requires the flat drafter (we reuse the same draft model wrapped
+    by ``TreeDraftModel``). When ``--speculative-tree`` is set but no
+    drafter is configured, raise a SystemExit with a clear message —
+    silently disabling tree mode would mask a misconfiguration.
+    """
+    if not getattr(args, "speculative_tree", False):
+        return None
+    if not getattr(args, "speculative_draft_model", None):
+        raise SystemExit(
+            "ERROR: --speculative-tree requires --speculative-draft-model "
+            "(the tree drafter wraps the same small model). Pass both, "
+            "or drop --speculative-tree."
+        )
+    from ..speculative import TreeSpeculativeConfig
+    policy = getattr(args, "speculative_mode_policy", None) or "auto"
+    return TreeSpeculativeConfig(
+        max_branching=args.speculative_tree_max_branching,
+        max_depth=args.speculative_tree_max_depth,
+        node_budget=args.speculative_tree_node_budget,
+        cold_accept=args.speculative_tree_cold_accept,
+        policy=policy,
+    )
+
+
 def main():
     args = parse_args()
 
@@ -502,6 +561,10 @@ def main():
             "OOM planning enabled: auto_truncate=%s, safety_margin=%.0f%%",
             args.auto_truncate, args.planner_safety_margin * 100,
         )
+        # Same coefficients drive tree-shape selection. The engine
+        # already constructed its tree engine with defaults; this
+        # writes the calibrated values in.
+        engine.set_cost_coefficients(cost_coefficients)
 
     worker = EngineWorker(
         engine=engine,
 
@@ -27,12 +27,21 @@
 from .config import SpeculativeConfig, SpeculativeMode
 from .draft import DraftModel
 from .engine import SpeculativeEngine
-from .rollback import truncate_past_kv
+from .mode_selector import ChosenMode, ModeSelector
+from .rollback import gather_kv_columns, truncate_past_kv
 from .sampler import verify_greedy, verify_sampling
 from .stats import SpeculativeStats
+from .tree import (
+    AcceptanceEWMA,
+    TreeShape,
+    TreeSpeculativeConfig,
+    pick_shape,
+)
+from .tree.engine import TreeSpeculativeEngine
 from .verifier import TargetVerifier
 
 __all__ = [
+    # flat
     "SpeculativeConfig",
     "SpeculativeMode",
     "SpeculativeEngine",
@@ -42,5 +51,15 @@
     "verify_greedy",
     "verify_sampling",
     "truncate_past_kv",
+    "gather_kv_columns",
     "run_speculative_decode",
+    # tree
+    "TreeSpeculativeConfig",
+    "TreeSpeculativeEngine",
+    "TreeShape",
+    "AcceptanceEWMA",
+    "pick_shape",
+    # mode selection
+    "ModeSelector",
+    "ChosenMode",
 ]