feat(spec): SpecInput data contract + host/device boundary annotations (#1064)

zorrofox · web-flow · commit f4d39f2e8db5 · 2026-05-14T15:37:46.000+08:00
Part of #1053 (P1-5a). Defines the spec-decode data contract that #1053 P1-2 (BaseSpecWorker/BaseDraftWorker class refactor) will build on, so the abstraction layer has a fixed interface to target. - spec_info.SpecInput: runtime_checkable Protocol exposing the three token counts the RFC requires to be separated (logical / allocated / verify) plus is_draft_input/is_verify_input/get_spec_adjust_token_coefficient /filter_batch/merge_batch. Docstring fixes the host/device boundary and DP-padded layout (Route 1) semantics. - EagleDraftInput: implement SpecInput; per-field docstrings annotate device vs host, shape, JIT-cache-key participation, and the allocate_lens vs accept_length vs new_seq_lens distinction. - EagleVerifyInput: implement SpecInput; per-field docstrings annotate device vs host and static-metadata fields; filter/merge raise (verify input is single-round). - test_spec_info.py: Protocol conformance + three-token-count distinction. No behavior change to the existing dp=1 path (pure additions: docstrings, Protocol, new methods). Type annotations on device fields changed np.ndarray → jax.Array | None to reflect actual runtime types after #1063. Depends on #1063 (P1-0) — diff is on top of that branch.
diff --git a/python/sgl_jax/srt/speculative/eagle_util.py b/python/sgl_jax/srt/speculative/eagle_util.py
@@ -379,42 +379,75 @@ def build_tree_kernel_efficient(
 @register_pytree_node_class
 @dataclass
 class EagleDraftInput:
-    # Constant: alloc length per decode step
+    """Next-round draft state — the only persistent cross-round spec state.
+
+    Implements ``SpecInput``. MUST NOT hold worker/runner/pool/future handles.
+    Under DP (Route 1), per-request fields use DP-padded order.
+    """
+
     ALLOC_LEN_PER_DECODE: ClassVar[int] = None
 
-    # The inputs for decode
-    # shape: (b, topk)
-    topk_p: np.ndarray = None
-    topk_index: np.ndarray = None
-    # shape: (b, hidden_size)
-    hidden_states: np.ndarray = None
+    # --- Cross-round draft state (device arrays, consumed by next draft) ---
+    #: device ``(b, topk)`` — top-k probs from previous draft/draft_extend.
+    topk_p: jax.Array | None = None
+    #: device ``(b, topk)`` — top-k token ids.
+    topk_index: jax.Array | None = None
+    #: device ``(b, hidden_size)`` — minimal hidden state for next draft step.
+    #: Multi-layer MTP keeps per-step hidden locally inside one
+    #: ``MultiLayerDraftWorker.draft()``; only this cross-round slice persists.
+    hidden_states: jax.Array | None = None
+    #: static metadata (pytree aux); changing it triggers a new compile shape.
     capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.FULL
 
-    # Inputs for extend
-    # shape: (b,)
-    verified_id: np.ndarray = None
-    accept_length: np.ndarray = None
+    # --- Draft-extend inputs (device unless ``_cpu`` suffixed) ---
+    #: device ``(b,)`` — verified token starting the next draft.
+    verified_id: jax.Array | None = None
+    #: device ``(b,)`` — accepted length used to select hidden in draft-extend.
+    accept_length: jax.Array | None = None
+    #: host ``(b,)`` int32 mirror of ``accept_length`` for scheduler bookkeeping.
     accept_length_cpu: np.ndarray | None = None
 
-    # Inputs for the attention backends
-    # shape: (b + 1,)
-    kv_indptr: np.ndarray = None
-    kv_indices: np.ndarray = None
+    # --- Attention-backend metadata (host, participates in metadata build) ---
+    kv_indptr: np.ndarray | None = None
+    kv_indices: np.ndarray | None = None
 
-    # Shape info for padding
+    # --- Padding shape (static; participates in JIT cache key) ---
     num_tokens_per_batch: int = -1
     num_tokens_for_logprob_per_batch: int = -1
 
-    # Inputs for draft extend
-    # shape: (b,)
-    seq_lens_for_draft_extend: np.ndarray = None
-    req_pool_indices_for_draft_extend: np.ndarray = None
+    # --- Draft-extend bookkeeping (host) ---
+    seq_lens_for_draft_extend: np.ndarray | None = None
+    req_pool_indices_for_draft_extend: np.ndarray | None = None
 
-    # Inputs for V2 overlap worker
-    # future_indices: Optional[FutureIndices] = None
+    # --- KV lifetime (host, scheduler-visible) ---
+    #: host ``(b,)`` — KV length already allocated in ``req_to_token_pool`` for
+    #: next-round pre-allocation and over-allocated slot release. Distinct from
+    #: ``accept_length`` (logical) and ``new_seq_lens`` (scheduler-visible).
     allocate_lens: np.ndarray | None = None
+    #: host ``(b,)`` — scheduler-visible logical length after verify. May be
+    #: derived from ``old_seq_lens + accept_length`` if not stored.
     new_seq_lens: np.ndarray | None = None
-    # verify_done: Optional[torch.cuda.Event] = None
+
+    # ---- SpecInput protocol -------------------------------------------------
+    def is_draft_input(self) -> bool:
+        return True
+
+    def is_verify_input(self) -> bool:
+        return False
+
+    def get_spec_adjust_token_coefficient(self) -> int:
+        return EagleDraftInput.ALLOC_LEN_PER_DECODE or 1
+
+    def get_logical_token_num(self, bs: int) -> np.ndarray:
+        if self.accept_length_cpu is not None:
+            return self.accept_length_cpu
+        return np.ones(bs, dtype=np.int32)
+
+    def get_allocated_token_num(self) -> np.ndarray | None:
+        return self.allocate_lens
+
+    def get_verify_token_num(self, bs: int) -> int:
+        return 0
 
     def tree_flatten(self):
         accept_length_cpu_arr = (
@@ -662,11 +695,10 @@ def merge_batch(self, spec_info: EagleDraftInput):
             return
         if spec_info.hidden_states is None:
             return
-        # FIXME(pc) this operate should be put on cpu
-        self.hidden_states = np.concatenate([self.hidden_states, spec_info.hidden_states], axis=0)
-        self.verified_id = np.concatenate([self.verified_id, spec_info.verified_id], axis=0)
-        self.topk_p = np.concatenate([self.topk_p, spec_info.topk_p])
-        self.topk_index = np.concatenate([self.topk_index, spec_info.topk_index])
+        self.hidden_states = jnp.concatenate([self.hidden_states, spec_info.hidden_states], axis=0)
+        self.verified_id = jnp.concatenate([self.verified_id, spec_info.verified_id], axis=0)
+        self.topk_p = jnp.concatenate([self.topk_p, spec_info.topk_p])
+        self.topk_index = jnp.concatenate([self.topk_index, spec_info.topk_index])
         self.allocate_lens = np.concatenate([self.allocate_lens, spec_info.allocate_lens])
 
 
@@ -687,22 +719,65 @@ class EagleVerifyOutput:
 @register_pytree_node_class
 @dataclass
 class EagleVerifyInput:
-    # container type for pytree
+    """Target-verify input. Implements ``SpecInput``.
+
+    Fully describes token/position/mask/tree-index for verify so
+    ``BaseSpecWorker.verify()`` never reads draft-worker internal state.
+    Under DP (Route 1), per-request fields use DP-padded order; verify
+    metadata must reshape to per-DP view before generating cu_q/kv_lens.
+    """
+
+    # --- Device arrays (enter target verify forward / sampling) ---
+    #: device ``(b*draft_token_num,)`` — flattened draft tokens to verify.
     draft_token: jax.Array
+    #: device ``(sum(q_i*kv_i),)`` — tree attention mask; shape participates
+    #: in the JIT cache key.
     custom_mask: jax.Array
+    #: device ``(b*draft_token_num,)`` — verify positions (follows
+    #: ``ForwardBatch`` host/device convention).
     positions: jax.Array
+    #: device — tree verify index (sampling-kernel convention).
     retrive_index: jax.Array
+    #: device — tree child pointer for tree sampling.
     retrive_next_token: jax.Array
+    #: device — tree sibling pointer for tree sampling.
     retrive_next_sibling: jax.Array
     retrive_cum_len: jax.Array
+    #: host ``(b,)`` — for verify attention metadata + DP token accounting.
     seq_lens_cpu: np.ndarray
-    # common type for pytree
+
+    # --- Static metadata (pytree aux; changes trigger new compile shape) ---
     spec_steps: int
     topk: int
+    #: per-request verify token count (constant within a precompile shape).
     draft_token_num: int
     seq_lens_sum: int
     capture_hidden_mode: CaptureHiddenMode
-    # grammar: BaseGrammarObject = None
+
+    # ---- SpecInput protocol -------------------------------------------------
+    def is_draft_input(self) -> bool:
+        return False
+
+    def is_verify_input(self) -> bool:
+        return True
+
+    def get_spec_adjust_token_coefficient(self) -> int:
+        return self.draft_token_num
+
+    def get_logical_token_num(self, bs: int) -> np.ndarray:
+        return np.ones(bs, dtype=np.int32)
+
+    def get_allocated_token_num(self) -> np.ndarray | None:
+        return None
+
+    def get_verify_token_num(self, bs: int) -> int:
+        return bs * self.draft_token_num
+
+    def filter_batch(self, new_indices: np.ndarray, has_been_filtered: bool = True) -> None:
+        raise NotImplementedError("EagleVerifyInput is consumed within one round")
+
+    def merge_batch(self, other) -> None:
+        raise NotImplementedError("EagleVerifyInput is consumed within one round")
 
     def tree_flatten(self):
         seq_lens_sum_arr = _as_int32_array(self.seq_lens_sum, fallback=0)
diff --git a/python/sgl_jax/srt/speculative/spec_info.py b/python/sgl_jax/srt/speculative/spec_info.py
@@ -1,13 +1,60 @@
+from __future__ import annotations
+
 import logging
 from enum import IntEnum, auto
+from typing import Protocol, runtime_checkable
 
 import jax
+import numpy as np
 
 from sgl_jax.srt.layers.logits_processor import LogitsProcessorOutput
 
 logger = logging.getLogger(__name__)
 
 
+@runtime_checkable
+class SpecInput(Protocol):
+    """Common interface for speculative-decode state passed through
+    ``ModelWorkerBatch.spec_info`` (#1053 P1-5a data contract).
+
+    Separates three token counts that the scheduler / KV allocator / verify
+    path each need but which differ under spec decode:
+
+    - **logical** — tokens the scheduler advances request output by
+      (= accepted count incl. bonus). Host scalar/array.
+    - **allocated** — KV slots already pre-allocated this round (for trimming
+      over-allocation on finished reqs). Host array.
+    - **verify** — flattened token count target verify will forward (drives
+      verify attention metadata + DP token accounting). Host scalar.
+
+    Implementations MUST NOT hold worker/runner/pool/future/callback handles
+    in pytree children (these would enter the JIT cache key). Device arrays
+    (``topk_p``, ``hidden_states``, ``draft_token``, ...) stay on device;
+    lengths/indices stay host-side ``np.ndarray``.
+
+    DP layout (Route 1, target+draft both DP): all per-request fields use
+    DP-padded order — section ``[dp_rank*per_dp_bs : dp_rank*per_dp_bs+real_bs]``.
+    Padding slots MUST NOT participate in valid state updates.
+    """
+
+    def is_draft_input(self) -> bool: ...
+    def is_verify_input(self) -> bool: ...
+
+    def get_spec_adjust_token_coefficient(self) -> int:
+        """Multiplier for scheduler new-token budgeting (e.g. draft_token_num)."""
+        ...
+
+    def get_logical_token_num(self, bs: int) -> np.ndarray:
+        """Per-request host int32 ``(bs,)``; callers sum for batch totals."""
+        ...
+
+    def get_allocated_token_num(self) -> np.ndarray | None: ...
+    def get_verify_token_num(self, bs: int) -> int: ...
+
+    def filter_batch(self, new_indices: np.ndarray, has_been_filtered: bool = True) -> None: ...
+    def merge_batch(self, other: SpecInput) -> None: ...
+
+
 class SpeculativeAlgorithm(IntEnum):
     NONE = auto()
     EAGLE = auto()
diff --git a/python/sgl_jax/test/speculative/test_spec_info.py b/python/sgl_jax/test/speculative/test_spec_info.py
@@ -0,0 +1,54 @@
+"""SpecInput protocol conformance for EagleDraftInput / EagleVerifyInput."""
+
+import numpy as np
+
+from sgl_jax.srt.model_executor.forward_batch_info import CaptureHiddenMode
+from sgl_jax.srt.speculative.eagle_util import EagleDraftInput, EagleVerifyInput
+from sgl_jax.srt.speculative.spec_info import SpecInput
+
+
+def test_eagle_draft_input_is_spec_input():
+    di = EagleDraftInput(
+        accept_length_cpu=np.array([2, 3, 1], dtype=np.int32),
+        allocate_lens=np.array([10, 12, 8], dtype=np.int32),
+    )
+    assert isinstance(di, SpecInput)
+    assert di.is_draft_input() and not di.is_verify_input()
+    assert (di.get_logical_token_num(bs=3) == np.array([2, 3, 1])).all()
+    assert di.get_verify_token_num(bs=3) == 0
+    assert (di.get_allocated_token_num() == np.array([10, 12, 8])).all()
+    assert di.get_spec_adjust_token_coefficient() >= 1
+
+
+def test_eagle_verify_input_is_spec_input():
+    vi = EagleVerifyInput(
+        draft_token=np.zeros(8, dtype=np.int32),
+        custom_mask=np.zeros(1, dtype=np.int32),
+        positions=np.zeros(8, dtype=np.int32),
+        retrive_index=np.zeros(8, dtype=np.int32),
+        retrive_next_token=np.zeros(8, dtype=np.int32),
+        retrive_next_sibling=np.zeros(8, dtype=np.int32),
+        retrive_cum_len=np.zeros(3, dtype=np.int32),
+        seq_lens_cpu=np.array([5, 7], dtype=np.int32),
+        spec_steps=3,
+        topk=1,
+        draft_token_num=4,
+        seq_lens_sum=12,
+        capture_hidden_mode=CaptureHiddenMode.FULL,
+    )
+    assert isinstance(vi, SpecInput)
+    assert vi.is_verify_input() and not vi.is_draft_input()
+    assert vi.get_verify_token_num(bs=2) == 8
+    assert vi.get_spec_adjust_token_coefficient() == 4
+    assert vi.get_allocated_token_num() is None
+
+
+def test_three_token_counts_are_distinct():
+    """RFC #1053: logical / allocated / verify must be exposed independently."""
+    di = EagleDraftInput(
+        accept_length_cpu=np.array([2, 2], dtype=np.int32),
+        allocate_lens=np.array([100, 100], dtype=np.int32),
+    )
+    assert int(di.get_logical_token_num(bs=2).sum()) == 4
+    assert int(di.get_allocated_token_num().sum()) == 200
+    assert di.get_verify_token_num(bs=2) == 0
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -459,6 +459,7 @@ def run_one_file(
         TestFile("python/sgl_jax/test/mem_cache/test_hybrid_req_to_token_pool.py", 1),
         TestFile("python/sgl_jax/test/speculative/test_eagle_tree_build.py", 1),
         TestFile("python/sgl_jax/test/speculative/test_eagle_utils.py", 1),
+        TestFile("python/sgl_jax/test/speculative/test_spec_info.py", 0.2, runner="pytest"),
         TestFile("python/sgl_jax/test/models/test_mimo_v2_nextn.py", 0.2, runner="pytest"),
         TestFile("python/sgl_jax/test/multimodal/test_wan_vae_precision.py", 1),
         TestFile("python/sgl_jax/test/multimodal/test_vae_scheduler.py", 2.5),