[Test] Gemma4 long-context sliding-window dispatch (#285)

ricky-chaoju · web-flow · commit b669f18aeea3 · 2026-04-20T15:20:34.000+01:00
Follow up on the review feedback for #282. - make `tests/test_gemma4_sliding_window_dispatch.py` build a prompt longer than Gemma4 E2B's 512-token sliding window so the test actually exercises the long-context path - record both `sliding_window` and `max_seq_len` from `paged_attention_primitive` and assert the kernel sees a context longer than the configured window - replace the hardcoded `paged_attention_primitive` argument index with lookup from nanobind signature metadata so the spy follows the canonical native-op signature --------- Signed-off-by: RickyChen / 陳昭儒 <ricky.chen@infinirc.com>
diff --git a/tests/test_gemma4_sliding_window_dispatch.py b/tests/test_gemma4_sliding_window_dispatch.py
@@ -23,8 +23,11 @@
 
 from __future__ import annotations
 
+import math
 import os
+import re
 from collections import Counter
+from dataclasses import dataclass
 
 import pytest
 
@@ -41,23 +44,109 @@
 
 _NO_WINDOW = -1
 
-# Position of ``sliding_window`` in ``paged_attention_primitive``'s
-# positional signature (see ``attention_sdpa.py:489-510``).
-_SLIDING_WINDOW_ARG_INDEX = 11
+# The bug only manifests once the prompt exceeds Gemma4 E2B's 512-token
+# sliding window. Build a prompt with a comfortable margin so the kernel
+# must actually decide whether to enforce the window.
+_LONG_CONTEXT_TOKEN_MARGIN = 128
+_LONG_CONTEXT_MIN_TOKENS = _E2B_SLIDING_WINDOW + _LONG_CONTEXT_TOKEN_MARGIN
+_LONG_CONTEXT_TARGET_TOKENS = _LONG_CONTEXT_MIN_TOKENS + 1
+_FRAGMENT_REPEAT_SAMPLE_COUNT = 2
+_MAX_MODEL_LEN = 1024
+_MAX_TOKENS = 1
+_PROMPT_FRAGMENT = "The capital of France is Paris. "
 
 # Ratio tolerance: layer_types is a config constant, but prefill and
 # decode may dispatch slightly different counts across forwards, so we
 # accept a 1% slack.
 _RATIO_TOLERANCE = 0.01
 
+_NB_PARAM_RE = re.compile(r"([A-Za-z_]\w*)\s*:")
+
+
+@dataclass(frozen=True)
+class _KernelDispatch:
+    sliding_window: int
+    max_seq_len: int
+
+
+def _nanobind_param_indices(fn, *names: str) -> dict[str, int]:
+    """Resolve parameter positions from nanobind's runtime signature metadata."""
+    overloads = getattr(fn, "__nb_signature__", ())
+    if not overloads:
+        raise RuntimeError("paged_attention_primitive is missing __nb_signature__")
+
+    signature_text = overloads[0][0]
+    params_text = signature_text.partition("(")[2].rpartition(")")[0]
+    param_names = _NB_PARAM_RE.findall(params_text)
+
+    indices: dict[str, int] = {}
+    for name in names:
+        if name not in param_names:
+            raise RuntimeError(
+                f"parameter {name!r} not found in nanobind signature: {signature_text}"
+            )
+        indices[name] = param_names.index(name)
+    return indices
+
+
+def _get_call_arg(
+    args: tuple[object, ...],
+    kwargs: dict[str, object],
+    param_indices: dict[str, int],
+    name: str,
+) -> object:
+    """Read a native-op argument by name from positional/keyword call data."""
+    index = param_indices[name]
+    if len(args) > index:
+        return args[index]
+    if name in kwargs:
+        return kwargs[name]
+    raise RuntimeError(f"paged_attention_primitive call missing {name!r}")
+
+
+def _build_long_prompt(tokenizer) -> str:
+    """Return a prompt whose tokenized length exceeds Gemma4's window size."""
+    first_fragment_token_count = len(
+        tokenizer.encode(text=_PROMPT_FRAGMENT, add_special_tokens=False)
+    )
+    if first_fragment_token_count <= 0:
+        raise AssertionError("prompt fragment must tokenize to at least one token")
+
+    repeated_fragment_sample = _PROMPT_FRAGMENT * _FRAGMENT_REPEAT_SAMPLE_COUNT
+    repeat_increment_token_count = (
+        len(tokenizer.encode(text=repeated_fragment_sample, add_special_tokens=False))
+        - first_fragment_token_count
+    )
+    if repeat_increment_token_count <= 0:
+        raise AssertionError(
+            "prompt fragment repeat must increase token count by at least one token"
+        )
+
+    additional_repeat_count = math.ceil(
+        max(0, _LONG_CONTEXT_TARGET_TOKENS - first_fragment_token_count)
+        / repeat_increment_token_count
+    )
+    repeat_count = 1 + additional_repeat_count
+    prompt = _PROMPT_FRAGMENT * repeat_count
+    token_count = len(tokenizer.encode(text=prompt, add_special_tokens=False))
+    if token_count > _LONG_CONTEXT_MIN_TOKENS:
+        return prompt
+    raise AssertionError(
+        "failed to construct a prompt longer than Gemma4's sliding window: "
+        f"repeat_count={repeat_count}, token_count={token_count}, "
+        f"first_fragment_token_count={first_fragment_token_count}, "
+        f"repeat_increment_token_count={repeat_increment_token_count}, "
+        f"target>{_LONG_CONTEXT_MIN_TOKENS}"
+    )
+
 
 @pytest.fixture(scope="module")
-def kernel_sliding_window_log() -> list[int]:
+def kernel_dispatch_log() -> list[_KernelDispatch]:
     """Run one Gemma4 inference with a spy on ``paged_attention_primitive``.
 
-    Returns the list of ``sliding_window`` ints passed to every kernel
-    dispatch during the inference.  Skips if the model path env var is
-    unset.
+    Returns the ``sliding_window`` and ``max_seq_len`` seen by every
+    kernel dispatch during one long-context inference. Skips if the
+    model path env var is unset.
     """
     model_path = os.environ.get(MODEL_ENV)
     if not model_path:
@@ -76,24 +165,32 @@ def kernel_sliding_window_log() -> list[int]:
 
         ops = get_ops()
         orig_fn = ops.paged_attention_primitive
-        captured: list[int] = []
+        param_indices = _nanobind_param_indices(
+            orig_fn, "sliding_window", "max_seq_len"
+        )
+        captured: list[_KernelDispatch] = []
 
         def spy(*args, **kwargs):
-            sw = (
-                args[_SLIDING_WINDOW_ARG_INDEX]
-                if len(args) > _SLIDING_WINDOW_ARG_INDEX
-                else kwargs.get("sliding_window")
+            captured.append(
+                _KernelDispatch(
+                    sliding_window=int(
+                        _get_call_arg(args, kwargs, param_indices, "sliding_window")
+                    ),
+                    max_seq_len=int(
+                        _get_call_arg(args, kwargs, param_indices, "max_seq_len")
+                    ),
+                )
             )
-            captured.append(sw)
             return orig_fn(*args, **kwargs)
 
         mp.setattr(ops, "paged_attention_primitive", spy)
 
         from vllm import LLM, SamplingParams
 
-        llm = LLM(model=model_path, max_model_len=512, max_num_seqs=1)
-        sp = SamplingParams(temperature=0, max_tokens=5, ignore_eos=True)
-        llm.generate(["The capital of France is"], sp)
+        llm = LLM(model=model_path, max_model_len=_MAX_MODEL_LEN, max_num_seqs=1)
+        prompt = _build_long_prompt(llm.get_tokenizer())
+        sp = SamplingParams(temperature=0, max_tokens=_MAX_TOKENS, ignore_eos=True)
+        llm.generate([prompt], sp)
 
         return captured
 
@@ -103,26 +200,26 @@ class TestGemma4KernelReceivesPerLayerSlidingWindow:
     """Kernel-level assertions on the sliding_window values dispatched."""
 
     def test_only_expected_window_values_appear(
-        self, kernel_sliding_window_log: list[int]
+        self, kernel_dispatch_log: list[_KernelDispatch]
     ) -> None:
         """No stray values leak from wiring errors."""
         # Act
         unexpected = {
-            w
-            for w in kernel_sliding_window_log
-            if w not in (_E2B_SLIDING_WINDOW, _NO_WINDOW)
+            dispatch.sliding_window
+            for dispatch in kernel_dispatch_log
+            if dispatch.sliding_window not in (_E2B_SLIDING_WINDOW, _NO_WINDOW)
         }
         # Assert
         assert not unexpected, (
             f"kernel received unexpected sliding_window values: {unexpected}"
         )
 
     def test_both_sliding_and_full_layers_dispatch(
-        self, kernel_sliding_window_log: list[int]
+        self, kernel_dispatch_log: list[_KernelDispatch]
     ) -> None:
         """``sliding_window=512`` and ``-1`` both appear."""
         # Act
-        counts = Counter(kernel_sliding_window_log)
+        counts = Counter(dispatch.sliding_window for dispatch in kernel_dispatch_log)
         # Assert
         assert counts[_E2B_SLIDING_WINDOW] > 0, (
             "sliding layers never received their window -- enforcement is "
@@ -132,8 +229,18 @@ def test_both_sliding_and_full_layers_dispatch(
             "full layers never received -1 -- they may be incorrectly getting a window"
         )
 
+    def test_kernel_sees_context_longer_than_the_window(
+        self, kernel_dispatch_log: list[_KernelDispatch]
+    ) -> None:
+        """The regression test must actually exercise long-context behavior."""
+        max_seen = max(dispatch.max_seq_len for dispatch in kernel_dispatch_log)
+        assert max_seen > _E2B_SLIDING_WINDOW, (
+            f"long-context path was not exercised: max_seq_len={max_seen}, "
+            f"sliding_window={_E2B_SLIDING_WINDOW}"
+        )
+
     def test_ratio_matches_layer_types_config(
-        self, kernel_sliding_window_log: list[int]
+        self, kernel_dispatch_log: list[_KernelDispatch]
     ) -> None:
         """Sliding/full dispatch ratio matches the 28:7 layer_types split.
 
@@ -144,7 +251,7 @@ def test_ratio_matches_layer_types_config(
         ``layer_types`` and not stochastic.
         """
         # Act
-        counts = Counter(kernel_sliding_window_log)
+        counts = Counter(dispatch.sliding_window for dispatch in kernel_dispatch_log)
         sliding = counts[_E2B_SLIDING_WINDOW]
         full = counts[_NO_WINDOW]
         total = sliding + full