Add kv_canary PP self-test fixture and SWA divergence coverage (#27410)

fzyzcjy · web-flow · commit 21201ef718ed · 2026-06-06T09:06:19.000+08:00
diff --git a/python/sglang/srt/kv_canary/runner/swa_divergence.py b/python/sglang/srt/kv_canary/runner/swa_divergence.py
@@ -149,6 +149,13 @@ def find_last(cls, text: str) -> Optional[tuple["SwaDivergenceLog", str]]:
             return None
         return cls(**json.loads(last_match.group(1))), last_match.group(0)
 
+    @classmethod
+    def find_all(cls, text: str) -> list[tuple["SwaDivergenceLog", str]]:
+        return [
+            (cls(**json.loads(match.group(1))), match.group(0))
+            for match in _SWA_DIVERGENCE_LINE_RE.finditer(text)
+        ]
+
 
 def compute_swa_out_of_window_tokens(
     *,
diff --git a/python/sglang/test/kv_canary/e2e_base.py b/python/sglang/test/kv_canary/e2e_base.py
@@ -126,13 +126,17 @@ def send_parallel_requests(
         assert_all_success: bool = True,
         max_new_tokens: int = 2048,
         timeout: float = 240.0,
+        ignore_eos: Optional[bool] = None,
     ) -> list[dict]:
         """Fan out n parallel /generate requests; return list of response dicts."""
+        if ignore_eos is None:
+            ignore_eos = self.model_mode == "swa"
         results = post_parallel_generate(
             url=self.base_url + "/generate",
             prompts=self.make_prompts(n),
             max_new_tokens=max_new_tokens,
             timeout=timeout,
+            ignore_eos=ignore_eos,
         )
         if assert_all_success:
             for result in results:
@@ -155,44 +159,54 @@ def assert_swa_divergence_observed(
         """Assert that the SWA path was genuinely exercised.
 
         Three signals must all hold:
-          - ``swa_out_of_window_tokens >= 1``: at least one prefix token has been clipped
-            out of the sliding window (its SWA mapping is 0). Any prompt longer than the
-            SWA window produces this — proves the SWA window slide actually ran.
+          - ``swa_out_of_window_tokens >= 1``: at least one token has slid out of the
+            sliding window (its SWA mapping is 0). This only appears once a request decodes
+            past the window, so the window evicts — proves the SWA window slide actually ran.
           - ``swa_full_idx_divergence >= 1``: SWA pool has actually remapped at least one
             slot to a non-identity index (i.e. real slot reuse / eviction occurred). The
             workload must drive SWA pool pressure for this to fire — required because the
             "pool reuse" path is the one production hits under sustained long-context
             traffic, and we must keep it covered.
           - ``verify_swa < verify_full``: SWA verify kernel processed fewer tokens than
             FULL — proves both kernel groups ran and the window short-circuited SWA.
+
+        The first two signals are checked as the *peak* across all sampled forwards, not
+        only the last sample. The divergence reporter snapshots one live forward batch per
+        interval; under PP it snapshots a single micro-batch, which may hold only in-window
+        requests even when another micro-batch diverged. "Was the SWA path ever exercised?"
+        is a max-over-samples question, so a trailing in-window sample must not mask an
+        earlier diverging one. ``verify_swa``/``verify_full`` are monotonic running totals,
+        so the lag check reads the last sample.
         """
-        last_parsed = None
-        last_line: str = ""
+        samples: list[tuple[SwaDivergenceLog, str]] = []
         for _ in range(max_retries):
             time.sleep(flush_wait_seconds)
-            log_text = self._captured_log_text()
-            found = SwaDivergenceLog.find_last(log_text)
-            if found is not None:
-                last_parsed, last_line = found
+            samples = SwaDivergenceLog.find_all(self._captured_log_text())
+            if samples:
                 break
 
-        if last_parsed is None:
+        if not samples:
             raise AssertionError(
                 "No kv_canary swa_divergence line found in server log after "
                 f"{max_retries} retries (wait={flush_wait_seconds}s each). "
                 f"Log tail:\n{self._captured_log_text()[-2000:]}"
             )
 
-        if last_parsed.swa_out_of_window_tokens < min_swa_out_of_window_tokens:
+        peak_out_of_window = max(p.swa_out_of_window_tokens for p, _ in samples)
+        peak_full_idx_divergence = max(p.swa_full_idx_divergence for p, _ in samples)
+        last_parsed, last_line = samples[-1]
+
+        if peak_out_of_window < min_swa_out_of_window_tokens:
             raise AssertionError(
-                f"SWA path not exercised: swa_out_of_window_tokens={last_parsed.swa_out_of_window_tokens} "
-                f"< min={min_swa_out_of_window_tokens}. Line: {last_line}"
+                f"SWA path not exercised: peak swa_out_of_window_tokens={peak_out_of_window} "
+                f"< min={min_swa_out_of_window_tokens} across {len(samples)} samples. "
+                f"Last line: {last_line}"
             )
-        if last_parsed.swa_full_idx_divergence < min_swa_full_idx_divergence:
+        if peak_full_idx_divergence < min_swa_full_idx_divergence:
             raise AssertionError(
-                f"SWA pool reuse not exercised: swa_full_idx_divergence={last_parsed.swa_full_idx_divergence} "
-                f"< min={min_swa_full_idx_divergence}. The workload did not drive enough SWA pool pressure "
-                f"to force slot remap. Line: {last_line}"
+                f"SWA pool reuse not exercised: peak swa_full_idx_divergence={peak_full_idx_divergence} "
+                f"< min={min_swa_full_idx_divergence} across {len(samples)} samples. The workload "
+                f"did not drive enough SWA pool pressure to force slot remap. Last line: {last_line}"
             )
         if require_verify_lag and not (
             last_parsed.verify_swa < last_parsed.verify_full
diff --git a/python/sglang/test/kv_canary/pp_fixture.py b/python/sglang/test/kv_canary/pp_fixture.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from typing import ClassVar
+
+from sglang.test.kv_canary.consts import SWA_POOL_SERVER_ARGS
+from sglang.test.kv_canary.e2e_base import CanaryE2EBase
+
+PP_SIZE: int = 2
+
+
+class CanaryPPFixture(CanaryE2EBase):
+
+    model_mode: ClassVar[str] = "swa"
+    workload_n_batches: ClassVar[int] = 2
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.extra_server_args = (
+            "--pp-size",
+            str(PP_SIZE),
+            "--disable-cuda-graph",
+            *SWA_POOL_SERVER_ARGS,
+            *cls.extra_server_args,
+        )
+        super().setUpClass()
diff --git a/python/sglang/test/kv_canary/utils.py b/python/sglang/test/kv_canary/utils.py
@@ -33,6 +33,7 @@ def post_parallel_generate(
     prompts: list[str],
     max_new_tokens: int,
     timeout: float,
+    ignore_eos: bool = False,
 ) -> list[dict]:
     def _send(prompt: str) -> dict:
         try:
@@ -43,6 +44,7 @@ def _send(prompt: str) -> dict:
                     "sampling_params": {
                         "max_new_tokens": max_new_tokens,
                         "temperature": 0.0,
+                        "ignore_eos": ignore_eos,
                     },
                 },
                 timeout=timeout,
diff --git a/test/registered/kv_canary/test_self_e2e_pp_baseline.py b/test/registered/kv_canary/test_self_e2e_pp_baseline.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import unittest
+
+from sglang.srt.kv_canary.config import CanaryMode
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.kv_canary.pp_fixture import CanaryPPFixture
+
+register_cuda_ci(est_time=220, stage="extra-a", runner_config="2-gpu-large")
+
+
+class TestPPBaselineSwa(CanaryPPFixture):
+
+    kv_canary_mode = CanaryMode.LOG
+
+    def test_no_violation(self) -> None:
+        for _ in range(self.workload_n_batches):
+            self.send_parallel_requests()
+        self.assert_no_violation(wait_seconds=2.0)
+        self.maybe_assert_swa_divergence_observed()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/kv_canary/test_self_e2e_pp_perturb.py b/test/registered/kv_canary/test_self_e2e_pp_perturb.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import unittest
+from typing import ClassVar
+
+from sglang.srt.kv_canary.config import CanaryMode
+from sglang.srt.kv_canary.perturb.config import TargetGroupKind
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.kv_canary.pp_fixture import CanaryPPFixture
+
+register_cuda_ci(est_time=220, stage="extra-a", runner_config="2-gpu-large")
+
+
+class TestPPPerturbSwaSwa(CanaryPPFixture):
+
+    kv_canary_mode = CanaryMode.LOG
+    target_group: ClassVar[TargetGroupKind] = TargetGroupKind.SWA
+    extra_server_args = ("--kv-canary-real-data", "partial")
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.extra_env = {
+            "SGLANG_KV_CANARY_PERTURB_REAL_KV_USED_PROB": "0.1",
+            "SGLANG_KV_CANARY_PERTURB_TARGET_GROUP": str(cls.target_group),
+            "SGLANG_KV_CANARY_PERTURB_WARMUP_STEPS": "0",
+        }
+        super().setUpClass()
+
+    def test_real_kv_used_perturbation_reports_real_kv_hash_violation(self) -> None:
+        for _ in range(self.workload_n_batches):
+            self.send_parallel_requests()
+        self.assert_per_forward_violation_reported(
+            fail_reason="verify_real_kv_hash",
+            target_group=self.target_group,
+        )
+        self.maybe_assert_swa_divergence_observed()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/kv_canary/test_self_unit_e2e_base.py b/test/registered/kv_canary/test_self_unit_e2e_base.py
@@ -69,17 +69,58 @@ def test_assert_swa_divergence_observed_passes_when_above_threshold(self) -> Non
                 max_retries=1,
             )
 
-    def test_assert_swa_divergence_observed_uses_latest_line(self) -> None:
-        log = _GOOD_LINE + "\n" + _LATER_LINE + "\n"
-        harness, patcher = self._make_harness(log)
+    def test_assert_swa_divergence_observed_uses_peak_out_of_window(self) -> None:
+        diverged = SwaDivergenceLog(
+            forward_ct=120,
+            verify_full=10000,
+            verify_swa=4200,
+            swa_full_idx_divergence=512,
+            swa_out_of_window_tokens=8192,
+        ).format()
+        trailing_zero = SwaDivergenceLog(
+            forward_ct=240,
+            verify_full=20000,
+            verify_swa=8400,
+            swa_full_idx_divergence=0,
+            swa_out_of_window_tokens=0,
+        ).format()
+        harness, patcher = self._make_harness(diverged + "\n" + trailing_zero + "\n")
         with patcher:
             harness.assert_swa_divergence_observed(
-                min_swa_full_idx_divergence=1000,
+                min_swa_out_of_window_tokens=1,
+                min_swa_full_idx_divergence=1,
                 require_verify_lag=True,
                 flush_wait_seconds=0.0,
                 max_retries=1,
             )
 
+    def test_assert_swa_divergence_observed_checks_verify_lag_on_latest_line(
+        self,
+    ) -> None:
+        lagging = SwaDivergenceLog(
+            forward_ct=120,
+            verify_full=10000,
+            verify_swa=4200,
+            swa_full_idx_divergence=512,
+            swa_out_of_window_tokens=8192,
+        ).format()
+        no_lag = SwaDivergenceLog(
+            forward_ct=240,
+            verify_full=20000,
+            verify_swa=20000,
+            swa_full_idx_divergence=1024,
+            swa_out_of_window_tokens=16384,
+        ).format()
+        harness, patcher = self._make_harness(lagging + "\n" + no_lag + "\n")
+        with patcher:
+            with self.assertRaisesRegex(AssertionError, "verify_swa=20000"):
+                harness.assert_swa_divergence_observed(
+                    min_swa_full_idx_divergence=1,
+                    require_verify_lag=True,
+                    flush_wait_seconds=0.0,
+                    max_retries=1,
+                )
+
     def test_assert_swa_divergence_observed_raises_when_below_threshold(self) -> None:
         zero_mapping_line = SwaDivergenceLog(
             forward_ct=100,
diff --git a/test/registered/kv_canary/test_self_unit_runner_swa_divergence.py b/test/registered/kv_canary/test_self_unit_runner_swa_divergence.py
@@ -410,6 +410,40 @@ def test_swa_divergence_report_emits_swa_full_idx_divergence_from_compute(
         self.assertEqual(parsed.verify_swa, 3)
 
 
+class TestSwaDivergenceLogFindAll(CustomTestCase):
+    def test_find_all_returns_every_sample_in_order(self) -> None:
+        text = "\n".join(
+            SwaDivergenceLog(
+                forward_ct=ct,
+                verify_full=100 * ct,
+                verify_swa=10 * ct,
+                swa_full_idx_divergence=ct,
+                swa_out_of_window_tokens=0,
+            ).format()
+            for ct in (20, 40, 60)
+        )
+        parsed = SwaDivergenceLog.find_all(text)
+        self.assertEqual([p.forward_ct for p, _ in parsed], [20, 40, 60])
+
+    def test_find_all_peak_survives_trailing_zero_sample(self) -> None:
+        text = "\n".join(
+            SwaDivergenceLog(
+                forward_ct=ct,
+                verify_full=1,
+                verify_swa=0,
+                swa_full_idx_divergence=1,
+                swa_out_of_window_tokens=oow,
+            ).format()
+            for ct, oow in ((20, 0), (40, 4080), (60, 0))
+        )
+        parsed = SwaDivergenceLog.find_all(text)
+        self.assertEqual(max(p.swa_out_of_window_tokens for p, _ in parsed), 4080)
+        self.assertEqual(parsed[-1][0].swa_out_of_window_tokens, 0)
+
+    def test_find_all_returns_empty_list_when_no_lines(self) -> None:
+        self.assertEqual(SwaDivergenceLog.find_all("nothing here\n"), [])
+
+
 class TestCanaryManagerSwaDivergenceWiring(CanaryManagerTestCase):
     def test_swa_divergence_report_is_none_when_env_disabled(self) -> None:
         with envs.SGLANG_KV_CANARY_SWA_DIVERGENCE_STATS_INTERVAL.override(