test(kda): hoist KDAAttnBackendForTest shim to test_utils.py

aolemila · claude · aolemila · commit e32453d6bf5c · 2026-05-14T14:15:19.000+08:00
Both KDA test files were carrying an identical copy of the `pool=` → `recurrent_state_pool=` translation shim added in 466afff. Move it to test_utils.py and import from both, dropping the local underscore prefix since it's now a shared helper. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
diff --git a/python/sgl_jax/test/test_kda_attention.py b/python/sgl_jax/test/test_kda_attention.py
@@ -15,36 +15,12 @@
 from sgl_jax.srt.mem_cache.recurrent_state_pool import RecurrentStatePool
 from sgl_jax.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sgl_jax.srt.utils.mesh_utils import create_device_mesh
+from sgl_jax.test.test_utils import KDAAttnBackendForTest
 
 mesh = create_device_mesh(ici_parallelism=[1, -1], dcn_parallelism=[1, 1])
 jax.sharding.set_mesh(mesh)
 
 
-class _KDAAttnBackendForTest:
-    """Test wrapper that translates `pool=` kwarg to `recurrent_state_pool=`.
-
-    Production routes through HybridLinearAttnBackend, which accepts `pool=`
-    (RadixLinearAttention's call convention) and forwards it to KDA as
-    `recurrent_state_pool=`. These tests assign the raw KDA backend as
-    `forward_batch.attn_backend`, bypassing that wrapper, so we replicate the
-    same translation here.
-    """
-
-    def __init__(self, backend):
-        object.__setattr__(self, "_backend", backend)
-
-    def __call__(self, *args, **kwargs):
-        if "pool" in kwargs:
-            kwargs["recurrent_state_pool"] = kwargs.pop("pool")
-        return self._backend(*args, **kwargs)
-
-    def __getattr__(self, name):
-        return getattr(self._backend, name)
-
-    def __setattr__(self, name, value):
-        setattr(self._backend, name, value)
-
-
 def _scaled_randn(rng: np.random.Generator, shape, scale: float = 0.1) -> np.ndarray:
     # scale=0.1 is a test-only hack: it shrinks the recurrent state so bf16 noise
     # in the delta-rule update fits the global atol=1e-2 (shared with flashattn).
@@ -348,7 +324,7 @@ def conv_weight():
     extend_prefix_lens = np.zeros(batch_size, dtype=np.int32) if mode == "prefill" else None
     has_initial_state_np = np.asarray(has_initial_state_per_req, dtype=np.bool_)
 
-    backend = _KDAAttnBackendForTest(KDAAttnBackend(mesh=test_mesh))
+    backend = KDAAttnBackendForTest(KDAAttnBackend(mesh=test_mesh))
 
     mwb = ModelWorkerBatch(
         bid=1,
diff --git a/python/sgl_jax/test/test_kda_attention_dp.py b/python/sgl_jax/test/test_kda_attention_dp.py
@@ -16,7 +16,7 @@
 from sgl_jax.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sgl_jax.srt.utils.common_utils import pad_to_bucket
 from sgl_jax.srt.utils.mesh_utils import create_device_mesh
-from sgl_jax.test.test_utils import CustomTestCase
+from sgl_jax.test.test_utils import CustomTestCase, KDAAttnBackendForTest
 
 
 def _scaled_randn(rng: np.random.Generator, shape, scale: float = 0.1) -> np.ndarray:
@@ -27,31 +27,6 @@ def _scaled_randn(rng: np.random.Generator, shape, scale: float = 0.1) -> np.nda
     return rng.standard_normal(shape).astype(np.float32) * scale
 
 
-class _KDAAttnBackendForTest:
-    """Test wrapper that translates `pool=` kwarg to `recurrent_state_pool=`.
-
-    Production routes through HybridLinearAttnBackend, which accepts `pool=`
-    (RadixLinearAttention's call convention) and forwards it to KDA as
-    `recurrent_state_pool=`. These tests assign the raw KDA backend as
-    `forward_batch.attn_backend`, bypassing that wrapper, so we replicate the
-    same translation here.
-    """
-
-    def __init__(self, backend):
-        object.__setattr__(self, "_backend", backend)
-
-    def __call__(self, *args, **kwargs):
-        if "pool" in kwargs:
-            kwargs["recurrent_state_pool"] = kwargs.pop("pool")
-        return self._backend(*args, **kwargs)
-
-    def __getattr__(self, name):
-        return getattr(self._backend, name)
-
-    def __setattr__(self, name, value):
-        setattr(self._backend, name, value)
-
-
 # Reference baselines duplicated from test_kda_attention.py — keep in sync.
 
 
@@ -420,7 +395,7 @@ def conv_weight():
     req_pool_indices_cpu = np.arange(total_bs, dtype=np.int32)
 
     real_bs_per_dp = [len(lens_per_rank.get(r, [])) for r in range(dp_size)]
-    backend = _KDAAttnBackendForTest(KDAAttnBackend(mesh=mesh))
+    backend = KDAAttnBackendForTest(KDAAttnBackend(mesh=mesh))
 
     mwb = ModelWorkerBatch(
         bid=1,
diff --git a/python/sgl_jax/test/test_utils.py b/python/sgl_jax/test/test_utils.py
@@ -788,3 +788,28 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
         rouge_l_scores.append(fmeasure)
 
     return rouge_l_scores
+
+
+class KDAAttnBackendForTest:
+    """Test wrapper that translates `pool=` kwarg to `recurrent_state_pool=`.
+
+    Production routes through HybridLinearAttnBackend, which accepts `pool=`
+    (RadixLinearAttention's call convention) and forwards it to the linear
+    sub-backend as `recurrent_state_pool=`. Tests that assign a raw linear
+    backend (e.g. KDAAttnBackend) as `forward_batch.attn_backend` bypass the
+    wrapper, so this shim replicates the same translation.
+    """
+
+    def __init__(self, backend):
+        object.__setattr__(self, "_backend", backend)
+
+    def __call__(self, *args, **kwargs):
+        if "pool" in kwargs:
+            kwargs["recurrent_state_pool"] = kwargs.pop("pool")
+        return self._backend(*args, **kwargs)
+
+    def __getattr__(self, name):
+        return getattr(self._backend, name)
+
+    def __setattr__(self, name, value):
+        setattr(self._backend, name, value)