test(bench_serving_dense_tp_4): cap context-length to 3072 (#1077)

JamesBrianD · web-flow · commit 6f9def6017a1 · 2026-05-14T13:57:39.000+08:00
The current TestBenchServingDenseTp4 launches the server without --context-length, so the control plane pads per-seq KV allocation to Qwen3-8B's full 40K context. RPA v3's DECODE block-size heuristic then picks bkv_sz=16384 - far larger than the test's actual ~1K-token sequences - causing ~13% throughput regression after #934 routed decode batches to the DECODE sub-kernel. Bound context-length to 3072 (still covers test_itl's 1024+1024 token worst case) so RPA picks bkv_sz=3072 instead, and raise the throughput threshold from 9866 to 11000 (single-run measured 12101 tok/s). See #1044 for full root-cause analysis.
diff --git a/test/srt/test_bench_serving_dense_tp_4.py b/test/srt/test_bench_serving_dense_tp_4.py
@@ -52,6 +52,8 @@ def setUpClass(cls):
                 "--page-size",
                 "256",
                 "--disable-radix-cache",
+                "--context-length",
+                "3072",
             ],
             env={
                 "JAX_COMPILATION_CACHE_DIR": "/tmp/jax_compilation_cache",
@@ -105,7 +107,7 @@ def test_output_throughput_default_tp_4(self):
                 f"### test_output_throughput_default_tp_4\n"
                 f"Output throughput: {res['output_throughput']:.2f} token/s\n"
             )
-            self.assertGreater(res["output_throughput"], 9866)
+            self.assertGreater(res["output_throughput"], 11000)
 
     def test_ttft_default_tp_4(self):
         args = get_benchmark_args(