sgl-project
diff --git a/‎python/sglang/test/chunked_prefill_test_utils.py‎
Lines changed: 129 additions & 0 deletions b/‎python/sglang/test/chunked_prefill_test_utils.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎python/sglang/test/scripted_runtime/context/api.py‎
Lines changed: 6 additions & 2 deletions b/‎python/sglang/test/scripted_runtime/context/api.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎python/sglang/test/scripted_runtime/context/http_post.py‎
Lines changed: 24 additions & 9 deletions b/‎python/sglang/test/scripted_runtime/context/http_post.py‎
Lines changed: 24 additions & 9 deletions
diff --git a/‎python/sglang/test/scripted_runtime/context/lifecycle.py‎
Lines changed: 12 additions & 2 deletions b/‎python/sglang/test/scripted_runtime/context/lifecycle.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎python/sglang/test/scripted_runtime/context/queries.py‎
Lines changed: 14 additions & 3 deletions b/‎python/sglang/test/scripted_runtime/context/queries.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎python/sglang/test/scripted_runtime/context/req_starter.py‎
Lines changed: 7 additions & 1 deletion b/‎python/sglang/test/scripted_runtime/context/req_starter.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎python/sglang/test/scripted_runtime/req_handle.py‎
Lines changed: 9 additions & 2 deletions b/‎python/sglang/test/scripted_runtime/req_handle.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎python/sglang/test/scripted_runtime/scheduler_hook.py‎
Lines changed: 3 additions & 0 deletions b/‎python/sglang/test/scripted_runtime/scheduler_hook.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sglang/test/scripted_runtime_chunked_helpers.py‎
Lines changed: 11 additions & 2 deletions b/‎python/sglang/test/scripted_runtime_chunked_helpers.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎test/manual/chunked_prefill/__init__.py‎ b/‎test/manual/chunked_prefill/__init__.py‎
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import time
+from types import SimpleNamespace
+from typing import ClassVar, List, Optional
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.server_fixtures.disaggregation_fixture import (
+    PDDisaggregationServerBase,
+)
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+    try_cached_model,
+)
+
+DEFAULT_MODEL: str = "Qwen/Qwen3-0.6B"
+
+DEFAULT_CHUNKED_PREFILL_SIZE: int = 256
+DEFAULT_NUM_EXAMPLES: int = 100
+DEFAULT_NUM_SHOTS: int = 10
+LONG_PROMPT_NUM_SHOTS: int = 24
+DEFAULT_NUM_THREADS: int = 128
+DEFAULT_MAX_TOKENS: int = 512
+DEFAULT_SEED: int = 42
+
+KV_CANARY_ARGS: List[str] = [
+    "--kv-canary",
+    "raise",
+    "--kv-canary-real-data",
+    "partial",
+    "--kv-canary-sweep-interval",
+    "100",
+    "--disable-piecewise-cuda-graph",
+]
+
+
+class ChunkedGsm8kMixin:
+    __test__ = False
+    use_kv_canary: ClassVar[bool] = True
+    model: ClassVar[str] = DEFAULT_MODEL
+    feature_args: ClassVar[List[str]] = []
+
+    chunked_prefill_size: ClassVar[int] = DEFAULT_CHUNKED_PREFILL_SIZE
+    num_shots: ClassVar[int] = DEFAULT_NUM_SHOTS
+    num_examples: ClassVar[int] = DEFAULT_NUM_EXAMPLES
+    num_threads: ClassVar[int] = DEFAULT_NUM_THREADS
+    max_tokens: ClassVar[int] = DEFAULT_MAX_TOKENS
+    gsm8k_threshold: ClassVar[float]
+
+    def build_prefill_side_args(self) -> List[str]:
+        canary = list(KV_CANARY_ARGS) if self.use_kv_canary else []
+        return (
+            ["--chunked-prefill-size", str(self.chunked_prefill_size)]
+            + list(self.feature_args)
+            + canary
+        )
+
+    def test_mixed_prefix_gsm8k_chunked(self):
+        fixture_name = type(self).__name__
+
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mixed_prefix_gsm8k",
+            api="chat_completion",
+            max_tokens=self.max_tokens,
+            num_examples=self.num_examples,
+            num_threads=self.num_threads,
+            num_shots=self.num_shots,
+            mixed_prefix_gsm8k_secondary_pool_size=15,
+            mixed_prefix_gsm8k_seed=DEFAULT_SEED,
+            gsm8k_data_path=None,
+            temperature=0.0,
+        )
+        tic = time.perf_counter()
+        metrics = run_eval(args)
+        metrics["elapsed_sec"] = time.perf_counter() - tic
+        print(f"[{fixture_name}] {metrics} threshold={self.gsm8k_threshold:.4f}")
+
+        score = metrics.get("score")
+        self.assertIsNotNone(score, "run_eval returned no score")
+        self.assertGreaterEqual(score, self.gsm8k_threshold)
+
+
+class ChunkedTestBase(ChunkedGsm8kMixin, CustomTestCase):
+    __test__ = False
+
+    base_url: ClassVar[str] = DEFAULT_URL_FOR_TEST
+    launch_timeout: ClassVar[int] = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+    process: ClassVar[Optional[object]] = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=cls.launch_timeout,
+            other_args=cls("test_mixed_prefix_gsm8k_chunked").build_prefill_side_args(),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        if cls.process is not None:
+            kill_process_tree(cls.process.pid)
+
+
+class ChunkedTestPDBase(ChunkedGsm8kMixin, PDDisaggregationServerBase):
+    __test__ = False
+    decode_feature_args: ClassVar[List[str]] = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.extra_prefill_args = cls(
+            "test_mixed_prefix_gsm8k_chunked"
+        ).build_prefill_side_args()
+        canary = list(KV_CANARY_ARGS) if cls.use_kv_canary else []
+        cls.extra_decode_args = canary + list(cls.decode_feature_args)
+        PDDisaggregationServerBase.setUpClass()
+        cls.model = try_cached_model(cls.model)
+        cls.launch_all()
+
+    @classmethod
+    def tearDownClass(cls):
+        PDDisaggregationServerBase.tearDownClass()
@@ -64,6 +64,8 @@ def start_req(
         return_logprob: bool = False,
         logprob_start_len: Optional[int] = None,
         top_logprobs_num: Optional[int] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        temperature: Optional[float] = None,
         lora_path: Optional[str] = None,
     ) -> "ScriptedReqHandle":
         return self._req_starter.start_req(
@@ -77,6 +79,8 @@ def start_req(
             return_logprob=return_logprob,
             logprob_start_len=logprob_start_len,
             top_logprobs_num=top_logprobs_num,
+            stop_token_ids=stop_token_ids,
+            temperature=temperature,
             lora_path=lora_path,
         )
 
@@ -89,8 +93,8 @@ def continue_generation(self, *, torch_empty_cache: bool = False) -> None:
     def abort_all(self) -> None:
         return lifecycle.abort_all(self)
 
-    def abort(self, handle: "ScriptedReqHandle") -> None:
-        return lifecycle.abort(self, rid=handle.rid)
+    def abort(self, handle: "ScriptedReqHandle", *, await_arrival: bool = True) -> None:
+        return lifecycle.abort(self, rid=handle.rid, await_arrival=await_arrival)
 
     def flush_cache(self) -> None:
         return lifecycle.flush_cache(self)
 
@@ -19,19 +19,34 @@ def _http_post_and_await_recv_msg(
     predicate: Callable[[Any], bool],
     description: str,
     timeout_s: float = RECV_MSG_ARRIVAL_TIMEOUT_S,
+) -> None:
+    _submit_post(ctx, path=path, json=json)
+    ctx._tokenizer_recv_proxy.wait_until_arrived(
+        predicate,
+        timeout_s=timeout_s,
+        description=description,
+    )
+
+
+def _http_post_fire_and_forget(
+    ctx: "ScriptedContext",
+    *,
+    path: str,
+    json: Optional[Dict[str, Any]],
+) -> None:
+    _submit_post(ctx, path=path, json=json)
+
+
+def _submit_post(
+    ctx: "ScriptedContext",
+    *,
+    path: str,
+    json: Optional[Dict[str, Any]],
 ) -> None:
     server_args = ctx.scheduler.server_args
     url = f"http://{server_args.host}:{server_args.port}{path}"
 
     async def _post() -> None:
-        try:
-            await ctx._http_poster.post(url, json)
-        except Exception:  # noqa: BLE001 — fire-and-forget background POST
-            logger.exception("scripted_runtime: POST %s failed", path)
+        await ctx._http_poster.post(url, json)
 
     ctx._http_poster.submit_coro(_post())
-    ctx._tokenizer_recv_proxy.wait_until_arrived(
-        predicate,
-        timeout_s=timeout_s,
-        description=description,
-    )
 
@@ -10,15 +10,24 @@
 )
 from sglang.test.scripted_runtime.context.http_post import (
     _http_post_and_await_recv_msg,
+    _http_post_fire_and_forget,
 )
 
 if TYPE_CHECKING:
     from sglang.test.scripted_runtime.context.api import ScriptedContext
 
 
 def _await_control(
-    ctx: "ScriptedContext", *, path: str, json, expect_type: type
+    ctx: "ScriptedContext",
+    *,
+    path: str,
+    json,
+    expect_type: type,
+    await_arrival: bool = True,
 ) -> None:
+    if not await_arrival:
+        _http_post_fire_and_forget(ctx, path=path, json=json)
+        return
     _http_post_and_await_recv_msg(
         ctx,
         path=path,
@@ -57,12 +66,13 @@ def abort_all(ctx: "ScriptedContext") -> None:
     )
 
 
-def abort(ctx: "ScriptedContext", *, rid: str) -> None:
+def abort(ctx: "ScriptedContext", *, rid: str, await_arrival: bool = True) -> None:
     _await_control(
         ctx,
         path="/abort_request",
         json={"rid": rid, "abort_all": False},
         expect_type=AbortReq,
+        await_arrival=await_arrival,
     )
 
 
 
@@ -80,9 +80,20 @@ def find_req_by_rid(ctx: "ScriptedContext", rid: str) -> Optional["Req"]:
 
 def is_finished(ctx: "ScriptedContext", rid: str) -> bool:
     req = find_req_by_rid(ctx, rid)
-    if req is None:
-        return rid in ctx._seen_rids
-    return req.finished()
+    if req is not None:
+        return req.finished()
+    if rid in ctx._seen_rids:
+        return True
+    # Fallback: if the req ran in a forward batch (recorded in _batch_log) but
+    # is now absent from all active scheduler sets, it must have finished.
+    # This catches requests that completed without ever being observed via
+    # find_req_by_rid (e.g. when Python short-circuit evaluation prevents the
+    # query while another request is still running).
+    log = ctx._scheduler_hook._batch_log
+    if any(rid in record.rids for record in log):
+        ctx._seen_rids.add(rid)
+        return True
+    return False
 
 
 def is_chunking(ctx: "ScriptedContext", rid: str) -> bool:
 
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import uuid
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from sglang.test.scripted_runtime.context.http_post import (
     _http_post_and_await_recv_msg,
@@ -30,6 +30,8 @@ def start_req(
         return_logprob: bool = False,
         logprob_start_len: Optional[int] = None,
         top_logprobs_num: Optional[int] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        temperature: Optional[float] = None,
         lora_path: Optional[str] = None,
     ) -> ScriptedReqHandle:
         ctx = self._ctx
@@ -39,6 +41,10 @@ def start_req(
             self._req_counter += 1
 
         sampling_params = {"max_new_tokens": max_new_tokens, "ignore_eos": ignore_eos}
+        if stop_token_ids is not None:
+            sampling_params["stop_token_ids"] = stop_token_ids
+        if temperature is not None:
+            sampling_params["temperature"] = temperature
         payload = {
             "input_ids": [prompt_token] * prompt_len,
             "sampling_params": sampling_params,
 
@@ -3,6 +3,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
+from sglang.test.scripted_runtime.context.radix import _node_lock_ref
+
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import Req
     from sglang.test.scripted_runtime.context.api import ScriptedContext
@@ -47,5 +49,10 @@ def kv_pages(self) -> int:
 
     @property
     def lock_refs(self) -> int:
-        node = self.req.last_node
-        return node.lock_ref if node is not None else 0
+        req = self.req
+        if req is None:
+            return 0
+        node = req.last_node
+        if node is None:
+            return 0
+        return _node_lock_ref(node)
@@ -95,6 +95,9 @@ def _drive_engine_through_warmup(ctx: ScriptedContext) -> Generator:
 def _reset_engine_state(ctx: ScriptedContext) -> Generator:
     scheduler = ctx.scheduler
 
+    if scheduler._engine_paused:
+        ctx.continue_generation()
+
     ctx._release_exhausted_pools()
     ctx.abort_all()
     for _ in range(RESET_DRAIN_MAX_STEPS):
 
@@ -41,13 +41,16 @@ def run_until_finished(handle, *, max_steps: int = DEFAULT_MAX_STEPS):
 
 
 def run_until_all_finished(handles: List[Any], *, max_steps: int = DEFAULT_MAX_STEPS):
+    done = [False] * len(handles)
     for _ in range(max_steps):
-        if all(h.finished for h in handles):
+        for i, h in enumerate(handles):
+            done[i] = done[i] or h.finished
+        if all(done):
             return
         yield
     raise AssertionError(
         f"run_until_all_finished: not all reqs finished after {max_steps} "
-        f"steps (finished={[h.finished for h in handles]})"
+        f"steps (finished={done})"
     )
 
 
@@ -65,6 +68,12 @@ def warmup_radix(t, prompt_tokens: List[int], *, max_steps: int = DEFAULT_MAX_ST
 
 BALLAST_MAX_NEW_TOKENS: int = 30000
 
+SMALL_KV_POOL_MAX_TOTAL_TOKENS: int = 4096
+
+SMALL_KV_POOL_BALLAST_MAX_NEW_TOKENS: int = 512
+
+SMALL_KV_POOL_BALLAST_PROMPT_LEN: int = 1536
+
 
 def exhaust_row_pool(t, *, leave_rows: int, max_steps: int = DEFAULT_MAX_STEPS):
     target: int = t.scheduler.req_to_token_pool.available_size() - leave_rows