sgl-project
diff --git a/‎.claude/rules/modify-component-must-read.md‎
Lines changed: 1 addition & 0 deletions b/‎.claude/rules/modify-component-must-read.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.claude/skills/scripted-runtime-notes/SKILL.md‎
Lines changed: 29 additions & 0 deletions b/‎.claude/skills/scripted-runtime-notes/SKILL.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎python/sglang/srt/environ.py‎
Lines changed: 5 additions & 0 deletions b/‎python/sglang/srt/environ.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 20 additions & 0 deletions b/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/sglang/srt/managers/scheduler_components/ipc_channels.py‎
Lines changed: 16 additions & 2 deletions b/‎python/sglang/srt/managers/scheduler_components/ipc_channels.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎python/sglang/srt/managers/scheduler_components/request_receiver.py‎
Lines changed: 9 additions & 1 deletion b/‎python/sglang/srt/managers/scheduler_components/request_receiver.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎python/sglang/srt/utils/network.py‎
Lines changed: 1 addition & 1 deletion b/‎python/sglang/srt/utils/network.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/test/scripted_runtime/__init__.py‎ b/‎python/sglang/test/scripted_runtime/__init__.py‎
diff --git a/‎python/sglang/test/scripted_runtime/background_http_poster.py‎
Lines changed: 68 additions & 0 deletions b/‎python/sglang/test/scripted_runtime/background_http_poster.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎python/sglang/test/scripted_runtime/context/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎python/sglang/test/scripted_runtime/context/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -5,3 +5,4 @@ Before modifying the following components, read the listed skill first.
 - **Speculative decoding code** (anything under `python/sglang/srt/speculative/`, related attention backends, scheduler accumulators, IPC fields, observability metrics, or CLI flags) → [`speculative-naming`](../skills/speculative-naming/SKILL.md)
 - **`Scheduler` / `TokenizerManager` / `ModelRunner` `__init__`** (`python/sglang/srt/managers/scheduler.py`, `python/sglang/srt/managers/tokenizer_manager.py`, `python/sglang/srt/model_executor/model_runner.py`) → [`large-class-init-style`](../skills/large-class-init-style/SKILL.md)
 - **Environment variables** (adding, renaming, or reviewing any `SGLANG_*` env var, migrating a legacy `SGL_*` alias, or touching `python/sglang/srt/environ.py`) → [`env-var-conventions`](../skills/env-var-conventions/SKILL.md)
+- **Scripted runtime** (anything related to the scripted runtime) → [`scripted-runtime-notes`](../skills/scripted-runtime-notes/SKILL.md)
@@ -0,0 +1,29 @@
+---
+name: scripted-runtime-notes
+description: Requirements for the SGLang scripted runtime, chiefly when to add (vs not add) a harness API. Use for anything related to the scripted runtime.
+---
+
+# Scripted Runtime — Notes
+
+Notes for anything related to the SGLang scripted runtime.
+
+## When to Add an API
+
+Tests read `r.req.*` and `t._scheduler.*` directly — there is no encapsulation boundary. A thin wrapper buys zero isolation; it only grows the surface.
+
+Add an API only if it does real work:
+
+1. **Control primitive** — drives the engine through a real path (`start_req`, `pause_generation`, `abort`, `evict_radix`, `exhaust_kv`). Reuse the real path; never hand-mutate state.
+2. **Hook-backed** — value cannot be read from a snapshot; accumulate via `scheduler_hook.on_run_batch` or the recv proxy (`chunks_done`). Read-only; never monkey-patch; never add `*_count` to `srt/`.
+3. **Multi-structure derivation, widely reused** — scans `chunked_req` + `waiting_queue` + `running_batch` + `last_batch` (`is_idle`, `status`, `batch_composition`).
+
+Else: don't. Read `r.req.X` / `t._scheduler.X` in the test; inline single-use accessors.
+
+Never:
+
+- Weaken an assertion to fit a missing probe.
+- Probe implementation details ("field non-None", "which branch ran") — assert the consequence.
+
+## Other Tips
+
+- **Engine-self-driven behavior: drive the real loop, don't call the private.** Never synchronously call a scheduler private (e.g. `scheduler._abort_on_waiting_timeout()`) from the harness/test — it runs at the wrong loop phase, bypasses the ordered `recv_requests` → `process_input_requests` injection, and can fire in states the real loop never reaches (e.g. while paused). For sweeps the engine runs itself (timeout/idle), enable the config/env and advance the loop with `yield`.
@@ -337,6 +337,11 @@ class Envs:
     SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
     SGLANG_TEST_FORCE_OPTIMISTIC_PREFILL_RETRY_PROB = EnvFloat(0.0)
 
+    SGLANG_TEST_SCRIPTED_RUNTIME = EnvBool(False)
+    SGLANG_TEST_SCRIPTED_RUNTIME_IPC_ADDR = EnvStr(None)
+    SGLANG_TEST_SCRIPTED_RUNTIME_OUT_OF_BAND_ERROR_PATH = EnvStr(None)
+    SGLANG_TEST_SCRIPTED_RUNTIME_SYS_PATH_ENTRY = EnvStr(None)
+
     # Model Parallel
     SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
     SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS = EnvBool(False)
 
@@ -545,6 +545,8 @@ def __init__(
         # Init the grammar backend for constrained generation
         self.init_grammar_manager()
 
+        self.maybe_init_scripted_scheduler_hook()
+
         self.init_request_receiver()
 
         self.init_dp_attn_adapter()
@@ -611,6 +613,7 @@ def init_ipc_channels(self, port_args: PortArgs):
                 self.ps.attn_tp_rank == 0
                 or self.server_args.enable_metrics_for_all_schedulers
             ),
+            enable_scripted_runtime=envs.SGLANG_TEST_SCRIPTED_RUNTIME.get(),
         )
 
         self.load_snapshot_writer = None
@@ -1601,6 +1604,19 @@ def init_lora_overlap_loader(self) -> None:
     def init_grammar_manager(self) -> None:
         self.grammar_manager = GrammarManager(self)
 
+    def maybe_init_scripted_scheduler_hook(self) -> None:
+        if envs.SGLANG_TEST_SCRIPTED_RUNTIME.get():
+            from sglang.test.scripted_runtime.scheduler_hook import (
+                ScriptedSchedulerHook,
+            )
+
+            self.scripted_scheduler_hook = ScriptedSchedulerHook(
+                scheduler=self,
+                tokenizer_recv_proxy=self.ipc_channels.recv_from_tokenizer,
+            )
+        else:
+            self.scripted_scheduler_hook = None
+
     def init_request_receiver(self) -> None:
         self.request_receiver = SchedulerRequestReceiver(
             recv_from_tokenizer=self.ipc_channels.recv_from_tokenizer,
@@ -1623,6 +1639,7 @@ def init_request_receiver(self) -> None:
             get_last_forward_mode=lambda: (
                 self.last_batch.forward_mode if self.last_batch is not None else None
             ),
+            scripted_scheduler_hook=self.scripted_scheduler_hook,
         )
 
     def init_dp_attn_adapter(self) -> None:
@@ -2978,6 +2995,9 @@ def run_batch(
         self.forward_ct += 1
         batch.forward_iter = self.forward_ct
 
+        if self.scripted_scheduler_hook is not None:
+            self.scripted_scheduler_hook.on_run_batch(batch)
+
         # Whether to run the profiler
         self.profiler_manager._profile_batch_predicate(batch)
         if self.forward_sleep_time is not None:
 
@@ -1,16 +1,21 @@
 from dataclasses import dataclass
-from typing import Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 import zmq
 
 from sglang.srt.managers.scheduler_components.output_sender import SenderWrapper
 from sglang.srt.server_args import PortArgs
 from sglang.srt.utils.network import get_zmq_socket
 
+if TYPE_CHECKING:
+    from sglang.test.scripted_runtime.tokenizer_recv_proxy import (
+        ScriptedTokenizerRecvProxy,
+    )
+
 
 @dataclass(frozen=True, slots=True, kw_only=True)
 class SchedulerIpcChannels:
-    recv_from_tokenizer: Optional[zmq.Socket]
+    recv_from_tokenizer: Union[zmq.Socket, "ScriptedTokenizerRecvProxy"]
     recv_from_rpc: Optional[zmq.Socket]
     send_to_tokenizer: SenderWrapper
     send_to_detokenizer: SenderWrapper
@@ -24,13 +29,22 @@ def create(
         is_rank_zero: bool,
         skip_tokenizer_init: bool,
         metrics_enabled: bool,
+        enable_scripted_runtime: bool,
     ) -> "SchedulerIpcChannels":
         context = zmq.Context(2)
 
         if is_rank_zero:
             recv_from_tokenizer = get_zmq_socket(
                 context, zmq.PULL, port_args.scheduler_input_ipc_name, False
             )
+            if enable_scripted_runtime:
+                from sglang.test.scripted_runtime.tokenizer_recv_proxy import (
+                    ScriptedTokenizerRecvProxy,
+                )
+
+                recv_from_tokenizer = ScriptedTokenizerRecvProxy(
+                    underlying=recv_from_tokenizer
+                )
             recv_from_rpc = get_zmq_socket(
                 context, zmq.DEALER, port_args.rpc_ipc_name, False
             )
 
@@ -34,11 +34,15 @@
     from sglang.srt.configs.model_config import ModelConfig
     from sglang.srt.distributed.parallel_state_wrapper import ParallelState
     from sglang.srt.server_args import ServerArgs
+    from sglang.test.scripted_runtime.scheduler_hook import ScriptedSchedulerHook
+    from sglang.test.scripted_runtime.tokenizer_recv_proxy import (
+        ScriptedTokenizerRecvProxy,
+    )
 
 
 @dataclass(kw_only=True, slots=True, frozen=True)
 class SchedulerRequestReceiver:
-    recv_from_tokenizer: zmq.Socket
+    recv_from_tokenizer: Union[zmq.Socket, "ScriptedTokenizerRecvProxy"]
     recv_from_rpc: Optional[zmq.Socket]
     recv_skipper: Any
     input_blocker: Any
@@ -56,6 +60,7 @@ class SchedulerRequestReceiver:
     max_recv_per_poll: int
     stream_output: Callable[..., None]
     get_last_forward_mode: Callable[[], Any]
+    scripted_scheduler_hook: Optional["ScriptedSchedulerHook"] = None
 
     def recv_limit_reached(self, num_recv_reqs: int) -> bool:
         if self.max_recv_per_poll < 0:
@@ -67,6 +72,9 @@ def recv_requests(
     ) -> List[Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput, Any]]:
         """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
 
+        if self.scripted_scheduler_hook is not None:
+            self.scripted_scheduler_hook.step()
+
         if self.recv_skipper is not None:
             if not self.recv_skipper.handle(self.get_last_forward_mode()):
                 return []
 
@@ -231,7 +231,7 @@ def set_recv_opt():
         set_send_opt()
     elif socket_type == zmq.PULL:
         set_recv_opt()
-    elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
+    elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP, zmq.PAIR]:
         set_send_opt()
         set_recv_opt()
     else:
 
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import threading
+from concurrent.futures import Future
+from typing import Any, Coroutine, Optional
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+JOIN_TIMEOUT_S: float = 10.0
+
+
+class BackgroundHttpPoster:
+
+    def __init__(self) -> None:
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(
+            target=self._run_loop, name="scripted-runtime-async", daemon=True
+        )
+        self._thread.start()
+
+    def _run_loop(self) -> None:
+        asyncio.set_event_loop(self._loop)
+        self._loop.run_forever()
+
+    def submit_coro(self, coro: Coroutine) -> None:
+        future = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        future.add_done_callback(self._log_coro_exception)
+
+    @staticmethod
+    def _log_coro_exception(future: Future) -> None:
+        try:
+            future.result()
+        except asyncio.CancelledError:
+            pass
+        except Exception:
+            logger.exception("scripted_runtime: background async coroutine failed")
+
+    async def post(self, url: str, json: Any) -> None:
+        session = self._ensure_session()
+        async with session.post(url, json=json) as resp:
+            await resp.read()
+
+    def _ensure_session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession(
+                connector=aiohttp.TCPConnector(limit=0)
+            )
+        return self._session
+
+    def close(self) -> None:
+        try:
+            if self._session is not None:
+                future = asyncio.run_coroutine_threadsafe(
+                    self._session.close(), self._loop
+                )
+                future.result(timeout=JOIN_TIMEOUT_S)
+        except Exception:
+            logger.exception("scripted_runtime: failed to close aiohttp session")
+        try:
+            self._loop.call_soon_threadsafe(self._loop.stop)
+            self._thread.join(timeout=JOIN_TIMEOUT_S)
+        except Exception:
+            logger.exception("scripted_runtime: failed to stop background async loop")
@@ -0,0 +1,3 @@
+from sglang.test.scripted_runtime.context.api import ScriptedContext
+
+__all__ = ["ScriptedContext"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from sglang.test.scripted_runtime.context.api import ScriptedContext`
	`2`	`+`
	`3`	`+__all__ = ["ScriptedContext"]`