lightseekorg
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 3 deletions b/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bindings/python/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎bindings/python/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/grpc_client/python/README.md‎
Lines changed: 1 addition & 1 deletion b/‎crates/grpc_client/python/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/grpc_client/python/pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎crates/grpc_client/python/pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/getting-started/kv-events-cache-aware.md‎
Lines changed: 28 additions & 0 deletions b/‎docs/getting-started/kv-events-cache-aware.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/reference/configuration.md‎
Lines changed: 29 additions & 0 deletions b/‎docs/reference/configuration.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎grpc_servicer/pyproject.toml‎
Lines changed: 6 additions & 3 deletions b/‎grpc_servicer/pyproject.toml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎grpc_servicer/smg_grpc_servicer/kv_events.py‎
Lines changed: 164 additions & 0 deletions b/‎grpc_servicer/smg_grpc_servicer/kv_events.py‎
Lines changed: 164 additions & 0 deletions
@@ -6,7 +6,7 @@
 
 # Python bindings
 /bindings/python @CatherineSue @key4ng @gongwei-130 @slin1237
-/bindings/golang @whybeyoung @slin1237
+/bindings/golang @slin1237
 
 # E2E tests
 /e2e_test @CatherineSue @key4ng @XinyueZhang369 @slin1237
@@ -26,12 +26,12 @@
 
 # Workspace crates
 /crates/auth @slin1237
-/crates/data_connector @key4ng @zhoug9127 @zhaowenzi
+/crates/data_connector @key4ng @zhoug9127
 /crates/grpc_client @CatherineSue @slin1237
 /crates/grpc_client/proto/vllm_engine.proto @njhill @CatherineSue @slin1237
 /crates/grpc_client/proto/common.proto @njhill @CatherineSue @slin1237
 /crates/kv_index @slin1237
-/crates/mcp @key4ng @CatherineSue @slin1237 @zhoug9127 @zhaowenzi
+/crates/mcp @key4ng @CatherineSue @slin1237 @zhoug9127
 /crates/mesh @tonyluj @llfl @slin1237
 /crates/multimodal @slin1237 @CatherineSue
 /crates/protocols @CatherineSue @key4ng
 
@@ -1198,6 +1198,7 @@ impl Router {
                 host: self.host.clone(),
                 port: self.port,
                 health_check_port: self.health_check_port,
+                runtime_worker_threads: None,
                 router_config,
                 max_payload_size: self.max_payload_size,
                 log_dir: self.log_dir.clone(),
 
@@ -17,7 +17,7 @@ This package provides pre-compiled Python gRPC stubs for:
 pip install smg-grpc-proto
 ```
 
-Requires `grpcio>=1.78.0` and `protobuf>=5.26.0`.
+Requires `grpcio>=1.81.1` and `protobuf>=5.26.0`.
 
 ## Usage
 
 
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=68.0", "grpcio-tools>=1.78.0"]
+requires = ["setuptools>=68.0", "grpcio-tools>=1.81.1"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -8,7 +8,7 @@ version = "0.4.11"
 description = "SMG gRPC proto definitions for vLLM, TRT-LLM, MLX, TokenSpeed, and SGLang"
 requires-python = ">=3.10"
 dependencies = [
-    "grpcio>=1.78.0",
+    "grpcio>=1.81.1",
     "protobuf>=5.26.0",
 ]
 readme = "README.md"
 
@@ -117,6 +117,31 @@ SMG learns the block size from the `BlockStored` events themselves, so you needn
 
 Everything downstream — SMG flags, block-size learning, and the verification logs — is unchanged; `KvEventMonitor` consumes the events the same way for any gRPC worker.
 
+### Alternative: launch a TokenSpeed worker
+
+TokenSpeed's scheduler publishes KV cache events on a ZMQ socket; enable them with `--kv-events-config`. The TokenSpeed gRPC server *is* the SMG gRPC entrypoint, so there is no separate `--grpc` flag:
+
+```bash
+# TokenSpeed is installed from source (engine + kernel + scheduler); see
+# scripts/ci_install_tokenspeed.sh. Install the bridge's extra deps:
+pip install "smg-grpc-servicer[tokenspeed]"
+
+# --kv-events-config turns on KV-event publishing in the scheduler:
+python -m smg_grpc_servicer.tokenspeed \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --host 0.0.0.0 \
+  --port 50051 \
+  --kv-events-config '{"enable_kv_cache_events": true, "publisher": "zmq", "endpoint": "tcp://*:5557", "topic": "kv-events"}'
+```
+
+| Field | Why |
+|---|---|
+| `enable_kv_cache_events: true` | TokenSpeed master switch. Without it the scheduler records no events even if a publisher is set. |
+| `publisher: "zmq"` | Selects the ZMQ publisher the servicer bridges. Unset defaults to `"zmq"` when events are enabled; `"null"` (or any other value) disables bridging. |
+| `endpoint` / `topic` | ZMQ `PUB` address and topic prefix. Use a **bind-style** endpoint (`tcp://*:PORT`) — TokenSpeed only *binds* when the endpoint contains `*`/`::`/`ipc://`/`inproc://`, so a concrete address like `tcp://127.0.0.1:PORT` makes it *connect* instead, leaving nothing bound and the stream idle. For data-parallel the port is `endpoint_port + dp_rank`, and SMG currently consumes rank 0. |
+
+`--kv-events-config` is parsed by TokenSpeed's `KVEventsConfig.from_cli`. SMG learns the block size from the `BlockStored` events themselves, so you needn't set it; pass TokenSpeed's `--page-size N` only to pin a non-default value. Everything downstream is identical to the SGLang and vLLM paths.
+
 ---
 
 ## Step 2 — Launch SMG
@@ -240,4 +265,7 @@ If events never arrive, the policy keeps working — it falls back to the approx
 - Event subscription manager: `model_gateway/src/worker/kv_event_monitor.rs`
 - KV event proto: `crates/grpc_client/proto/common.proto` (messages `KvEventBatch`, `KvCacheEvent`, `KvBlocksStored`, `KvBlocksRemoved`)
 - Servicer bridge: `grpc_servicer/smg_grpc_servicer/sglang/servicer.py` (`SubscribeKvEvents`)
+- Shared ZMQ→proto conversion: `grpc_servicer/smg_grpc_servicer/kv_events.py` (engine-neutral; used by the vLLM and TokenSpeed bridges)
+- TokenSpeed servicer bridge: `grpc_servicer/smg_grpc_servicer/tokenspeed/servicer.py` (`SubscribeKvEvents`) + config resolver `grpc_servicer/smg_grpc_servicer/tokenspeed/kv_events.py`
 - SGLang upstream config: `python/sglang/srt/disaggregation/kv_events.py` (class `KVEventsConfig`)
+- TokenSpeed upstream config: `tokenspeed/runtime/pd/kv_events.py` (class `KVEventsConfig`)
@@ -517,6 +517,35 @@ headers can otherwise spoof storage hook request context values.
 
 ---
 
+## Runtime Configuration
+
+Controls the tokio async runtime that backs request handling.
+
+By default the runtime is **container-aware**. tokio sizes its worker pool to
+`std::thread::available_parallelism()`, which on Rust 1.95+ already reads the
+cgroup CPU quota — so under a Kubernetes `limits.cpu` the worker count matches
+the pod's quota, not the host's core count. No extra configuration is needed for
+the default to be right under a CPU limit.
+
+Do **not** set an inflated `TOKIO_WORKER_THREADS` (for example a fixed `32`).
+That overrides the container-aware default and oversubscribes worker threads
+against the cores the scheduler actually grants, causing scheduler thrash,
+tail-latency spikes, and `/health` starvation. Leaving it unset is the correct
+production configuration.
+
+### Worker Threads
+
+Explicit async runtime worker-thread count. Leave unset to use tokio's
+container-aware default above; set it only to pin an explicit count (overriding
+the cgroup-quota-derived default).
+
+| Option | `--runtime-worker-threads` |
+|--------|----------------------------|
+| Environment | - |
+| Default | tokio default (`available_parallelism()`, cgroup-quota-aware) |
+
+---
+
 ## Rate Limiting Configuration
 
 ### Concurrent Request Limit
 
@@ -9,9 +9,9 @@ description = "SMG gRPC servicer implementations for LLM inference engines (vLLM
 requires-python = ">=3.10"
 dependencies = [
     "smg-grpc-proto>=0.4.11",
-    "grpcio>=1.78.0",
-    "grpcio-reflection>=1.78.0",
-    "grpcio-health-checking>=1.78.0",
+    "grpcio>=1.81.1",
+    "grpcio-reflection>=1.81.1",
+    "grpcio-health-checking>=1.81.1",
 ]
 readme = "README.md"
 license = { text = "Apache-2.0" }
@@ -36,6 +36,9 @@ sglang = ["sglang>=0.5.10"]
 # without this floor, installing [mlx] against an older proto build would
 # crash at import time when smg_grpc_servicer.mlx.server runs.
 mlx = ["smg-grpc-proto>=0.4.7", "mlx>=0.22.0", "mlx-lm>=0.22.0"]
+# TokenSpeed itself is installed from source (no PyPI release); these are the
+# extra runtime deps the KV-event bridge needs on top of a TokenSpeed install.
+tokenspeed = ["pyzmq>=25.0.0", "msgspec>=0.18.0"]
 
 [project.urls]
 Homepage = "https://github.com/lightseekorg/smg"
 
@@ -0,0 +1,164 @@
+"""Engine-neutral KV-cache-event → proto conversion and ZMQ streaming.
+
+Shared by every engine bridge (vLLM, TokenSpeed, ...). Imports only stdlib +
+the generated proto, and dispatches engine events by class name (BlockStored /
+BlockRemoved / AllBlocksCleared), so it needs no engine import and is
+unit-testable without any engine installed.
+
+Each engine package keeps its own ``resolve_kv_events_config`` (the only
+engine-specific seam); everything here is wire-format-only.
+"""
+
+import logging
+from collections.abc import AsyncIterator, Awaitable, Callable
+
+from smg_grpc_proto.generated import common_pb2
+
+logger = logging.getLogger(__name__)
+
+_U64_MASK = 0xFFFFFFFFFFFFFFFF
+_I64_SIGN_BIT = 0x8000000000000000
+_U64_MODULUS = 0x10000000000000000
+
+
+def to_int64(value: int | bytes) -> int:
+    """Reduce an engine block hash to a signed int64 for the proto block_hash field.
+
+    An engine's block hash may be ``int | bytes`` (sha256 bytes when int hashes
+    are disabled); bytes are read big-endian. SMG uses the hash only as a node
+    identity, so the 64-bit reduction is safe as long as it stays deterministic.
+    """
+    if isinstance(value, (bytes, bytearray)):
+        value = int.from_bytes(value, "big")
+    masked = value & _U64_MASK
+    if masked >= _I64_SIGN_BIT:
+        masked -= _U64_MODULUS
+    return masked
+
+
+def endpoint_for_rank(endpoint: str, dp_rank: int) -> str:
+    """Resolve a KV-events PUB endpoint to a connectable SUB address.
+
+    Bind wildcards (``*``, ``0.0.0.0``) are rewritten to ``127.0.0.1`` (the
+    latter is not connectable on macOS/Windows). For data-parallel deployments
+    each rank publishes on ``base_port + dp_rank``; non-tcp endpoints (ipc://,
+    inproc://) get the wildcard substituted but no port arithmetic.
+    """
+    resolved = endpoint.replace("*", "127.0.0.1").replace("0.0.0.0", "127.0.0.1")
+    if resolved.startswith("tcp://") and dp_rank:
+        host, sep, port = resolved.rpartition(":")
+        if sep and port.isdigit():
+            return f"{host}:{int(port) + dp_rank}"
+    return resolved
+
+
+def convert_event(event: object, event_id: int) -> common_pb2.KvCacheEvent | None:
+    """Convert one decoded engine event to a proto KvCacheEvent (or None if unknown)."""
+    name = type(event).__name__
+
+    if name == "BlockStored":
+        block_size = int(event.block_size)
+        blocks = []
+        for i, block_hash in enumerate(event.block_hashes):
+            start = i * block_size
+            end = start + block_size
+            block = common_pb2.KvBlock(
+                block_hash=to_int64(block_hash),
+                token_ids=list(event.token_ids[start:end]),
+                block_size=block_size,
+            )
+            lora_id = getattr(event, "lora_id", None)
+            if lora_id is not None:
+                block.lora_id = to_int64(lora_id)
+            blocks.append(block)
+        stored = common_pb2.KvBlocksStored(blocks=blocks)
+        parent = getattr(event, "parent_block_hash", None)
+        if parent is not None:
+            stored.parent_block_hash = to_int64(parent)
+        return common_pb2.KvCacheEvent(event_id=event_id, stored=stored)
+
+    if name == "BlockRemoved":
+        return common_pb2.KvCacheEvent(
+            event_id=event_id,
+            removed=common_pb2.KvBlocksRemoved(
+                block_hashes=[to_int64(h) for h in event.block_hashes]
+            ),
+        )
+
+    if name == "AllBlocksCleared":
+        return common_pb2.KvCacheEvent(event_id=event_id, cleared=common_pb2.KvCacheCleared())
+
+    logger.debug("Unknown KV event type %r, skipping", name)
+    return None
+
+
+def convert_batch(
+    raw_batch: object, seq_num: int, event_id_start: int
+) -> tuple[common_pb2.KvEventBatch, int]:
+    """Convert a decoded engine KVEventBatch to a proto KvEventBatch.
+
+    Returns the proto batch and the new event-id counter. The counter advances
+    once per input event (even if unconvertible) so ids stay monotonic.
+
+    The DP rank is read from ``data_parallel_rank`` (vLLM) or ``attn_dp_rank``
+    (TokenSpeed); engines that carry neither leave the proto field unset.
+    """
+    proto = common_pb2.KvEventBatch(sequence_number=seq_num, timestamp=raw_batch.ts)
+    dp_rank = getattr(raw_batch, "data_parallel_rank", None)
+    if dp_rank is None:
+        dp_rank = getattr(raw_batch, "attn_dp_rank", None)
+    if dp_rank is not None:
+        proto.dp_rank = dp_rank
+
+    event_id = event_id_start
+    for event in raw_batch.events:
+        event_id += 1
+        proto_event = convert_event(event, event_id)
+        if proto_event is not None:
+            proto.events.append(proto_event)
+    return proto, event_id
+
+
+async def stream_kv_events(
+    sub_socket: object,
+    decode: Callable[[bytes], object],
+    send_initial_metadata: Callable[[], Awaitable[None]],
+    is_cancelled: Callable[[], bool],
+    *,
+    recv_timeout: float = 1.0,
+) -> AsyncIterator[common_pb2.KvEventBatch]:
+    """Core ZMQ→proto streaming loop, decoupled from any engine and gRPC types.
+
+    Args:
+        sub_socket: a connected ``zmq.asyncio`` SUB socket (duck-typed; only
+            ``poll()`` and ``recv_multipart()`` are used). The caller owns the
+            socket lifecycle (this function never closes it).
+        decode: bytes → decoded engine batch (e.g. ``msgspec.msgpack.Decoder(KVEventBatch).decode``).
+        send_initial_metadata: awaitable called once before the first recv so the
+            gRPC client's ``subscribe_kv_events().await`` resolves promptly.
+        is_cancelled: returns True when the RPC is cancelled; loop then exits.
+        recv_timeout: poll timeout so cancellation is observed even when idle.
+
+    Yields proto KvEventBatch using the ZMQ publisher's native sequence numbers.
+    """
+    await send_initial_metadata()
+    event_id = 0
+    while not is_cancelled():
+        # poll() before recv: cancelling a zmq.asyncio recv future does not
+        # cancel the in-flight ZMQ recv and can drop an already-dequeued message.
+        if not await sub_socket.poll(timeout=int(recv_timeout * 1000)):
+            continue
+        frames = await sub_socket.recv_multipart()
+
+        # ZMQ multipart: [topic, 8-byte big-endian seq, msgpack payload].
+        if len(frames) < 3:
+            continue
+        zmq_seq = int.from_bytes(frames[1], "big")
+        try:
+            raw_batch = decode(frames[2])
+        except Exception as e:  # noqa: BLE001 - one bad frame must not kill the stream
+            logger.warning("Failed to decode KV event batch: %s", e)
+            continue
+
+        proto_batch, event_id = convert_batch(raw_batch, zmq_seq, event_id)
+        yield proto_batch