lightseekorg
diff --git a/‎docs/reference/configuration.md‎
Lines changed: 7 additions & 1 deletion b/‎docs/reference/configuration.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎grpc_servicer/smg_grpc_servicer/tokenspeed/servicer.py‎
Lines changed: 77 additions & 21 deletions b/‎grpc_servicer/smg_grpc_servicer/tokenspeed/servicer.py‎
Lines changed: 77 additions & 21 deletions
diff --git a/‎grpc_servicer/tests/test_tokenspeed_multimodal_shm.py‎
Lines changed: 74 additions & 0 deletions b/‎grpc_servicer/tests/test_tokenspeed_multimodal_shm.py‎
Lines changed: 74 additions & 0 deletions
@@ -942,11 +942,17 @@ smg \
 These env-only variables tune how the router ships preprocessed multimodal
 tensors (image/video encoder inputs) to a TokenSpeed worker. They do not affect
 accuracy — the inline and shared-memory paths produce byte-identical tensors.
+SHM handles include offsets; multi-item TokenSpeed encoder inputs may share one
+packed segment while preserving the same byte-exact tensor payloads and reducing
+per-tensor file lifecycle overhead.
 
 | Environment Variable | Default | Description |
 |---------------------|---------|-------------|
-| `SMG_TOKENSPEED_MM_TENSOR_TRANSPORT` | `inline` | Transport for large MM tensors: `inline` (gRPC bytes), `shm` (always use `/dev/shm`), or `auto` (use `/dev/shm` only when the worker is *verified* to share it). In `auto`, the router compares the worker's advertised `/dev/shm` namespace token (`GetServerInfo`) to its own and uses SHM only on a match; otherwise it falls back to inline. No locality configuration is needed. |
+| `SMG_TOKENSPEED_MM_TENSOR_TRANSPORT` | image/audio: `inline`; video: `auto` | Transport for large MM tensors: `inline` (gRPC bytes), `shm` (always use `/dev/shm`), or `auto` (use `/dev/shm` only when the worker is *verified* to share it). When unset, image/audio stay inline while video uses `auto` to avoid the high-throughput video gRPC byte-copy path on colocated workers without hurting image TTFT. In `auto`, the router compares the worker's advertised `/dev/shm` namespace token (`GetServerInfo`) to its own and uses SHM only on a match; otherwise it falls back to inline. No locality configuration is needed. |
 | `SMG_TOKENSPEED_MM_SHM_MIN_BYTES` | `65536` | Minimum tensor size (bytes) before the SHM path is used; smaller tensors stay inline. |
+| `SMG_MM_PREPROCESS_PAR_MIN_BYTES` | `524288` | Minimum output size before CPU image/video preprocessing splits work across helper threads. |
+| `SMG_MM_PREPROCESS_PAR_MIN_ROWS` | `32` | Minimum output rows or block bands per helper thread for CPU multimodal preprocessing. |
+| `SMG_MM_PREPROCESS_PAR_MAX_THREADS` | `8` | Maximum helper threads spawned per image/video preprocessing pass. Raise for large single requests; keep lower for high-concurrency TTFT. |
 | `SMG_LOG_MM_TIMING` | `false` | Log per-stage multimodal preprocessing/assembly timing at `INFO`. Accepts `1`/`true`/`yes`. |
 
 The TokenSpeed gRPC servicer (worker side) reads two companion variables:
 
@@ -13,6 +13,7 @@
 import dataclasses
 import json
 import logging
+import math
 import os
 import re
 import time
@@ -994,23 +995,26 @@ def _mm_inputs_from_itemized_proto(
                 for name, tensor_data in item_proto.model_specific_tensors.items()
             }
             model_elapsed_ms = (
-                (time.perf_counter() - model_started) * 1000 if model_started is not None else None
+                (time.perf_counter() - model_started) * 1000
+                if model_started is not None
+                else None
             )
             self._validate_item_tensor_consistency(modality, model_specific_data)
 
-            if not item_proto.placeholders:
-                raise ValueError("MultimodalItem carried no placeholders")
-            if any(p.length <= 0 for p in item_proto.placeholders):
-                raise ValueError("MultimodalItem.placeholders.length must be > 0")
-            offsets = [(p.offset, p.offset + p.length - 1) for p in item_proto.placeholders]
+            offsets, token_count, offset_ends, offset_prefix = (
+                self._offsets_from_proto_placeholders(item_proto.placeholders)
+            )
 
             content_hash = bytes(item_proto.content_hash)
             mm_item = MultimodalDataItem(
                 modality=modality,
                 feature=feature,
                 model_specific_data=model_specific_data,
                 offsets=offsets,
+                token_count=token_count,
                 hash=int.from_bytes(content_hash[:8], "little") if content_hash else None,
+                offset_ends=offset_ends,
+                offset_prefix=offset_prefix,
             )
             mm_item.set_pad_value()
             items.append(mm_item)
@@ -1055,6 +1059,7 @@ def _mm_inputs_from_itemized_proto(
             mm_items=items,
             im_token_id=im_token_id,
             video_token_id=video_token_id,
+            pad_values_ready=True,
         )
 
     @staticmethod
@@ -1092,6 +1097,44 @@ def _validate_item_tensor_consistency(
         if modality == Modality.VIDEO and not has_video_grid:
             raise ValueError("VIDEO MultimodalItem must carry video_grid_thw")
 
+    @staticmethod
+    def _offsets_from_proto_placeholders(
+        placeholders,
+    ) -> tuple[list[tuple[int, int]], int, list[int] | None, list[int] | None]:
+        if len(placeholders) == 1:
+            placeholder = placeholders[0]
+            length = int(placeholder.length)
+            if length <= 0:
+                raise ValueError("MultimodalItem.placeholders.length must be > 0")
+            start = int(placeholder.offset)
+            end = start + length - 1
+            return [(start, end)], length, [end], [0, length]
+
+        offsets = []
+        offset_ends = []
+        offset_prefix = [0]
+        sorted_non_overlapping = True
+        prev_end = -1
+        token_count = 0
+        for placeholder in placeholders:
+            length = int(placeholder.length)
+            if length <= 0:
+                raise ValueError("MultimodalItem.placeholders.length must be > 0")
+            start = int(placeholder.offset)
+            end = start + length - 1
+            if start <= prev_end:
+                sorted_non_overlapping = False
+            offsets.append((start, end))
+            offset_ends.append(end)
+            token_count += length
+            offset_prefix.append(token_count)
+            prev_end = end
+        if not offsets:
+            raise ValueError("MultimodalItem carried no placeholders")
+        if not sorted_non_overlapping:
+            return offsets, token_count, None, None
+        return offsets, token_count, offset_ends, offset_prefix
+
     @staticmethod
     def _tensor_from_proto(
         tensor_data: tokenspeed_scheduler_pb2.TensorData,
@@ -1107,7 +1150,7 @@ def _tensor_from_proto(
 
         if tensor_data.dtype == "bfloat16":
             # numpy has no bfloat16 — read the raw bits as uint16, reinterpret.
-            expected = int(np.prod(shape, dtype=np.int64)) * np.dtype(np.uint16).itemsize
+            expected = math.prod(shape) * np.dtype(np.uint16).itemsize
             if len(raw) != expected:
                 raise ValueError(
                     f"TensorData byte length mismatch for bfloat16 shape={shape}: "
@@ -1118,7 +1161,7 @@ def _tensor_from_proto(
             )
         else:
             dtype = np.dtype(tensor_data.dtype)
-            expected = int(np.prod(shape, dtype=np.int64)) * dtype.itemsize
+            expected = math.prod(shape) * dtype.itemsize
             if len(raw) != expected:
                 raise ValueError(
                     f"TensorData byte length mismatch for dtype={tensor_data.dtype}, "
@@ -1146,27 +1189,32 @@ def _feature_from_proto(
             return TokenSpeedSchedulerServicer._tensor_from_proto(tensor_data, cast_to=cast_to)
 
         dtype = TokenSpeedSchedulerServicer._torch_dtype_from_proto(tensor_data.dtype)
-        if (
-            cast_to is not None
-            and dtype != cast_to
-            and torch.is_floating_point(torch.empty((), dtype=dtype))
-        ):
-            return TokenSpeedSchedulerServicer._tensor_from_proto(tensor_data, cast_to=cast_to)
-
-        shm = tensor_data.shm
-        if shm.offset != 0:
+        if cast_to is not None and dtype != cast_to:
             return TokenSpeedSchedulerServicer._tensor_from_proto(tensor_data, cast_to=cast_to)
 
         shape = tuple(int(dim) for dim in tensor_data.shape)
-        expected = int(np.prod(shape, dtype=np.int64)) * torch.empty((), dtype=dtype).element_size()
-        if int(shm.nbytes) != expected:
+        expected = math.prod(shape) * TokenSpeedSchedulerServicer._torch_dtype_size(dtype)
+        shm = tensor_data.shm
+        offset = int(shm.offset)
+        nbytes = int(shm.nbytes)
+        if offset < 0:
+            raise ValueError(
+                f"TensorData.shm offset must be non-negative for shape={list(shape)}: {offset}"
+            )
+        if nbytes != expected:
             raise ValueError(
                 f"TensorData.shm byte length mismatch for dtype={tensor_data.dtype}, "
-                f"shape={list(shape)}: expected {expected}, got {int(shm.nbytes)}"
+                f"shape={list(shape)}: expected {expected}, got {nbytes}"
             )
 
         name = TokenSpeedSchedulerServicer._validated_shm_name(shm.name)
-        return ShmTensorHandle(shm_name=name, shape=shape, dtype=dtype)
+        return ShmTensorHandle(
+            shm_name=name,
+            shape=shape,
+            dtype=dtype,
+            offset=offset,
+            nbytes=nbytes,
+        )
 
     @staticmethod
     def _tensor_payload_bytes(tensor_data: tokenspeed_scheduler_pb2.TensorData) -> bytes:
@@ -1223,6 +1271,14 @@ def _torch_dtype_from_proto(dtype: str) -> torch.dtype:
             return torch.float32
         raise ValueError(f"Unsupported TensorData dtype for SHM feature: {dtype!r}")
 
+    @staticmethod
+    def _torch_dtype_size(dtype: torch.dtype) -> int:
+        if dtype is torch.float32:
+            return 4
+        if dtype is torch.float16 or dtype is torch.bfloat16:
+            return 2
+        return torch.empty((), dtype=dtype).element_size()
+
     @staticmethod
     def _torch_dtype_to_proto(dtype: torch.dtype | None) -> str:
         if dtype is torch.bfloat16:
 
@@ -0,0 +1,74 @@
+import pytest
+import torch
+from smg_grpc_proto.generated import tokenspeed_scheduler_pb2
+from smg_grpc_servicer.tokenspeed.servicer import TokenSpeedSchedulerServicer
+from tokenspeed.runtime.multimodal.shm_transport import ShmTensorHandle
+
+
+def test_feature_from_proto_preserves_offset_shm_handle():
+    tensor = tokenspeed_scheduler_pb2.TensorData(
+        shape=[3, 4],
+        dtype="float32",
+        shm=tokenspeed_scheduler_pb2.ShmHandle(
+            name="smg-tokenspeed-test",
+            offset=128,
+            nbytes=3 * 4 * 4,
+            owner_id="smg:test",
+        ),
+    )
+
+    feature = TokenSpeedSchedulerServicer._feature_from_proto(tensor)
+
+    assert isinstance(feature, ShmTensorHandle)
+    assert feature.shm_name == "smg-tokenspeed-test"
+    assert feature.shape == (3, 4)
+    assert feature.dtype is torch.float32
+    assert feature.offset == 128
+    assert feature.nbytes == 3 * 4 * 4
+
+
+def test_feature_from_proto_rejects_offset_shm_length_mismatch():
+    tensor = tokenspeed_scheduler_pb2.TensorData(
+        shape=[3, 4],
+        dtype="float32",
+        shm=tokenspeed_scheduler_pb2.ShmHandle(
+            name="smg-tokenspeed-test",
+            offset=128,
+            nbytes=4,
+            owner_id="smg:test",
+        ),
+    )
+
+    with pytest.raises(ValueError, match="byte length mismatch"):
+        TokenSpeedSchedulerServicer._feature_from_proto(tensor)
+
+
+def test_offsets_from_proto_placeholders_validates_and_builds_offsets_once():
+    placeholders = [
+        tokenspeed_scheduler_pb2.PlaceholderRange(offset=10, length=3),
+        tokenspeed_scheduler_pb2.PlaceholderRange(offset=20, length=1),
+    ]
+
+    assert TokenSpeedSchedulerServicer._offsets_from_proto_placeholders(
+        placeholders
+    ) == ([(10, 12), (20, 20)], 4, [12, 20], [0, 3, 4])
+
+
+def test_offsets_from_proto_placeholders_single_placeholder_fast_path():
+    placeholders = [
+        tokenspeed_scheduler_pb2.PlaceholderRange(offset=10, length=3),
+    ]
+
+    assert TokenSpeedSchedulerServicer._offsets_from_proto_placeholders(
+        placeholders
+    ) == ([(10, 12)], 3, [12], [0, 3])
+
+
+def test_offsets_from_proto_placeholders_rejects_empty_and_non_positive_lengths():
+    with pytest.raises(ValueError, match="no placeholders"):
+        TokenSpeedSchedulerServicer._offsets_from_proto_placeholders([])
+
+    with pytest.raises(ValueError, match="length must be > 0"):
+        TokenSpeedSchedulerServicer._offsets_from_proto_placeholders(
+            [tokenspeed_scheduler_pb2.PlaceholderRange(offset=10, length=0)]
+        )