feat: add mocker backend for smoke-testing the full srt-slurm pipeline

alec-flowers · claude · alec-flowers · commit cd7ba84c525d · 2026-04-09T15:14:40.000-07:00
Adds a new `mocker` backend type that uses `dynamo.mocker` — a scheduler
simulator that validates model paths, loads tokenizer config, and registers
with etcd/NATS discovery without loading model weights. This enables fast
end-to-end validation of SLURM jobs, container mounts, discovery, frontend
routing, and benchmark clients.

Usage: set `backend.type: mocker` in any recipe to swap in the mock server.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/recipes/mocker/agg.yaml b/recipes/mocker/agg.yaml
@@ -0,0 +1,38 @@
+# Mocker smoke test - Aggregated mode
+# Validates full srt-slurm pipeline (SLURM, container, mounts, tokenizer,
+# discovery, frontend, benchmark) without loading model weights.
+#
+# Usage:
+#   srtctl apply -f recipes/smoke-test/agg.yaml
+#   srtctl dry-run -f recipes/smoke-test/agg.yaml
+
+name: "smoke-test-agg"
+
+slurm:
+  time_limit: "00:15:00"
+
+model:
+  path: "hf:Qwen/Qwen3-0.6B"
+  container: "dynamo-sglang"
+  precision: "fp16"
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: mocker
+  speedup_ratio: 100
+  engine_type: vllm
+
+benchmark:
+  type: "sa-bench"
+  isl: 128
+  osl: 128
+  concurrencies: "4"
diff --git a/recipes/mocker/disagg.yaml b/recipes/mocker/disagg.yaml
@@ -0,0 +1,41 @@
+# Mocker smoke test - Disaggregated mode
+# Validates prefill/decode disaggregation pipeline with bootstrap rendezvous.
+#
+# Usage:
+#   srtctl apply -f recipes/smoke-test/disagg.yaml
+#   srtctl dry-run -f recipes/smoke-test/disagg.yaml
+
+name: "smoke-test-disagg"
+
+slurm:
+  time_limit: "00:15:00"
+
+model:
+  path: "hf:Qwen/Qwen3-0.6B"
+  container: "dynamo-sglang"
+  precision: "fp16"
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 0
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 1
+  gpus_per_decode: 1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: mocker
+  speedup_ratio: 100
+  engine_type: vllm
+
+benchmark:
+  type: "sa-bench"
+  isl: 128
+  osl: 128
+  concurrencies: "4"
diff --git a/src/srtctl/backends/__init__.py b/src/srtctl/backends/__init__.py
@@ -10,12 +10,13 @@
 """
 
 from .base import BackendProtocol, BackendType, SrunConfig
+from .mocker import MockerProtocol, MockerServerConfig
 from .sglang import SGLangProtocol, SGLangServerConfig
 from .trtllm import TRTLLMProtocol, TRTLLMServerConfig
 from .vllm import VLLMProtocol, VLLMServerConfig
 
 # Union type for all backend configs
-BackendConfig = SGLangProtocol | TRTLLMProtocol | VLLMProtocol
+BackendConfig = SGLangProtocol | TRTLLMProtocol | VLLMProtocol | MockerProtocol
 
 __all__ = [
     # Base types
@@ -32,4 +33,7 @@
     # vLLM
     "VLLMProtocol",
     "VLLMServerConfig",
+    # Mocker
+    "MockerProtocol",
+    "MockerServerConfig",
 ]
diff --git a/src/srtctl/backends/base.py b/src/srtctl/backends/base.py
@@ -22,6 +22,8 @@ class BackendType(str, Enum):
 
     SGLANG = "sglang"
     TRTLLM = "trtllm"
+    VLLM = "vllm"
+    MOCKER = "mocker"
 
 
 @dataclass
diff --git a/src/srtctl/backends/mocker.py b/src/srtctl/backends/mocker.py
@@ -0,0 +1,286 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Dynamo Mocker backend configuration.
+
+Implements BackendProtocol for the dynamo.mocker scheduler simulator.
+Used for smoke-testing the full srt-slurm pipeline (SLURM, mounts,
+tokenizer, discovery, frontend, benchmark) without loading model weights.
+
+The mocker validates model paths, reads tokenizer config, registers with
+etcd/NATS discovery, simulates scheduling and KV cache management, and
+generates random tokens at configurable simulated latency.
+"""
+
+import builtins
+from collections.abc import Sequence
+from dataclasses import field
+from pathlib import Path
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ClassVar,
+    Literal,
+)
+
+from marshmallow import Schema
+from marshmallow_dataclass import dataclass
+
+if TYPE_CHECKING:
+    from srtctl.backends.base import SrunConfig
+    from srtctl.core.runtime import RuntimeContext
+    from srtctl.core.topology import Endpoint, Process
+
+# Type alias for worker modes
+WorkerMode = Literal["prefill", "decode", "agg"]
+
+
+@dataclass(frozen=True)
+class MockerServerConfig:
+    """Mocker CLI configuration per mode (prefill/decode/aggregated).
+
+    Each mode can have its own configuration dict that gets converted
+    to CLI flags when starting the mocker. Use for per-mode overrides
+    of mocker-specific parameters.
+    """
+
+    prefill: dict[str, Any] | None = None
+    decode: dict[str, Any] | None = None
+    aggregated: dict[str, Any] | None = None
+
+    Schema: ClassVar[type[Schema]] = Schema
+
+
+@dataclass(frozen=True)
+class MockerProtocol:
+    """Dynamo Mocker protocol - implements BackendProtocol.
+
+    This frozen dataclass both holds configuration AND implements the
+    BackendProtocol methods for process allocation and launching.
+
+    The mocker is a drop-in replacement for real inference backends
+    (sglang, vllm, trtllm) that simulates scheduling without loading
+    model weights. It validates model paths, loads tokenizer config,
+    and registers with etcd/NATS discovery identically to real workers.
+
+    Example YAML:
+        backend:
+          type: mocker
+          speedup_ratio: 100
+          engine_type: vllm
+
+    Or with per-mode overrides:
+        backend:
+          type: mocker
+          speedup_ratio: 100
+          engine_type: sglang
+          mocker_config:
+            prefill:
+              max-num-seqs: 512
+            decode:
+              max-num-seqs: 128
+    """
+
+    type: Literal["mocker"] = "mocker"
+
+    # Simulation parameters
+    engine_type: str = "vllm"
+    speedup_ratio: float = 100.0
+    decode_speedup_ratio: float = 1.0
+    num_gpu_blocks_override: int = 16384
+    max_num_seqs: int = 256
+    max_num_batched_tokens: int = 8192
+    block_size: int | None = None
+    data_parallel_size: int = 1
+    num_workers: int = 1
+    startup_time: float | None = None
+    kv_transfer_bandwidth: float | None = None
+    kv_cache_dtype: str | None = None
+    enable_prefix_caching: bool = True
+    enable_chunked_prefill: bool = True
+    preemption_mode: str | None = None
+
+    # Environment variables per mode
+    prefill_environment: dict[str, str] = field(default_factory=dict)
+    decode_environment: dict[str, str] = field(default_factory=dict)
+    aggregated_environment: dict[str, str] = field(default_factory=dict)
+
+    # Per-mode CLI overrides
+    mocker_config: MockerServerConfig | None = None
+
+    Schema: ClassVar[builtins.type[Schema]] = Schema
+
+    # =========================================================================
+    # BackendProtocol Implementation
+    # =========================================================================
+
+    def get_srun_config(self) -> "SrunConfig":
+        """Mocker uses per-process launching (one srun per node)."""
+        from srtctl.backends.base import SrunConfig
+
+        return SrunConfig(mpi=None, oversubscribe=False, launch_per_endpoint=False)
+
+    def get_config_for_mode(self, mode: WorkerMode) -> dict[str, Any]:
+        """Get merged config dict for a worker mode."""
+        if not self.mocker_config:
+            return {}
+
+        if mode == "prefill":
+            return dict(self.mocker_config.prefill or {})
+        elif mode == "decode":
+            return dict(self.mocker_config.decode or {})
+        elif mode == "agg":
+            return dict(self.mocker_config.aggregated or {})
+        return {}
+
+    def get_environment_for_mode(self, mode: WorkerMode) -> dict[str, str]:
+        """Get environment variables for a worker mode."""
+        if mode == "prefill":
+            return dict(self.prefill_environment)
+        elif mode == "decode":
+            return dict(self.decode_environment)
+        elif mode == "agg":
+            return dict(self.aggregated_environment)
+        return {}
+
+    def get_process_environment(self, process: "Process") -> dict[str, str]:
+        """Get process-specific environment variables.
+
+        The mocker does not need per-process env vars (no NIXL ports, etc.).
+        """
+        return {}
+
+    def get_served_model_name(self, default: str) -> str:
+        """Get served model name — mocker uses default (model path basename)."""
+        return default
+
+    def allocate_endpoints(
+        self,
+        num_prefill: int,
+        num_decode: int,
+        num_agg: int,
+        gpus_per_prefill: int,
+        gpus_per_decode: int,
+        gpus_per_agg: int,
+        gpus_per_node: int,
+        available_nodes: Sequence[str],
+    ) -> list["Endpoint"]:
+        """Allocate endpoints to nodes."""
+        from srtctl.core.topology import allocate_endpoints
+
+        return allocate_endpoints(
+            num_prefill=num_prefill,
+            num_decode=num_decode,
+            num_agg=num_agg,
+            gpus_per_prefill=gpus_per_prefill,
+            gpus_per_decode=gpus_per_decode,
+            gpus_per_agg=gpus_per_agg,
+            gpus_per_node=gpus_per_node,
+            available_nodes=available_nodes,
+        )
+
+    def endpoints_to_processes(
+        self,
+        endpoints: list["Endpoint"],
+        base_sys_port: int = 8081,
+    ) -> list["Process"]:
+        """Convert endpoints to processes."""
+        from srtctl.core.topology import endpoints_to_processes
+
+        return endpoints_to_processes(endpoints, base_sys_port=base_sys_port)
+
+    def build_worker_command(
+        self,
+        process: "Process",
+        endpoint_processes: list["Process"],
+        runtime: "RuntimeContext",
+        frontend_type: str = "dynamo",
+        nsys_prefix: list[str] | None = None,
+        dump_config_path: Path | None = None,
+    ) -> list[str]:
+        """Build the command to start a mocker worker process.
+
+        Args:
+            process: The process to start
+            endpoint_processes: All processes for this endpoint (for multi-node)
+            runtime: Runtime context with paths and settings
+            frontend_type: Frontend type (mocker always uses dynamo discovery)
+            nsys_prefix: Optional nsys profiling command prefix
+            dump_config_path: Unused (mocker has no config dump)
+        """
+        mode = process.endpoint_mode
+        config = self.get_config_for_mode(mode)
+
+        # Determine model path: HF model ID or container mount path
+        model_arg = str(runtime.model_path) if runtime.is_hf_model else "/model"
+
+        # Start with nsys prefix if provided
+        cmd: list[str] = list(nsys_prefix) if nsys_prefix else []
+
+        cmd.extend(
+            [
+                "python3",
+                "-m",
+                "dynamo.mocker",
+                "--model-path",
+                model_arg,
+            ]
+        )
+
+        # Disaggregation mode for prefill/decode workers
+        if mode != "agg":
+            cmd.extend(["--disaggregation-mode", mode])
+
+        # Bootstrap port for prefill workers (disaggregated serving rendezvous)
+        if mode == "prefill" and process.bootstrap_port is not None:
+            cmd.extend(["--bootstrap-ports", str(process.bootstrap_port)])
+
+        # Core simulation parameters (always emitted)
+        cmd.extend(["--engine-type", self.engine_type])
+        cmd.extend(["--speedup-ratio", str(self.speedup_ratio)])
+        cmd.extend(["--data-parallel-size", str(self.data_parallel_size)])
+        cmd.extend(["--num-gpu-blocks-override", str(self.num_gpu_blocks_override)])
+        cmd.extend(["--max-num-seqs", str(self.max_num_seqs)])
+        cmd.extend(["--max-num-batched-tokens", str(self.max_num_batched_tokens)])
+
+        # Optional parameters (only emitted when non-default)
+        if self.decode_speedup_ratio != 1.0:
+            cmd.extend(["--decode-speedup-ratio", str(self.decode_speedup_ratio)])
+        if self.block_size is not None:
+            cmd.extend(["--block-size", str(self.block_size)])
+        if self.num_workers > 1:
+            cmd.extend(["--num-workers", str(self.num_workers)])
+        if self.startup_time is not None:
+            cmd.extend(["--startup-time", str(self.startup_time)])
+        if self.kv_transfer_bandwidth is not None:
+            cmd.extend(["--kv-transfer-bandwidth", str(self.kv_transfer_bandwidth)])
+        if self.kv_cache_dtype is not None:
+            cmd.extend(["--kv-cache-dtype", self.kv_cache_dtype])
+        if not self.enable_prefix_caching:
+            cmd.append("--no-enable-prefix-caching")
+        if not self.enable_chunked_prefill:
+            cmd.append("--no-enable-chunked-prefill")
+        if self.preemption_mode is not None:
+            cmd.extend(["--preemption-mode", self.preemption_mode])
+
+        # Per-mode config overrides from mocker_config
+        cmd.extend(_config_to_cli_args(config))
+
+        return cmd
+
+
+def _config_to_cli_args(config: dict[str, Any]) -> list[str]:
+    """Convert config dict to CLI arguments."""
+    args: list[str] = []
+    for key, value in sorted(config.items()):
+        flag_name = key.replace("_", "-")
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{flag_name}")
+        elif isinstance(value, list):
+            args.append(f"--{flag_name}")
+            args.extend(str(v) for v in value)
+        elif value is not None:
+            args.extend([f"--{flag_name}", str(value)])
+    return args
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
diff --git a/tests/test_mocker_backend.py b/tests/test_mocker_backend.py