NVIDIA
diff --git a/‎src/srtctl/benchmarks/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/srtctl/benchmarks/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/srtctl/benchmarks/base.py‎
Lines changed: 12 additions & 0 deletions b/‎src/srtctl/benchmarks/base.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/srtctl/benchmarks/custom.py‎
Lines changed: 42 additions & 0 deletions b/‎src/srtctl/benchmarks/custom.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎src/srtctl/cli/do_sweep.py‎
Lines changed: 18 additions & 2 deletions b/‎src/srtctl/cli/do_sweep.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎src/srtctl/cli/mixins/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/srtctl/cli/mixins/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/srtctl/cli/mixins/benchmark_stage.py‎
Lines changed: 5 additions & 2 deletions b/‎src/srtctl/cli/mixins/benchmark_stage.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/srtctl/cli/mixins/postprocess_stage.py‎
Lines changed: 61 additions & 30 deletions b/‎src/srtctl/cli/mixins/postprocess_stage.py‎
Lines changed: 61 additions & 30 deletions
@@ -5,6 +5,7 @@
 
 # Import runners to trigger registration
 from srtctl.benchmarks import (
+    custom,
     gpqa,
     gsm8k,
     longbenchv2,
@@ -28,6 +29,7 @@
     "list_benchmarks",
     "register_benchmark",
     # Runners
+    "custom",
     "sa_bench",
     "sglang_bench",
     "mmlu",
 
@@ -59,6 +59,18 @@ def build_command(
         """
         ...
 
+    def get_container_image(self, config: SrtConfig, runtime: RuntimeContext) -> str | Path:
+        """Get the container image used for the benchmark process."""
+        return runtime.container_image
+
+    def get_container_mounts(self, config: SrtConfig, runtime: RuntimeContext) -> dict[Path, Path]:
+        """Get mounts used for the benchmark process."""
+        return runtime.container_mounts
+
+    def get_environment(self, config: SrtConfig, runtime: RuntimeContext) -> dict[str, str]:
+        """Get benchmark-specific environment variables."""
+        return {}
+
 
 class AIPerfBenchmarkRunner(BenchmarkRunner):
     """Base class for AIPerf-driven benchmarks.
 
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Custom benchmark runner."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from srtctl.benchmarks.base import BenchmarkRunner, register_benchmark
+from srtctl.core.runtime import RuntimeContext
+from srtctl.core.schema import SrtConfig
+
+
+@register_benchmark("custom")
+class CustomBenchmarkRunner(BenchmarkRunner):
+    """Run an arbitrary benchmark command inside a container."""
+
+    @property
+    def name(self) -> str:
+        return "Custom"
+
+    @property
+    def script_path(self) -> str:
+        return "<custom command>"
+
+    def validate_config(self, config: SrtConfig) -> list[str]:
+        if config.benchmark.command:
+            return []
+        return ["benchmark.command is required for benchmark.type=custom"]
+
+    def build_command(self, config: SrtConfig, runtime: RuntimeContext) -> list[str]:
+        del runtime
+        assert config.benchmark.command is not None
+        return ["bash", "-lc", config.benchmark.command]
+
+    def get_container_image(self, config: SrtConfig, runtime: RuntimeContext) -> str | Path:
+        return config.benchmark.container_image or runtime.container_image
+
+    def get_environment(self, config: SrtConfig, runtime: RuntimeContext) -> dict[str, str]:
+        del runtime
+        return dict(config.benchmark.env)
@@ -21,7 +21,13 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-from srtctl.cli.mixins import BenchmarkStageMixin, FrontendStageMixin, PostProcessStageMixin, WorkerStageMixin
+from srtctl.cli.mixins import (
+    BenchmarkStageMixin,
+    FrontendStageMixin,
+    PostProcessStageMixin,
+    TelemetryStageMixin,
+    WorkerStageMixin,
+)
 from srtctl.core.config import load_config
 from srtctl.core.health import wait_for_port
 from srtctl.core.lockfile import write_lockfile
@@ -42,7 +48,13 @@
 
 
 @dataclass
-class SweepOrchestrator(WorkerStageMixin, FrontendStageMixin, BenchmarkStageMixin, PostProcessStageMixin):
+class SweepOrchestrator(
+    WorkerStageMixin,
+    FrontendStageMixin,
+    TelemetryStageMixin,
+    BenchmarkStageMixin,
+    PostProcessStageMixin,
+):
     """Main orchestrator for benchmark sweeps.
 
     Usage:
@@ -225,6 +237,10 @@ def run(self) -> int:
             for proc in frontend_procs:
                 registry.add_process(proc)
 
+            telemetry_procs = self.start_telemetry()
+            for proc in telemetry_procs:
+                registry.add_process(proc)
+
             self._print_connection_info()
 
             # Stage 4: Benchmark (status reported AFTER health check passes)
 
@@ -14,11 +14,13 @@
 from srtctl.cli.mixins.benchmark_stage import BenchmarkStageMixin
 from srtctl.cli.mixins.frontend_stage import FrontendStageMixin
 from srtctl.cli.mixins.postprocess_stage import PostProcessStageMixin
+from srtctl.cli.mixins.telemetry_stage import TelemetryStageMixin
 from srtctl.cli.mixins.worker_stage import WorkerStageMixin
 
 __all__ = [
     "WorkerStageMixin",
     "FrontendStageMixin",
+    "TelemetryStageMixin",
     "BenchmarkStageMixin",
     "PostProcessStageMixin",
 ]
@@ -178,6 +178,9 @@ def _run_benchmark_script(
 
         cmd = runner.build_command(self.config, self.runtime)
         env_to_set = self._get_benchmark_env(runner)
+        env_to_set.update(runner.get_environment(self.config, self.runtime))
+        container_image = runner.get_container_image(self.config, self.runtime)
+        container_mounts = runner.get_container_mounts(self.config, self.runtime)
 
         logger.info("Script: %s", runner.script_path)
         logger.info("Command: %s", shlex.join(cmd))
@@ -187,8 +190,8 @@ def _run_benchmark_script(
             command=cmd,
             nodelist=[self.runtime.nodes.head],
             output=str(log_file),
-            container_image=str(self.runtime.container_image),
-            container_mounts=self.runtime.container_mounts,
+            container_image=str(container_image),
+            container_mounts=container_mounts,
             env_to_set=env_to_set,
         )
 
 
@@ -38,6 +38,9 @@
 
 logger = logging.getLogger(__name__)
 
+POSTPROCESS_PARSE_FAILED_EXIT = 20
+POSTPROCESS_UPLOAD_FAILED_EXIT = 11
+
 
 class PostProcessStageMixin:
     """Mixin for post-process stage after benchmark completion.
@@ -254,34 +257,7 @@ def _run_postprocess_container(self) -> tuple[Path | None, str | None]:
         endpoint_flag = f"--endpoint-url {s3_config.endpoint_url}" if s3_config.endpoint_url else ""
 
         # Build the post-processing script
-        script = f"""
-set -e
-
-# Install uv, srtlog, and awscli
-echo "Installing uv..."
-pip install uv
-
-echo "Installing srtlog and awscli..."
-cd /tmp
-git clone --depth 1 https://github.com/ishandhanani/srtlog.git
-uv pip install --system ./srtlog awscli
-
-# Run srtlog to generate parquet
-echo "Running srtlog parse..."
-cd /logs
-srtlog parse .
-
-# Upload entire log directory to S3
-echo "Uploading entire log directory to S3..."
-aws s3 sync /logs {s3_url} {endpoint_flag}
-echo "Upload complete: {s3_url}"
-
-# Report what was uploaded
-echo ""
-echo "Uploaded files:"
-find /logs -type f | wc -l
-echo "files total"
-"""
+        script = self._build_postprocess_script(s3_url, endpoint_flag)
 
         # Build env for AWS credentials
         env: dict[str, str] = {}
@@ -301,7 +277,7 @@ def _run_postprocess_container(self) -> tuple[Path | None, str | None]:
                 nodelist=[self.runtime.nodes.head],
                 output=str(self.runtime.log_dir / "postprocess.log"),
                 container_image="python:3.11",
-                container_mounts={str(self.runtime.log_dir): "/logs"},
+                container_mounts={self.runtime.log_dir: Path("/logs")},
                 env_to_set=env,
             )
             proc.wait(timeout=600)  # 10 min timeout for install + parse + full sync
@@ -311,6 +287,9 @@ def _run_postprocess_container(self) -> tuple[Path | None, str | None]:
             if proc.returncode == 0:
                 logger.info("Post-processing complete: %s", s3_url)
                 return parquet_path if parquet_path.exists() else None, s3_url
+            if proc.returncode == POSTPROCESS_PARSE_FAILED_EXIT:
+                logger.warning("srtlog parsing failed, but raw logs were still uploaded to %s", s3_url)
+                return parquet_path if parquet_path.exists() else None, s3_url
             else:
                 logger.warning("Post-processing failed (exit code: %s)", proc.returncode)
                 return parquet_path if parquet_path.exists() else None, None
@@ -323,6 +302,58 @@ def _run_postprocess_container(self) -> tuple[Path | None, str | None]:
             logger.warning("Post-processing container failed: %s", e)
             return None, None
 
+    def _build_postprocess_script(self, s3_url: str, endpoint_flag: str) -> str:
+        """Build the post-processing shell script.
+
+        Upload is always attempted if awscli installs successfully. Parsing is
+        best-effort so raw logs survive parser/tooling failures.
+        """
+        return f"""
+set -u
+set -o pipefail
+
+PARSE_STATUS=0
+UPLOAD_STATUS=0
+
+echo "Installing uv and awscli..."
+if ! pip install uv awscli; then
+  echo "Failed to install uv/awscli"
+  exit {POSTPROCESS_UPLOAD_FAILED_EXIT}
+fi
+
+echo "Installing srtlog..."
+if cd /tmp && git clone --depth 1 https://github.com/ishandhanani/srtlog.git && uv pip install --system ./srtlog; then
+  echo "Running srtlog parse..."
+  cd /logs
+  srtlog parse . || PARSE_STATUS=$?
+else
+  echo "Failed to install srtlog; continuing with raw log upload"
+  PARSE_STATUS=1
+fi
+
+cat > /logs/postprocess-status.json <<EOF
+{{"parse_status": $PARSE_STATUS, "s3_url": "{s3_url}"}}
+EOF
+
+echo "Uploading entire log directory to S3..."
+aws s3 sync /logs {s3_url} {endpoint_flag} || UPLOAD_STATUS=$?
+
+if [ "$UPLOAD_STATUS" -ne 0 ]; then
+  echo "Upload failed with status $UPLOAD_STATUS"
+  exit {POSTPROCESS_UPLOAD_FAILED_EXIT}
+fi
+
+echo "Upload complete: {s3_url}"
+echo ""
+echo "Uploaded files:"
+find /logs -type f | wc -l
+echo "files total"
+
+if [ "$PARSE_STATUS" -ne 0 ]; then
+  exit {POSTPROCESS_PARSE_FAILED_EXIT}
+fi
+"""
+
     def _report_metrics(self, benchmark_results: dict[str, Any] | None, s3_url: str | None, exit_code: int) -> None:
         """Report metrics to dashboard via status API.
 
@@ -443,7 +474,7 @@ def _run_ai_analysis(self, config: AIAnalysisConfig) -> None:
                 nodelist=[self.runtime.nodes.head],
                 output=str(analysis_log),
                 container_image="python:3.11",
-                container_mounts={str(self.runtime.log_dir): "/logs"},
+                container_mounts={self.runtime.log_dir: Path("/logs")},
                 env_to_set=env_to_set,
             )