feat: trace-replay benchmark with aiperf_args passthrough (#18)

alec-flowers · claude · ishandhanani · web-flow · commit 73bbc2528f03 · 2026-04-09T16:04:29.000-07:00
* feat: add aiperf_args passthrough and fix tokenizer for trace-replay

- Add --tokenizer-trust-remote-code to aiperf calls (fixes Kimi tokenizer)
- Install tiktoken if missing (required by Kimi's custom tokenizer)
- Add aiperf_args dict to BenchmarkConfig for passing extra aiperf CLI flags
- bench.sh accepts extra args after positional params
- Add production aiperf flags to kimi recipe (duration, timeout, workers, etc)
- Increase file descriptor limit and add PYTHONUNBUFFERED for real-time logging

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* fix: direct warmup artifacts to ARTIFACT_DIR instead of cwd

The aiperf warmup call was missing --artifact-dir, causing it to write
artifacts to the working directory (creating artifacts/ in repo root).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* test: add tests for aiperf_args passthrough in trace-replay

Tests that key-value args are passed as --key value flags and boolean
args are passed as --flag (true) or omitted (false).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* feat: add aiperf_package config for controlling aiperf version

Add benchmark.aiperf_package field to specify the pip install spec for
aiperf (e.g., "aiperf&gt;=0.7.0"). Passed as AIPERF_PACKAGE env var to
bench.sh which does pip install --upgrade. Defaults to "aiperf" if not
set. Always installs tiktoken alongside.

Needed because container-bundled aiperf may predate fixes like
trust-remote-code propagation to pool workers (aiperf PR #744).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Co-authored-by: ishandhanani &lt;82981111+ishandhanani@users.noreply.github.com&gt;
diff --git a/src/srtctl/benchmarks/base.py b/src/srtctl/benchmarks/base.py
@@ -61,7 +61,20 @@ def build_command(
 
 
 class AIPerfBenchmarkRunner(BenchmarkRunner):
-    """Marker base class for AIPerf-driven benchmarks."""
+    """Base class for AIPerf-driven benchmarks.
+
+    Provides shared aiperf_args handling for subclasses.
+    """
+
+    def append_aiperf_args(self, cmd: list[str], config: SrtConfig) -> list[str]:
+        """Append aiperf_args from config as CLI flags."""
+        for key, value in config.benchmark.aiperf_args.items():
+            if isinstance(value, bool):
+                if value:
+                    cmd.append(f"--{key}")
+            else:
+                cmd.extend([f"--{key}", str(value)])
+        return cmd
 
 
 # Registry of benchmark runners
diff --git a/src/srtctl/benchmarks/mooncake_router.py b/src/srtctl/benchmarks/mooncake_router.py
@@ -101,7 +101,7 @@ def build_command(
         # For HF models, use the model ID directly so transformers downloads it
         tokenizer_path = str(runtime.model_path) if runtime.is_hf_model else "/model"
 
-        return [
+        cmd = [
             "bash",
             self.script_path,
             endpoint,
@@ -111,3 +111,7 @@ def build_command(
             str(itl_threshold),
             tokenizer_path,
         ]
+
+        self.append_aiperf_args(cmd, config)
+
+        return cmd
diff --git a/src/srtctl/benchmarks/scripts/trace-replay/bench.sh b/src/srtctl/benchmarks/scripts/trace-replay/bench.sh
@@ -6,17 +6,25 @@
 # Replays a user-provided JSONL trace dataset at configurable concurrency levels.
 # Uses aiperf with --custom-dataset-type mooncake_trace.
 #
-# Usage: bench.sh ENDPOINT MODEL_NAME TRACE_FILE CONCURRENCIES [TTFT_THRESHOLD] [ITL_THRESHOLD] [TOKENIZER_PATH]
+# Usage: bench.sh ENDPOINT MODEL_NAME TRACE_FILE CONCURRENCIES [TTFT_THRESHOLD] [ITL_THRESHOLD] [TOKENIZER_PATH] [EXTRA_ARGS]
+#
+# EXTRA_ARGS: JSON-encoded string of additional aiperf flags (passed from Python)
 
 set -e
 
+# Ensure Python output is unbuffered for real-time logging
+export PYTHONUNBUFFERED=1
+
 ENDPOINT=$1
 MODEL_NAME=${2:-"test-model"}
 TRACE_FILE=$3
 CONCURRENCIES=${4:-"1"}
 TTFT_THRESHOLD=${5:-2000}
 ITL_THRESHOLD=${6:-25}
 TOKENIZER_PATH=${7:-"/model"}
+# Remaining args are extra aiperf flags
+shift 7 2>/dev/null || true
+EXTRA_ARGS=("$@")
 
 # Optional: extra Prometheus endpoints for AIPerf server metrics
 SERVER_METRICS_ARGS=()
@@ -32,6 +40,9 @@ BASE_DIR="${BASE_DIR:-/logs}"
 ARTIFACT_DIR="${ARTIFACT_DIR:-${BASE_DIR}/artifacts}"
 mkdir -p "${ARTIFACT_DIR}"
 
+# Increase file descriptor limit for high concurrency
+ulimit -n 600000 2>/dev/null || ulimit -n 65536 2>/dev/null || true
+
 # Increase aiperf HTTP timeout
 export AIPERF_HTTP_SO_RCVTIMEO=120
 
@@ -45,6 +56,9 @@ echo "Concurrencies: ${CONCURRENCIES}"
 echo "TTFT Threshold: ${TTFT_THRESHOLD}ms"
 echo "ITL Threshold: ${ITL_THRESHOLD}ms"
 echo "Tokenizer Path: ${TOKENIZER_PATH}"
+if [ ${#EXTRA_ARGS[@]} -gt 0 ]; then
+    echo "Extra Args: ${EXTRA_ARGS[*]}"
+fi
 echo "=============================================="
 
 # Validate trace file exists
@@ -53,23 +67,40 @@ if [ ! -f "${TRACE_FILE}" ]; then
     exit 1
 fi
 
-# Install aiperf if not present
-if ! command -v aiperf &> /dev/null; then
-    echo "Installing aiperf..."
-    pip install aiperf
+# Create isolated aiperf environment (avoids polluting container packages)
+# AIPERF_PACKAGE env var controls the version (e.g., "aiperf>=0.7.0")
+AIPERF_SPEC="${AIPERF_PACKAGE:-aiperf}"
+AIPERF_VENV="/tmp/aiperf-${SLURM_JOB_ID:-$$}"
+
+echo "Setting up aiperf environment: ${AIPERF_SPEC}"
+
+# Install uv if not in container
+if ! command -v uv &> /dev/null; then
+    echo "Installing uv..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="$HOME/.local/bin:$PATH"
 fi
 
+uv venv "${AIPERF_VENV}"
+uv pip install -p "${AIPERF_VENV}" "${AIPERF_SPEC}" tiktoken
+export PATH="${AIPERF_VENV}/bin:${PATH}"
+echo "aiperf $(aiperf --version 2>/dev/null || echo 'installed') in ${AIPERF_VENV}"
+
 # Run small benchmark for warmup
 echo "Running warmup..."
+WARMUP_DIR="${ARTIFACT_DIR}/warmup"
+mkdir -p "${WARMUP_DIR}"
 aiperf profile \
     -m "${MODEL_NAME}" \
     --tokenizer "${TOKENIZER_PATH}" \
+    --tokenizer-trust-remote-code \
     --url "${ENDPOINT}" \
     --streaming \
     --ui simple \
     --extra-inputs ignore_eos:true \
     --concurrency 1 \
-    --request-count 5
+    --request-count 5 \
+    --artifact-dir "${WARMUP_DIR}"
 echo "Warmup complete"
 
 # Setup artifact directory
@@ -92,6 +123,7 @@ for C in "${CONCURRENCY_LIST[@]}"; do
     aiperf profile \
         -m "${MODEL_NAME}" \
         --tokenizer "${TOKENIZER_PATH}" \
+        --tokenizer-trust-remote-code \
         --input-file "${TRACE_FILE}" \
         --custom-dataset-type mooncake_trace \
         --url "${ENDPOINT}" \
@@ -102,7 +134,8 @@ for C in "${CONCURRENCY_LIST[@]}"; do
         --ui simple \
         --artifact-dir "${RUN_ARTIFACT_DIR}" \
         "${SERVER_METRICS_ARGS[@]}" \
-        --goodput "time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD}"
+        --goodput "time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD}" \
+        "${EXTRA_ARGS[@]}"
 
     echo "$(date '+%Y-%m-%d %H:%M:%S') - Concurrency ${C} complete"
 
diff --git a/src/srtctl/benchmarks/trace_replay.py b/src/srtctl/benchmarks/trace_replay.py
@@ -82,7 +82,7 @@ def build_command(
 
         tokenizer_path = str(runtime.model_path) if runtime.is_hf_model else "/model"
 
-        return [
+        cmd = [
             "bash",
             self.script_path,
             endpoint,
@@ -93,3 +93,7 @@ def build_command(
             str(itl_threshold),
             tokenizer_path,
         ]
+
+        self.append_aiperf_args(cmd, config)
+
+        return cmd
diff --git a/src/srtctl/cli/mixins/benchmark_stage.py b/src/srtctl/cli/mixins/benchmark_stage.py
@@ -285,8 +285,10 @@ def _get_benchmark_env(self, runner: "BenchmarkRunner") -> dict[str, str]:
         env = self._get_benchmark_profiling_env(runner)
         env["SRTCTL_FRONTEND_TYPE"] = self.config.frontend.type
 
-        # Add AIPerf metrics URLs for AIPerf-driven benchmarks
+        # Add AIPerf-specific env vars for AIPerf-driven benchmarks only
         if isinstance(runner, AIPerfBenchmarkRunner):
             env.update(self._get_aiperf_server_metrics_env())
+            if self.config.benchmark.aiperf_package:
+                env["AIPERF_PACKAGE"] = self.config.benchmark.aiperf_package
 
         return env
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
@@ -546,6 +546,11 @@ class BenchmarkConfig:
     trace_file: str | None = None  # Path to trace JSONL file (container path, e.g., /traces/dataset.jsonl)
     custom_tokenizer: str | None = None  # Custom tokenizer class (e.g., "module.path.ClassName")
     use_chat_template: bool = True  # Pass --use-chat-template to benchmark (default: true)
+    # aiperf pip install spec (e.g., "aiperf>=0.7.0", "aiperf @ git+https://...@commit")
+    # If set, runs pip install <spec> before benchmarking. Upgrades if already installed.
+    aiperf_package: str | None = None
+    # Extra aiperf CLI flags passed through to bench.sh (e.g., benchmark-duration: 600, workers-max: 200)
+    aiperf_args: dict[str, Any] = field(default_factory=dict)
 
     def get_concurrency_list(self) -> list[int]:
         if self.concurrencies is None:
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -322,6 +322,81 @@ def test_build_command_default_thresholds(self):
         assert cmd[6] == "2000"  # default ttft
         assert cmd[7] == "25"  # default itl
 
+    def test_build_command_with_aiperf_args(self):
+        """aiperf_args are passed through as CLI flags."""
+        from unittest.mock import MagicMock
+
+        from srtctl.benchmarks.trace_replay import TraceReplayRunner
+
+        runner = TraceReplayRunner()
+        runtime = MagicMock()
+        runtime.frontend_port = 8000
+        runtime.is_hf_model = False
+
+        from srtctl.core.schema import BenchmarkConfig, ModelConfig, ResourceConfig, SrtConfig
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model/kimi", container="/image", precision="fp4"),
+            resources=ResourceConfig(gpu_type="gb200"),
+            benchmark=BenchmarkConfig(
+                type="trace-replay",
+                trace_file="/traces/dataset.jsonl",
+                concurrencies=[4],
+                aiperf_args={
+                    "benchmark-duration": 600,
+                    "workers-max": 200,
+                    "request-timeout-seconds": 1200,
+                    "profile-export-level": "raw",
+                },
+            ),
+        )
+
+        cmd = runner.build_command(config, runtime)
+
+        # Positional args come first (9 of them)
+        assert cmd[8] == "/model"  # tokenizer path
+
+        # aiperf_args appended after positional args
+        extra = cmd[9:]
+        assert "--benchmark-duration" in extra
+        assert extra[extra.index("--benchmark-duration") + 1] == "600"
+        assert "--workers-max" in extra
+        assert extra[extra.index("--workers-max") + 1] == "200"
+        assert "--request-timeout-seconds" in extra
+        assert "--profile-export-level" in extra
+        assert extra[extra.index("--profile-export-level") + 1] == "raw"
+
+    def test_build_command_aiperf_args_bool(self):
+        """Boolean aiperf_args are passed as flags without values."""
+        from unittest.mock import MagicMock
+
+        from srtctl.benchmarks.trace_replay import TraceReplayRunner
+
+        runner = TraceReplayRunner()
+        runtime = MagicMock()
+        runtime.frontend_port = 8000
+        runtime.is_hf_model = False
+
+        from srtctl.core.schema import BenchmarkConfig, ModelConfig, ResourceConfig, SrtConfig
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model/test", container="/image", precision="fp4"),
+            resources=ResourceConfig(gpu_type="gb200"),
+            benchmark=BenchmarkConfig(
+                type="trace-replay",
+                trace_file="/traces/dataset.jsonl",
+                concurrencies=[1],
+                aiperf_args={"export-http-trace": True, "disabled-flag": False},
+            ),
+        )
+
+        cmd = runner.build_command(config, runtime)
+        extra = cmd[9:]
+        assert "--export-http-trace" in extra
+        assert "--disabled-flag" not in extra
+
     def test_config_roundtrip(self):
         """Config with trace-replay loads correctly from YAML."""
         import tempfile