66# Replays a user-provided JSONL trace dataset at configurable concurrency levels.
77# Uses aiperf with --custom-dataset-type mooncake_trace.
88#
9- # Usage: bench.sh ENDPOINT MODEL_NAME TRACE_FILE CONCURRENCIES [TTFT_THRESHOLD] [ITL_THRESHOLD] [TOKENIZER_PATH]
9+ # Usage: bench.sh ENDPOINT MODEL_NAME TRACE_FILE CONCURRENCIES [TTFT_THRESHOLD] [ITL_THRESHOLD] [TOKENIZER_PATH] [EXTRA_ARGS]
10+ #
11+ # EXTRA_ARGS: JSON-encoded string of additional aiperf flags (passed from Python)
1012
1113set -e
1214
15+ # Ensure Python output is unbuffered for real-time logging
16+ export PYTHONUNBUFFERED=1
17+
1318ENDPOINT=$1
1419MODEL_NAME=${2:- " test-model" }
1520TRACE_FILE=$3
1621CONCURRENCIES=${4:- " 1" }
1722TTFT_THRESHOLD=${5:- 2000}
1823ITL_THRESHOLD=${6:- 25}
1924TOKENIZER_PATH=${7:- " /model" }
25+ # Remaining args are extra aiperf flags
26+ shift 7 2> /dev/null || true
27+ EXTRA_ARGS=(" $@ " )
2028
2129# Optional: extra Prometheus endpoints for AIPerf server metrics
2230SERVER_METRICS_ARGS=()
@@ -32,6 +40,9 @@ BASE_DIR="${BASE_DIR:-/logs}"
3240ARTIFACT_DIR=" ${ARTIFACT_DIR:- ${BASE_DIR} / artifacts} "
3341mkdir -p " ${ARTIFACT_DIR} "
3442
43+ # Increase file descriptor limit for high concurrency
44+ ulimit -n 600000 2> /dev/null || ulimit -n 65536 2> /dev/null || true
45+
3546# Increase aiperf HTTP timeout
3647export AIPERF_HTTP_SO_RCVTIMEO=120
3748
@@ -45,6 +56,9 @@ echo "Concurrencies: ${CONCURRENCIES}"
4556echo " TTFT Threshold: ${TTFT_THRESHOLD} ms"
4657echo " ITL Threshold: ${ITL_THRESHOLD} ms"
4758echo " Tokenizer Path: ${TOKENIZER_PATH} "
59+ if [ ${# EXTRA_ARGS[@]} -gt 0 ]; then
60+ echo " Extra Args: ${EXTRA_ARGS[*]} "
61+ fi
4862echo " =============================================="
4963
5064# Validate trace file exists
@@ -53,23 +67,40 @@ if [ ! -f "${TRACE_FILE}" ]; then
5367 exit 1
5468fi
5569
56- # Install aiperf if not present
57- if ! command -v aiperf & > /dev/null; then
58- echo " Installing aiperf..."
59- pip install aiperf
70+ # Create isolated aiperf environment (avoids polluting container packages)
71+ # AIPERF_PACKAGE env var controls the version (e.g., "aiperf>=0.7.0")
72+ AIPERF_SPEC=" ${AIPERF_PACKAGE:- aiperf} "
73+ AIPERF_VENV=" /tmp/aiperf-${SLURM_JOB_ID:- $$ } "
74+
75+ echo " Setting up aiperf environment: ${AIPERF_SPEC} "
76+
77+ # Install uv if not in container
78+ if ! command -v uv & > /dev/null; then
79+ echo " Installing uv..."
80+ curl -LsSf https://astral.sh/uv/install.sh | sh
81+ export PATH=" $HOME /.local/bin:$PATH "
6082fi
6183
84+ uv venv " ${AIPERF_VENV} "
85+ uv pip install -p " ${AIPERF_VENV} " " ${AIPERF_SPEC} " tiktoken
86+ export PATH=" ${AIPERF_VENV} /bin:${PATH} "
87+ echo " aiperf $( aiperf --version 2> /dev/null || echo ' installed' ) in ${AIPERF_VENV} "
88+
6289# Run small benchmark for warmup
6390echo " Running warmup..."
91+ WARMUP_DIR=" ${ARTIFACT_DIR} /warmup"
92+ mkdir -p " ${WARMUP_DIR} "
6493aiperf profile \
6594 -m " ${MODEL_NAME} " \
6695 --tokenizer " ${TOKENIZER_PATH} " \
96+ --tokenizer-trust-remote-code \
6797 --url " ${ENDPOINT} " \
6898 --streaming \
6999 --ui simple \
70100 --extra-inputs ignore_eos:true \
71101 --concurrency 1 \
72- --request-count 5
102+ --request-count 5 \
103+ --artifact-dir " ${WARMUP_DIR} "
73104echo " Warmup complete"
74105
75106# Setup artifact directory
@@ -92,6 +123,7 @@ for C in "${CONCURRENCY_LIST[@]}"; do
92123 aiperf profile \
93124 -m " ${MODEL_NAME} " \
94125 --tokenizer " ${TOKENIZER_PATH} " \
126+ --tokenizer-trust-remote-code \
95127 --input-file " ${TRACE_FILE} " \
96128 --custom-dataset-type mooncake_trace \
97129 --url " ${ENDPOINT} " \
@@ -102,7 +134,8 @@ for C in "${CONCURRENCY_LIST[@]}"; do
102134 --ui simple \
103135 --artifact-dir " ${RUN_ARTIFACT_DIR} " \
104136 " ${SERVER_METRICS_ARGS[@]} " \
105- --goodput " time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD} "
137+ --goodput " time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD} " \
138+ " ${EXTRA_ARGS[@]} "
106139
107140 echo " $( date ' +%Y-%m-%d %H:%M:%S' ) - Concurrency ${C} complete"
108141
0 commit comments