Skip to content

Commit 6d5cb7a

Browse files
authored
Merge branch 'main' into aflowers/fix-cross-arch-uv
2 parents c044da9 + 18f0ec9 commit 6d5cb7a

13 files changed

Lines changed: 782 additions & 17 deletions

File tree

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ dev = [
4040
"ty", # Astral's fast type checker (replaces mypy)
4141
"fastapi>=0.109.0",
4242
"httpx>=0.27.0", # Required by FastAPI TestClient
43+
"uvicorn>=0.27.0", # Required for integration test mock server
44+
"aiperf", # Benchmark tool for trace replay integration tests
4345
]
4446

4547
# =============================================================================
@@ -77,7 +79,10 @@ testpaths = ["tests"]
7779
pythonpath = ["tests"]
7880
python_files = ["test_*.py"]
7981
python_functions = ["test_*"]
80-
addopts = "-v --tb=short"
82+
addopts = "-v --tb=short -m 'not integration'"
83+
markers = [
84+
"integration: slow tests that run real aiperf against a mock server",
85+
]
8186

8287
# =============================================================================
8388
# ty - Astral's fast type checker (10-100x faster than mypy)

src/srtctl/benchmarks/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,17 @@
44
"""Benchmark runners for srtctl."""
55

66
# Import runners to trigger registration
7-
from srtctl.benchmarks import gpqa, gsm8k, longbenchv2, mmlu, mooncake_router, router, sa_bench, sglang_bench
7+
from srtctl.benchmarks import (
8+
gpqa,
9+
gsm8k,
10+
longbenchv2,
11+
mmlu,
12+
mooncake_router,
13+
router,
14+
sa_bench,
15+
sglang_bench,
16+
trace_replay,
17+
)
818
from srtctl.benchmarks.base import (
919
BenchmarkRunner,
1020
get_runner,
@@ -26,4 +36,5 @@
2636
"longbenchv2",
2737
"router",
2838
"mooncake_router",
39+
"trace_replay",
2940
]

src/srtctl/benchmarks/sa_bench.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,5 +99,7 @@ def build_command(
9999
str(b.random_range_ratio) if b.random_range_ratio is not None else "0.8",
100100
str(b.num_prompts_mult) if b.num_prompts_mult is not None else "10",
101101
str(b.num_warmup_mult) if b.num_warmup_mult is not None else "2",
102+
b.custom_tokenizer or "",
103+
str(b.use_chat_template).lower(),
102104
]
103105
return cmd

src/srtctl/benchmarks/scripts/mooncake-router/bench.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ if [ -n "${AIPERF_SERVER_METRICS_URLS:-}" ]; then
2626
fi
2727
fi
2828

29-
# Setup directories
30-
BASE_DIR="/logs"
31-
TRACE_DIR="${BASE_DIR}/traces"
32-
ARTIFACT_DIR="${BASE_DIR}/artifacts"
29+
# Setup directories (BASE_DIR defaults to /logs inside container, overridable for testing)
30+
BASE_DIR="${BASE_DIR:-/logs}"
31+
TRACE_DIR="${TRACE_DIR:-${BASE_DIR}/traces}"
32+
ARTIFACT_DIR="${ARTIFACT_DIR:-${BASE_DIR}/artifacts}"
3333
mkdir -p "${TRACE_DIR}"
3434
mkdir -p "${ARTIFACT_DIR}"
3535

src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py

Lines changed: 67 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -565,10 +565,52 @@ def _fix_v5_tokenizer_components(tokenizer, model_name_or_path):
565565
backend.decoder = raw.decoder
566566

567567

568+
def _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path: str) -> "PreTrainedTokenizerFast":
569+
"""Load GLM-Moe-Dsa / GLM-5 tokenizer directly from tokenizer.json.
570+
571+
Works around incompatibilities when the checkpoint was saved with
572+
transformers 5.x (TokenizersBackend / list-style extra_special_tokens).
573+
"""
574+
import json
575+
from pathlib import Path
576+
577+
from tokenizers import Tokenizer as RustTokenizer
578+
from transformers import PreTrainedTokenizerFast
579+
580+
_SAFE_CONFIG_KEYS = (
581+
"pad_token", "pad_token_id", "eos_token", "eos_token_id",
582+
"bos_token", "bos_token_id", "unk_token", "unk_token_id",
583+
"model_max_length", "padding_side", "truncation_side",
584+
)
585+
586+
path = Path(pretrained_model_name_or_path)
587+
tokenizer_json = path / "tokenizer.json"
588+
if not tokenizer_json.exists():
589+
raise FileNotFoundError(
590+
f"Expected tokenizer.json at {tokenizer_json}. "
591+
"GlmMoeDsaTokenizer loads from tokenizer.json only."
592+
)
593+
594+
rust_tok = RustTokenizer.from_file(str(tokenizer_json))
595+
init_kwargs = {}
596+
config_path = path / "tokenizer_config.json"
597+
if config_path.exists():
598+
with open(config_path, encoding="utf-8") as f:
599+
config = json.load(f)
600+
for key in _SAFE_CONFIG_KEYS:
601+
if key in config:
602+
init_kwargs[key] = config[key]
603+
if "extra_special_tokens" in config:
604+
init_kwargs["additional_special_tokens"] = config["extra_special_tokens"]
605+
606+
return PreTrainedTokenizerFast(tokenizer_object=rust_tok, **init_kwargs)
607+
608+
568609
def get_tokenizer(
569610
pretrained_model_name_or_path: str,
570611
tokenizer_mode: str = "auto",
571612
trust_remote_code: bool = False,
613+
custom_tokenizer: str | None = None,
572614
**kwargs,
573615
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
574616
if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
@@ -587,14 +629,31 @@ def get_tokenizer(
587629
"to use mistral tokenizer mode."
588630
) from e
589631
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
590-
else:
591-
tokenizer = AutoTokenizer.from_pretrained(
592-
pretrained_model_name_or_path,
593-
trust_remote_code=trust_remote_code,
594-
**kwargs,
595-
)
596-
_fix_v5_tokenizer_components(tokenizer, pretrained_model_name_or_path)
597-
return tokenizer
632+
if custom_tokenizer:
633+
if custom_tokenizer == "glm_moe_dsa":
634+
return _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path)
635+
from importlib import import_module
636+
try:
637+
module_path, class_name = custom_tokenizer.rsplit('.', 1)
638+
module = import_module(module_path)
639+
tokenizer_class = getattr(module, class_name)
640+
return tokenizer_class.from_pretrained(
641+
pretrained_model_name_or_path,
642+
trust_remote_code=trust_remote_code,
643+
**kwargs,
644+
)
645+
except (ValueError, ImportError, AttributeError) as e:
646+
raise ValueError(
647+
f"Failed to load custom_tokenizer '{custom_tokenizer}'. "
648+
"Expected 'glm_moe_dsa' or 'module.path.ClassName'.") from e
649+
650+
tokenizer = AutoTokenizer.from_pretrained(
651+
pretrained_model_name_or_path,
652+
trust_remote_code=trust_remote_code,
653+
**kwargs,
654+
)
655+
_fix_v5_tokenizer_components(tokenizer, pretrained_model_name_or_path)
656+
return tokenizer
598657

599658

600659
ASYNC_REQUEST_FUNCS = {

src/srtctl/benchmarks/scripts/sa-bench/bench.sh

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ DECODE_GPUS=${11:-0}
6262
RANDOM_RANGE_RATIO=${12:-0.8}
6363
NUM_PROMPTS_MULT=${13:-10}
6464
NUM_WARMUP_MULT=${14:-2}
65+
CUSTOM_TOKENIZER=${15:-}
66+
USE_CHAT_TEMPLATE=${16:-true}
67+
68+
# Build optional custom tokenizer args
69+
CUSTOM_TOKENIZER_ARGS=()
70+
if [ -n "$CUSTOM_TOKENIZER" ]; then
71+
CUSTOM_TOKENIZER_ARGS=(--custom-tokenizer "$CUSTOM_TOKENIZER")
72+
fi
73+
74+
# Build optional chat template args
75+
CHAT_TEMPLATE_ARGS=()
76+
if [ "$USE_CHAT_TEMPLATE" = "true" ]; then
77+
CHAT_TEMPLATE_ARGS=(--use-chat-template)
78+
fi
6579

6680
# Parse endpoint into host:port
6781
HOST=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f1)
@@ -121,7 +135,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
121135
--request-rate 250 \
122136
--percentile-metrics ttft,tpot,itl,e2el \
123137
--max-concurrency "$concurrency" \
124-
--trust-remote-code
138+
--trust-remote-code \
139+
"${CUSTOM_TOKENIZER_ARGS[@]}"
125140

126141
num_prompts=$((concurrency * NUM_PROMPTS_MULT))
127142

@@ -151,7 +166,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
151166
--percentile-metrics ttft,tpot,itl,e2el \
152167
--max-concurrency "$concurrency" \
153168
--trust-remote-code \
154-
--use-chat-template \
169+
"${CHAT_TEMPLATE_ARGS[@]}" \
170+
"${CUSTOM_TOKENIZER_ARGS[@]}" \
155171
--save-result --result-dir "$result_dir" --result-filename "$result_filename"
156172
set +x
157173

src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,7 @@ def main(args: argparse.Namespace):
837837
tokenizer_id,
838838
tokenizer_mode=tokenizer_mode,
839839
trust_remote_code=args.trust_remote_code,
840+
custom_tokenizer=args.custom_tokenizer,
840841
)
841842

842843
if args.dataset is not None:
@@ -1279,6 +1280,14 @@ def main(args: argparse.Namespace):
12791280
'"custom" will use --tokenizer to select the preregistered tokenizer.',
12801281
)
12811282

1283+
parser.add_argument(
1284+
"--custom-tokenizer",
1285+
type=str,
1286+
default=None,
1287+
help="Custom tokenizer to use (e.g., 'glm_moe_dsa' or 'module.path.ClassName'). "
1288+
"When set, overrides the default tokenizer loading.",
1289+
)
1290+
12821291
parser.add_argument(
12831292
"--served-model-name",
12841293
type=str,
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#!/bin/bash
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# Trace Replay Benchmark using aiperf
6+
# Replays a user-provided JSONL trace dataset at configurable concurrency levels.
7+
# Uses aiperf with --custom-dataset-type mooncake_trace.
8+
#
9+
# Usage: bench.sh ENDPOINT MODEL_NAME TRACE_FILE CONCURRENCIES [TTFT_THRESHOLD] [ITL_THRESHOLD] [TOKENIZER_PATH]
10+
11+
set -e
12+
13+
ENDPOINT=$1
14+
MODEL_NAME=${2:-"test-model"}
15+
TRACE_FILE=$3
16+
CONCURRENCIES=${4:-"1"}
17+
TTFT_THRESHOLD=${5:-2000}
18+
ITL_THRESHOLD=${6:-25}
19+
TOKENIZER_PATH=${7:-"/model"}
20+
21+
# Optional: extra Prometheus endpoints for AIPerf server metrics
22+
SERVER_METRICS_ARGS=()
23+
if [ -n "${AIPERF_SERVER_METRICS_URLS:-}" ]; then
24+
IFS=',' read -r -a server_metrics_urls <<< "${AIPERF_SERVER_METRICS_URLS}"
25+
if [ ${#server_metrics_urls[@]} -gt 0 ]; then
26+
SERVER_METRICS_ARGS+=(--server-metrics "${server_metrics_urls[@]}")
27+
fi
28+
fi
29+
30+
# Setup directories (BASE_DIR defaults to /logs inside container, overridable for testing)
31+
BASE_DIR="${BASE_DIR:-/logs}"
32+
ARTIFACT_DIR="${ARTIFACT_DIR:-${BASE_DIR}/artifacts}"
33+
mkdir -p "${ARTIFACT_DIR}"
34+
35+
# Increase aiperf HTTP timeout
36+
export AIPERF_HTTP_SO_RCVTIMEO=120
37+
38+
echo "=============================================="
39+
echo "Trace Replay Benchmark (aiperf)"
40+
echo "=============================================="
41+
echo "Endpoint: ${ENDPOINT}"
42+
echo "Model: ${MODEL_NAME}"
43+
echo "Trace File: ${TRACE_FILE}"
44+
echo "Concurrencies: ${CONCURRENCIES}"
45+
echo "TTFT Threshold: ${TTFT_THRESHOLD}ms"
46+
echo "ITL Threshold: ${ITL_THRESHOLD}ms"
47+
echo "Tokenizer Path: ${TOKENIZER_PATH}"
48+
echo "=============================================="
49+
50+
# Validate trace file exists
51+
if [ ! -f "${TRACE_FILE}" ]; then
52+
echo "ERROR: Trace file not found: ${TRACE_FILE}"
53+
exit 1
54+
fi
55+
56+
# Install aiperf if not present
57+
if ! command -v aiperf &> /dev/null; then
58+
echo "Installing aiperf..."
59+
pip install aiperf
60+
fi
61+
62+
# Run small benchmark for warmup
63+
echo "Running warmup..."
64+
aiperf profile \
65+
-m "${MODEL_NAME}" \
66+
--tokenizer "${TOKENIZER_PATH}" \
67+
--url "${ENDPOINT}" \
68+
--streaming \
69+
--ui simple \
70+
--extra-inputs ignore_eos:true \
71+
--concurrency 1 \
72+
--request-count 5
73+
echo "Warmup complete"
74+
75+
# Setup artifact directory
76+
MODEL_BASE_NAME="${MODEL_NAME##*/}"
77+
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
78+
79+
# Parse concurrencies (comma-separated)
80+
IFS=',' read -r -a CONCURRENCY_LIST <<< "${CONCURRENCIES}"
81+
82+
for C in "${CONCURRENCY_LIST[@]}"; do
83+
echo ""
84+
echo "=============================================="
85+
echo "Running concurrency=${C}"
86+
echo "=============================================="
87+
echo "$(date '+%Y-%m-%d %H:%M:%S') - Starting benchmark at concurrency ${C}"
88+
89+
RUN_ARTIFACT_DIR="${ARTIFACT_DIR}/${MODEL_BASE_NAME}_trace_c${C}_${TIMESTAMP}"
90+
mkdir -p "${RUN_ARTIFACT_DIR}"
91+
92+
aiperf profile \
93+
-m "${MODEL_NAME}" \
94+
--tokenizer "${TOKENIZER_PATH}" \
95+
--input-file "${TRACE_FILE}" \
96+
--custom-dataset-type mooncake_trace \
97+
--url "${ENDPOINT}" \
98+
--streaming \
99+
--extra-inputs ignore_eos:true \
100+
--concurrency "${C}" \
101+
--random-seed 42 \
102+
--ui simple \
103+
--artifact-dir "${RUN_ARTIFACT_DIR}" \
104+
"${SERVER_METRICS_ARGS[@]}" \
105+
--goodput "time_to_first_token:${TTFT_THRESHOLD} inter_token_latency:${ITL_THRESHOLD}"
106+
107+
echo "$(date '+%Y-%m-%d %H:%M:%S') - Concurrency ${C} complete"
108+
109+
# List artifacts
110+
ls -la "${RUN_ARTIFACT_DIR}" 2>/dev/null || true
111+
done
112+
113+
echo ""
114+
echo "=============================================="
115+
echo "Trace Replay Benchmark Complete"
116+
echo "Results saved to: ${ARTIFACT_DIR}"
117+
echo "=============================================="

0 commit comments

Comments
 (0)