Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Qwen3.5-397B-A17B-FP8 disaggregated 1P1D: TP4 prefill + DEP4 decode (Mooncake).
# Model / resources / backend / sglang_config: copied from
# recipes/qwen3.5/fp8/disagg/mooncake/profile/1p1d-dep4-nsys-profile.yaml
# This file changes frontend + profiling + benchmark, and disables staging buffer, to pair nsys with SGLang decode /slow_down.
#
# Slow-down is meant to be used with SA-Bench warmup skipped (num_warmup_mult: 0). The
# separate benchmark warmup is disabled so step indices stay predictable; the role of
# "warming up" decode (graphs, batching) is instead covered by a short span of real
# forwards *after* slow_down auto-clears and *before* the nsys decode window.
#
# Choose profiling.decode.start_step as:
# decode.start_step = bootstrap_steps + slow_down_steps + warmup_steps
# In this example (osl=1024, slow_down window ≈4 steps, post-slowdown warmup ≈72 steps):
# 1100 = 1024 + 4 + 72
# — bootstrap_steps is taken as osl (decode gen length) for this workload;
# — slow_down_steps: forwards while /slow_down is active (tune with slow_down_*);
# — warmup_steps: extra forwards after slow_down ends so decode is hot before capture.
# Adjust the three terms if you change osl, concurrency, or slow_down timing.

name: "qwen3.5-1p1d-dep4-nsys-profile-slowdown"

model:
path: "qwen3.5-fp8"
container: "dev-0318"
precision: "fp8"

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1

frontend:
type: "sglang"
enable_multiple_frontends: false

backend:

prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
MC_FORCE_MNNVL: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
# SGLANG_DISAGG_STAGING_BUFFER: "1"
# SGLANG_DISAGG_STAGING_BUFFER_SIZE_MB: "128"
SGLANG_LOG_FORWARD_ITERS: "1"

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
MC_FORCE_MNNVL: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
# SGLANG_DISAGG_STAGING_BUFFER: "1"
# SGLANG_DISAGG_STAGING_BUFFER_SIZE_MB: "128"
SGLANG_LOG_FORWARD_ITERS: "1"

sglang_config:
prefill:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"

attention-backend: "trtllm_mha"
kv-cache-dtype: "fp8_e4m3"

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

mamba-scheduler-strategy: "no_buffer"
disable-radix-cache: true
mamba-track-interval: 2048
mamba-ssm-dtype: "bfloat16"

disaggregation-mode: "prefill"

mem-fraction-static: 0.80
chunked-prefill-size: 16384
context-length: 4096
load-balance-method: "round_robin"
watchdog-timeout: 1000000

decode:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"

attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
moe-runner-backend: "flashinfer_trtllm"

# DEP4: DP4 + TP4 + EP4 with dp-attention
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4
enable-dp-attention: true
enable-dp-lm-head: true
moe-dense-tp-size: 1

mamba-scheduler-strategy: "no_buffer"
disable-radix-cache: true
mamba-track-interval: 2048 # must be > isl+osl to avoid checkpointing
mamba-ssm-dtype: "bfloat16"

disaggregation-mode: "decode"

mem-fraction-static: 0.80
chunked-prefill-size: 16384
context-length: 4096
cuda-graph-max-bs: 1024
decode-log-interval: 1
stream-interval: 50
watchdog-timeout: 1000000

profiling:
type: "nsys"
prefill:
start_step: 10
stop_step: 30
decode:
# nsys starts after bootstrap (≈osl) + slow_down (4) + post-slowdown warmup (72) — see file header
start_step: 1100
stop_step: 1120

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "2048"
req_rate: "inf"
random_range_ratio: 1.0
num_warmup_mult: 0 # skip SA-Bench warmup; use slow_down then post-slowdown steps as warmup
num_prompts_mult: 1
slow_down_sleep_time: 30.0 # s per forward while slow_down is on
slow_down_wait_time: 120.0 # then clear slow_down
3 changes: 3 additions & 0 deletions src/srtctl/benchmarks/sa_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class SABenchRunner(BenchmarkRunner):
- benchmark.req_rate: Request rate (default: "inf")
- benchmark.dataset_name: "random" (default) or "custom"
- benchmark.dataset_path: Container path to dataset file (required when dataset_name="custom")
- benchmark.slow_down_sleep_time / benchmark.slow_down_wait_time: When both are set and
frontend is sglang, SA-Bench POSTs /slow_down on each decode worker leader (framework-derived
URLs). Omit either field to disable slow_down.
"""

@property
Expand Down
58 changes: 42 additions & 16 deletions src/srtctl/benchmarks/scripts/sa-bench/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,28 @@ if [ "$DATASET_NAME" = "random" ]; then
)
fi

# Optional SGLang /slow_down (set by srtctl for SA-Bench when YAML provides slow_down_* and frontend is sglang):
# SA_BENCH_SLOW_DOWN_URLS: comma-separated http://host:port base URLs (decode workers)
# SA_BENCH_SLOW_DOWN_SLEEP_TIME / SA_BENCH_SLOW_DOWN_WAIT_TIME
SLOW_DOWN_ARGS=()
if [ -n "${SA_BENCH_SLOW_DOWN_URLS:-}" ]; then
IFS=',' read -r -a _sd_urls <<< "${SA_BENCH_SLOW_DOWN_URLS}"
for u in "${_sd_urls[@]}"; do
u="$(echo "$u" | xargs)"
if [ -n "$u" ]; then
SLOW_DOWN_ARGS+=(--slow-down-server "$u")
fi
done
fi
if [ ${#SLOW_DOWN_ARGS[@]} -gt 0 ]; then
SLOW_DOWN_EXTRA=(
--slow-down-sleep-time "${SA_BENCH_SLOW_DOWN_SLEEP_TIME:-1}"
--slow-down-wait-time "${SA_BENCH_SLOW_DOWN_WAIT_TIME:-60}"
)
else
SLOW_DOWN_EXTRA=()
fi

# Parse endpoint into host:port
HOST=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f1)
PORT=$(echo "$ENDPOINT" | sed 's|http://||' | cut -d: -f2 | cut -d/ -f1)
Expand Down Expand Up @@ -155,22 +177,24 @@ start_all_profiling

for concurrency in "${CONCURRENCY_LIST[@]}"; do

num_warmup_prompts=$((concurrency * NUM_WARMUP_MULT))
python3 -u "${WORK_DIR}/benchmark_serving.py" \
--model "${MODEL_NAME}" --tokenizer "${MODEL_PATH}" \
--host "$HOST" --port "$PORT" \
--backend "dynamo" --endpoint /v1/completions \
--disable-tqdm \
"${DATASET_ARGS[@]}" \
--num-prompts "$num_warmup_prompts" \
"${RANDOM_LEN_ARGS[@]}" \
--ignore-eos \
--request-rate 250 \
--percentile-metrics ttft,tpot,itl,e2el \
--max-concurrency "$concurrency" \
--trust-remote-code \
"${CHAT_TEMPLATE_ARGS[@]}" \
"${CUSTOM_TOKENIZER_ARGS[@]}"
if [ "$NUM_WARMUP_MULT" -gt 0 ]; then
num_warmup_prompts=$((concurrency * NUM_WARMUP_MULT))
python3 -u "${WORK_DIR}/benchmark_serving.py" \
--model "${MODEL_NAME}" --tokenizer "${MODEL_PATH}" \
--host "$HOST" --port "$PORT" \
--backend "dynamo" --endpoint /v1/completions \
--disable-tqdm \
"${DATASET_ARGS[@]}" \
--num-prompts "$num_warmup_prompts" \
"${RANDOM_LEN_ARGS[@]}" \
--ignore-eos \
--request-rate 250 \
--percentile-metrics ttft,tpot,itl,e2el \
--max-concurrency "$concurrency" \
--trust-remote-code \
"${CHAT_TEMPLATE_ARGS[@]}" \
"${CUSTOM_TOKENIZER_ARGS[@]}"
fi

num_prompts=$((concurrency * NUM_PROMPTS_MULT))

Expand Down Expand Up @@ -200,6 +224,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
--trust-remote-code \
"${CHAT_TEMPLATE_ARGS[@]}" \
"${CUSTOM_TOKENIZER_ARGS[@]}" \
"${SLOW_DOWN_ARGS[@]}" \
"${SLOW_DOWN_EXTRA[@]}" \
--save-result --result-dir "$result_dir" --result-filename "$result_filename"
set +x

Expand Down
Loading
Loading