Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 218 additions & 0 deletions recipes/b200-fp8/glm5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
# B200-FP8 GLM5 8k1k disaggregated recipe

base:
name: "b200-fp8-glm5"

model:
path: "glm5-fp8"
container: "sglang-v0.5.10-cu130.post1"
precision: "fp8"

identity:
model:
repo: "zai-org/GLM-5-FP8"
revision: "4f96cc5eec29dcee5d6ded54f7ffe889438f9516"
frameworks:
dynamo: "1.1.0.dev2"
sglang: "0.5.10.post1"

resources:
gpu_type: "b200"
gpus_per_node: 8

frontend:
type: "dynamo"
dynamo:
version: "1.1.0.dev2"

backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
NCCL_CUMEM_ENABLE: "1"
DYN_REQUEST_PLANE: "nats"

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
NCCL_CUMEM_ENABLE: "1"
DYN_REQUEST_PLANE: "nats"

sglang_config:
prefill:
# Model configuration
served-model-name: "GLM-5-FP8"
trust-remote-code: true
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"

# Disaggregation mode
disaggregation-mode: "prefill"
disaggregation-transfer-backend: "nixl"

# Size limits
max-running-requests: 256
cuda-graph-max-bs: 256
mem-fraction-static: 0.7
context-length: 9600
chunked-prefill-size: 65536
max-prefill-tokens: 8192

# Parallelism
tensor-parallel-size: 8
data-parallel-size: 8
expert-parallel-size: 1
enable-dp-attention: true
enable-dp-lm-head: true
load-balance-method: "total_tokens"

# Backend
nsa-decode-backend: "trtllm"
nsa-prefill-backend: "trtllm"
moe-runner-backend: "flashinfer_trtllm"

# Other flags
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true
stream-interval: 30
model-loader-extra-config: '{"enable_multithread_load": true}'

decode:
# Model configuration
served-model-name: "GLM-5-FP8"
trust-remote-code: true

quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"

# Disaggregation mode
disaggregation-mode: "decode"
disaggregation-transfer-backend: "nixl"

# Memory and token limits
mem-fraction-static: 0.8
context-length: 9600

# Parallelism
tensor-parallel-size: 8
expert-parallel-size: 1

# Backend
nsa-decode-backend: "trtllm"
nsa-prefill-backend: "trtllm"
moe-runner-backend: "flashinfer_trtllm"

# Other flags
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true
stream-interval: 30
model-loader-extra-config: '{"enable_multithread_load": true}'

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
req_rate: "inf"

#### 8k1k ####

zip_override_8k1k_hightpt:
resources:
prefill_nodes: [2, 1, 1]
prefill_workers: [2, 1, 1]
decode_nodes: [1, 1, 2]
decode_workers: [1, 1, 2]
backend:
sglang_config:
decode:
data-parallel-size: 8
enable-dp-lm-head: true
enable-dp-attention: true
load-balance-method: "total_tokens"

max-running-requests: [544, 224, 208]
cuda-graph-max-bs: [544, 224, 208]

benchmark:
isl: 8192
osl: 1024
concurrencies: ["560", "240", "224"]

zip_override_8k1k_lowlat:
resources:
prefill_nodes: 1
prefill_workers: 1
decode_nodes: [2, 3, 4, 5, 7, 8]
decode_workers: [2, 3, 4, 5, 7, 8]
backend:
sglang_config:
decode:
data-parallel-size: 1

max-running-requests: [80, 48, 34, 22, 8, 1]
cuda-graph-max-bs: [80, 48, 34, 22, 8, 1]

benchmark:
isl: 8192
osl: 1024
concurrencies: ["256", "256", "200", "128", "64", "12"]



#### 1k1k ####

zip_override_1k1k_hightpt:
resources:
prefill_nodes: [1, 1, 1, 1]
prefill_workers: [1, 1, 1, 1]
decode_nodes: [1, 2, 3, 4]
decode_workers: [1, 2, 3, 4]
backend:
sglang_config:
decode:
data-parallel-size: 8
enable-dp-lm-head: true
enable-dp-attention: true
load-balance-method: "total_tokens"

max-running-requests: [2560, 1232, 784, 560]
cuda-graph-max-bs: [2560, 1232, 784, 560]

benchmark:
isl: 1024
osl: 1024
concurrencies: ["2576", "1248", "800", "576"]


zip_override_1k1k_lowlat:
resources:
prefill_nodes: 1
prefill_workers: 1
decode_nodes: [8, 8]
decode_workers: [8, 8]
backend:
sglang_config:
decode:
data-parallel-size: 1

max-running-requests: [64, 1]
cuda-graph-max-bs: [64, 1]

benchmark:
isl: 1024
osl: 1024
concurrencies: ["512x256x128x64x32", "16"]
Loading
Loading