Skip to content

Commit 5ed8518

Browse files
committed
Add GLM5 disagg MTP recipes and GB300 STP/MTP recipes
- Add GB200 MTP recipes for ISL1K_OSL1K (8 configs) and ISL8K_OSL1K (8 configs) - Add GB300 STP recipes for ISL1K_OSL1K (8 configs) and ISL8K_OSL1K (9 configs) - Add GB300 MTP recipes for ISL1K_OSL1K (10 configs) and ISL8K_OSL1K (10 configs) - Update existing GB200 STP recipes with container and model path fixes - All configs validated against source data tables (env_vars, ctx_config, gen_config)
1 parent ef5f32a commit 5ed8518

66 files changed

Lines changed: 7371 additions & 29 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp2"
2+
3+
# ctx: 1 prefill worker, TP4/EP4
4+
# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
5+
# concurrency: 666
6+
7+
model:
8+
path: "/mnt/lustre01/models/glm-5-nvfp4"
9+
container: "/mnt/lustre01/users/slurm-shared/yeswanthk/squashs/dynamo-trtllm-rihuo-glm5-2.0-arm64.sqsh"
10+
precision: "fp4"
11+
12+
resources:
13+
gpu_type: "gb200"
14+
15+
prefill_nodes: 1
16+
prefill_workers: 1
17+
gpus_per_prefill: 4
18+
19+
decode_workers: 1
20+
decode_nodes: 4
21+
gpus_per_decode: 16
22+
23+
gpus_per_node: 4
24+
25+
backend:
26+
type: trtllm
27+
28+
prefill_environment:
29+
ENROOT_ALLOW_DEV: "yes"
30+
MIMALLOC_PURGE_DELAY: "0"
31+
NCCL_GRAPH_MIXING_SUPPORT: "0"
32+
TLLM_LOG_LEVEL: "INFO"
33+
TRTLLM_ENABLE_PDL: "1"
34+
TRTLLM_SERVER_DISABLE_GC: "1"
35+
TRTLLM_WORKER_DISABLE_GC: "1"
36+
37+
decode_environment:
38+
ENROOT_ALLOW_DEV: "yes"
39+
MIMALLOC_PURGE_DELAY: "0"
40+
NCCL_GRAPH_MIXING_SUPPORT: "0"
41+
TLLM_LOG_LEVEL: "INFO"
42+
TRTLLM_ENABLE_PDL: "1"
43+
TRTLLM_SERVER_DISABLE_GC: "1"
44+
TRTLLM_WORKER_DISABLE_GC: "1"
45+
46+
trtllm_config:
47+
prefill:
48+
tensor_parallel_size: 4
49+
moe_expert_parallel_size: 4
50+
pipeline_parallel_size: 1
51+
enable_attention_dp: true
52+
disable_overlap_scheduler: true
53+
trust_remote_code: true
54+
custom_tokenizer: "glm_moe_dsa"
55+
max_batch_size: 16
56+
max_num_tokens: 16384
57+
max_seq_len: 1064
58+
print_iter_log: true
59+
cuda_graph_config: null
60+
moe_config:
61+
backend: CUTEDSL
62+
kv_cache_config:
63+
dtype: fp8
64+
enable_block_reuse: false
65+
free_gpu_memory_fraction: 0.6
66+
cache_transceiver_config:
67+
backend: UCX
68+
max_tokens_in_buffer: 16384
69+
speculative_config:
70+
decoding_type: MTP
71+
num_nextn_predict_layers: 2
72+
73+
decode:
74+
tensor_parallel_size: 16
75+
moe_expert_parallel_size: 16
76+
pipeline_parallel_size: 1
77+
enable_attention_dp: true
78+
enable_lm_head_tp_in_adp: true
79+
trust_remote_code: true
80+
custom_tokenizer: "glm_moe_dsa"
81+
max_batch_size: 32
82+
max_num_tokens: 96
83+
max_seq_len: 2088
84+
print_iter_log: true
85+
stream_interval: 100
86+
num_postprocess_workers: 4
87+
cuda_graph_config:
88+
enable_padding: true
89+
batch_sizes:
90+
- 1
91+
- 2
92+
- 4
93+
- 8
94+
- 16
95+
- 24
96+
- 32
97+
moe_config:
98+
backend: CUTEDSL
99+
use_low_precision_moe_combine: true
100+
kv_cache_config:
101+
dtype: fp8
102+
enable_block_reuse: false
103+
free_gpu_memory_fraction: 0.7
104+
cache_transceiver_config:
105+
backend: UCX
106+
max_tokens_in_buffer: 16384
107+
nvfp4_gemm_config:
108+
allowed_backends:
109+
- cutlass
110+
- cublaslt
111+
- cutedsl
112+
- cuda_core
113+
speculative_config:
114+
decoding_type: MTP
115+
num_nextn_predict_layers: 2
116+
117+
benchmark:
118+
type: "sa-bench"
119+
isl: 1024
120+
osl: 1024
121+
concurrencies: "666"
122+
req_rate: "inf"
123+
custom_tokenizer: "glm_moe_dsa"
124+
use_chat_template: false
125+
126+
frontend:
127+
type: "dynamo"
128+
enable_multiple_frontends: false
129+
130+
health_check:
131+
max_attempts: 360
132+
interval_seconds: 10
133+
134+
dynamo:
135+
install: false
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch64_eplb0_mtp1"
2+
3+
# ctx: 1 prefill worker, TP4/EP4
4+
# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
5+
# concurrency: 1229
6+
7+
model:
8+
path: "/mnt/lustre01/models/glm-5-nvfp4"
9+
container: "/mnt/lustre01/users/slurm-shared/yeswanthk/squashs/dynamo-trtllm-rihuo-glm5-2.0-arm64.sqsh"
10+
precision: "fp4"
11+
12+
resources:
13+
gpu_type: "gb200"
14+
15+
prefill_nodes: 1
16+
prefill_workers: 1
17+
gpus_per_prefill: 4
18+
19+
decode_workers: 1
20+
decode_nodes: 4
21+
gpus_per_decode: 16
22+
23+
gpus_per_node: 4
24+
25+
backend:
26+
type: trtllm
27+
28+
prefill_environment:
29+
ENROOT_ALLOW_DEV: "yes"
30+
MIMALLOC_PURGE_DELAY: "0"
31+
NCCL_GRAPH_MIXING_SUPPORT: "0"
32+
TLLM_LOG_LEVEL: "INFO"
33+
TRTLLM_ENABLE_PDL: "1"
34+
TRTLLM_SERVER_DISABLE_GC: "1"
35+
TRTLLM_WORKER_DISABLE_GC: "1"
36+
37+
decode_environment:
38+
ENROOT_ALLOW_DEV: "yes"
39+
MIMALLOC_PURGE_DELAY: "0"
40+
NCCL_GRAPH_MIXING_SUPPORT: "0"
41+
TLLM_LOG_LEVEL: "INFO"
42+
TRTLLM_ENABLE_PDL: "1"
43+
TRTLLM_SERVER_DISABLE_GC: "1"
44+
TRTLLM_WORKER_DISABLE_GC: "1"
45+
46+
trtllm_config:
47+
prefill:
48+
tensor_parallel_size: 4
49+
moe_expert_parallel_size: 4
50+
pipeline_parallel_size: 1
51+
enable_attention_dp: true
52+
disable_overlap_scheduler: true
53+
trust_remote_code: true
54+
custom_tokenizer: "glm_moe_dsa"
55+
max_batch_size: 16
56+
max_num_tokens: 16384
57+
max_seq_len: 1064
58+
print_iter_log: true
59+
cuda_graph_config: null
60+
moe_config:
61+
backend: CUTEDSL
62+
kv_cache_config:
63+
dtype: fp8
64+
enable_block_reuse: false
65+
free_gpu_memory_fraction: 0.6
66+
cache_transceiver_config:
67+
backend: UCX
68+
max_tokens_in_buffer: 16384
69+
speculative_config:
70+
decoding_type: MTP
71+
num_nextn_predict_layers: 1
72+
73+
decode:
74+
tensor_parallel_size: 16
75+
moe_expert_parallel_size: 16
76+
pipeline_parallel_size: 1
77+
enable_attention_dp: true
78+
enable_lm_head_tp_in_adp: true
79+
trust_remote_code: true
80+
custom_tokenizer: "glm_moe_dsa"
81+
max_batch_size: 64
82+
max_num_tokens: 128
83+
max_seq_len: 2088
84+
print_iter_log: true
85+
stream_interval: 100
86+
num_postprocess_workers: 4
87+
cuda_graph_config:
88+
enable_padding: true
89+
batch_sizes:
90+
- 1
91+
- 2
92+
- 4
93+
- 8
94+
- 16
95+
- 24
96+
- 32
97+
- 40
98+
- 48
99+
- 56
100+
- 64
101+
moe_config:
102+
backend: CUTEDSL
103+
use_low_precision_moe_combine: true
104+
kv_cache_config:
105+
dtype: fp8
106+
enable_block_reuse: false
107+
free_gpu_memory_fraction: 0.7
108+
cache_transceiver_config:
109+
backend: UCX
110+
max_tokens_in_buffer: 16384
111+
nvfp4_gemm_config:
112+
allowed_backends:
113+
- cutlass
114+
- cublaslt
115+
- cutedsl
116+
- cuda_core
117+
speculative_config:
118+
decoding_type: MTP
119+
num_nextn_predict_layers: 1
120+
121+
benchmark:
122+
type: "sa-bench"
123+
isl: 1024
124+
osl: 1024
125+
concurrencies: "1229"
126+
req_rate: "inf"
127+
custom_tokenizer: "glm_moe_dsa"
128+
use_chat_template: false
129+
130+
frontend:
131+
type: "dynamo"
132+
enable_multiple_frontends: false
133+
134+
health_check:
135+
max_attempts: 360
136+
interval_seconds: 10
137+
138+
dynamo:
139+
install: false

0 commit comments

Comments
 (0)