Skip to content

Commit 05852d7

Browse files
committed
Add Kimi K2.5 disagg STP and MTP recipes for GB200 NVfp4 (ISL8K_OSL1K and ISL1K_OSL1K)
Add optimized disaggregated inference recipes for Kimi K2.5 model with NVfp4 precision on GB200 GPUs. Includes both STP and MTP configurations for ISL8K_OSL1K and ISL1K_OSL1K workloads covering concurrency points from 5 to 2253, with Eagle speculative decoding for MTP variants.
1 parent ecd7a15 commit 05852d7

29 files changed

Lines changed: 4158 additions & 0 deletions
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp3"
2+
3+
# ctx: 1 prefill worker, TP4/EP4
4+
# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
5+
# MTP (Eagle speculative decoding, max_draft_len=3)
6+
# concurrency: 666
7+
8+
model:
9+
path: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-NVFP4"
10+
container: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/squash/trtllm-main_aarch-46939060.sqsh"
11+
precision: "fp4"
12+
13+
resources:
14+
gpu_type: "gb200"
15+
16+
prefill_nodes: 1
17+
prefill_workers: 1
18+
gpus_per_prefill: 4
19+
20+
decode_workers: 1
21+
decode_nodes: 4
22+
gpus_per_decode: 16
23+
24+
gpus_per_node: 4
25+
26+
backend:
27+
type: trtllm
28+
29+
prefill_environment:
30+
ENROOT_ALLOW_DEV: "yes"
31+
NCCL_GRAPH_MIXING_SUPPORT: "0"
32+
TLLM_LOG_LEVEL: "INFO"
33+
TRTLLM_ENABLE_PDL: "1"
34+
TRTLLM_SERVER_DISABLE_GC: "1"
35+
TRTLLM_WORKER_DISABLE_GC: "1"
36+
HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
37+
38+
decode_environment:
39+
ENROOT_ALLOW_DEV: "yes"
40+
NCCL_GRAPH_MIXING_SUPPORT: "0"
41+
TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
42+
TLLM_LOG_LEVEL: "INFO"
43+
TRTLLM_ENABLE_PDL: "1"
44+
TRTLLM_SERVER_DISABLE_GC: "1"
45+
TRTLLM_WORKER_DISABLE_GC: "1"
46+
HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
47+
48+
trtllm_config:
49+
prefill:
50+
tensor_parallel_size: 4
51+
moe_expert_parallel_size: 4
52+
pipeline_parallel_size: 1
53+
enable_attention_dp: true
54+
disable_overlap_scheduler: true
55+
trust_remote_code: true
56+
max_batch_size: 16
57+
max_num_tokens: 16384
58+
max_seq_len: 1064
59+
print_iter_log: true
60+
cuda_graph_config: null
61+
moe_config:
62+
backend: TRTLLM
63+
kv_cache_config:
64+
dtype: fp8
65+
enable_block_reuse: false
66+
free_gpu_memory_fraction: 0.4
67+
cache_transceiver_config:
68+
backend: UCX
69+
max_tokens_in_buffer: 16384
70+
speculative_config:
71+
decoding_type: Eagle
72+
max_draft_len: 3
73+
speculative_model_dir: "/eagle-model"
74+
75+
decode:
76+
tensor_parallel_size: 16
77+
moe_expert_parallel_size: 16
78+
pipeline_parallel_size: 1
79+
enable_attention_dp: true
80+
enable_lm_head_tp_in_adp: true
81+
trust_remote_code: true
82+
max_batch_size: 32
83+
max_num_tokens: 128
84+
max_seq_len: 2088
85+
print_iter_log: true
86+
stream_interval: 100
87+
num_postprocess_workers: 4
88+
cuda_graph_config:
89+
enable_padding: true
90+
batch_sizes:
91+
- 1
92+
- 2
93+
- 4
94+
- 8
95+
- 16
96+
- 24
97+
- 32
98+
moe_config:
99+
backend: TRTLLM
100+
use_low_precision_moe_combine: true
101+
kv_cache_config:
102+
dtype: fp8
103+
enable_block_reuse: false
104+
free_gpu_memory_fraction: 0.7
105+
cache_transceiver_config:
106+
backend: UCX
107+
max_tokens_in_buffer: 16384
108+
nvfp4_gemm_config:
109+
allowed_backends:
110+
- cutlass
111+
- cublaslt
112+
- cutedsl
113+
- cuda_core
114+
speculative_config:
115+
decoding_type: Eagle
116+
max_draft_len: 3
117+
speculative_model_dir: "/eagle-model"
118+
119+
extra_mount:
120+
- "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
121+
122+
benchmark:
123+
type: "sa-bench"
124+
isl: 1024
125+
osl: 1024
126+
concurrencies: "666"
127+
req_rate: "inf"
128+
129+
frontend:
130+
type: "dynamo"
131+
enable_multiple_frontends: false
132+
133+
health_check:
134+
max_attempts: 360
135+
interval_seconds: 10
136+
137+
dynamo:
138+
install: false
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_eplb0_mtp3"
2+
3+
# ctx: 1 prefill worker, TP4/EP4
4+
# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
5+
# MTP (Eagle speculative decoding, max_draft_len=3)
6+
# concurrency: 666
7+
8+
model:
9+
path: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-NVFP4"
10+
container: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/squash/trtllm-main_aarch-46939060.sqsh"
11+
precision: "fp4"
12+
13+
resources:
14+
gpu_type: "gb200"
15+
16+
prefill_nodes: 1
17+
prefill_workers: 1
18+
gpus_per_prefill: 4
19+
20+
decode_workers: 1
21+
decode_nodes: 8
22+
gpus_per_decode: 32
23+
24+
gpus_per_node: 4
25+
26+
backend:
27+
type: trtllm
28+
29+
prefill_environment:
30+
ENROOT_ALLOW_DEV: "yes"
31+
NCCL_GRAPH_MIXING_SUPPORT: "0"
32+
TLLM_LOG_LEVEL: "INFO"
33+
TRTLLM_ENABLE_PDL: "1"
34+
TRTLLM_SERVER_DISABLE_GC: "1"
35+
TRTLLM_WORKER_DISABLE_GC: "1"
36+
HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
37+
38+
decode_environment:
39+
ENROOT_ALLOW_DEV: "yes"
40+
NCCL_GRAPH_MIXING_SUPPORT: "0"
41+
TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
42+
TLLM_LOG_LEVEL: "INFO"
43+
TRTLLM_ENABLE_PDL: "1"
44+
TRTLLM_SERVER_DISABLE_GC: "1"
45+
TRTLLM_WORKER_DISABLE_GC: "1"
46+
HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
47+
48+
trtllm_config:
49+
prefill:
50+
tensor_parallel_size: 4
51+
moe_expert_parallel_size: 4
52+
pipeline_parallel_size: 1
53+
enable_attention_dp: true
54+
disable_overlap_scheduler: true
55+
trust_remote_code: true
56+
max_batch_size: 16
57+
max_num_tokens: 16384
58+
max_seq_len: 1064
59+
print_iter_log: true
60+
cuda_graph_config: null
61+
moe_config:
62+
backend: TRTLLM
63+
kv_cache_config:
64+
dtype: fp8
65+
enable_block_reuse: false
66+
free_gpu_memory_fraction: 0.4
67+
cache_transceiver_config:
68+
backend: UCX
69+
max_tokens_in_buffer: 16384
70+
speculative_config:
71+
decoding_type: Eagle
72+
max_draft_len: 3
73+
speculative_model_dir: "/eagle-model"
74+
75+
decode:
76+
tensor_parallel_size: 32
77+
moe_expert_parallel_size: 32
78+
pipeline_parallel_size: 1
79+
enable_attention_dp: true
80+
enable_lm_head_tp_in_adp: true
81+
trust_remote_code: true
82+
max_batch_size: 16
83+
max_num_tokens: 64
84+
max_seq_len: 2088
85+
print_iter_log: true
86+
stream_interval: 100
87+
num_postprocess_workers: 4
88+
cuda_graph_config:
89+
enable_padding: true
90+
batch_sizes:
91+
- 1
92+
- 2
93+
- 4
94+
- 8
95+
- 16
96+
moe_config:
97+
backend: TRTLLM
98+
use_low_precision_moe_combine: true
99+
kv_cache_config:
100+
dtype: fp8
101+
enable_block_reuse: false
102+
free_gpu_memory_fraction: 0.6
103+
cache_transceiver_config:
104+
backend: UCX
105+
max_tokens_in_buffer: 16384
106+
nvfp4_gemm_config:
107+
allowed_backends:
108+
- cutlass
109+
- cublaslt
110+
- cutedsl
111+
- cuda_core
112+
speculative_config:
113+
decoding_type: Eagle
114+
max_draft_len: 3
115+
speculative_model_dir: "/eagle-model"
116+
117+
extra_mount:
118+
- "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
119+
120+
benchmark:
121+
type: "sa-bench"
122+
isl: 1024
123+
osl: 1024
124+
concurrencies: "666"
125+
req_rate: "inf"
126+
127+
frontend:
128+
type: "dynamo"
129+
enable_multiple_frontends: false
130+
131+
health_check:
132+
max_attempts: 360
133+
interval_seconds: 10
134+
135+
dynamo:
136+
install: false

0 commit comments

Comments
 (0)