Skip to content

Commit f8fb4d7

Browse files
committed
Add Kimi-K2.5-nvfp4 GB200-disagg 8k1k for vllm
1 parent 896eabe commit f8fb4d7

4 files changed

Lines changed: 401 additions & 0 deletions

File tree

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4"
2+
3+
model:
4+
path: "kimi-k2.5-nvfp4"
5+
container: "v0.18.0"
6+
precision: "fp4"
7+
8+
dynamo:
9+
version: 1.0.1
10+
install: true
11+
12+
setup_script: vllm-container-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 1
18+
decode_nodes: 4
19+
prefill_workers: 1
20+
decode_workers: 4
21+
gpus_per_prefill: 4
22+
gpus_per_decode: 4
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_USE_FLASHINFER_MOE_FP4: "1"
34+
VLLM_USE_NCCL_SYMM_MEM: "1"
35+
NCCL_CUMEM_ENABLE: "1"
36+
NCCL_MNNVL_ENABLE: "1"
37+
NCCL_NVLS_ENABLE: "1"
38+
39+
decode_environment:
40+
VLLM_USE_FLASHINFER_MOE_FP4: "1"
41+
VLLM_USE_NCCL_SYMM_MEM: "1"
42+
NCCL_CUMEM_ENABLE: "1"
43+
NCCL_MNNVL_ENABLE: "1"
44+
NCCL_NVLS_ENABLE: "1"
45+
46+
vllm_config:
47+
prefill:
48+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
49+
served-model-name: "nvidia/Kimi-K2.5-NVFP4"
50+
kv-cache-dtype: "fp8"
51+
tensor-parallel-size: 1
52+
pipeline-parallel-size: 1
53+
data-parallel-size: 4
54+
data-parallel-rpc-port: 13345
55+
enable-expert-parallel: true
56+
max-model-len: 10240
57+
max-num-seqs: 64
58+
enforce-eager: true
59+
compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
60+
max-num-batched-tokens: 16384
61+
safetensors-load-strategy: "prefetch"
62+
trust-remote-code: true
63+
no-enable-prefix-caching: true
64+
no-enable-chunked-prefill: true
65+
attention-backend: "FLASHINFER_MLA"
66+
block-size: 64
67+
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68+
all2all-backend: "allgather_reducescatter"
69+
gpu-memory-utilization: 0.9
70+
71+
decode:
72+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
73+
served-model-name: "nvidia/Kimi-K2.5-NVFP4"
74+
kv-cache-dtype: "fp8"
75+
tensor-parallel-size: 4
76+
pipeline-parallel-size: 1
77+
enable-expert-parallel: true
78+
max-model-len: 10240
79+
max-num-seqs: 16
80+
max-num-batched-tokens: 10240
81+
safetensors-load-strategy: "prefetch"
82+
trust-remote-code: true
83+
no-enable-prefix-caching: true
84+
no-enable-chunked-prefill: true
85+
async-scheduling: true
86+
attention-backend: "FLASHINFER_MLA"
87+
block-size: 64
88+
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
89+
gpu-memory-utilization: 0.9
90+
stream-interval: 50
91+
max-cudagraph-capture-size: 16
92+
93+
benchmark:
94+
type: "sa-bench"
95+
isl: 8192
96+
osl: 1024
97+
concurrencies: "4x8x16x32x64x128x256"
98+
req_rate: "inf"
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16"
2+
3+
model:
4+
path: "kimi-k2.5-nvfp4"
5+
container: "v0.18.0"
6+
precision: "fp4"
7+
8+
dynamo:
9+
version: 1.0.1
10+
install: true
11+
12+
setup_script: vllm-container-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 3
18+
decode_nodes: 4
19+
prefill_workers: 3
20+
decode_workers: 1
21+
gpus_per_prefill: 4
22+
gpus_per_decode: 16
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_USE_FLASHINFER_MOE_FP4: "1"
34+
VLLM_USE_NCCL_SYMM_MEM: "1"
35+
NCCL_CUMEM_ENABLE: "1"
36+
NCCL_MNNVL_ENABLE: "1"
37+
NCCL_NVLS_ENABLE: "1"
38+
39+
decode_environment:
40+
VLLM_USE_FLASHINFER_MOE_FP4: "1"
41+
VLLM_USE_NCCL_SYMM_MEM: "1"
42+
NCCL_CUMEM_ENABLE: "1"
43+
NCCL_MNNVL_ENABLE: "1"
44+
NCCL_NVLS_ENABLE: "1"
45+
46+
vllm_config:
47+
prefill:
48+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
49+
served-model-name: "nvidia/Kimi-K2.5-NVFP4"
50+
kv-cache-dtype: "fp8"
51+
tensor-parallel-size: 1
52+
pipeline-parallel-size: 1
53+
data-parallel-size: 4
54+
data-parallel-rpc-port: 13345
55+
enable-expert-parallel: true
56+
max-model-len: 10240
57+
max-num-seqs: 64
58+
enforce-eager: true
59+
compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
60+
max-num-batched-tokens: 16384
61+
safetensors-load-strategy: "prefetch"
62+
trust-remote-code: true
63+
no-enable-prefix-caching: true
64+
no-enable-chunked-prefill: true
65+
attention-backend: "FLASHINFER_MLA"
66+
block-size: 64
67+
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68+
all2all-backend: "allgather_reducescatter"
69+
gpu-memory-utilization: 0.9
70+
71+
decode:
72+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
73+
served-model-name: "nvidia/Kimi-K2.5-NVFP4"
74+
kv-cache-dtype: "fp8"
75+
tensor-parallel-size: 1
76+
pipeline-parallel-size: 1
77+
data-parallel-size: 16
78+
data-parallel-rpc-port: 13345
79+
enable-expert-parallel: true
80+
max-model-len: 10240
81+
max-num-seqs: 256
82+
max-num-batched-tokens: 10240
83+
safetensors-load-strategy: "prefetch"
84+
trust-remote-code: true
85+
no-enable-prefix-caching: true
86+
no-enable-chunked-prefill: true
87+
async-scheduling: true
88+
attention-backend: "FLASHINFER_MLA"
89+
block-size: 64
90+
all2all-backend: "allgather_reducescatter"
91+
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
92+
gpu-memory-utilization: 0.9
93+
stream-interval: 50
94+
max-cudagraph-capture-size: 256
95+
96+
benchmark:
97+
type: "sa-bench"
98+
isl: 8192
99+
osl: 1024
100+
concurrencies: "512x1024"
101+
req_rate: "inf"
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
name: "kimi-vllm-disagg-gb200-5p1d-dep4-dep8"
2+
3+
model:
4+
path: "kimi-k2.5-nvfp4"
5+
container: "v0.18.0"
6+
precision: "fp4"
7+
8+
dynamo:
9+
version: 1.0.1
10+
install: true
11+
12+
setup_script: vllm-container-deps.sh
13+
14+
resources:
15+
gpu_type: "gb200"
16+
gpus_per_node: 4
17+
prefill_nodes: 5
18+
decode_nodes: 2
19+
prefill_workers: 5
20+
decode_workers: 1
21+
gpus_per_prefill: 4
22+
gpus_per_decode: 8
23+
24+
frontend:
25+
type: dynamo
26+
enable_multiple_frontends: false
27+
28+
backend:
29+
type: vllm
30+
connector: null
31+
32+
prefill_environment:
33+
VLLM_USE_FLASHINFER_MOE_FP4: "1"
34+
VLLM_USE_NCCL_SYMM_MEM: "1"
35+
NCCL_CUMEM_ENABLE: "1"
36+
NCCL_MNNVL_ENABLE: "1"
37+
NCCL_NVLS_ENABLE: "1"
38+
39+
decode_environment:
40+
VLLM_USE_FLASHINFER_MOE_FP4: "1"
41+
VLLM_USE_NCCL_SYMM_MEM: "1"
42+
NCCL_CUMEM_ENABLE: "1"
43+
NCCL_MNNVL_ENABLE: "1"
44+
NCCL_NVLS_ENABLE: "1"
45+
46+
vllm_config:
47+
prefill:
48+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
49+
served-model-name: "nvidia/Kimi-K2.5-NVFP4"
50+
kv-cache-dtype: "fp8"
51+
tensor-parallel-size: 1
52+
pipeline-parallel-size: 1
53+
data-parallel-size: 4
54+
data-parallel-rpc-port: 13345
55+
enable-expert-parallel: true
56+
max-model-len: 10240
57+
max-num-seqs: 64
58+
enforce-eager: true
59+
compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
60+
max-num-batched-tokens: 16384
61+
safetensors-load-strategy: "prefetch"
62+
trust-remote-code: true
63+
no-enable-prefix-caching: true
64+
no-enable-chunked-prefill: true
65+
attention-backend: "FLASHINFER_MLA"
66+
block-size: 64
67+
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68+
all2all-backend: "allgather_reducescatter"
69+
gpu-memory-utilization: 0.9
70+
71+
decode:
72+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
73+
served-model-name: "nvidia/Kimi-K2.5-NVFP4"
74+
kv-cache-dtype: "fp8"
75+
tensor-parallel-size: 1
76+
pipeline-parallel-size: 1
77+
data-parallel-size: 8
78+
data-parallel-rpc-port: 13345
79+
enable-expert-parallel: true
80+
max-model-len: 10240
81+
max-num-seqs: 512
82+
max-num-batched-tokens: 10240
83+
safetensors-load-strategy: "prefetch"
84+
trust-remote-code: true
85+
no-enable-prefix-caching: true
86+
no-enable-chunked-prefill: true
87+
async-scheduling: true
88+
attention-backend: "FLASHINFER_MLA"
89+
block-size: 64
90+
all2all-backend: "allgather_reducescatter"
91+
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
92+
gpu-memory-utilization: 0.9
93+
stream-interval: 50
94+
max-cudagraph-capture-size: 512
95+
96+
benchmark:
97+
type: "sa-bench"
98+
isl: 8192
99+
osl: 1024
100+
concurrencies: "2048"
101+
req_rate: "inf"

0 commit comments

Comments
 (0)