Add Kimi-K2.5-nvfp4 GB200-disagg 8k1k for vllm

kyleliang-nv · kyleliang-nv · commit f8fb4d7f76dd · 2026-04-02T15:46:49.000-07:00
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-tep4.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-tep4.yaml
@@ -0,0 +1,98 @@
+name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "v0.18.0"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "allgather_reducescatter"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 16
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 16
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128x256"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "v0.18.0"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 3
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "allgather_reducescatter"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 256
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "allgather_reducescatter"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-5p1d-dep4-dep8"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "v0.18.0"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "allgather_reducescatter"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 512
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "allgather_reducescatter"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep16.yaml