Skip to content

Commit 4a13ebb

Browse files
committed
Switch to flashinfer_nvlink_one_sided A2A backend for 8k1k config
1 parent fd250ac commit 4a13ebb

4 files changed

Lines changed: 7 additions & 7 deletions

File tree

recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ backend:
6565
attention-backend: "FLASHINFER_MLA"
6666
block-size: 64
6767
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68-
all2all-backend: "allgather_reducescatter"
68+
all2all-backend: "flashinfer_nvlink_one_sided"
6969
gpu-memory-utilization: 0.9
7070

7171
decode:

recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ backend:
6565
attention-backend: "FLASHINFER_MLA"
6666
block-size: 64
6767
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68-
all2all-backend: "allgather_reducescatter"
68+
all2all-backend: "flashinfer_nvlink_one_sided"
6969
gpu-memory-utilization: 0.9
7070

7171
decode:
@@ -87,7 +87,7 @@ backend:
8787
async-scheduling: true
8888
attention-backend: "FLASHINFER_MLA"
8989
block-size: 64
90-
all2all-backend: "allgather_reducescatter"
90+
all2all-backend: "flashinfer_nvlink_one_sided"
9191
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
9292
gpu-memory-utilization: 0.9
9393
stream-interval: 50

recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ backend:
6565
attention-backend: "FLASHINFER_MLA"
6666
block-size: 64
6767
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68-
all2all-backend: "allgather_reducescatter"
68+
all2all-backend: "flashinfer_nvlink_one_sided"
6969
gpu-memory-utilization: 0.9
7070

7171
decode:
@@ -87,7 +87,7 @@ backend:
8787
async-scheduling: true
8888
attention-backend: "FLASHINFER_MLA"
8989
block-size: 64
90-
all2all-backend: "allgather_reducescatter"
90+
all2all-backend: "flashinfer_nvlink_one_sided"
9191
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
9292
gpu-memory-utilization: 0.9
9393
stream-interval: 50

recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ backend:
6565
attention-backend: "FLASHINFER_MLA"
6666
block-size: 64
6767
attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
68-
all2all-backend: "allgather_reducescatter"
68+
all2all-backend: "flashinfer_nvlink_one_sided"
6969
gpu-memory-utilization: 0.9
7070

7171
decode:
@@ -87,7 +87,7 @@ backend:
8787
async-scheduling: true
8888
attention-backend: "FLASHINFER_MLA"
8989
block-size: 64
90-
all2all-backend: "allgather_reducescatter"
90+
all2all-backend: "flashinfer_nvlink_one_sided"
9191
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
9292
gpu-memory-utilization: 0.9
9393
stream-interval: 50

0 commit comments

Comments
 (0)