Switch to flashinfer_nvlink_one_sided A2A backend for 8k1k config

kyleliang-nv · kyleliang-nv · commit 4a13ebbc1d95 · 2026-04-03T09:45:04.000-07:00
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -65,7 +65,7 @@ backend:
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
       attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       gpu-memory-utilization: 0.9
 
     decode:
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
@@ -65,7 +65,7 @@ backend:
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
       attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       gpu-memory-utilization: 0.9
 
     decode:
@@ -87,7 +87,7 @@ backend:
       async-scheduling: true
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -65,7 +65,7 @@ backend:
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
       attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       gpu-memory-utilization: 0.9
 
     decode:
@@ -87,7 +87,7 @@ backend:
       async-scheduling: true
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
diff --git a/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
@@ -65,7 +65,7 @@ backend:
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
       attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       gpu-memory-utilization: 0.9
 
     decode:
@@ -87,7 +87,7 @@ backend:
       async-scheduling: true
       attention-backend: "FLASHINFER_MLA"
       block-size: 64
-      all2all-backend: "allgather_reducescatter"
+      all2all-backend: "flashinfer_nvlink_one_sided"
       compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
       gpu-memory-utilization: 0.9
       stream-interval: 50