Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 (#935)

ankursingh-nv · Ankur-singh · web-flow · commit 79ea3651a8e5 · 2026-03-24T13:03:05.000-07:00
* Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200
* Update perf-changelog.yaml

---------

Co-authored-by: ankursingh-nv &lt;ankusingh@nvidia.com&gt;
diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/kimik2.5_int4_b200.sh
@@ -21,6 +21,7 @@ hf download "$MODEL"
 nvidia-smi
 
 export PYTHONNOUSERSITE=1
+export VLLM_USE_FLASHINFER_MOE_INT4=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -38,7 +39,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --tool-call-parser kimi_k2 \
 --compilation_config.pass_config.fuse_allreduce_rms true \
 --trust-remote-code \
---disable-log-requests > $SERVER_LOG 2>&1 &
+--disable-log-requests \
+--no-enable-prefix-caching > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1048,3 +1048,10 @@
     - "Replace FP8 with combination of TP4 and TP8 config"
     - "Add --enable-flashinfer-allreduce-fusion to TP8"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/918
+
+- config-keys:
+    - kimik2.5-int4-b200-vllm
+  description:
+    - "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935
+