File tree Expand file tree Collapse file tree 2 files changed +10
-1
lines changed
Expand file tree Collapse file tree 2 files changed +10
-1
lines changed Original file line number Diff line number Diff line change @@ -21,6 +21,7 @@ hf download "$MODEL"
2121nvidia-smi
2222
2323export PYTHONNOUSERSITE=1
24+ export VLLM_USE_FLASHINFER_MOE_INT4=1
2425
2526SERVER_LOG=/workspace/server.log
2627PORT=${PORT:- 8888}
@@ -38,7 +39,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
3839--tool-call-parser kimi_k2 \
3940--compilation_config.pass_config.fuse_allreduce_rms true \
4041--trust-remote-code \
41- --disable-log-requests > $SERVER_LOG 2>&1 &
42+ --disable-log-requests \
43+ --no-enable-prefix-caching > $SERVER_LOG 2>&1 &
4244
4345SERVER_PID=$!
4446
Original file line number Diff line number Diff line change 10481048 - " Replace FP8 with combination of TP4 and TP8 config"
10491049 - " Add --enable-flashinfer-allreduce-fusion to TP8"
10501050 pr-link : https://github.com/SemiAnalysisAI/InferenceX/pull/918
1051+
1052+ - config-keys :
1053+ - kimik2.5-int4-b200-vllm
1054+ description :
1055+ - " Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
1056+ pr-link : https://github.com/SemiAnalysisAI/InferenceX/pull/935
1057+
You can’t perform that action at this time.
0 commit comments