diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 72ef89c09..265dc48ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -334,7 +334,7 @@ kimik2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:v0.19.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -345,15 +345,15 @@ minimaxm2.5-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index adfb959cf..569172cee 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -25,6 +25,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -49,8 +50,10 @@ vllm serve $MODEL --port $PORT \ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ +--kv-cache-dtype fp8 \ --block-size=32 \ --no-enable-prefix-caching \ +--attention-backend "ROCM_AITER_FA" \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c490922a4..b4aa8c595 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1244,4 +1244,18 @@ - "Remove ISL 1024 / OSL 8192 seq-len config" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" + - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002 +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" + - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" + - "Upgrade vLLM image to v0.19.0" + - "Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003