From 65fe8841bcc77f28788b04ffa97b660363e90c93 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 4 Apr 2026 11:42:00 +0000 Subject: [PATCH 1/8] Add TP2EP2 for minimaxm2.5-fp8-mi355x-vllm Fewer GPUs means less inter-GPU communication overhead, and MoE expert parallelism across 2 GPUs is very efficient for this model. --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 72ef89c09..f55b216a9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -345,13 +345,13 @@ minimaxm2.5-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } From faf1537e697c1f4174b383847225a82f622e8c70 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 4 Apr 2026 18:08:13 +0000 Subject: [PATCH 2/8] Optimize config for minimaxm2.5-fp8-mi355x-vllm --- .github/configs/amd-master.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f55b216a9..587dde107 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -345,15 +345,15 @@ minimaxm2.5-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 From 772019f7ac5fdeae25adcb689d719518120cfd86 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 4 Apr 2026 20:08:47 +0000 Subject: [PATCH 3/8] Update perf-changelog for minimaxm2.5-fp8-mi355x-vllm --- perf-changelog.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c490922a4..2b87585f9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1244,4 +1244,9 @@ - "Remove ISL 1024 / OSL 8192 seq-len config" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 - +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" + - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002 From e9e56b67a1b8c4366080f2a69572529b17dd41b1 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 4 Apr 2026 21:13:09 +0000 Subject: [PATCH 4/8] Upgrade minimaxm2.5-fp8-mi355x-vllm Image to v0.19.0 Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm --- .github/configs/amd-master.yaml | 2 +- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 587dde107..265dc48ca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -334,7 +334,7 @@ kimik2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:v0.19.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index adfb959cf..148562d2c 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -49,8 +49,10 @@ vllm serve $MODEL --port $PORT \ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ +--kv-cache-dtype fp8 \ --block-size=32 \ --no-enable-prefix-caching \ +--attention-backend "ROCM_AITER_FA" \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! From 76d4d2db81f57c980d15ca3f0b1b2c552281aaf3 Mon Sep 17 00:00:00 2001 From: zhutaoyu Date: Sun, 5 Apr 2026 07:13:20 +0000 Subject: [PATCH 5/8] optimize all reduce --- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 148562d2c..569172cee 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -25,6 +25,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} From 2f7e7ffa0fbd7db277252a9fc767181b5a13024f Mon Sep 17 00:00:00 2001 From: zhutaoyu Date: Sun, 5 Apr 2026 07:25:16 +0000 Subject: [PATCH 6/8] fix pr --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2b87585f9..9aa44d15d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1249,4 +1249,4 @@ description: - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003 From 35be871144c2dbba8e4d873dedd82c5592ad22a8 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 5 Apr 2026 07:24:29 +0000 Subject: [PATCH 7/8] Update perf-chagelog --- perf-changelog.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9aa44d15d..310296a03 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1249,4 +1249,6 @@ description: - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" + - "Upgrade vLLM image to v0.19.0" + - "Enable FP8 KV cache + AITER FA for minimaxm2.5-fp8-mi355x-vllm" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1003 From 6a510857a9f4f62843feab7cc2e51c413606291c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sun, 5 Apr 2026 07:30:37 +0000 Subject: [PATCH 8/8] Fix the perf-changelog --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 310296a03..b4aa8c595 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1244,6 +1244,13 @@ - "Remove ISL 1024 / OSL 8192 seq-len config" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Optimize MiniMax-M2.5 FP8 MI355X vLLM search-space" + - "Add tp2 ep2 search-space entries (conc 2-256) for all seq-len configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1002 + - config-keys: - minimaxm2.5-fp8-mi355x-vllm description: