From e655a4c7d9d00e6a96ba83f01e434dc19589e462 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Mon, 29 Jun 2026 21:17:55 +0000
Subject: [PATCH 1/4] [AMD] enable AITER MoE for MiniMax-M3 MI355X vLLM MTP
 benchmarks

Mirror the STP AITER MoE and shared-expert fusion knobs on the MXFP4/MXFP8
EAGLE3 launchers, including INT6 quick-reduce on FP8 MTP.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh        | 5 +++++
 .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh | 5 +++++
 perf-changelog.yaml                                       | 8 ++++++++
 3 files changed, 18 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
index 96a5604934..69ba51f4f0 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
@@ -5,6 +5,7 @@
 # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from
 # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD
 # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed.
+# MoE serving mirrors minimaxm3_fp4_mi355x_vllm.sh (AITER MoE, vllm#46419).
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -36,6 +37,9 @@ fi
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MOE=1
+export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -65,6 +69,7 @@ vllm serve "$MODEL" --port "$PORT" \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
+    --moe-backend aiter \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
index 757d54786f..281dbe297b 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -61,6 +61,10 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # avoids the M3-decode breakable-cudagraph path that previously forced eager.
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MOE=1
+export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -177,6 +181,7 @@ vllm serve "$MODEL" --port "$PORT" \
     --max-model-len "$MAX_MODEL_LEN" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
+    --moe-backend aiter \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5d8427dffd..afff9af93c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4316,3 +4316,11 @@
   description:
     - "Update the DeepSeek-V4-Pro B300 disaggregated Dynamo-vLLM benchmark to the vllm/vllm-openai:v0.23.0 image"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1952
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm-mtp
+    - minimaxm3-fp8-mi355x-vllm-mtp
+  description:
+    - "Enable AITER MoE on MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP benchmarks (MXFP4 and MXFP8): export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
+    - "MXFP8 MTP also exports VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6. Mirrors the STP AITER MoE knobs from #1954 with three Inferact/MiniMax-M3-EAGLE3 speculative tokens and --use-chat-template for serving."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER

From 75ed17089581d8cdf56679048057f66c908a9a2a Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Mon, 29 Jun 2026 21:18:08 +0000
Subject: [PATCH 2/4] fix: set perf-changelog pr-link for #1955

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index afff9af93c..e33dc5430b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4323,4 +4323,4 @@
   description:
     - "Enable AITER MoE on MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP benchmarks (MXFP4 and MXFP8): export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
     - "MXFP8 MTP also exports VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6. Mirrors the STP AITER MoE knobs from #1954 with three Inferact/MiniMax-M3-EAGLE3 speculative tokens and --use-chat-template for serving."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1955

From 811bd4e2605db0dff0d1e1339f7c7f2f3ce3a8be Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Mon, 29 Jun 2026 21:30:12 +0000
Subject: [PATCH 3/4] [AMD] pass --linear-backend emulation on MiniMax-M3 FP8
 MTP

Use the emulation linear backend for MXFP8 EAGLE3 serving on MI355X.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh       | 1 +
 perf-changelog.yaml                                             | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
index 281dbe297b..b8664a91f3 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -180,6 +180,7 @@ vllm serve "$MODEL" --port "$PORT" \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --kv-cache-dtype fp8 \
+    --linear-backend emulation \
     --attention-backend TRITON_ATTN \
     --moe-backend aiter \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e33dc5430b..91f0ebc24f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4322,5 +4322,5 @@
     - minimaxm3-fp8-mi355x-vllm-mtp
   description:
     - "Enable AITER MoE on MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP benchmarks (MXFP4 and MXFP8): export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
-    - "MXFP8 MTP also exports VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6. Mirrors the STP AITER MoE knobs from #1954 with three Inferact/MiniMax-M3-EAGLE3 speculative tokens and --use-chat-template for serving."
+    - "MXFP8 MTP also exports VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 and passes --linear-backend emulation. Mirrors the STP AITER MoE knobs from #1954 with three Inferact/MiniMax-M3-EAGLE3 speculative tokens and --use-chat-template for serving."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1955

From 1db8d73758caf612b9b6236901ab5c86f602e129 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Mon, 29 Jun 2026 22:03:30 +0000
Subject: [PATCH 4/4] chore(amd): bump MiniMax-M3 MI355X vLLM to latest ROCm
 nightly

Pin nightly-4559c43a for AITER MoE, shared-expert fusion, and FP8
linear-backend emulation support on all four single-node configs.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/configs/amd-master.yaml | 8 ++++----
 perf-changelog.yaml             | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f6166699aa..466c9f2d3f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2525,7 +2525,7 @@ dsv4-fp4-mi355x-atom-disagg:
 # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
 minimaxm3-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2562,7 +2562,7 @@ minimaxm3-fp8-mi355x-vllm:
 # acceptance dilutes in big batches, and the draft weights + draft KV shave
 # headroom — tp2-ep2 is dropped since its KV headroom was already thin.
 minimaxm3-fp8-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2641,7 +2641,7 @@ minimaxm3-fp4-mi355x-vllm-disagg:
 # language-model path and mirror the MXFP8 MI355X search space for a direct
 # precision comparison.
 minimaxm3-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
@@ -2672,7 +2672,7 @@ minimaxm3-fp4-mi355x-vllm:
 # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
 # FP4 sweep at extreme concurrency where speculative decoding loses value.
 minimaxm3-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 91f0ebc24f..e61f3ac315 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4323,4 +4323,5 @@
   description:
     - "Enable AITER MoE on MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP benchmarks (MXFP4 and MXFP8): export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
     - "MXFP8 MTP also exports VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 and passes --linear-backend emulation. Mirrors the STP AITER MoE knobs from #1954 with three Inferact/MiniMax-M3-EAGLE3 speculative tokens and --use-chat-template for serving."
+    - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) on all four MiniMax-M3 MI355X single-node vLLM configs."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1955