SemiAnalysisAI · hongxiayang · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026 · Jul 4, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
@@ -32,11 +32,7 @@ SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
-# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
-# fusion checks this env directly and runs on both the aiter and native MXFP8
-# MoE paths (it is independent of the AITER master switch, and self-disables
-# under expert parallelism inside the model), so enable it unconditionally.
-# (The AITER master switch itself is set below, gated on expert parallelism.)
+# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). 
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
@@ -55,16 +51,15 @@ elif [ "$EP_SIZE" -gt 1 ]; then
     PARALLEL_ARGS+=(--enable-expert-parallel)
 fi
 
-# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
-# MoE path is the auto-selected backend (no --moe-backend override). With EP
-# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
-# output, so leave it off and fall back to the native MXFP8 path (the
-# shared-experts fusion set above still applies — it is master-independent).
-if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
-    export VLLM_ROCM_USE_AITER=1
-else
-    export VLLM_ROCM_USE_AITER=0
-fi
+# Previously when EP is On, VLLM_ROCM_USE_AITER needs to be off.
+# After https://github.com/vllm-project/vllm/pull/47158, 
+# it can be simplified as VLLM_ROCM_USE_AITER=1.
+# As the configs are TP only, remove the conditional check.
+export VLLM_ROCM_USE_AITER=1
+
+# Larger per-step prefill token budget to improve TP4 throughput at high
+# concurrency. Overridable via env.
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
 
 start_gpu_monitor
 
@@ -74,9 +69,12 @@ vllm serve "$MODEL" --port "$PORT" \
     --block-size 128 \
     --no-enable-prefix-caching \
     --language-model-only \
+    --moe-backend aiter \
     --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
+    --linear-backend emulation \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \
     --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -38,7 +38,7 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
-DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"
+DRAFT_MODEL="${DRAFT_MODEL:-Inferact/MiniMax-M3-EAGLE3}"
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -62,11 +62,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 # avoids the M3-decode breakable-cudagraph path that previously forced eager.
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
-# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
-# fusion checks this env directly and runs on both the aiter and native MXFP8
-# MoE paths (it is independent of the AITER master switch, and self-disables
-# under expert parallelism inside the model), so enable it unconditionally.
-# (The AITER master switch itself is set below, gated on expert parallelism.)
+# the router-append shared-experts MoE fusion (vllm-project/vllm#46545).
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
@@ -85,20 +81,22 @@ elif [ "$EP_SIZE" -gt 1 ]; then
     PARALLEL_ARGS+=(--enable-expert-parallel)
 fi
 
-# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
-# MoE path is the auto-selected backend (no --moe-backend override). With EP
-# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
-# output, so leave it off and fall back to the native MXFP8 path (the
-# shared-experts fusion set above still applies — it is master-independent).
+# Gate the AITER master switch on expert parallelism. With EP, 
+# the AITER master switch produces degenerate MiniMax-M3
+# output, so leave it off.
 if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
-    export VLLM_ROCM_USE_AITER=1
-else
     export VLLM_ROCM_USE_AITER=0
+else
+    export VLLM_ROCM_USE_AITER=1
 fi
 
 # use 3 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=3
 
+# Larger per-step prefill token budget to improve TP4 throughput at high
+# concurrency. Overridable via env.
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
+
 # [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
 # SupportsEagle3 interface (functionstackx/vllm#1). Mirrors nvidia/model.py:
 # adds EagleModelMixin to the inner model + aux-hidden-state emission, and
@@ -193,9 +191,12 @@ vllm serve "$MODEL" --port "$PORT" \
     --block-size 128 \
     --no-enable-prefix-caching \
     --language-model-only \
+    --moe-backend aiter \
     --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
+    --linear-backend emulation \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \

@@ -2475,7 +2475,7 @@ dsv4-fp4-mi355x-atom-disagg:
 # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
 minimaxm3-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2487,14 +2487,11 @@ minimaxm3-fp8-mi355x-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
+      - { tp: 4, conc-start: 1, conc-end: 512 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 2 }
-      - { tp: 4, conc-start: 2, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 512 }
 
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
 # minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
@@ -2507,7 +2504,7 @@ minimaxm3-fp8-mi355x-vllm:
 # acceptance dilutes in big batches, and the draft weights + draft KV shave
 # headroom — tp2-ep2 is dropped since its KV headroom was already thin.
 minimaxm3-fp8-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2519,17 +2516,11 @@ minimaxm3-fp8-mi355x-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
-      - { tp: 4, conc-start: 2, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp }
 
 # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
 minimaxm3-fp4-mi355x-vllm-disagg:

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4433,3 +4433,21 @@
     - "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts."
     - "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm
+  description:
+    - "Bump the MiniMax-M3 MXFP8 MI355X vLLM image to nightly-09663abde0f50944a8d5ea30120666024b503faa"
+    - "Use --linear-backend emulation for the MXFP8 dense-linear path (beats the stock nightly native MXFP8 linear: ~+26% tput / -21% TPOT at 8k1k conc1, ~+2-3% at high concurrency)"
+    - "Add --max-num-batched-tokens 32768 (env MAX_NUM_BATCHED_TOKENS) to enlarge the per-step prefill budget and improve TP4 throughput at high concurrency"
+    - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)"
+    - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm-mtp
+  description:
+    - "Bump the MiniMax-M3 MXFP8 MI355X vLLM MTP (EAGLE3) image to nightly-09663abde0f50944a8d5ea30120666024b503faa, which natively supports SupportsEagle3 (the in-place EAGLE3 patch is now a no-op) and carries vllm-project/vllm#47158"
+    - "Port the non-MTP serve-command tuning to the MTP recipe: --moe-backend aiter, --linear-backend emulation, --max-num-batched-tokens 32768, and the AITER master switch on for TP-only runs (kept --speculative-config eagle3 with 3 draft tokens)"
+    - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4, matching the non-MTP entry; verified locally on this nightly at TP4 conc512, 5120/5120 completed)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003