SemiAnalysisAI · anish-shanbhag · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
@@ -3,7 +3,7 @@
 # MiniMax-M3 NVFP4 B300 single-node vLLM recipe with EAGLE3 speculative
 # decoding — same shape as minimaxm3_fp8_b300_mtp.sh but uses the
 # nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support
-# (vllm-project/vllm PR #46380) is baked into the perf container image, so no
+# (vllm-project/vllm PR #46380) is baked into the nightly container image, so no
 # runtime patch is needed.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
@@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log
 
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_FLOAT32_MATMUL_PRECISION=high
+export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm
 
 if [ "${DP_ATTENTION}" = "true" ]; then
   PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"

@@ -12838,13 +12838,13 @@ minimaxm3-fp4-b300-vllm:
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4
 # (nvidia/MiniMax-M3-NVFP4) B300 single-node vLLM, pairing the target with the
 # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). MiniMax-M3
-# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf
+# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the nightly
 # container image, so no runtime patch is needed; prompts are routed through the
 # chat template. Target weights are pre-staged read-only at
 # /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in
 # launch_b300-nv.sh); the EAGLE3 draft is downloaded to the writable models dir.
 minimaxm3-fp4-b300-vllm-mtp:
-  image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41
+  image: vllm/vllm-openai:nightly-93d8f834dd8acf33eb0e2a75b2711b628cb6e226
   model: nvidia/MiniMax-M3-NVFP4
   model-prefix: minimaxm3
   runner: b300
@@ -12856,21 +12856,18 @@ minimaxm3-fp4-b300-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 4, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
       - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+      - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+      - { tp: 2, conc-start: 8, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4440,3 +4440,12 @@
     - "Update Minimax M3 b200 vllm image tag"
     - "Update search space to cover more configs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1978
+
+- config-keys:
+    - minimaxm3-fp4-b300-vllm-mtp
+  description:
+    - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly."
+    - "Select the TRT-LLM backend when FlashInfer all-reduce is enabled."
+    - "For 1k/1k, retain TP8/EP1 c1-c4 and TP8/EP8 c128-c256 bridge lanes, TP4/EP1 c1-c64 and TP4/EP4 c64-c256, and limit TP2/EP1 to c128-c256 plus TP2/EP2 DP-attention c512."
+    - "For 8k/1k, use TP4/EP1 c1-c8 and TP2/EP1 c8-c256 plus TP2/EP2 DP-attention c512; drop TP8."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991