From 651a1b5fee639351d0b0226923e936d4a89b6cdb Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 11:29:03 -0700
Subject: [PATCH 1/7] perf: update MiniMax-M3 FP4 B300 vLLM MTP

---
 .github/configs/nvidia-master.yaml            | 22 ++++++++-----------
 .../fixed_seq_len/minimaxm3_fp4_b300_mtp.sh   |  4 +++-
 perf-changelog.yaml                           |  7 ++++++
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 54e73a4b5c..d71cdfd420 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12847,7 +12847,7 @@ minimaxm3-fp4-b300-vllm:
 # /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in
 # launch_b300-nv.sh); the EAGLE3 draft is downloaded to the writable models dir.
 minimaxm3-fp4-b300-vllm-mtp:
-  image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41
+  image: vllm/vllm-openai:nightly-93d8f834dd8acf33eb0e2a75b2711b628cb6e226
   model: nvidia/MiniMax-M3-NVFP4
   model-prefix: minimaxm3
   runner: b300
@@ -12859,21 +12859,17 @@ minimaxm3-fp4-b300-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp }
+      - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp }
+      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp }
+      - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp }
+      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp }
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
index 74cbcd0202..60299c29fc 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
@@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log
 
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_FLOAT32_MATMUL_PRECISION=high
+export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm
 
 if [ "${DP_ATTENTION}" = "true" ]; then
   PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
@@ -73,8 +74,9 @@ start_gpu_monitor
 set -x
 vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $PORT \
 $PARALLEL_ARGS \
---gpu-memory-utilization 0.90 \
+--gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
+--kv-cache-dtype fp8 \
 --block-size 128 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 818092577f..64b67d1e06 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4426,3 +4426,10 @@
     - "Use 1k/1k TP4/EP1 c1-c32 and TP4/EP4 c64-c256; use 8k/1k TP4/EP1 c1-c32 and TP4/EP4 DP-attention c64-c256."
     - "Drop the TP8/EP8 single-concurrency points."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1975
+
+- config-keys:
+    - minimaxm3-fp4-b300-vllm-mtp
+  description:
+    - "Update Minimax M3 b300 vllm mtp image tag"
+    - "Update search space to cover more configs"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/tree/codex/minimax-m3-b300-fp4-vllm-mtp-update

From dcaf629f5417d604a1cdffe3360bf46569b16422 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 12:51:47 -0700
Subject: [PATCH 2/7] fix: disable fp8 kv cache for MiniMax M3 MTP

---
 benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
index 60299c29fc..103fbe7ade 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
@@ -76,7 +76,6 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $POR
 $PARALLEL_ARGS \
 --gpu-memory-utilization 0.95 \
 --max-model-len $MAX_MODEL_LEN \
---kv-cache-dtype fp8 \
 --block-size 128 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \

From a44fc870784b2ba9d5ff3a936dea6e002cbf718f Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 13:19:33 -0700
Subject: [PATCH 3/7] matrix

---
 .github/configs/nvidia-master.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d71cdfd420..b493754110 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12860,16 +12860,16 @@ minimaxm3-fp4-b300-vllm-mtp:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
       - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp }
+      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
       - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp }
+      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint

From b6d5f655f21b9414396d94b5f25ba337a219bd18 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 14:41:17 -0700
Subject: [PATCH 4/7] memory

---
 benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
index 103fbe7ade..e74d9224cd 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
@@ -74,7 +74,7 @@ start_gpu_monitor
 set -x
 vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $PORT \
 $PARALLEL_ARGS \
---gpu-memory-utilization 0.95 \
+--gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --language-model-only \

From 3ca1694bf851f7d56cd9e1714670927559cfc879 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 14:43:12 -0700
Subject: [PATCH 5/7] review

---
 .../single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh    | 3 +--
 configs/nvidia-master.yaml                                 | 2 +-
 perf-changelog.yaml                                        | 7 ++++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
index e74d9224cd..db87f3da1f 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
@@ -3,7 +3,7 @@
 # MiniMax-M3 NVFP4 B300 single-node vLLM recipe with EAGLE3 speculative
 # decoding — same shape as minimaxm3_fp8_b300_mtp.sh but uses the
 # nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support
-# (vllm-project/vllm PR #46380) is baked into the perf container image, so no
+# (vllm-project/vllm PR #46380) is baked into the nightly container image, so no
 # runtime patch is needed.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
@@ -52,7 +52,6 @@ SERVER_LOG=/workspace/server.log
 
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_FLOAT32_MATMUL_PRECISION=high
-export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm
 
 if [ "${DP_ATTENTION}" = "true" ]; then
   PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml
index 912da6278c..5031e11895 100644
--- a/configs/nvidia-master.yaml
+++ b/configs/nvidia-master.yaml
@@ -12841,7 +12841,7 @@ minimaxm3-fp4-b300-vllm:
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4
 # (nvidia/MiniMax-M3-NVFP4) B300 single-node vLLM, pairing the target with the
 # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). MiniMax-M3
-# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf
+# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the nightly
 # container image, so no runtime patch is needed; prompts are routed through the
 # chat template. Target weights are pre-staged read-only at
 # /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 64b67d1e06..aaa9b3469f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4430,6 +4430,7 @@
 - config-keys:
     - minimaxm3-fp4-b300-vllm-mtp
   description:
-    - "Update Minimax M3 b300 vllm mtp image tag"
-    - "Update search space to cover more configs"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/tree/codex/minimax-m3-b300-fp4-vllm-mtp-update
+    - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly."
+    - "Refocus both 1k/1k and 8k/1k search spaces on TP2 c4-c256 and TP2/EP2 DP-attention c512, while retaining TP8 and TP4 c1-c2."
+    - "Drop the TP8/EP8 and TP4/EP4 lanes."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991

From 0220c7fa84f5107be571e4d356edce96f59969af Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 14:45:27 -0700
Subject: [PATCH 6/7] flashinfer

---
 benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 1 +
 perf-changelog.yaml                                            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
index db87f3da1f..b69a2522aa 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
@@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log
 
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_FLOAT32_MATMUL_PRECISION=high
+export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm
 
 if [ "${DP_ATTENTION}" = "true" ]; then
   PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index aaa9b3469f..3619a515c0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4431,6 +4431,7 @@
     - minimaxm3-fp4-b300-vllm-mtp
   description:
     - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly."
+    - "Select the TRT-LLM backend when FlashInfer all-reduce is enabled."
     - "Refocus both 1k/1k and 8k/1k search spaces on TP2 c4-c256 and TP2/EP2 DP-attention c512, while retaining TP8 and TP4 c1-c2."
     - "Drop the TP8/EP8 and TP4/EP4 lanes."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991

From 0937baa0d3f0272e17f6b5aa41f79bebc8aeec5f Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Thu, 2 Jul 2026 17:07:20 -0700
Subject: [PATCH 7/7] matrix

---
 configs/nvidia-master.yaml | 13 +++++++------
 perf-changelog.yaml        |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml
index ca782d7078..199e3a9024 100644
--- a/configs/nvidia-master.yaml
+++ b/configs/nvidia-master.yaml
@@ -12856,16 +12856,17 @@ minimaxm3-fp4-b300-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 4, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp }
       - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+      - { tp: 2, conc-start: 8, conc-end: 256, spec-decoding: mtp }
       - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 641794615f..4b6f0adc51 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4446,6 +4446,6 @@
   description:
     - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly."
     - "Select the TRT-LLM backend when FlashInfer all-reduce is enabled."
-    - "Refocus both 1k/1k and 8k/1k search spaces on TP2 c4-c256 and TP2/EP2 DP-attention c512, while retaining TP8 and TP4 c1-c2."
-    - "Drop the TP8/EP8 and TP4/EP4 lanes."
+    - "For 1k/1k, retain TP8/EP1 c1-c4 and TP8/EP8 c128-c256 bridge lanes, TP4/EP1 c1-c64 and TP4/EP4 c64-c256, and limit TP2/EP1 to c128-c256 plus TP2/EP2 DP-attention c512."
+    - "For 8k/1k, use TP4/EP1 c1-c8 and TP2/EP1 c8-c256 plus TP2/EP2 DP-attention c512; drop TP8."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991