From 651a1b5fee639351d0b0226923e936d4a89b6cdb Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 11:29:03 -0700 Subject: [PATCH 1/7] perf: update MiniMax-M3 FP4 B300 vLLM MTP --- .github/configs/nvidia-master.yaml | 22 ++++++++----------- .../fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 4 +++- perf-changelog.yaml | 7 ++++++ 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 54e73a4b5c..d71cdfd420 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12847,7 +12847,7 @@ minimaxm3-fp4-b300-vllm: # /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in # launch_b300-nv.sh); the EAGLE3 draft is downloaded to the writable models dir. minimaxm3-fp4-b300-vllm-mtp: - image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 + image: vllm/vllm-openai:nightly-93d8f834dd8acf33eb0e2a75b2711b628cb6e226 model: nvidia/MiniMax-M3-NVFP4 model-prefix: minimaxm3 runner: b300 @@ -12859,21 +12859,17 @@ minimaxm3-fp4-b300-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } + - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } + - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp } + - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp } # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh index 74cbcd0202..60299c29fc 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh @@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_FLOAT32_MATMUL_PRECISION=high +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" @@ -73,8 +74,9 @@ start_gpu_monitor set -x vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $PORT \ $PARALLEL_ARGS \ ---gpu-memory-utilization 0.90 \ +--gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ +--kv-cache-dtype fp8 \ --block-size 128 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 818092577f..64b67d1e06 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4426,3 +4426,10 @@ - "Use 1k/1k TP4/EP1 c1-c32 and TP4/EP4 c64-c256; use 8k/1k TP4/EP1 c1-c32 and TP4/EP4 DP-attention c64-c256." - "Drop the TP8/EP8 single-concurrency points." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1975 + +- config-keys: + - minimaxm3-fp4-b300-vllm-mtp + description: + - "Update Minimax M3 b300 vllm mtp image tag" + - "Update search space to cover more configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/tree/codex/minimax-m3-b300-fp4-vllm-mtp-update From dcaf629f5417d604a1cdffe3360bf46569b16422 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 12:51:47 -0700 Subject: [PATCH 2/7] fix: disable fp8 kv cache for MiniMax M3 MTP --- benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh index 60299c29fc..103fbe7ade 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh @@ -76,7 +76,6 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $POR $PARALLEL_ARGS \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---kv-cache-dtype fp8 \ --block-size 128 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ From a44fc870784b2ba9d5ff3a936dea6e002cbf718f Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 13:19:33 -0700 Subject: [PATCH 3/7] matrix --- .github/configs/nvidia-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d71cdfd420..b493754110 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12860,16 +12860,16 @@ minimaxm3-fp4-b300-vllm-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 4, conc-start: 2, conc-end: 2, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 1024, spec-decoding: mtp } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp } # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint From b6d5f655f21b9414396d94b5f25ba337a219bd18 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 14:41:17 -0700 Subject: [PATCH 4/7] memory --- benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh index 103fbe7ade..e74d9224cd 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh @@ -74,7 +74,7 @@ start_gpu_monitor set -x vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $PORT \ $PARALLEL_ARGS \ ---gpu-memory-utilization 0.95 \ +--gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --language-model-only \ From 3ca1694bf851f7d56cd9e1714670927559cfc879 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 14:43:12 -0700 Subject: [PATCH 5/7] review --- .../single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 3 +-- configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 7 ++++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh index e74d9224cd..db87f3da1f 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh @@ -3,7 +3,7 @@ # MiniMax-M3 NVFP4 B300 single-node vLLM recipe with EAGLE3 speculative # decoding — same shape as minimaxm3_fp8_b300_mtp.sh but uses the # nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support -# (vllm-project/vllm PR #46380) is baked into the perf container image, so no +# (vllm-project/vllm PR #46380) is baked into the nightly container image, so no # runtime patch is needed. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -52,7 +52,6 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_FLOAT32_MATMUL_PRECISION=high -export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index 912da6278c..5031e11895 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -12841,7 +12841,7 @@ minimaxm3-fp4-b300-vllm: # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4 # (nvidia/MiniMax-M3-NVFP4) B300 single-node vLLM, pairing the target with the # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). MiniMax-M3 -# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf +# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the nightly # container image, so no runtime patch is needed; prompts are routed through the # chat template. Target weights are pre-staged read-only at # /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 64b67d1e06..aaa9b3469f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4430,6 +4430,7 @@ - config-keys: - minimaxm3-fp4-b300-vllm-mtp description: - - "Update Minimax M3 b300 vllm mtp image tag" - - "Update search space to cover more configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/tree/codex/minimax-m3-b300-fp4-vllm-mtp-update + - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly." + - "Refocus both 1k/1k and 8k/1k search spaces on TP2 c4-c256 and TP2/EP2 DP-attention c512, while retaining TP8 and TP4 c1-c2." + - "Drop the TP8/EP8 and TP4/EP4 lanes." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991 From 0220c7fa84f5107be571e4d356edce96f59969af Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 14:45:27 -0700 Subject: [PATCH 6/7] flashinfer --- benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh | 1 + perf-changelog.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh index db87f3da1f..b69a2522aa 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh @@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_FLOAT32_MATMUL_PRECISION=high +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index aaa9b3469f..3619a515c0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4431,6 +4431,7 @@ - minimaxm3-fp4-b300-vllm-mtp description: - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly." + - "Select the TRT-LLM backend when FlashInfer all-reduce is enabled." - "Refocus both 1k/1k and 8k/1k search spaces on TP2 c4-c256 and TP2/EP2 DP-attention c512, while retaining TP8 and TP4 c1-c2." - "Drop the TP8/EP8 and TP4/EP4 lanes." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991 From 0937baa0d3f0272e17f6b5aa41f79bebc8aeec5f Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 2 Jul 2026 17:07:20 -0700 Subject: [PATCH 7/7] matrix --- configs/nvidia-master.yaml | 13 +++++++------ perf-changelog.yaml | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index ca782d7078..199e3a9024 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -12856,16 +12856,17 @@ minimaxm3-fp4-b300-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 4, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 2, conc-start: 8, conc-end: 256, spec-decoding: mtp } - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp } # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 641794615f..4b6f0adc51 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4446,6 +4446,6 @@ description: - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly." - "Select the TRT-LLM backend when FlashInfer all-reduce is enabled." - - "Refocus both 1k/1k and 8k/1k search spaces on TP2 c4-c256 and TP2/EP2 DP-attention c512, while retaining TP8 and TP4 c1-c2." - - "Drop the TP8/EP8 and TP4/EP4 lanes." + - "For 1k/1k, retain TP8/EP1 c1-c4 and TP8/EP8 c128-c256 bridge lanes, TP4/EP1 c1-c64 and TP4/EP4 c64-c256, and limit TP2/EP1 to c128-c256 plus TP2/EP2 DP-attention c512." + - "For 8k/1k, use TP4/EP1 c1-c8 and TP2/EP1 c8-c256 plus TP2/EP2 DP-attention c512; drop TP8." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991