Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# MiniMax-M3 NVFP4 B300 single-node vLLM recipe with EAGLE3 speculative
# decoding — same shape as minimaxm3_fp8_b300_mtp.sh but uses the
# nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support
# (vllm-project/vllm PR #46380) is baked into the perf container image, so no
# (vllm-project/vllm PR #46380) is baked into the nightly container image, so no
# runtime patch is needed.

source "$(dirname "$0")/../../benchmark_lib.sh"
Expand Down Expand Up @@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log

export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high
export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
Expand Down
21 changes: 9 additions & 12 deletions configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12838,13 +12838,13 @@ minimaxm3-fp4-b300-vllm:
# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4
# (nvidia/MiniMax-M3-NVFP4) B300 single-node vLLM, pairing the target with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). MiniMax-M3
# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf
# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the nightly
# container image, so no runtime patch is needed; prompts are routed through the
# chat template. Target weights are pre-staged read-only at
# /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in
# launch_b300-nv.sh); the EAGLE3 draft is downloaded to the writable models dir.
minimaxm3-fp4-b300-vllm-mtp:
image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41
image: vllm/vllm-openai:nightly-93d8f834dd8acf33eb0e2a75b2711b628cb6e226
Comment thread
anish-shanbhag marked this conversation as resolved.
model: nvidia/MiniMax-M3-NVFP4
model-prefix: minimaxm3
runner: b300
Expand All @@ -12856,21 +12856,18 @@ minimaxm3-fp4-b300-vllm-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 8, conc-start: 1, conc-end: 4, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 8, spec-decoding: mtp }
- { tp: 2, conc-start: 8, conc-end: 256, spec-decoding: mtp }
- { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp }

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4440,3 +4440,12 @@
- "Update Minimax M3 b200 vllm image tag"
- "Update search space to cover more configs"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1978

- config-keys:
- minimaxm3-fp4-b300-vllm-mtp
description:
- "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly."
- "Select the TRT-LLM backend when FlashInfer all-reduce is enabled."
- "For 1k/1k, retain TP8/EP1 c1-c4 and TP8/EP8 c128-c256 bridge lanes, TP4/EP1 c1-c64 and TP4/EP4 c64-c256, and limit TP2/EP1 to c128-c256 plus TP2/EP2 DP-attention c512."
- "For 8k/1k, use TP4/EP1 c1-c8 and TP2/EP1 c8-c256 plus TP2/EP2 DP-attention c512; drop TP8."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991