diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh index 74cbcd020..b69a2522a 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh @@ -3,7 +3,7 @@ # MiniMax-M3 NVFP4 B300 single-node vLLM recipe with EAGLE3 speculative # decoding — same shape as minimaxm3_fp8_b300_mtp.sh but uses the # nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support -# (vllm-project/vllm PR #46380) is baked into the perf container image, so no +# (vllm-project/vllm PR #46380) is baked into the nightly container image, so no # runtime patch is needed. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -52,6 +52,7 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_FLOAT32_MATMUL_PRECISION=high +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index a23ebe349..199e3a902 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -12838,13 +12838,13 @@ minimaxm3-fp4-b300-vllm: # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4 # (nvidia/MiniMax-M3-NVFP4) B300 single-node vLLM, pairing the target with the # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). MiniMax-M3 -# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf +# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the nightly # container image, so no runtime patch is needed; prompts are routed through the # chat template. Target weights are pre-staged read-only at # /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in # launch_b300-nv.sh); the EAGLE3 draft is downloaded to the writable models dir. minimaxm3-fp4-b300-vllm-mtp: - image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 + image: vllm/vllm-openai:nightly-93d8f834dd8acf33eb0e2a75b2711b628cb6e226 model: nvidia/MiniMax-M3-NVFP4 model-prefix: minimaxm3 runner: b300 @@ -12856,21 +12856,18 @@ minimaxm3-fp4-b300-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 4, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 2, conc-start: 8, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512, spec-decoding: mtp } # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 943f4b390..4b6f0adc5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4440,3 +4440,12 @@ - "Update Minimax M3 b200 vllm image tag" - "Update search space to cover more configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1978 + +- config-keys: + - minimaxm3-fp4-b300-vllm-mtp + description: + - "Switch the MiniMax-M3 B300 vLLM MTP recipe from the custom perf image to a pinned mainline nightly." + - "Select the TRT-LLM backend when FlashInfer all-reduce is enabled." + - "For 1k/1k, retain TP8/EP1 c1-c4 and TP8/EP8 c128-c256 bridge lanes, TP4/EP1 c1-c64 and TP4/EP4 c64-c256, and limit TP2/EP1 to c128-c256 plus TP2/EP2 DP-attention c512." + - "For 8k/1k, use TP4/EP1 c1-c8 and TP2/EP1 c8-c256 plus TP2/EP2 DP-attention c512; drop TP8." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1991