diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f6166699aa..466c9f2d3f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2525,7 +2525,7 @@ dsv4-fp4-mi355x-atom-disagg: # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. minimaxm3-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2562,7 +2562,7 @@ minimaxm3-fp8-mi355x-vllm: # acceptance dilutes in big batches, and the draft weights + draft KV shave # headroom — tp2-ep2 is dropped since its KV headroom was already thin. minimaxm3-fp8-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2641,7 +2641,7 @@ minimaxm3-fp4-mi355x-vllm-disagg: # language-model path and mirror the MXFP8 MI355X search space for a direct # precision comparison. minimaxm3-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2672,7 +2672,7 @@ minimaxm3-fp4-mi355x-vllm: # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base # FP4 sweep at extreme concurrency where speculative decoding loses value. minimaxm3-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh index 96a5604934..69ba51f4f0 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh @@ -5,6 +5,7 @@ # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed. +# MoE serving mirrors minimaxm3_fp4_mi355x_vllm.sh (AITER MoE, vllm#46419). source "$(dirname "$0")/../../benchmark_lib.sh" @@ -36,6 +37,9 @@ fi SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -65,6 +69,7 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ + --moe-backend aiter \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --enable-auto-tool-choice \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 757d54786f..b8664a91f3 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -61,6 +61,10 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0 # avoids the M3-decode breakable-cudagraph path that previously forced eager. export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -176,7 +180,9 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --kv-cache-dtype fp8 \ + --linear-backend emulation \ --attention-backend TRITON_ATTN \ + --moe-backend aiter \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5d8427dffd..e61f3ac315 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4316,3 +4316,12 @@ description: - "Update the DeepSeek-V4-Pro B300 disaggregated Dynamo-vLLM benchmark to the vllm/vllm-openai:v0.23.0 image" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1952 + +- config-keys: + - minimaxm3-fp4-mi355x-vllm-mtp + - minimaxm3-fp8-mi355x-vllm-mtp + description: + - "Enable AITER MoE on MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP benchmarks (MXFP4 and MXFP8): export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter." + - "MXFP8 MTP also exports VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 and passes --linear-backend emulation. Mirrors the STP AITER MoE knobs from #1954 with three Inferact/MiniMax-M3-EAGLE3 speculative tokens and --use-chat-template for serving." + - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) on all four MiniMax-M3 MI355X single-node vLLM configs." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1955