Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 13 additions & 15 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,7 @@ SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_USE_BREAKABLE_CUDAGRAPH=0
# MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
# fusion checks this env directly and runs on both the aiter and native MXFP8
# MoE paths (it is independent of the AITER master switch, and self-disables
# under expert parallelism inside the model), so enable it unconditionally.
# (The AITER master switch itself is set below, gated on expert parallelism.)
# the router-append shared-experts MoE fusion (vllm-project/vllm#46545).
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6

Expand All @@ -55,16 +51,15 @@ elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
# MoE path is the auto-selected backend (no --moe-backend override). With EP
# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
# output, so leave it off and fall back to the native MXFP8 path (the
# shared-experts fusion set above still applies — it is master-independent).
if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
export VLLM_ROCM_USE_AITER=1
else
export VLLM_ROCM_USE_AITER=0
fi
# Previously when EP is On, VLLM_ROCM_USE_AITER needs to be off.
# After https://github.com/vllm-project/vllm/pull/47158,
# it can be simplified as VLLM_ROCM_USE_AITER=1.
# As the configs are TP only, remove the conditional check.
export VLLM_ROCM_USE_AITER=1

# Larger per-step prefill token budget to improve TP4 throughput at high
# concurrency. Overridable via env.
MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"

start_gpu_monitor

Expand All @@ -74,9 +69,12 @@ vllm serve "$MODEL" --port "$PORT" \
--block-size 128 \
--no-enable-prefix-caching \
--language-model-only \
--moe-backend aiter \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--kv-cache-dtype fp8 \
--attention-backend TRITON_ATTN \
--linear-backend emulation \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
--enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &
Expand Down
27 changes: 14 additions & 13 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ check_env_vars \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"
DRAFT_MODEL="${DRAFT_MODEL:-Inferact/MiniMax-M3-EAGLE3}"

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -62,11 +62,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
# avoids the M3-decode breakable-cudagraph path that previously forced eager.
export VLLM_USE_BREAKABLE_CUDAGRAPH=0
# MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
# fusion checks this env directly and runs on both the aiter and native MXFP8
# MoE paths (it is independent of the AITER master switch, and self-disables
# under expert parallelism inside the model), so enable it unconditionally.
# (The AITER master switch itself is set below, gated on expert parallelism.)
# the router-append shared-experts MoE fusion (vllm-project/vllm#46545).
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6

Expand All @@ -85,20 +81,22 @@ elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
# MoE path is the auto-selected backend (no --moe-backend override). With EP
# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
# output, so leave it off and fall back to the native MXFP8 path (the
# shared-experts fusion set above still applies — it is master-independent).
# Gate the AITER master switch on expert parallelism. With EP,
# the AITER master switch produces degenerate MiniMax-M3
# output, so leave it off.
if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
export VLLM_ROCM_USE_AITER=1
else
export VLLM_ROCM_USE_AITER=0
else
export VLLM_ROCM_USE_AITER=1
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

# Larger per-step prefill token budget to improve TP4 throughput at high
# concurrency. Overridable via env.
MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"

# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
# SupportsEagle3 interface (functionstackx/vllm#1). Mirrors nvidia/model.py:
# adds EagleModelMixin to the inner model + aux-hidden-state emission, and
Expand Down Expand Up @@ -193,9 +191,12 @@ vllm serve "$MODEL" --port "$PORT" \
--block-size 128 \
--no-enable-prefix-caching \
--language-model-only \
--moe-backend aiter \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--kv-cache-dtype fp8 \
--attention-backend TRITON_ATTN \
--linear-backend emulation \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
--tool-call-parser minimax_m3 \
--reasoning-parser minimax_m3 \
Expand Down
21 changes: 6 additions & 15 deletions configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2475,7 +2475,7 @@ dsv4-fp4-mi355x-atom-disagg:
# https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
# MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
minimaxm3-fp8-mi355x-vllm:
image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x
Expand All @@ -2487,14 +2487,11 @@ minimaxm3-fp8-mi355x-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 512 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 2 }
- { tp: 4, conc-start: 2, conc-end: 128 }
- { tp: 4, conc-start: 1, conc-end: 512 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
Expand All @@ -2507,7 +2504,7 @@ minimaxm3-fp8-mi355x-vllm:
# acceptance dilutes in big batches, and the draft weights + draft KV shave
# headroom — tp2-ep2 is dropped since its KV headroom was already thin.
minimaxm3-fp8-mi355x-vllm-mtp:
image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x
Expand All @@ -2519,17 +2516,11 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
- { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
- { tp: 4, conc-start: 2, conc-end: 128, spec-decoding: mtp }
- { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp }

# MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
minimaxm3-fp4-mi355x-vllm-disagg:
Expand Down
18 changes: 18 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4433,3 +4433,21 @@
- "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts."
- "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001

- config-keys:
- minimaxm3-fp8-mi355x-vllm
description:
- "Bump the MiniMax-M3 MXFP8 MI355X vLLM image to nightly-09663abde0f50944a8d5ea30120666024b503faa"
- "Use --linear-backend emulation for the MXFP8 dense-linear path (beats the stock nightly native MXFP8 linear: ~+26% tput / -21% TPOT at 8k1k conc1, ~+2-3% at high concurrency)"
- "Add --max-num-batched-tokens 32768 (env MAX_NUM_BATCHED_TOKENS) to enlarge the per-step prefill budget and improve TP4 throughput at high concurrency"
- "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)"
- "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003

- config-keys:
- minimaxm3-fp8-mi355x-vllm-mtp
description:
- "Bump the MiniMax-M3 MXFP8 MI355X vLLM MTP (EAGLE3) image to nightly-09663abde0f50944a8d5ea30120666024b503faa, which natively supports SupportsEagle3 (the in-place EAGLE3 patch is now a no-op) and carries vllm-project/vllm#47158"
- "Port the non-MTP serve-command tuning to the MTP recipe: --moe-backend aiter, --linear-backend emulation, --max-num-batched-tokens 32768, and the AITER master switch on for TP-only runs (kept --speculative-config eagle3 with 3 draft tokens)"
- "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4, matching the non-MTP entry; verified locally on this nightly at TP4 conc512, 5120/5120 completed)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003