Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ SERVER_LOG=/workspace/server.log

export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high
export VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
Expand All @@ -56,8 +57,36 @@ else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3
# Speculative-token count is picked per operating point to trace the
# Total-TPS/GPU vs median-interactivity Pareto frontier of the EAGLE3 offline
# sweep (bench_results_b200.json). The best num_speculative_tokens is not
# constant: fewer tokens win at the throughput end (high concurrency), more
# tokens win at the latency end (low concurrency). 3 is the default and the
# table below overrides the non-3 operating points.
# Key: ISL:TP:EP_SIZE:DP_ATTENTION:CONC (exactly the env vars the launcher sets;
# for dp-attn configs TP holds the data-parallel size, as in PARALLEL_ARGS below).
declare -A NUM_SPEC_TOKENS_MAP=(
# --- ISL=1024 / OSL=1024 ---
[1024:4:4:false:4]=4
[1024:8:1:false:2]=6
[1024:8:1:false:4]=4
# --- ISL=8192 / OSL=1024 ---
[8192:2:1:false:1]=4
[8192:2:1:false:256]=4
[8192:2:2:false:1]=4
[8192:2:2:false:16]=4
[8192:2:2:false:32]=2
[8192:2:2:false:64]=4
[8192:2:2:false:128]=4
[8192:2:2:false:256]=4
[8192:2:2:false:512]=2
[8192:4:1:false:256]=2
[8192:4:4:false:256]=4
[8192:8:1:false:1]=4
[8192:8:1:false:2]=4
)
NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS_MAP[${ISL}:${TP}:${EP_SIZE}:${DP_ATTENTION}:${CONC}]:-3}"
echo "Selected NUM_SPEC_TOKENS=$NUM_SPEC_TOKENS for ISL=$ISL TP=$TP EP_SIZE=$EP_SIZE DP_ATTENTION=$DP_ATTENTION CONC=$CONC"

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
Expand All @@ -68,7 +97,7 @@ start_gpu_monitor
set -x
vllm serve $MODEL --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--gpu-memory-utilization 0.9 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
Expand Down
18 changes: 10 additions & 8 deletions configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12787,17 +12787,19 @@ minimaxm3-fp4-b200-vllm-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 2, conc-list: [1, 2, 4, 8, 64, 128, 256, 512], spec-decoding: mtp }
- { tp: 2, ep: 2, conc-list: [8, 512], spec-decoding: mtp }
- { tp: 4, conc-list: [1, 2, 4, 16, 32, 64], spec-decoding: mtp }
- { tp: 4, ep: 4, conc-list: [1, 2, 4, 8], spec-decoding: mtp }
- { tp: 8, conc-list: [1, 2, 4, 8, 16], spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 2, conc-list: [1, 8, 16, 32, 256], spec-decoding: mtp }
- { tp: 2, ep: 2, conc-list: [1, 2, 4, 16, 32, 64, 128, 256, 512], spec-decoding: mtp }
- { tp: 4, conc-list: [1, 2, 4, 8, 16, 256], spec-decoding: mtp }
- { tp: 4, ep: 4, conc-list: [1, 2, 4, 8, 256], spec-decoding: mtp }
- { tp: 8, conc-list: [1, 2, 4, 8], spec-decoding: mtp }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-b300-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4433,3 +4433,10 @@
- "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts."
- "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001

- config-keys:
- minimaxm3-fp4-b200-vllm-mtp
description:
- "Update image tag to nightly"
- "Update B200 MTP search space"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2007