diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh index a0907f18c..dfb1e7a9f 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh @@ -31,6 +31,7 @@ if [ "$DP_ATTENTION" = "true" ]; then fi SPEC_ARGS=() +OPT_ARGS=(--online_quant_config '{"global_quant_config": "ptpc_fp8", "exclude_layer": ["lm_head", "model.embed_tokens", "vision_tower", "multi_modal_projector", "patch_merge_mlp", "*block_sparse_moe"]}') # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,8 +39,7 @@ MEM_FRAC_STATIC=0.8 set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 -export ATOM_M3_SPARSE_USE_ASM_PA=1 +export ATOM_FORCE_ATTN_TRITON=1 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 export MAX_NUM_SEQS=256 @@ -48,6 +48,7 @@ python3 -m atom.entrypoints.openai_server \ --server-port $PORT \ "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ + "${OPT_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --max-model-len $MAX_MODEL_LEN \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh index fe71c47a0..4ef60e71e 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -31,6 +31,7 @@ if [ "$DP_ATTENTION" = "true" ]; then fi SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 ) +OPT_ARGS=(--online_quant_config '{"global_quant_config": "ptpc_fp8", "exclude_layer": ["lm_head", "model.embed_tokens", "vision_tower", "multi_modal_projector", "patch_merge_mlp", "*block_sparse_moe"]}') # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,8 +39,7 @@ MEM_FRAC_STATIC=0.8 set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 -export ATOM_M3_SPARSE_USE_ASM_PA=1 +export ATOM_FORCE_ATTN_TRITON=1 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 export MAX_NUM_SEQS=256 @@ -48,6 +48,7 @@ python3 -m atom.entrypoints.openai_server \ --server-port $PORT \ "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ + "${OPT_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --max-model-len $MAX_MODEL_LEN \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh index 3506a3764..c5ce072fc 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh @@ -31,6 +31,7 @@ if [ "$DP_ATTENTION" = "true" ]; then fi SPEC_ARGS=() +OPT_ARGS=(--online_quant_config '{"global_quant_config": "ptpc_fp8", "exclude_layer": ["lm_head", "model.embed_tokens", "vision_tower", "multi_modal_projector", "patch_merge_mlp", "*block_sparse_moe"]}') # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,8 +39,7 @@ MEM_FRAC_STATIC=0.8 set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 -export ATOM_M3_SPARSE_USE_ASM_PA=1 +export ATOM_FORCE_ATTN_TRITON=1 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 export MAX_NUM_SEQS=256 @@ -48,6 +48,7 @@ python3 -m atom.entrypoints.openai_server \ --server-port $PORT \ "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ + "${OPT_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --max-model-len $MAX_MODEL_LEN \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh index a5b019b80..66320d03d 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh @@ -31,6 +31,7 @@ if [ "$DP_ATTENTION" = "true" ]; then fi SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 ) +OPT_ARGS=(--online_quant_config '{"global_quant_config": "ptpc_fp8", "exclude_layer": ["lm_head", "model.embed_tokens", "vision_tower", "multi_modal_projector", "patch_merge_mlp", "*block_sparse_moe"]}') # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -38,8 +39,7 @@ MEM_FRAC_STATIC=0.8 set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 -export ATOM_M3_SPARSE_USE_ASM_PA=1 +export ATOM_FORCE_ATTN_TRITON=1 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 export MAX_NUM_SEQS=256 @@ -48,6 +48,7 @@ python3 -m atom.entrypoints.openai_server \ --server-port $PORT \ "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ + "${OPT_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --max-model-len $MAX_MODEL_LEN \ diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 95a76ab6d..41f60afda 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2645,7 +2645,7 @@ minimaxm3-fp4-mi355x-vllm-mtp: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. minimaxm3-fp4-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:nightly_202607011530 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2664,7 +2664,7 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } minimaxm3-fp4-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:nightly_202607011530 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2683,7 +2683,7 @@ minimaxm3-fp4-mi355x-atom-mtp: - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } minimaxm3-fp8-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:nightly_202607011530 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2702,7 +2702,7 @@ minimaxm3-fp8-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } minimaxm3-fp8-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:nightly_202607011530 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6eb6ca61f..3944e67c5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4422,3 +4422,14 @@ - "Update Minimax M3 b200 vllm image tag" - "Update search space to cover more configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1978 + +- config-keys: + - minimaxm3-fp4-mi355x-atom + - minimaxm3-fp4-mi355x-atom-mtp + - minimaxm3-fp8-mi355x-atom + - minimaxm3-fp8-mi355x-atom-mtp + description: + - "Bump ATOM image from rocm/atom-dev:MiniMax-M3-20260623 to rocm/atom-dev:nightly_202607011530 for all single-node MiniMax-M3 ATOM recipes." + - "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts." + - "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001