Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
3ef1380
[AMD] refactor server_atom.sh and models_atom.yaml for model-specific…
seungrokj Jun 26, 2026
594f2bc
[AMD] add perf-changelog entry for minimaxm3-fp4-mi355x-atom-disagg a…
seungrokj Jun 26, 2026
8740b80
[AMD] add env dump in server_atom.sh and minimaxm3-fp4-mi355x-atom-di…
seungrokj Jun 26, 2026
7c1ef64
[AMD] fix server_atom.sh YAML loading: safe source, EP string compare…
seungrokj Jun 26, 2026
7cd3353
[AMD] cap minimaxm3-fp8-mi355x-atom-disagg conc to 256; fix missing n…
seungrokj Jun 26, 2026
8961125
[AMD] update amd-master.yaml: image bumps, search space tweaks for Mi…
seungrokj Jun 26, 2026
48b9946
[AMD] restore minimaxm3-fp4/fp8-mi355x-atom recipes; bump all ATOM im…
seungrokj Jun 26, 2026
7f94d30
[AMD] clean up minimaxm3-fp4-mi355x-atom search space; revert fp8-dis…
seungrokj Jun 26, 2026
1aa7ace
[AMD] add amd-master.yaml config
seungrokj Jul 3, 2026
ed5e874
[AMD] remove amd-master.yaml config
seungrokj Jul 3, 2026
8b6141c
[AMD] refactor ATOM disagg config: split per-role flags, move model d…
seungrokj Jul 3, 2026
a6ef155
[AMD] fix YAML server-tuning defaults never taking effect
seungrokj Jul 3, 2026
8d57cde
[AMD] add perf-changelog entry for MiniMax-M3 ATOM disagg refactor (P…
seungrokj Jul 3, 2026
0fd4545
[AMD] remove hf_overrides from models_atom.yaml and server_atom.sh
seungrokj Jul 3, 2026
113e568
[AMD] fix server-tuning: YAML values must override job.slurm Docker d…
seungrokj Jul 4, 2026
79a7c65
Merge branch 'main' into amd/m3_atom_pd_fp4fp8_0701
seungrokj Jul 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 8 additions & 20 deletions benchmarks/multi_node/amd_utils/env_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,24 @@ else
fi
export IBDEVICES

export SAFETENSORS_FAST_GPU=1
export VLLM_LOG_LEVEL=WARNING
export ATOM_LOG_LEVEL=WARNING
export AITER_LOG_LEVEL=WARNING
export LOG_LEVEL=WARNING
export LOGLEVEL=WARNING

# =============================================================================
# ATOM/mooncake-specific environment
# =============================================================================

# mooncake RDMA KV transfer library path
export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}


# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
# faster model loading (safetensors only)
export SAFETENSORS_FAST_GPU=1

# aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
export VLLM_LOG_LEVEL=WARNING
export ATOM_LOG_LEVEL=WARNING
export AITER_LOG_LEVEL=WARNING

if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
# ATOM MoE gather/scatter interleave optimization
export ATOM_MOE_GU_ITLV=1
# Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro)
export AITER_BF16_FP8_MOE_BOUND=0
fi

# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
# No env var needed; documented here for reference.
export LOG_LEVEL=WARNING
export LOGLEVEL=WARNING

set +x

echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake"
# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake"
103 changes: 61 additions & 42 deletions benchmarks/multi_node/amd_utils/models_atom.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Model-specific SGLang server configurations for disaggregated inference.
# Model-specific ATOM server configurations for disaggregated inference.
#
# Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR).
#
Expand All @@ -7,50 +7,69 @@
#
# Schema:
# <model-name>:
# base_flags: str # Common flags for both prefill and decode
# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0
# dp_flags: str # Appended when DP is enabled (prefill or decode)
# prefill:
# mem_fraction_static: float
# disable_radix_cache: bool
# dp: # Config when data-parallel attention is enabled
# max_running_requests: int
# chunked_prefill_size: str # Can be integer or bash arithmetic expression
# cuda_graph_bs: str # Space-separated values
# no_dp: # Config when data-parallel attention is disabled
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str # "start-end" expanded via seq
# decode:
# mem_fraction_static: float
# prefill_round_robin_balance: bool
# dp:
# max_running_requests: int
# chunked_prefill_size: str
# cuda_graph_bs_range: str
# ep_only: # Config when EP is enabled but DP is disabled
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str
# no_dp:
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str
# env: str # Space-separated KEY=VALUE pairs exported unconditionally
# tp_dp_flags: str # Shared TP+DPA flags (fallback when prefill/decode-specific keys are absent)
# prefill_tp_dp_flags: str # TP+DPA flags for prefill only (overrides tp_dp_flags)
# decode_tp_dp_flags: str # TP+DPA flags for decode only (overrides tp_dp_flags)
# tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode
# ep_dp_flags: str # Shared EP+DPA flags (fallback when prefill/decode-specific keys are absent)
# prefill_ep_dp_flags: str # EP+DPA flags for prefill only (overrides ep_dp_flags)
# decode_ep_dp_flags: str # EP+DPA flags for decode only (overrides ep_dp_flags)
# ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode
# mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens")
# kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none)
# online_quant_config: str # JSON string passed to --online_quant_config (used when DPA is disabled)
# online_quant_dpa_config: str # JSON string passed to --online_quant_config when DPA is enabled (falls back to online_quant_config)
# block_size: str # --block-size value (overrides server_atom.sh default of 16)
# mem_frac_static: str # --gpu-memory-utilization value (overrides default of 0.85)
# max_model_len: str # --max-model-len value (overrides default of unset)
# max_num_seqs: str # --max-num-seqs value (overrides default of 256)
# max_num_batched_tokens: str # --max-num-batched-tokens value (overrides default of unset)

DeepSeek-V4-Pro:
# ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS
# directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by
# server_atom.sh; they are kept here for documentation and potential future use.
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention --enable-tbo"
prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo"
decode_tp_dp_flags: "--enable-dp-attention --enable-tbo"
tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method mtp --num-speculative-tokens"

MiniMax-M3-MXFP4:
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention"
prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo"
decode_tp_dp_flags: "--enable-dp-attention"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
block_size: "128"
mem_frac_static: "0.8"
max_model_len: "32768"
max_num_seqs: "256"
max_num_batched_tokens: "32768"

MiniMax-M3-MXFP8:
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention"
prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill"
decode_tp_dp_flags: "--enable-dp-attention"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*.gate.*","*.block_sparse_moe.experts*"]}'
block_size: "128"
mem_frac_static: "0.8"
max_model_len: "32768"
max_num_seqs: "256"
max_num_batched_tokens: "32768"
Loading