diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh index f2b9063128..089594cbe4 100644 --- a/benchmarks/multi_node/amd_utils/env_atom.sh +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -32,13 +32,6 @@ else fi export IBDEVICES -export SAFETENSORS_FAST_GPU=1 -export VLLM_LOG_LEVEL=WARNING -export ATOM_LOG_LEVEL=WARNING -export AITER_LOG_LEVEL=WARNING -export LOG_LEVEL=WARNING -export LOGLEVEL=WARNING - # ============================================================================= # ATOM/mooncake-specific environment # ============================================================================= @@ -46,22 +39,17 @@ export LOGLEVEL=WARNING # mooncake RDMA KV transfer library path export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} - -# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) +# faster model loading (safetensors only) +export SAFETENSORS_FAST_GPU=1 # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) +export VLLM_LOG_LEVEL=WARNING +export ATOM_LOG_LEVEL=WARNING export AITER_LOG_LEVEL=WARNING - -if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - # ATOM MoE gather/scatter interleave optimization - export ATOM_MOE_GU_ITLV=1 - # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro) - export AITER_BF16_FP8_MOE_BOUND=0 -fi - -# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) -# No env var needed; documented here for reference. +export LOG_LEVEL=WARNING +export LOGLEVEL=WARNING set +x -echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake" +# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) +echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 85771eeaac..4a854f49fd 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -1,4 +1,4 @@ -# Model-specific SGLang server configurations for disaggregated inference. +# Model-specific ATOM server configurations for disaggregated inference. # # Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR). # @@ -7,50 +7,69 @@ # # Schema: # : -# base_flags: str # Common flags for both prefill and decode -# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0 -# dp_flags: str # Appended when DP is enabled (prefill or decode) -# prefill: -# mem_fraction_static: float -# disable_radix_cache: bool -# dp: # Config when data-parallel attention is enabled -# max_running_requests: int -# chunked_prefill_size: str # Can be integer or bash arithmetic expression -# cuda_graph_bs: str # Space-separated values -# no_dp: # Config when data-parallel attention is disabled -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str # "start-end" expanded via seq -# decode: -# mem_fraction_static: float -# prefill_round_robin_balance: bool -# dp: -# max_running_requests: int -# chunked_prefill_size: str -# cuda_graph_bs_range: str -# ep_only: # Config when EP is enabled but DP is disabled -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str -# no_dp: -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str +# env: str # Space-separated KEY=VALUE pairs exported unconditionally +# tp_dp_flags: str # Shared TP+DPA flags (fallback when prefill/decode-specific keys are absent) +# prefill_tp_dp_flags: str # TP+DPA flags for prefill only (overrides tp_dp_flags) +# decode_tp_dp_flags: str # TP+DPA flags for decode only (overrides tp_dp_flags) +# tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode +# ep_dp_flags: str # Shared EP+DPA flags (fallback when prefill/decode-specific keys are absent) +# prefill_ep_dp_flags: str # EP+DPA flags for prefill only (overrides ep_dp_flags) +# decode_ep_dp_flags: str # EP+DPA flags for decode only (overrides ep_dp_flags) +# ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode +# mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens") +# kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none) +# online_quant_config: str # JSON string passed to --online_quant_config (used when DPA is disabled) +# online_quant_dpa_config: str # JSON string passed to --online_quant_config when DPA is enabled (falls back to online_quant_config) +# block_size: str # --block-size value (overrides server_atom.sh default of 16) +# mem_frac_static: str # --gpu-memory-utilization value (overrides default of 0.85) +# max_model_len: str # --max-model-len value (overrides default of unset) +# max_num_seqs: str # --max-num-seqs value (overrides default of 256) +# max_num_batched_tokens: str # --max-num-batched-tokens value (overrides default of unset) DeepSeek-V4-Pro: - # ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS - # directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by - # server_atom.sh; they are kept here for documentation and potential future use. - base_flags: "" - mtp_flags: "" - dp_flags: "" + env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention --enable-tbo" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo" + decode_tp_dp_flags: "--enable-dp-attention --enable-tbo" + tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method mtp --num-speculative-tokens" MiniMax-M3-MXFP4: - base_flags: "" - mtp_flags: "" - dp_flags: "" + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo" + decode_tp_dp_flags: "--enable-dp-attention" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" + online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' + online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' + block_size: "128" + mem_frac_static: "0.8" + max_model_len: "32768" + max_num_seqs: "256" + max_num_batched_tokens: "32768" MiniMax-M3-MXFP8: - base_flags: "" - mtp_flags: "" - dp_flags: "" \ No newline at end of file + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill" + decode_tp_dp_flags: "--enable-dp-attention" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" + online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' + online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*.gate.*","*.block_sparse_moe.experts*"]}' + block_size: "128" + mem_frac_static: "0.8" + max_model_len: "32768" + max_num_seqs: "256" + max_num_batched_tokens: "32768" diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index ccc864030a..45f3053bd5 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -36,8 +36,7 @@ DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" # MTP -SPEC_DECODING="${SPEC_DECODING:-}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" # ATOM server ports (different from SGLang which uses 8000 for all) PREFILL_PORT="${PREFILL_PORT:-8010}" @@ -45,13 +44,7 @@ DECODE_PORT="${DECODE_PORT:-8020}" ROUTER_PORT="${ROUTER_PORT:-8000}" HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" -# ATOM server tuning (from reference script defaults) -MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" -KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" -BLOCK_SIZE="${BLOCK_SIZE:-16}" -MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" -MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" -MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}" +# ATOM server tuning — defaults applied after YAML load (env var > YAML > shell default) EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}" # Benchmark Configuration @@ -78,6 +71,51 @@ if [[ -z "$host_ip" ]]; then fi host_name=$(hostname) +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +# Load model-specific config from YAML (single parse for all fields) +set -x +_yaml_tmp=$(mktemp) +python3 << PYEOF > "$_yaml_tmp" +import yaml +with open('${ATOM_WS_PATH}/models_atom.yaml') as f: + m = yaml.safe_load(f).get('${MODEL_NAME}', {}) +def sh(v): return v.replace("'", "'\\''") +print(f"MODEL_ENVS='{sh(m.get('env', ''))}'") +_tp_dp = m.get('tp_dp_flags', '') +print(f"PREFILL_MODEL_TP_DP_FLAGS='{sh(m.get('prefill_tp_dp_flags', _tp_dp))}'") +print(f"DECODE_MODEL_TP_DP_FLAGS='{sh(m.get('decode_tp_dp_flags', _tp_dp))}'") +_ep_dp = m.get('ep_dp_flags', '') +print(f"PREFILL_MODEL_EP_DP_FLAGS='{sh(m.get('prefill_ep_dp_flags', _ep_dp))}'") +print(f"DECODE_MODEL_EP_DP_FLAGS='{sh(m.get('decode_ep_dp_flags', _ep_dp))}'") +print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'") +print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'") +print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'") +print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'") +print(f"_ONLINE_QUANT_CONFIG='{sh(m.get('online_quant_config', ''))}'") +print(f"_ONLINE_QUANT_DPA_CONFIG='{sh(m.get('online_quant_dpa_config', m.get('online_quant_config', '')))}'") +print(f"_YAML_BLOCK_SIZE='{sh(m.get('block_size', ''))}'") +print(f"_YAML_MEM_FRAC_STATIC='{sh(m.get('mem_frac_static', ''))}'") +print(f"_YAML_MAX_MODEL_LEN='{sh(m.get('max_model_len', ''))}'") +print(f"_YAML_MAX_NUM_SEQS='{sh(m.get('max_num_seqs', ''))}'") +print(f"_YAML_MAX_NUM_BATCHED_TOKENS='{sh(m.get('max_num_batched_tokens', ''))}'") +PYEOF +# shellcheck source=/dev/null +source "$_yaml_tmp" +rm -f "$_yaml_tmp" +unset _yaml_tmp + +# Apply server-tuning: YAML > env var > shell default +# (job.slurm injects BLOCK_SIZE/MEM_FRAC_STATIC/MAX_NUM_SEQS with hardcoded +# defaults into the Docker env, so env-first would always shadow the YAML.) +BLOCK_SIZE="${_YAML_BLOCK_SIZE:-${BLOCK_SIZE:-16}}" +MEM_FRAC_STATIC="${_YAML_MEM_FRAC_STATIC:-${MEM_FRAC_STATIC:-0.85}}" +MAX_MODEL_LEN="${_YAML_MAX_MODEL_LEN:-${MAX_MODEL_LEN:-}}" +MAX_NUM_SEQS="${_YAML_MAX_NUM_SEQS:-${MAX_NUM_SEQS:-256}}" +MAX_NUM_BATCHED_TOKENS="${_YAML_MAX_NUM_BATCHED_TOKENS:-${MAX_NUM_BATCHED_TOKENS:-}}" +unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS + # ============================================================================= # Cluster Topology Configuration # ============================================================================= @@ -111,56 +149,56 @@ PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" + + + # Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP +ONLINE_QUANT_ARG="" if [ "$PREFILL_ENABLE_DP" = "true" ]; then - if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #TP+DPA+TBO - if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) - export GPU_MAX_HW_QUEUES=5 - export ATOM_CPU_AFFINITY=1 - else #TP+DPA - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) - fi + if [ "$PREFILL_ENABLE_EP" = "true" ]; then #EP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${PREFILL_MODEL_EP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done + else #TP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${PREFILL_MODEL_TP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done + fi + if [[ -n "$_ONLINE_QUANT_DPA_CONFIG" ]]; then + ONLINE_QUANT_ARG="--online_quant_config '${_ONLINE_QUANT_DPA_CONFIG}'" + fi +else + if [[ -n "$_ONLINE_QUANT_CONFIG" ]]; then + ONLINE_QUANT_ARG="--online_quant_config '${_ONLINE_QUANT_CONFIG}'" fi -fi +fi -# (srok), split DPA & TBO cases -DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP +DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then - if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #TP+DPA+TBO - if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) - export GPU_MAX_HW_QUEUES=5 - export ATOM_CPU_AFFINITY=1 - else #TP+DPA - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) - fi + if [ "$DECODE_ENABLE_EP" = "true" ]; then #EP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${DECODE_MODEL_EP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done + else #TP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${DECODE_MODEL_TP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi -fi - -# MTP args -SPEC_ARGS=() #TP -if [ "$SPEC_DECODING" = "mtp" ]; then - SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") fi +unset _dp_env_pair +unset _ONLINE_QUANT_CONFIG _ONLINE_QUANT_DPA_CONFIG -# HF overrides (single-quoted JSON preserved through eval) -HF_OVERRIDES_ARG="" -if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" -fi +for _env_pair in ${MODEL_ENVS}; do + export "$_env_pair" +done +unset _env_pair -# KV cache dtype (skip if unset or 'auto') -KV_CACHE_ARG="" -if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then - KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" +# MTP args +SPEC_ARGS=() +if [[ -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then + SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") fi +# KV cache arg - full flag string from YAML +KV_CACHE_ARG="${MODEL_KV_ARG}" + # Optional model length / batched-token cap MODEL_LEN_ARGS="" if [[ -n "$MAX_MODEL_LEN" ]]; then @@ -170,9 +208,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" fi -if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then - export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -fi cat < YAML > shell default) and remove redundant SPEC_DECODING gating in favor of MODEL_MTP_FLAGS + DECODE_MTP_SIZE > 0." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2000 + - config-keys: - minimaxm3-fp8-mi355x-vllm-mtp description: