From 3ef13803854e2a4525dd3d522defc9282ea38091 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 10:25:16 +0900 Subject: [PATCH 01/15] [AMD] refactor server_atom.sh and models_atom.yaml for model-specific ATOM config; add minimaxm3-fp4-mi355x-atom-disagg Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/env_atom.sh | 28 ++---- .../multi_node/amd_utils/models_atom.yaml | 68 ++++++-------- .../multi_node/amd_utils/server_atom.sh | 91 ++++++++++--------- configs/amd-master.yaml | 2 +- 4 files changed, 85 insertions(+), 104 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh index f2b9063128..089594cbe4 100644 --- a/benchmarks/multi_node/amd_utils/env_atom.sh +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -32,13 +32,6 @@ else fi export IBDEVICES -export SAFETENSORS_FAST_GPU=1 -export VLLM_LOG_LEVEL=WARNING -export ATOM_LOG_LEVEL=WARNING -export AITER_LOG_LEVEL=WARNING -export LOG_LEVEL=WARNING -export LOGLEVEL=WARNING - # ============================================================================= # ATOM/mooncake-specific environment # ============================================================================= @@ -46,22 +39,17 @@ export LOGLEVEL=WARNING # mooncake RDMA KV transfer library path export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} - -# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) +# faster model loading (safetensors only) +export SAFETENSORS_FAST_GPU=1 # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) +export VLLM_LOG_LEVEL=WARNING +export ATOM_LOG_LEVEL=WARNING export AITER_LOG_LEVEL=WARNING - -if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - # ATOM MoE gather/scatter interleave optimization - export ATOM_MOE_GU_ITLV=1 - # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro) - export AITER_BF16_FP8_MOE_BOUND=0 -fi - -# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) -# No env var needed; documented here for reference. +export LOG_LEVEL=WARNING +export LOGLEVEL=WARNING set +x -echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake" +# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) +echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 85771eeaac..d6f12ac065 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -1,4 +1,4 @@ -# Model-specific SGLang server configurations for disaggregated inference. +# Model-specific ATOM server configurations for disaggregated inference. # # Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR). # @@ -7,50 +7,34 @@ # # Schema: # : -# base_flags: str # Common flags for both prefill and decode -# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0 -# dp_flags: str # Appended when DP is enabled (prefill or decode) -# prefill: -# mem_fraction_static: float -# disable_radix_cache: bool -# dp: # Config when data-parallel attention is enabled -# max_running_requests: int -# chunked_prefill_size: str # Can be integer or bash arithmetic expression -# cuda_graph_bs: str # Space-separated values -# no_dp: # Config when data-parallel attention is disabled -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str # "start-end" expanded via seq -# decode: -# mem_fraction_static: float -# prefill_round_robin_balance: bool -# dp: -# max_running_requests: int -# chunked_prefill_size: str -# cuda_graph_bs_range: str -# ep_only: # Config when EP is enabled but DP is disabled -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str -# no_dp: -# max_running_requests: int -# chunked_prefill_size: int -# cuda_graph_bs_range: str +# env: str # Space-separated KEY=VALUE pairs exported unconditionally +# hf_overrides: str # JSON string passed to --hf-overrides +# tp_dp_flags: str # Parallel flags for TP+DPA case (must include --enable-dp-attention) +# tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode +# ep_dp_flags: str # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention) +# ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode +# mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens") +# kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none) DeepSeek-V4-Pro: - # ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS - # directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by - # server_atom.sh; they are kept here for documentation and potential future use. - base_flags: "" - mtp_flags: "" - dp_flags: "" + env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention --enable-tbo" + tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method mtp --num-speculative-tokens" + hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' MiniMax-M3-MXFP4: - base_flags: "" - mtp_flags: "" - dp_flags: "" + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" MiniMax-M3-MXFP8: - base_flags: "" - mtp_flags: "" - dp_flags: "" \ No newline at end of file + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0" + kv_cache_flags: "--kv_cache_dtype fp8" + tp_dp_flags: "--enable-dp-attention" + ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index ccc864030a..ab3c25da22 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -47,7 +47,6 @@ HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # ATOM server tuning (from reference script defaults) MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" -KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" BLOCK_SIZE="${BLOCK_SIZE:-16}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" @@ -78,6 +77,24 @@ if [[ -z "$host_ip" ]]; then fi host_name=$(hostname) +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +# Load model-specific config from YAML (single parse for all fields) +eval "$(python3 -c " +import yaml +with open('${ATOM_WS_PATH}/models_atom.yaml') as f: + m = yaml.safe_load(f).get('${MODEL_NAME}', {}) +print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"') +print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"') +print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"') +print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"') +print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"') +print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"') +")" + # ============================================================================= # Cluster Topology Configuration # ============================================================================= @@ -114,53 +131,48 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" # Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then - if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #TP+DPA+TBO - if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) - export GPU_MAX_HW_QUEUES=5 - export ATOM_CPU_AFFINITY=1 - else #TP+DPA - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) - fi + if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done + else #TP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi -fi +fi -# (srok), split DPA & TBO cases -DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP +DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then - if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #TP+DPA+TBO - if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) - export GPU_MAX_HW_QUEUES=5 - export ATOM_CPU_AFFINITY=1 - else #TP+DPA - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) - fi + if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done + else #TP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS}) + for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi -fi - -# MTP args -SPEC_ARGS=() #TP -if [ "$SPEC_DECODING" = "mtp" ]; then - SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") fi +unset _dp_env_pair # HF overrides (single-quoted JSON preserved through eval) HF_OVERRIDES_ARG="" -if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then - HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +if [[ -n "$_HF_OVERRIDES" ]]; then + HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'" fi +unset _HF_OVERRIDES + +for _env_pair in ${MODEL_ENVS}; do + export "$_env_pair" +done +unset _env_pair -# KV cache dtype (skip if unset or 'auto') -KV_CACHE_ARG="" -if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then - KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" +# MTP args +SPEC_ARGS=() +if [ "$SPEC_DECODING" = "mtp" ]; then + SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") fi +# KV cache arg - full flag string from YAML +KV_CACHE_ARG="${MODEL_KV_ARG}" + # Optional model length / batched-token cap MODEL_LEN_ARGS="" if [[ -n "$MAX_MODEL_LEN" ]]; then @@ -170,9 +182,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" fi -if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then - export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -fi cat < Date: Fri, 26 Jun 2026 10:28:57 +0900 Subject: [PATCH 02/15] [AMD] add perf-changelog entry for minimaxm3-fp4-mi355x-atom-disagg and server_atom.sh refactor (PR #1940) Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6eb6ca61f6..35b796fddb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4304,6 +4304,17 @@ - "Pass --use-chat-template for MTP acceptance and mirror the existing MiniMax-M3 MXFP8 MI355X MTP TP/EP/DP-attention search space at 1k1k and 8k1k." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1939 +- config-keys: + - minimaxm3-fp4-mi355x-atom-disagg + description: + - "Add minimaxm3-fp4-mi355x-atom-disagg CI recipe: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4" + - "Image: rocm/atom-dev:MiniMax-M3-20260623; model: amd/MiniMax-M3-MXFP4; framework: atom-disagg" + - "Search space: ISL=8192 and ISL=1024, OSL=1024, 1P1D TP4, conc 1-512" + - "Refactor server_atom.sh to eliminate all hardcoded MODEL_NAME checks; all model-specific config (env, parallel flags, MTP flags, KV cache flags, HF overrides) now driven from models_atom.yaml" + - "Add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries to models_atom.yaml with EAGLE3 MTP flags (--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3)" + - "Fix model HuggingFace path for minimaxm3-fp8-mi355x-atom-disagg: amd/MiniMax-M3-MXFP8 -> MiniMaxAI/MiniMax-M3-MXFP8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1940 + - config-keys: - minimaxm3-fp8-mi355x-vllm-mtp description: From 8740b804c24d88f0e2629bc226670b84b36819f3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 10:34:29 +0900 Subject: [PATCH 03/15] [AMD] add env dump in server_atom.sh and minimaxm3-fp4-mi355x-atom-disagg launch script Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 5 +++++ benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index ab3c25da22..8ef9a22176 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -201,6 +201,11 @@ Opt args : ${HF_OVERRIDES_ARG} ===================== INFO +set -x +echo "::group::Environment Variables" +env +echo "::endgroup::" + # ============================================================================= # Node Role Assignment # diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh index 505f743195..9b1957fa5f 100644 --- a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh +++ b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh @@ -65,7 +65,6 @@ export SPEC_DECODING="none" export DECODE_MTP_SIZE=0 # Block size 128 -export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}" export BLOCK_SIZE="${BLOCK_SIZE:-128}" export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}" export MAX_MODEL_LEN=32768 From 7c1ef64a4a3775e543861ec25a725349585de345 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 11:38:15 +0900 Subject: [PATCH 04/15] [AMD] fix server_atom.sh YAML loading: safe source, EP string compare, SPEC_DECODING guard - Replace fragile eval "$(python3 -c "...")" with heredoc + source tempfile to avoid nested quote escaping issues that caused MODEL_ENVS to be empty at runtime - Fix PREFILL/DECODE_ENABLE_EP comparison from numeric -gt 1 to string = "true" to match the "true"/"false" values set by launch scripts - Fix SPEC_DECODING guard from hardcoded "mtp" to any non-none/non-empty value so EAGLE3 and future methods also activate SPEC_ARGS from models_atom.yaml Co-Authored-By: Claude Sonnet 4.6 --- .../multi_node/amd_utils/server_atom.sh | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 8ef9a22176..303e0d8767 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -81,19 +81,25 @@ host_name=$(hostname) # Model-Specific Configuration from YAML # ============================================================================= # Load model-specific config from YAML (single parse for all fields) -eval "$(python3 -c " +_yaml_tmp=$(mktemp) +python3 << PYEOF > "$_yaml_tmp" import yaml with open('${ATOM_WS_PATH}/models_atom.yaml') as f: m = yaml.safe_load(f).get('${MODEL_NAME}', {}) -print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"') -print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"') -print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"') -print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"') -print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"') -print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"') -")" +def sh(v): return v.replace("'", "'\\''") +print(f"MODEL_ENVS='{sh(m.get('env', ''))}'") +print(f"MODEL_TP_DP_FLAGS='{sh(m.get('tp_dp_flags', ''))}'") +print(f"MODEL_EP_DP_FLAGS='{sh(m.get('ep_dp_flags', ''))}'") +print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'") +print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'") +print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'") +print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'") +print(f"_HF_OVERRIDES='{sh(m.get('hf_overrides', ''))}'") +PYEOF +# shellcheck source=/dev/null +source "$_yaml_tmp" +rm -f "$_yaml_tmp" +unset _yaml_tmp # ============================================================================= # Cluster Topology Configuration @@ -131,7 +137,7 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" # Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then - if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA + if [ "$PREFILL_ENABLE_EP" = "true" ]; then #EP+DPA PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS}) for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done else #TP+DPA @@ -142,7 +148,7 @@ fi DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then - if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA + if [ "$DECODE_ENABLE_EP" = "true" ]; then #EP+DPA DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS}) for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done else #TP+DPA @@ -166,7 +172,7 @@ unset _env_pair # MTP args SPEC_ARGS=() -if [ "$SPEC_DECODING" = "mtp" ]; then +if [[ "$SPEC_DECODING" != "none" && "$SPEC_DECODING" != "" && -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") fi From 7cd3353d77ab4e902d10a5411e491c384ebbc180 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 12:04:12 +0900 Subject: [PATCH 05/15] [AMD] cap minimaxm3-fp8-mi355x-atom-disagg conc to 256; fix missing newline in models_atom.yaml Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/models_atom.yaml | 2 +- configs/amd-master.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index d6f12ac065..620aaf6c68 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -37,4 +37,4 @@ MiniMax-M3-MXFP8: kv_cache_flags: "--kv_cache_dtype fp8" tp_dp_flags: "--enable-dp-attention" ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" - mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" \ No newline at end of file + mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 97619caf72..971fe29733 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2735,7 +2735,7 @@ minimaxm3-fp8-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 4 @@ -2755,7 +2755,7 @@ minimaxm3-fp8-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 4 From 89611257dd850465bc15e29ab16ecf87eabeb963 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 12:11:29 +0900 Subject: [PATCH 06/15] [AMD] update amd-master.yaml: image bumps, search space tweaks for MiniMax-M3 ATOM recipes Co-Authored-By: Claude Sonnet 4.6 --- configs/amd-master.yaml | 53 +++++++---------------------------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 971fe29733..5327d95116 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2645,7 +2645,7 @@ minimaxm3-fp4-mi355x-vllm-mtp: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. minimaxm3-fp4-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:M3 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2657,52 +2657,17 @@ minimaxm3-fp4-mi355x-atom: - isl: 1024 osl: 1024 search-space: + - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: + - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - -minimaxm3-fp4-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260623 - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - -minimaxm3-fp8-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260623 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp8-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:MiniMax-M3-20260622 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2735,7 +2700,7 @@ minimaxm3-fp8-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 tp: 4 @@ -2755,7 +2720,7 @@ minimaxm3-fp8-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] prefill: num-worker: 1 tp: 4 @@ -2786,7 +2751,7 @@ minimaxm3-fp4-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 4 @@ -2806,7 +2771,7 @@ minimaxm3-fp4-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 4 From 48b9946ae2d7ac205f737dc0438558e8ee6845a7 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 12:16:20 +0900 Subject: [PATCH 07/15] [AMD] restore minimaxm3-fp4/fp8-mi355x-atom recipes; bump all ATOM images to 20260623 Co-Authored-By: Claude Sonnet 4.6 --- configs/amd-master.yaml | 44 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 5327d95116..86dceedcb9 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2645,7 +2645,7 @@ minimaxm3-fp4-mi355x-vllm-mtp: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. minimaxm3-fp4-mi355x-atom: - image: rocm/atom-dev:M3 + image: rocm/atom-dev:MiniMax-M3-20260623 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2666,8 +2666,46 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } +minimaxm3-fp4-mi355x-atom-mtp: + image: rocm/atom-dev:MiniMax-M3-20260623 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + +minimaxm3-fp8-mi355x-atom: + image: rocm/atom-dev:MiniMax-M3-20260623 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256 } + minimaxm3-fp8-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260622 + image: rocm/atom-dev:MiniMax-M3-20260623 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2686,7 +2724,7 @@ minimaxm3-fp8-mi355x-atom-mtp: - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } minimaxm3-fp8-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260622 + image: rocm/atom-dev:MiniMax-M3-20260623 model: amd/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x-disagg From 7f94d30e905105d5ab8c600b32e339b37223bb19 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 26 Jun 2026 12:17:31 +0900 Subject: [PATCH 08/15] [AMD] clean up minimaxm3-fp4-mi355x-atom search space; revert fp8-disagg image to 20260622 Co-Authored-By: Claude Sonnet 4.6 --- configs/amd-master.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 86dceedcb9..cd05d7dc22 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2657,14 +2657,11 @@ minimaxm3-fp4-mi355x-atom: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp4-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260623 @@ -2724,7 +2721,7 @@ minimaxm3-fp8-mi355x-atom-mtp: - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } minimaxm3-fp8-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:MiniMax-M3-20260622 model: amd/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x-disagg From 1aa7acea1699e6ff82c38cd58dc381d59de57b3c Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 3 Jul 2026 13:45:19 +0900 Subject: [PATCH 09/15] [AMD] add amd-master.yaml config Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 3164 +++++++++++++++++++++++++++++++ 1 file changed, 3164 insertions(+) create mode 100644 .github/configs/amd-master.yaml diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml new file mode 100644 index 0000000000..f6166699aa --- /dev/null +++ b/.github/configs/amd-master.yaml @@ -0,0 +1,3164 @@ +dsr1-fp4-mi355x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm700-mi35x + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + # Agentic-coding sweep commented out for this image-bump PR — the + # 10-conc agentic matrix amplifies sweep cost and the bump validation + # only needs the fixed-seq-len throughput shape. Re-enable once the + # bump merges; the next agentic cron PR will pick it up. + # agentic-coding: + # - duration: 1800 + # search-space: + # - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] } + +dsr1-fp4-mi355x-sglang-mtp: + image: lmsysorg/sglang:v0.5.12-rocm700-mi35x + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +dsr1-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + +dsr1-fp4-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + runner: mi355x + precision: fp4 + # WIP framework (no customers yet) + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +dsr1-fp8-mi300x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm700-mi30x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi300x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi325x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm700-mi30x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi355x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm700-mi35x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 32, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + +dsr1-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang:v0.5.12-rocm700-mi35x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +qwen3.5-bf16-mi355x-sglang: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: mi355x + precision: bf16 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + +qwen3.5-bf16-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: mi355x + precision: bf16 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +qwen3.5-bf16-mi300x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: mi300x + precision: bf16 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +qwen3.5-bf16-mi325x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: mi325x + precision: bf16 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +qwen3.5-fp8-mi325x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +qwen3.5-fp8-mi355x-sglang: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + +qwen3.5-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. +qwen3.5-fp8-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +qwen3.5-fp8-mi355x-atom: + image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + +qwen3.5-fp8-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +qwen3.5-fp8-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Matches qwen3.5-fp8-mi355x-sglang TP8/EP1 low-concurrency sweep + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # 1P+1D TP8/EP1 low-concurrency sweep. + # dp-attn intentionally false (matches the 1k1k row): with + # --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes + # moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI, + # so num_shared_slots stays at the global value (1) and the + # (num_experts - num_shared_slots) % moe_ep_size assertion in + # fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared). + # Track upstream sglang for a fix; flip back to dp-attn=true once + # MoRI is added to is_deepep_class_backend() or shared-slot + # accounting is reconciled. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +qwen3.5-fp4-mi355x-sglang: + image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612 + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + +qwen3.5-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + +qwen3.5-fp4-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612 + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + +qwen3.5-fp4-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523 + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP8/EP1, dp-attn false; MoRI conn.py overlay via job.slurm. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +qwen3.5-fp8-mi300x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi300x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +glm5-fp8-mi355x-sglang: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + +glm5-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + +glm5-fp8-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P+1D TP8/EP1 CI smoke sweep (aligned with glm5-fp8-mi355x-sglang conc range) + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # 1P+1D TP8/EP1 CI smoke sweep; dp-attn false (NSA / MoRI path) + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +glm5-fp8-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 256 } + +glm5.1-fp4-mi355x-sglang: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + +# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. +glm5.1-fp4-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +glm5.1-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + +kimik2.5-int4-mi355x-vllm: + image: vllm/vllm-openai-rocm:nightly-b8336c3c7c298e0878f22a7bf70f4e295b2f4e01 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi355x + precision: int4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + +kimik2.5-int4-mi325x-vllm: + image: vllm/vllm-openai-rocm:v0.21.0 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi325x + precision: int4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +kimik2.5-int4-mi300x-vllm: + image: vllm/vllm-openai-rocm:v0.21.0 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi300x + precision: int4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +kimik2.5-fp4-mi355x-vllm: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + +# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' +kimik2.5-fp4-mi355x-vllm-agentic: + # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin + # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm + # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and + # includes all subsequent ROCm offload work. + image: vllm/vllm-openai-rocm:v0.21.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } + # CPU offload only above the KV cliff. Lower concurrencies fit + # entirely on-GPU, so paying the offload-path overhead there would + # just slow them down without measuring anything new. + - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } + # TP=4 probe: half-node layout doubles per-GPU weight footprint + # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to + # cliff-region concurrencies on both offload modes so we can directly + # compare TP=4 vs TP=8 at the same conc points. + - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } + - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } + +kimik2.5-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + +gptoss-fp4-mi300x-vllm: + image: vllm/vllm-openai-rocm:v0.17.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: mi300x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 64, conc-end: 256 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } + +gptoss-fp4-mi325x-vllm: + image: vllm/vllm-openai-rocm:v0.22.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: mi325x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 8 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + +gptoss-fp4-mi355x-vllm: + image: vllm/vllm-openai-rocm:v0.22.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 8, conc-start: 4, conc-end: 8 } + +gptoss-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 16, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } + +dsr1-fp8-mi355x-atom: + image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x + precision: fp8 + # WIP framework (no customers yet) + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + +dsr1-fp8-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + +dsr1-fp8-mi355x-sglang-disagg: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) + - spec-decoding: "none" + conc-list: [ 1536, 1024, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +dsr1-fp8-mi355x-sglang-disagg-mtp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1536, 1024, 512, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" + +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + +dsr1-fp4-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 2*DEP8 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + +dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 32, 64 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 640, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 128 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 64 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 2*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + +dsv4-fp4-mi355x-sglang: + image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260618 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } + - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 } + - { tp: 4, dp-attn: false, conc-start: 1 , conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } + - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 } + - { tp: 4, dp-attn: false, conc-start: 1, conc-end: 32 } + + +# MTP variant of dsv4-fp4-mi355x-sglang. Mirrors the base search space and adds +# spec-decoding: mtp, which routes to dsv4_fp4_mi355x_sglang_mtp.sh (EAGLE +# speculative decoding), per sgl-project/sglang#26383 ([AMD][DSV4] DSV4 MTP +# graph + sparse triton attn optimizations, merged to main 2026-05-27). That PR +# fixes the ROCm HIP-radix MTP CUDA-graph bug (the false-EOS symptom in sgl +# #20404) and validates GSM8K 0.950 with MTP on. +# +# #26383 is on sglang `main`, NOT the amd/deepseek_v4 branch the rocm/sgl-dev:*-DSv4 +# builds are cut from (latest da28108 = f96ac98 + build fixes + an unrelated +# MLA-decode refactor, still pre-#26383 -> kv_score crash, run 26723126211). So we +# pin the mainline ROCm nightly, which carries #26383. Mainline omits deep_gemm, +# but the recipe detects that and routes the DSv4 fp8 wo_a / topk paths to their +# torch fallbacks (see dsv4_fp4_mi355x_sglang_mtp.sh). When a -DSv4 image carrying +# #26383 ships, bump to it; the recipe auto-restores the deep_gemm perf path. +dsv4-fp4-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260601 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } + - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } + - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } + +# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm +# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged +# on 2026-05-05, so any nightly built after that includes the +# DeepseekV4ForCausalLM model class. +# +# IMPORTANT: pin to a digest-suffixed nightly tag rather than the +# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs +# files keyed on the image string and short-circuits re-import if the +# file already exists, so the floating tag silently keeps a stale build +# even after Docker Hub updates `:nightly`. +# +# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the +# rest); InferenceX classifies this as fp4 — same as the sister sglang +# and atom DSv4 mi355x entries below. Image and serving flags follow the +# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp +# executor, triton_unfused MoE (required for the FP4 expert format), +# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, +# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 +# probe to validate the ROCm DP+EP path. +dsv4-fp4-mi355x-vllm: + image: vllm/vllm-openai-rocm:v0.22.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512 } + +# MTP variant of dsv4-fp4-mi355x-vllm. Mirrors the base recipe's search space +# and adds spec-decoding: mtp, which routes to dsv4_fp4_mi355x_vllm_mtp.sh +# (--speculative-config '{"method":"mtp","num_speculative_tokens":2}'), per +# vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP, merged 2026-05-24, included in +# v0.22.0). Full conc 4-512 range maps the complete crossover curve: MTP wins +# at low batch (PR perf data: +75% @ conc1, +38% @ conc8) and falls behind STP +# above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm +# build, which already contains the MTP commit. +dsv4-fp4-mi355x-vllm-mtp: + image: vllm/vllm-openai-rocm:v0.22.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } + +dsv4-fp4-mi355x-atom: + image: rocm/atom-dev:nightly_202606161823 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # conc4-64, TP8 + # conc128-512, DPA + # conc1024-2048, DPA TBO + - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 } + - isl: 8192 + osl: 1024 + search-space: + # conc4-64, TP8 + # conc128, DPA + # conc256-2048, DPA TBO + - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] } + - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 } + +dsv4-fp4-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp } + +qwen3.5-bf16-mi325x-sglang-mtp: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + runner: mi325x + precision: bf16 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +dsr1-fp8-mi325x-sglang-mtp: + image: lmsysorg/sglang:v0.5.12-rocm700-mi30x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +qwen3.5-fp8-mi325x-sglang-mtp: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +glm5-fp8-mi325x-sglang: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +glm5-fp8-mi325x-sglang-mtp: + image: lmsysorg/sglang:v0.5.12-rocm720-mi30x + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: mi325x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +# ============================================================================ +# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). +# Recipes that ALREADY existed on main were intentionally left at main's version +# to preserve main behavior; PR-branch modifications to those recipes are NOT +# brought in here. +# ============================================================================ + +qwen3.5-fp8-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +dsv4-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } + +dsr1-fp4-mi355x-sglang-disagg-mtp: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 + model: amd/DeepSeek-R1-0528-MXFP4-v2 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 128, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # 1*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 64, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # 2*DEP8 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + +# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the +# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the +# image tag, so bumping sglang is just an image tag bump here. Sweeps +# DP-attention on/off and EP=8. + +# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; +# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding. +# Image is identical to the base entry (rocm/sgl-dev DSv4 build). +# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware +# comparability. Offload sweep is none-only (SGLang has no equivalent of +# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). +dsv4-fp4-mi355x-sglang-agentic: + image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [16, 32, 64] } + - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } + +# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm +# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged +# on 2026-05-05, so any nightly built after that includes the +# DeepseekV4ForCausalLM model class. +# +# IMPORTANT: pin to a digest-suffixed nightly tag rather than the +# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs +# files keyed on the image string and short-circuits re-import if the +# file already exists, so the floating tag silently keeps a stale build +# even after Docker Hub updates `:nightly`. +# +# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the +# rest); InferenceX classifies this as fp4 — same as the sister sglang +# and atom DSv4 mi355x entries below. Image and serving flags follow the +# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp +# executor, triton_unfused MoE (required for the FP4 expert format), +# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, +# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 +# probe to validate the ROCm DP+EP path. + +dsv4-fp4-mi355x-atom-disagg: + image: rocm/atom-dev:nightly_202606101403 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + # 1P1D DPA+TP8 + - isl: 8192 + osl: 1024 + search-space: + # 2P1D DPA+TP8 + - conc-list: [ 256, 512, 768, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP8 + - conc-list: [ 4, 8, 16, 32, 64, 128 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP8 + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + +# MiniMax-M3 MXFP8 MI355X recipe: +# https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 +# MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. +minimaxm3-fp8-mi355x-vllm: + image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } + - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } + - { tp: 4, conc-start: 1, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } + +# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of +# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the +# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). No +# attention_backend override is needed — the server runs on TRITON_ATTN, so +# the FlashInfer page-128/MHA limitation that forced FLASH_ATTN on Blackwell +# does not apply here. Search space mirrors the non-MTP entry trimmed at the +# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp / +# b200-vllm-mtp precedent: spec decode pays off at low/mid concurrency while +# acceptance dilutes in big batches, and the draft weights + draft KV shave +# headroom — tp2-ep2 is dropped since its KV headroom was already thin. +minimaxm3-fp8-mi355x-vllm-mtp: + image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + +# MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config. +minimaxm3-fp4-mi355x-vllm-disagg: + image: rocm/vllm-dev:vllm-0.23.1-rocm723-mi35x-mori-0625 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P TP4 + 1D TP4 (2 nodes total), conc sweep 1..512 (single job, looped) + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 2P TP4 + 1D TP4 (3 nodes total), conc 128/256/512 (single job, looped) + - spec-decoding: "none" + conc-list: [ 128, 256, 512 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" +# MiniMax-M3 MXFP4 MI355X vLLM recipe. The pinned nightly includes upstream +# MiniMax-M3 Quark MXFP4 support (vllm-project/vllm#45794). Use the text-only +# language-model path and mirror the MXFP8 MI355X search space for a direct +# precision comparison. +minimaxm3-fp4-mi355x-vllm: + image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } + - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } + - { tp: 4, conc-start: 1, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } + +# EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the +# amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft +# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base +# FP4 sweep at extreme concurrency where speculative decoding loses value. +minimaxm3-fp4-mi355x-vllm-mtp: + image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + +# MiniMax-M3 MXFP4 MI355X atom recipe: +# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md +# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. +minimaxm3-fp4-mi355x-atom: + image: rocm/atom-dev:MiniMax-M3-20260623 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256 } + +minimaxm3-fp4-mi355x-atom-mtp: + image: rocm/atom-dev:MiniMax-M3-20260623 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + +minimaxm3-fp8-mi355x-atom: + image: rocm/atom-dev:MiniMax-M3-20260623 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256 } + +minimaxm3-fp8-mi355x-atom-mtp: + image: rocm/atom-dev:MiniMax-M3-20260623 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + +minimaxm3-fp8-mi355x-atom-disagg: + image: rocm/atom-dev:MiniMax-M3-20260622 + model: amd/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp8 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + +minimaxm3-fp4-mi355x-atom-disagg: + image: rocm/atom-dev:MiniMax-M3-20260622 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + +# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and +# MI355X serving shape, but retain the default BF16 KV cache because this +# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 +# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency. +minimaxm3-fp8-mi300x-vllm: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } + +# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of +# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the +# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only +# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like +# H100), with the TP8 latency rows started at conc 1 to capture single-request +# latency — matching the H100/MI355X MTP recipes. The pinned ROCm nightly +# includes upstream SupportsEagle3 support for the AMD MiniMax-M3 model. +minimaxm3-fp8-mi300x-vllm-mtp: + image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } + +# MiniMax-M3 MXFP8 MI325X day-zero recipe. Reuse the dedicated ROCm image +# and serving flags validated on MI355X, with the H200 search space: TP4 and +# TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention. +minimaxm3-fp8-mi325x-vllm: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 } + - { tp: 8, conc-start: 1, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 32 } + - { tp: 8, conc-start: 1, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } + +# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of +# minimaxm3-fp8-mi325x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the +# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style +# search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency +# end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP +# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0, +# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD +# MiniMax-M3 model, so the recipe applies that fix in-place at runtime +# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on +# MI355X/MI300X) before serving. +minimaxm3-fp8-mi325x-vllm-mtp: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + +# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the +# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to +# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on +# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are +# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in +# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8). +minimaxm3-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across + # conc 1,2,4,8,16,32,64,128,256. + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024. + - spec-decoding: "none" + conc-list: [ 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2) + # feeding one full-node TP8 decode, at high conc 256,512,768,1024. + - spec-decoding: "none" + conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024; + # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8 + # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one + # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) — + # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end. + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across + # conc 1,2,4,8,16,32,64,128,256. + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024. + - spec-decoding: "none" + conc-list: [ 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2) + # feeding one full-node TP8 decode, at high conc 256,512,768,1024. + - spec-decoding: "none" + conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" From ed5e87461a544b7c592774293062cec36bf494c8 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 3 Jul 2026 13:49:40 +0900 Subject: [PATCH 10/15] [AMD] remove amd-master.yaml config Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 3164 ------------------------------- 1 file changed, 3164 deletions(-) delete mode 100644 .github/configs/amd-master.yaml diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml deleted file mode 100644 index f6166699aa..0000000000 --- a/.github/configs/amd-master.yaml +++ /dev/null @@ -1,3164 +0,0 @@ -dsr1-fp4-mi355x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm700-mi35x - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - # Agentic-coding sweep commented out for this image-bump PR — the - # 10-conc agentic matrix amplifies sweep cost and the bump validation - # only needs the fixed-seq-len throughput shape. Re-enable once the - # bump merges; the next agentic cron PR will pick it up. - # agentic-coding: - # - duration: 1800 - # search-space: - # - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] } - -dsr1-fp4-mi355x-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-rocm700-mi35x - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - -dsr1-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 - model: amd/DeepSeek-R1-0528-MXFP4-Preview - model-prefix: dsr1 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - -dsr1-fp4-mi355x-atom-mtp: - image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 - model: amd/DeepSeek-R1-0528-MXFP4 - model-prefix: dsr1 - runner: mi355x - precision: fp4 - # WIP framework (no customers yet) - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -dsr1-fp8-mi300x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm700-mi30x - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi300x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-mi325x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm700-mi30x - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi325x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-mi355x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm700-mi35x - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 32, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - -dsr1-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-rocm700-mi35x - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - -qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: mi355x - precision: bf16 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - -qwen3.5-bf16-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: mi355x - precision: bf16 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -qwen3.5-bf16-mi300x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: mi300x - precision: bf16 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -qwen3.5-bf16-mi325x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: mi325x - precision: bf16 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -qwen3.5-fp8-mi325x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi325x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - -qwen3.5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - -qwen3.5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - -qwen3.5-fp8-mi355x-atom-mtp: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -qwen3.5-fp8-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x-disagg - precision: fp8 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # Matches qwen3.5-fp8-mi355x-sglang TP8/EP1 low-concurrency sweep - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # 1P+1D TP8/EP1 low-concurrency sweep. - # dp-attn intentionally false (matches the 1k1k row): with - # --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes - # moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI, - # so num_shared_slots stays at the global value (1) and the - # (num_experts - num_shared_slots) % moe_ep_size assertion in - # fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared). - # Track upstream sglang for a fix; flip back to dp-attn=true once - # MoRI is added to is_deepep_class_backend() or shared-slot - # accounting is reconciled. - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -qwen3.5-fp4-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612 - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - -qwen3.5-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - -qwen3.5-fp4-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612 - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } - -qwen3.5-fp4-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523 - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # 1P1D TP8/EP1, dp-attn false; MoRI conn.py overlay via job.slurm. - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -qwen3.5-fp8-mi300x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi300x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -glm5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - -glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - -glm5-fp8-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi355x-disagg - precision: fp8 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # 1P+1D TP8/EP1 CI smoke sweep (aligned with glm5-fp8-mi355x-sglang conc range) - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # 1P+1D TP8/EP1 CI smoke sweep; dp-attn false (NSA / MoRI path) - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -glm5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 256 } - -glm5.1-fp4-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - -# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. -glm5.1-fp4-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - -glm5.1-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - -kimik2.5-int4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-b8336c3c7c298e0878f22a7bf70f4e295b2f4e01 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: mi355x - precision: int4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - -kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: mi325x - precision: int4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -kimik2.5-int4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: mi300x - precision: int4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -kimik2.5-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - -# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' -kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. - image: vllm/vllm-openai-rocm:v0.21.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - # CPU offload only above the KV cliff. Lower concurrencies fit - # entirely on-GPU, so paying the offload-path overhead there would - # just slow them down without measuring anything new. - - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } - # TP=4 probe: half-node layout doubles per-GPU weight footprint - # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to - # cliff-region concurrencies on both offload modes so we can directly - # compare TP=4 vs TP=8 at the same conc points. - - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } - -kimik2.5-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - -gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.17.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: mi300x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 256 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } - -gptoss-fp4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: mi325x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 8 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - -gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 4 } - - { tp: 8, conc-start: 4, conc-end: 8 } - -gptoss-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - -dsr1-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x - precision: fp8 - # WIP framework (no customers yet) - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - -dsr1-fp8-mi355x-atom-mtp: - image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp8 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -dsr1-fp8-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp8 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" - -kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x-disagg - precision: fp4 - framework: vllm-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - -dsr1-fp4-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D pure TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - -dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - -dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 32, 64 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 640, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 128 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 64 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - -dsv4-fp4-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260618 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } - - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 } - - { tp: 4, dp-attn: false, conc-start: 1 , conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } - - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 } - - { tp: 4, dp-attn: false, conc-start: 1, conc-end: 32 } - - -# MTP variant of dsv4-fp4-mi355x-sglang. Mirrors the base search space and adds -# spec-decoding: mtp, which routes to dsv4_fp4_mi355x_sglang_mtp.sh (EAGLE -# speculative decoding), per sgl-project/sglang#26383 ([AMD][DSV4] DSV4 MTP -# graph + sparse triton attn optimizations, merged to main 2026-05-27). That PR -# fixes the ROCm HIP-radix MTP CUDA-graph bug (the false-EOS symptom in sgl -# #20404) and validates GSM8K 0.950 with MTP on. -# -# #26383 is on sglang `main`, NOT the amd/deepseek_v4 branch the rocm/sgl-dev:*-DSv4 -# builds are cut from (latest da28108 = f96ac98 + build fixes + an unrelated -# MLA-decode refactor, still pre-#26383 -> kv_score crash, run 26723126211). So we -# pin the mainline ROCm nightly, which carries #26383. Mainline omits deep_gemm, -# but the recipe detects that and routes the DSv4 fp8 wo_a / topk paths to their -# torch fallbacks (see dsv4_fp4_mi355x_sglang_mtp.sh). When a -DSv4 image carrying -# #26383 ships, bump to it; the recipe auto-restores the deep_gemm perf path. -dsv4-fp4-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260601 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } - - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } - - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } - -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. -dsv4-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 512 } - -# MTP variant of dsv4-fp4-mi355x-vllm. Mirrors the base recipe's search space -# and adds spec-decoding: mtp, which routes to dsv4_fp4_mi355x_vllm_mtp.sh -# (--speculative-config '{"method":"mtp","num_speculative_tokens":2}'), per -# vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP, merged 2026-05-24, included in -# v0.22.0). Full conc 4-512 range maps the complete crossover curve: MTP wins -# at low batch (PR perf data: +75% @ conc1, +38% @ conc8) and falls behind STP -# above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm -# build, which already contains the MTP commit. -dsv4-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:v0.22.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp } - -dsv4-fp4-mi355x-atom: - image: rocm/atom-dev:nightly_202606161823 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # conc4-64, TP8 - # conc128-512, DPA - # conc1024-2048, DPA TBO - - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 } - - isl: 8192 - osl: 1024 - search-space: - # conc4-64, TP8 - # conc128, DPA - # conc256-2048, DPA TBO - - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] } - - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 } - -dsv4-fp4-mi355x-atom-mtp: - image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp } - -qwen3.5-bf16-mi325x-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: Qwen/Qwen3.5-397B-A17B - model-prefix: qwen3.5 - runner: mi325x - precision: bf16 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - -dsr1-fp8-mi325x-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-rocm700-mi30x - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: mi325x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - -qwen3.5-fp8-mi325x-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi325x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - -glm5-fp8-mi325x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi325x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -glm5-fp8-mi325x-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-rocm720-mi30x - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: mi325x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ - -qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } - -dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.21.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4] } - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } - -dsr1-fp4-mi355x-sglang-disagg-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 - model: amd/DeepSeek-R1-0528-MXFP4-v2 - model-prefix: dsr1 - runner: mi355x-disagg - precision: fp4 - framework: sglang-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 128, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 1*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 64, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # 2*DEP8 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. - -# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# Image is identical to the base entry (rocm/sgl-dev DSv4 build). -# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware -# comparability. Offload sweep is none-only (SGLang has no equivalent of -# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). -dsv4-fp4-mi355x-sglang-agentic: - image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } - -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. - -dsv4-fp4-mi355x-atom-disagg: - image: rocm/atom-dev:nightly_202606101403 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: atom-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - # 1P1D DPA+TP8 - - isl: 8192 - osl: 1024 - search-space: - # 2P1D DPA+TP8 - - conc-list: [ 256, 512, 768, 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - # 1P1D TP8 - - conc-list: [ 4, 8, 16, 32, 64, 128 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 1P1D TP8 - - isl: 1024 - osl: 1024 - search-space: - - conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - -# MiniMax-M3 MXFP8 MI355X recipe: -# https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 -# MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. -minimaxm3-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } - -# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of -# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the -# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). No -# attention_backend override is needed — the server runs on TRITON_ATTN, so -# the FlashInfer page-128/MHA limitation that forced FLASH_ATTN on Blackwell -# does not apply here. Search space mirrors the non-MTP entry trimmed at the -# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp / -# b200-vllm-mtp precedent: spec decode pays off at low/mid concurrency while -# acceptance dilutes in big batches, and the draft weights + draft KV shave -# headroom — tp2-ep2 is dropped since its KV headroom was already thin. -minimaxm3-fp8-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - -# MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config. -minimaxm3-fp4-mi355x-vllm-disagg: - image: rocm/vllm-dev:vllm-0.23.1-rocm723-mi35x-mori-0625 - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x-disagg - precision: fp4 - framework: vllm-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 8192 - osl: 1024 - search-space: - # 1P TP4 + 1D TP4 (2 nodes total), conc sweep 1..512 (single job, looped) - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 2P TP4 + 1D TP4 (3 nodes total), conc 128/256/512 (single job, looped) - - spec-decoding: "none" - conc-list: [ 128, 256, 512 ] - prefill: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" -# MiniMax-M3 MXFP4 MI355X vLLM recipe. The pinned nightly includes upstream -# MiniMax-M3 Quark MXFP4 support (vllm-project/vllm#45794). Use the text-only -# language-model path and mirror the MXFP8 MI355X search space for a direct -# precision comparison. -minimaxm3-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 } - - { tp: 4, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 } - -# EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the -# amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft -# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base -# FP4 sweep at extreme concurrency where speculative decoding loses value. -minimaxm3-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - -# MiniMax-M3 MXFP4 MI355X atom recipe: -# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md -# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. -minimaxm3-fp4-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260623 - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } - -minimaxm3-fp4-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260623 - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - -minimaxm3-fp8-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260623 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } - -minimaxm3-fp8-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260623 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - -minimaxm3-fp8-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260622 - model: amd/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x-disagg - precision: fp8 - framework: atom-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 8192 - osl: 1024 - search-space: - # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 1P1D TP4 - - isl: 1024 - osl: 1024 - search-space: - # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - -minimaxm3-fp4-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260622 - model: amd/MiniMax-M3-MXFP4 - model-prefix: minimaxm3 - runner: mi355x-disagg - precision: fp4 - framework: atom-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 8192 - osl: 1024 - search-space: - # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 1P1D TP4 - - isl: 1024 - osl: 1024 - search-space: - # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - -# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and -# MI355X serving shape, but retain the default BF16 KV cache because this -# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 -# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency. -minimaxm3-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:minimax-m3 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi300x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } - -# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of -# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the -# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only -# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like -# H100), with the TP8 latency rows started at conc 1 to capture single-request -# latency — matching the H100/MI355X MTP recipes. The pinned ROCm nightly -# includes upstream SupportsEagle3 support for the AMD MiniMax-M3 model. -minimaxm3-fp8-mi300x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi300x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp } - -# MiniMax-M3 MXFP8 MI325X day-zero recipe. Reuse the dedicated ROCm image -# and serving flags validated on MI355X, with the H200 search space: TP4 and -# TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention. -minimaxm3-fp8-mi325x-vllm: - image: vllm/vllm-openai-rocm:minimax-m3 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi325x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 } - - { tp: 8, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 32 } - - { tp: 8, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } - -# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of -# minimaxm3-fp8-mi325x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the -# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style -# search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency -# end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP -# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0, -# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD -# MiniMax-M3 model, so the recipe applies that fix in-place at runtime -# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on -# MI355X/MI300X) before serving. -minimaxm3-fp8-mi325x-vllm-mtp: - image: vllm/vllm-openai-rocm:minimax-m3 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi325x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - -# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the -# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to -# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on -# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are -# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in -# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8). -minimaxm3-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x-disagg - precision: fp8 - framework: vllm-disagg - multinode: true - disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across - # conc 1,2,4,8,16,32,64,128,256. - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024. - - spec-decoding: "none" - conc-list: [ 64, 128, 256, 512, 1024 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2) - # feeding one full-node TP8 decode, at high conc 256,512,768,1024. - - spec-decoding: "none" - conc-list: [ 256, 512, 768, 1024 ] - prefill: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024; - # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8 - # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one - # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) — - # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end. - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across - # conc 1,2,4,8,16,32,64,128,256. - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024. - - spec-decoding: "none" - conc-list: [ 64, 128, 256, 512, 1024 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2) - # feeding one full-node TP8 decode, at high conc 256,512,768,1024. - - spec-decoding: "none" - conc-list: [ 256, 512, 768, 1024 ] - prefill: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" From 8b6141c4076f737788ab694b88b11da1fbb1f625 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 3 Jul 2026 15:48:09 +0900 Subject: [PATCH 11/15] [AMD] refactor ATOM disagg config: split per-role flags, move model defaults to YAML - Split MODEL_TP_DP_FLAGS and MODEL_EP_DP_FLAGS into prefill/decode variants - Move BLOCK_SIZE, MEM_FRAC_STATIC, MAX_MODEL_LEN, MAX_NUM_SEQS, MAX_NUM_BATCHED_TOKENS from launch scripts into models_atom.yaml - Add hf_overrides and online_quant_config (with DPA variant) to YAML - Remove SPEC_DECODING gate; use MODEL_MTP_FLAGS + DECODE_MTP_SIZE > 0 - Add minimaxm3-fp4/fp8-mi355x-atom-disagg-mtp recipes to amd-master.yaml Co-Authored-By: Claude Opus 4.6 --- .../multi_node/amd_utils/models_atom.yaml | 47 ++++- .../multi_node/amd_utils/server_atom.sh | 81 +++++--- .../minimaxm3_fp4_mi355x_atom-disagg.sh | 15 -- .../minimaxm3_fp8_mi355x_atom-disagg.sh | 16 -- configs/amd-master.yaml | 188 +++++++++++++++++- 5 files changed, 282 insertions(+), 65 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 620aaf6c68..1c20e8aef8 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -9,32 +9,71 @@ # : # env: str # Space-separated KEY=VALUE pairs exported unconditionally # hf_overrides: str # JSON string passed to --hf-overrides -# tp_dp_flags: str # Parallel flags for TP+DPA case (must include --enable-dp-attention) +# tp_dp_flags: str # Shared TP+DPA flags (fallback when prefill/decode-specific keys are absent) +# prefill_tp_dp_flags: str # TP+DPA flags for prefill only (overrides tp_dp_flags) +# decode_tp_dp_flags: str # TP+DPA flags for decode only (overrides tp_dp_flags) # tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode -# ep_dp_flags: str # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention) +# ep_dp_flags: str # Shared EP+DPA flags (fallback when prefill/decode-specific keys are absent) +# prefill_ep_dp_flags: str # EP+DPA flags for prefill only (overrides ep_dp_flags) +# decode_ep_dp_flags: str # EP+DPA flags for decode only (overrides ep_dp_flags) # ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode # mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens") # kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none) +# online_quant_config: str # JSON string passed to --online_quant_config (used when DPA is disabled) +# online_quant_dpa_config: str # JSON string passed to --online_quant_config when DPA is enabled (falls back to online_quant_config) +# block_size: str # --block-size value (overrides server_atom.sh default of 16) +# mem_frac_static: str # --gpu-memory-utilization value (overrides default of 0.85) +# max_model_len: str # --max-model-len value (overrides default of unset) +# max_num_seqs: str # --max-num-seqs value (overrides default of 256) +# max_num_batched_tokens: str # --max-num-batched-tokens value (overrides default of unset) DeepSeek-V4-Pro: env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0" kv_cache_flags: "--kv_cache_dtype fp8" tp_dp_flags: "--enable-dp-attention --enable-tbo" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo" + decode_tp_dp_flags: "--enable-dp-attention --enable-tbo" tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1" ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" mtp_flags: "--method mtp --num-speculative-tokens" hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' MiniMax-M3-MXFP4: - env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0" + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1" kv_cache_flags: "--kv_cache_dtype fp8" tp_dp_flags: "--enable-dp-attention" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill" + decode_tp_dp_flags: "--enable-dp-attention" ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" + hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' + online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' + online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' + block_size: "128" + mem_frac_static: "0.8" + max_model_len: "32768" + max_num_seqs: "256" + max_num_batched_tokens: "32768" MiniMax-M3-MXFP8: - env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0" + env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1" kv_cache_flags: "--kv_cache_dtype fp8" tp_dp_flags: "--enable-dp-attention" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill" + decode_tp_dp_flags: "--enable-dp-attention" ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" + decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" + hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' + online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' + online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*.gate.*","*.block_sparse_moe.experts*"]}' + block_size: "128" + mem_frac_static: "0.8" + max_model_len: "32768" + max_num_seqs: "256" + max_num_batched_tokens: "32768" diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 303e0d8767..ea79a8d663 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -36,8 +36,7 @@ DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" # MTP -SPEC_DECODING="${SPEC_DECODING:-}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" # ATOM server ports (different from SGLang which uses 8000 for all) PREFILL_PORT="${PREFILL_PORT:-8010}" @@ -88,19 +87,48 @@ with open('${ATOM_WS_PATH}/models_atom.yaml') as f: m = yaml.safe_load(f).get('${MODEL_NAME}', {}) def sh(v): return v.replace("'", "'\\''") print(f"MODEL_ENVS='{sh(m.get('env', ''))}'") -print(f"MODEL_TP_DP_FLAGS='{sh(m.get('tp_dp_flags', ''))}'") -print(f"MODEL_EP_DP_FLAGS='{sh(m.get('ep_dp_flags', ''))}'") +_tp_dp = m.get('tp_dp_flags', '') +print(f"PREFILL_MODEL_TP_DP_FLAGS='{sh(m.get('prefill_tp_dp_flags', _tp_dp))}'") +print(f"DECODE_MODEL_TP_DP_FLAGS='{sh(m.get('decode_tp_dp_flags', _tp_dp))}'") +_ep_dp = m.get('ep_dp_flags', '') +print(f"PREFILL_MODEL_EP_DP_FLAGS='{sh(m.get('prefill_ep_dp_flags', _ep_dp))}'") +print(f"DECODE_MODEL_EP_DP_FLAGS='{sh(m.get('decode_ep_dp_flags', _ep_dp))}'") print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'") print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'") print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'") print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'") print(f"_HF_OVERRIDES='{sh(m.get('hf_overrides', ''))}'") +print(f"_ONLINE_QUANT_CONFIG='{sh(m.get('online_quant_config', ''))}'") +print(f"_ONLINE_QUANT_DPA_CONFIG='{sh(m.get('online_quant_dpa_config', m.get('online_quant_config', '')))}'") +print(f"_YAML_BLOCK_SIZE='{sh(m.get('block_size', ''))}'") +print(f"_YAML_MEM_FRAC_STATIC='{sh(m.get('mem_frac_static', ''))}'") +print(f"_YAML_MAX_MODEL_LEN='{sh(m.get('max_model_len', ''))}'") +print(f"_YAML_MAX_NUM_SEQS='{sh(m.get('max_num_seqs', ''))}'") +print(f"_YAML_MAX_NUM_BATCHED_TOKENS='{sh(m.get('max_num_batched_tokens', ''))}'") PYEOF # shellcheck source=/dev/null source "$_yaml_tmp" rm -f "$_yaml_tmp" unset _yaml_tmp +# Apply YAML server-tuning defaults (env vars take precedence) +if [[ -n "$_YAML_BLOCK_SIZE" ]]; then + BLOCK_SIZE="${BLOCK_SIZE:-$_YAML_BLOCK_SIZE}" +fi +if [[ -n "$_YAML_MEM_FRAC_STATIC" ]]; then + MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-$_YAML_MEM_FRAC_STATIC}" +fi +if [[ -n "$_YAML_MAX_MODEL_LEN" ]]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN:-$_YAML_MAX_MODEL_LEN}" +fi +if [[ -n "$_YAML_MAX_NUM_SEQS" ]]; then + MAX_NUM_SEQS="${MAX_NUM_SEQS:-$_YAML_MAX_NUM_SEQS}" +fi +if [[ -n "$_YAML_MAX_NUM_BATCHED_TOKENS" ]]; then + MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-$_YAML_MAX_NUM_BATCHED_TOKENS}" +fi +unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS + # ============================================================================= # Cluster Topology Configuration # ============================================================================= @@ -134,29 +162,41 @@ PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" + + + # Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP +ONLINE_QUANT_ARG="" if [ "$PREFILL_ENABLE_DP" = "true" ]; then if [ "$PREFILL_ENABLE_EP" = "true" ]; then #EP+DPA - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS}) + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${PREFILL_MODEL_EP_DP_FLAGS}) for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done else #TP+DPA - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS}) + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${PREFILL_MODEL_TP_DP_FLAGS}) for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi + if [[ -n "$_ONLINE_QUANT_DPA_CONFIG" ]]; then + ONLINE_QUANT_ARG="--online_quant_config '${_ONLINE_QUANT_DPA_CONFIG}'" + fi +else + if [[ -n "$_ONLINE_QUANT_CONFIG" ]]; then + ONLINE_QUANT_ARG="--online_quant_config '${_ONLINE_QUANT_CONFIG}'" + fi fi DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then if [ "$DECODE_ENABLE_EP" = "true" ]; then #EP+DPA - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS}) + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${DECODE_MODEL_EP_DP_FLAGS}) for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done else #TP+DPA - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS}) + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${DECODE_MODEL_TP_DP_FLAGS}) for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done fi fi unset _dp_env_pair +unset _ONLINE_QUANT_CONFIG _ONLINE_QUANT_DPA_CONFIG # HF overrides (single-quoted JSON preserved through eval) HF_OVERRIDES_ARG="" @@ -172,7 +212,7 @@ unset _env_pair # MTP args SPEC_ARGS=() -if [[ "$SPEC_DECODING" != "none" && "$SPEC_DECODING" != "" && -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then +if [[ -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") fi @@ -203,7 +243,7 @@ Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NU Prefill args : ${PREFILL_PARALLEL_ARGS[*]} Decode args : ${DECODE_PARALLEL_ARGS[*]} Spec args : ${SPEC_ARGS[*]} -Opt args : ${HF_OVERRIDES_ARG} +Opt args : ${HF_OVERRIDES_ARG} ${ONLINE_QUANT_ARG} ===================== INFO @@ -244,6 +284,7 @@ if [ "$NODE_RANK" -eq 0 ]; then ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ ${HF_OVERRIDES_ARG} \ + ${ONLINE_QUANT_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ ${EXTRA_SERVER_ARGS}" @@ -334,7 +375,7 @@ if [ "$NODE_RANK" -eq 0 ]; then cd $ATOM_WS_PATH export IS_MTP="false" - if [ "$SPEC_DECODING" = "mtp" ]; then + if [[ -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then export IS_MTP="true" fi @@ -466,6 +507,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ ${HF_OVERRIDES_ARG} \ + ${ONLINE_QUANT_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ ${EXTRA_SERVER_ARGS}" @@ -522,21 +564,9 @@ else echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})" _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - if [[ "$_MAX_CONC" -gt 2048 ]]; then - CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048,4096]' - elif [[ "$_MAX_CONC" -gt 1024 ]]; then - CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048]' - elif [[ "$_MAX_CONC" -gt 512 ]]; then - CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,768,1024]' - else - CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512]' - fi + CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]' - if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then - DECODE_MAX_NUM_SEQS="${_MAX_CONC}" - else - DECODE_MAX_NUM_SEQS="${MAX_NUM_SEQS}" - fi + DECODE_MAX_NUM_SEQS="${_MAX_CONC}" DECODE_CMD="python3 -m atom.entrypoints.openai_server \ --model ${MODEL_DIR}/${MODEL_NAME} \ @@ -551,6 +581,7 @@ else ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ ${HF_OVERRIDES_ARG} \ + ${ONLINE_QUANT_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \ ${EXTRA_SERVER_ARGS}" diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh index 9b1957fa5f..1505b905de 100644 --- a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh +++ b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh @@ -60,21 +60,6 @@ else export DECODE_ENABLE_DP=false fi -# No MTP for MiniMax-M3 -export SPEC_DECODING="none" -export DECODE_MTP_SIZE=0 - -# Block size 128 -export BLOCK_SIZE="${BLOCK_SIZE:-128}" -export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}" -export MAX_MODEL_LEN=32768 -export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}" -export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}" - -# Launch jobs based on ISL/OSL -# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented -# by a list of numbers delimited by 'x'. This is because of how the underlying launch script -# expects the concurrencies. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ diff --git a/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh index 505f743195..1505b905de 100644 --- a/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh +++ b/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh @@ -60,22 +60,6 @@ else export DECODE_ENABLE_DP=false fi -# No MTP for MiniMax-M3 -export SPEC_DECODING="none" -export DECODE_MTP_SIZE=0 - -# Block size 128 -export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}" -export BLOCK_SIZE="${BLOCK_SIZE:-128}" -export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}" -export MAX_MODEL_LEN=32768 -export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}" -export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}" - -# Launch jobs based on ISL/OSL -# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented -# by a list of numbers delimited by 'x'. This is because of how the underlying launch script -# expects the concurrencies. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index cd05d7dc22..462c477421 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2721,8 +2721,8 @@ minimaxm3-fp8-mi355x-atom-mtp: - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } minimaxm3-fp8-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260622 - model: amd/MiniMax-M3-MXFP8 + image: rocm/atom-dev:nightly_202607011530 + model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x-disagg precision: fp8 @@ -2735,7 +2735,7 @@ minimaxm3-fp8-mi355x-atom-disagg: osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 4 @@ -2750,12 +2750,100 @@ minimaxm3-fp8-mi355x-atom-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + # 2P1D, DPA TP4 + - conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" # 1P1D TP4 - isl: 1024 osl: 1024 search-space: # 1P1D TP4 - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +minimaxm3-fp8-mi355x-atom-disagg-mtp: + image: rocm/atom-dev:nightly_202607011530 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp8 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 2P1D, DPA TP4 + - conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 4 @@ -2770,9 +2858,10 @@ minimaxm3-fp8-mi355x-atom-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" minimaxm3-fp4-mi355x-atom-disagg: - image: rocm/atom-dev:MiniMax-M3-20260623 + image: rocm/atom-dev:nightly_202607011530 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x-disagg @@ -2801,6 +2890,24 @@ minimaxm3-fp4-mi355x-atom-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + # 2P1D, DPA TP4 + - conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" # 1P1D TP4 - isl: 1024 osl: 1024 @@ -2821,6 +2928,77 @@ minimaxm3-fp4-mi355x-atom-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + +minimaxm3-fp4-mi355x-atom-disagg-mtp: + image: rocm/atom-dev:nightly_202607011530 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 2P1D, DPA TP4 + - conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this From a6ef155cfb2b39bc2d7e7bd4e8a2e0dc13a742c8 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 3 Jul 2026 16:06:45 +0900 Subject: [PATCH 12/15] [AMD] fix YAML server-tuning defaults never taking effect Shell defaults (BLOCK_SIZE=16, MEM_FRAC_STATIC=0.85) were set before YAML loading, so the YAML values (128, 0.8) were never substituted. Use three-tier fallback: env var > YAML > shell default. Co-Authored-By: Claude Opus 4.6 --- .../multi_node/amd_utils/models_atom.yaml | 2 +- .../multi_node/amd_utils/server_atom.sh | 29 +++++-------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 1c20e8aef8..6e4f59f778 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -44,7 +44,7 @@ MiniMax-M3-MXFP4: env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1" kv_cache_flags: "--kv_cache_dtype fp8" tp_dp_flags: "--enable-dp-attention" - prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill" + prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo" decode_tp_dp_flags: "--enable-dp-attention" ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index ea79a8d663..e2f90c6ceb 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -44,12 +44,7 @@ DECODE_PORT="${DECODE_PORT:-8020}" ROUTER_PORT="${ROUTER_PORT:-8000}" HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" -# ATOM server tuning (from reference script defaults) -MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" -BLOCK_SIZE="${BLOCK_SIZE:-16}" -MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" -MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" -MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}" +# ATOM server tuning — defaults applied after YAML load (env var > YAML > shell default) EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}" # Benchmark Configuration @@ -111,22 +106,12 @@ source "$_yaml_tmp" rm -f "$_yaml_tmp" unset _yaml_tmp -# Apply YAML server-tuning defaults (env vars take precedence) -if [[ -n "$_YAML_BLOCK_SIZE" ]]; then - BLOCK_SIZE="${BLOCK_SIZE:-$_YAML_BLOCK_SIZE}" -fi -if [[ -n "$_YAML_MEM_FRAC_STATIC" ]]; then - MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-$_YAML_MEM_FRAC_STATIC}" -fi -if [[ -n "$_YAML_MAX_MODEL_LEN" ]]; then - MAX_MODEL_LEN="${MAX_MODEL_LEN:-$_YAML_MAX_MODEL_LEN}" -fi -if [[ -n "$_YAML_MAX_NUM_SEQS" ]]; then - MAX_NUM_SEQS="${MAX_NUM_SEQS:-$_YAML_MAX_NUM_SEQS}" -fi -if [[ -n "$_YAML_MAX_NUM_BATCHED_TOKENS" ]]; then - MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-$_YAML_MAX_NUM_BATCHED_TOKENS}" -fi +# Apply server-tuning: env var > YAML > shell default +BLOCK_SIZE="${BLOCK_SIZE:-${_YAML_BLOCK_SIZE:-16}}" +MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-${_YAML_MEM_FRAC_STATIC:-0.85}}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-${_YAML_MAX_MODEL_LEN:-}}" +MAX_NUM_SEQS="${MAX_NUM_SEQS:-${_YAML_MAX_NUM_SEQS:-256}}" +MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-${_YAML_MAX_NUM_BATCHED_TOKENS:-}}" unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS # ============================================================================= From 8d57cde68ab1bd1d28189c452eb341f47a7e744d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 3 Jul 2026 16:15:49 +0900 Subject: [PATCH 13/15] [AMD] add perf-changelog entry for MiniMax-M3 ATOM disagg refactor (PR #2000) Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 35b796fddb..b2af93f012 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4306,14 +4306,15 @@ - config-keys: - minimaxm3-fp4-mi355x-atom-disagg + - minimaxm3-fp4-mi355x-atom-disagg-mtp + - minimaxm3-fp8-mi355x-atom-disagg + - minimaxm3-fp8-mi355x-atom-disagg-mtp description: - - "Add minimaxm3-fp4-mi355x-atom-disagg CI recipe: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4" - - "Image: rocm/atom-dev:MiniMax-M3-20260623; model: amd/MiniMax-M3-MXFP4; framework: atom-disagg" - - "Search space: ISL=8192 and ISL=1024, OSL=1024, 1P1D TP4, conc 1-512" - - "Refactor server_atom.sh to eliminate all hardcoded MODEL_NAME checks; all model-specific config (env, parallel flags, MTP flags, KV cache flags, HF overrides) now driven from models_atom.yaml" - - "Add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries to models_atom.yaml with EAGLE3 MTP flags (--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3)" - - "Fix model HuggingFace path for minimaxm3-fp8-mi355x-atom-disagg: amd/MiniMax-M3-MXFP8 -> MiniMaxAI/MiniMax-M3-MXFP8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1940 + - "Refactor ATOM disagg server_atom.sh and models_atom.yaml: split MODEL_TP_DP_FLAGS and MODEL_EP_DP_FLAGS into per-role prefill/decode variants, move BLOCK_SIZE (128), MEM_FRAC_STATIC (0.8), MAX_MODEL_LEN (32768), MAX_NUM_SEQS (256), MAX_NUM_BATCHED_TOKENS (32768) from launch scripts into models_atom.yaml with env-var override support." + - "Add hf_overrides and online_quant_config (with DPA-specific variant) to models_atom.yaml. FP8 uses different online_quant_config exclude patterns when DPA is enabled (*.gate.*, *.block_sparse_moe.experts*) vs disabled (*block_sparse_moe)." + - "Add minimaxm3-fp4/fp8-mi355x-atom-disagg-mtp recipes with EAGLE3 speculative decoding (DECODE_MTP_SIZE=3) at 1P1D and 2P1D DPA TP4 search spaces for 8k1k and 1k1k." + - "Fix YAML server-tuning defaults precedence (env var > YAML > shell default) and remove redundant SPEC_DECODING gating in favor of MODEL_MTP_FLAGS + DECODE_MTP_SIZE > 0." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2000 - config-keys: - minimaxm3-fp8-mi355x-vllm-mtp From 0fd4545b755abc785c575c35eef4bfe6ab69dc27 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 3 Jul 2026 16:30:59 +0900 Subject: [PATCH 14/15] [AMD] remove hf_overrides from models_atom.yaml and server_atom.sh Co-Authored-By: Claude Opus 4.6 --- benchmarks/multi_node/amd_utils/models_atom.yaml | 4 ---- benchmarks/multi_node/amd_utils/server_atom.sh | 13 +------------ 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 6e4f59f778..4a854f49fd 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -8,7 +8,6 @@ # Schema: # : # env: str # Space-separated KEY=VALUE pairs exported unconditionally -# hf_overrides: str # JSON string passed to --hf-overrides # tp_dp_flags: str # Shared TP+DPA flags (fallback when prefill/decode-specific keys are absent) # prefill_tp_dp_flags: str # TP+DPA flags for prefill only (overrides tp_dp_flags) # decode_tp_dp_flags: str # TP+DPA flags for decode only (overrides tp_dp_flags) @@ -38,7 +37,6 @@ DeepSeek-V4-Pro: prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" mtp_flags: "--method mtp --num-speculative-tokens" - hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' MiniMax-M3-MXFP4: env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1" @@ -50,7 +48,6 @@ MiniMax-M3-MXFP4: prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" - hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' block_size: "128" @@ -69,7 +66,6 @@ MiniMax-M3-MXFP8: prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention" mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens" - hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}' online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}' online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*.gate.*","*.block_sparse_moe.experts*"]}' block_size: "128" diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index e2f90c6ceb..76fe119698 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -92,7 +92,6 @@ print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'") print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'") print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'") print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'") -print(f"_HF_OVERRIDES='{sh(m.get('hf_overrides', ''))}'") print(f"_ONLINE_QUANT_CONFIG='{sh(m.get('online_quant_config', ''))}'") print(f"_ONLINE_QUANT_DPA_CONFIG='{sh(m.get('online_quant_dpa_config', m.get('online_quant_config', '')))}'") print(f"_YAML_BLOCK_SIZE='{sh(m.get('block_size', ''))}'") @@ -183,13 +182,6 @@ fi unset _dp_env_pair unset _ONLINE_QUANT_CONFIG _ONLINE_QUANT_DPA_CONFIG -# HF overrides (single-quoted JSON preserved through eval) -HF_OVERRIDES_ARG="" -if [[ -n "$_HF_OVERRIDES" ]]; then - HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'" -fi -unset _HF_OVERRIDES - for _env_pair in ${MODEL_ENVS}; do export "$_env_pair" done @@ -228,7 +220,7 @@ Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NU Prefill args : ${PREFILL_PARALLEL_ARGS[*]} Decode args : ${DECODE_PARALLEL_ARGS[*]} Spec args : ${SPEC_ARGS[*]} -Opt args : ${HF_OVERRIDES_ARG} ${ONLINE_QUANT_ARG} +Opt args : ${ONLINE_QUANT_ARG} ===================== INFO @@ -268,7 +260,6 @@ if [ "$NODE_RANK" -eq 0 ]; then --max-num-seqs ${MAX_NUM_SEQS} \ ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ - ${HF_OVERRIDES_ARG} \ ${ONLINE_QUANT_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ ${EXTRA_SERVER_ARGS}" @@ -491,7 +482,6 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then --max-num-seqs ${MAX_NUM_SEQS} \ ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ - ${HF_OVERRIDES_ARG} \ ${ONLINE_QUANT_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ ${EXTRA_SERVER_ARGS}" @@ -565,7 +555,6 @@ else --max-num-seqs ${DECODE_MAX_NUM_SEQS} \ ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ - ${HF_OVERRIDES_ARG} \ ${ONLINE_QUANT_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \ From 113e568471d00a14da2f88f7de3a73b5616fff41 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 4 Jul 2026 13:09:36 +0900 Subject: [PATCH 15/15] [AMD] fix server-tuning: YAML values must override job.slurm Docker defaults job.slurm injects BLOCK_SIZE=16, MEM_FRAC_STATIC=0.85, MAX_NUM_SEQS=256 as Docker env vars with hardcoded defaults. The previous env-first fallback (env > YAML > default) meant YAML values were always shadowed. Flip all five server-tuning vars to YAML > env > default so models_atom.yaml entries (e.g. block_size=128 for MiniMax-M3-MXFP4) actually take effect. Also add set -x before YAML parsing for CI debuggability. Co-Authored-By: Claude Opus 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 76fe119698..45f3053bd5 100755 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -75,6 +75,7 @@ host_name=$(hostname) # Model-Specific Configuration from YAML # ============================================================================= # Load model-specific config from YAML (single parse for all fields) +set -x _yaml_tmp=$(mktemp) python3 << PYEOF > "$_yaml_tmp" import yaml @@ -105,12 +106,14 @@ source "$_yaml_tmp" rm -f "$_yaml_tmp" unset _yaml_tmp -# Apply server-tuning: env var > YAML > shell default -BLOCK_SIZE="${BLOCK_SIZE:-${_YAML_BLOCK_SIZE:-16}}" -MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-${_YAML_MEM_FRAC_STATIC:-0.85}}" -MAX_MODEL_LEN="${MAX_MODEL_LEN:-${_YAML_MAX_MODEL_LEN:-}}" -MAX_NUM_SEQS="${MAX_NUM_SEQS:-${_YAML_MAX_NUM_SEQS:-256}}" -MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-${_YAML_MAX_NUM_BATCHED_TOKENS:-}}" +# Apply server-tuning: YAML > env var > shell default +# (job.slurm injects BLOCK_SIZE/MEM_FRAC_STATIC/MAX_NUM_SEQS with hardcoded +# defaults into the Docker env, so env-first would always shadow the YAML.) +BLOCK_SIZE="${_YAML_BLOCK_SIZE:-${BLOCK_SIZE:-16}}" +MEM_FRAC_STATIC="${_YAML_MEM_FRAC_STATIC:-${MEM_FRAC_STATIC:-0.85}}" +MAX_MODEL_LEN="${_YAML_MAX_MODEL_LEN:-${MAX_MODEL_LEN:-}}" +MAX_NUM_SEQS="${_YAML_MAX_NUM_SEQS:-${MAX_NUM_SEQS:-256}}" +MAX_NUM_BATCHED_TOKENS="${_YAML_MAX_NUM_BATCHED_TOKENS:-${MAX_NUM_BATCHED_TOKENS:-}}" unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS # =============================================================================