From 3ef13803854e2a4525dd3d522defc9282ea38091 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 10:25:16 +0900
Subject: [PATCH 01/15] [AMD] refactor server_atom.sh and models_atom.yaml for
 model-specific ATOM config; add minimaxm3-fp4-mi355x-atom-disagg

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/env_atom.sh   | 28 ++----
 .../multi_node/amd_utils/models_atom.yaml     | 68 ++++++--------
 .../multi_node/amd_utils/server_atom.sh       | 91 ++++++++++---------
 configs/amd-master.yaml                       |  2 +-
 4 files changed, 85 insertions(+), 104 deletions(-)
diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh
index f2b9063128..089594cbe4 100644
--- a/benchmarks/multi_node/amd_utils/env_atom.sh
+++ b/benchmarks/multi_node/amd_utils/env_atom.sh
@@ -32,13 +32,6 @@ else
 fi
 export IBDEVICES
 
-export SAFETENSORS_FAST_GPU=1
-export VLLM_LOG_LEVEL=WARNING
-export ATOM_LOG_LEVEL=WARNING
-export AITER_LOG_LEVEL=WARNING
-export LOG_LEVEL=WARNING
-export LOGLEVEL=WARNING
-
 # =============================================================================
 # ATOM/mooncake-specific environment
 # =============================================================================
@@ -46,22 +39,17 @@ export LOGLEVEL=WARNING
 # mooncake RDMA KV transfer library path
 export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}
 
-
-# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
+# faster model loading (safetensors only)
+export SAFETENSORS_FAST_GPU=1
 
 # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
+export VLLM_LOG_LEVEL=WARNING
+export ATOM_LOG_LEVEL=WARNING
 export AITER_LOG_LEVEL=WARNING
-
-if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
-    # ATOM MoE gather/scatter interleave optimization
-    export ATOM_MOE_GU_ITLV=1
-    # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro)
-    export AITER_BF16_FP8_MOE_BOUND=0
-fi
-
-# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
-# No env var needed; documented here for reference.
+export LOG_LEVEL=WARNING
+export LOGLEVEL=WARNING
 
 set +x
 
-echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES  LD_LIBRARY_PATH includes mooncake"
+# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
+echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES  LD_LIBRARY_PATH includes mooncake"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
index 85771eeaac..d6f12ac065 100644
--- a/benchmarks/multi_node/amd_utils/models_atom.yaml
+++ b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -1,4 +1,4 @@
-# Model-specific SGLang server configurations for disaggregated inference.
+# Model-specific ATOM server configurations for disaggregated inference.
 #
 # Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR).
 #
@@ -7,50 +7,34 @@
 #
 # Schema:
 #   <model-name>:
-#     base_flags: str          # Common flags for both prefill and decode
-#     mtp_flags: str           # Appended to decode when DECODE_MTP_SIZE > 0
-#     dp_flags: str            # Appended when DP is enabled (prefill or decode)
-#     prefill:
-#       mem_fraction_static: float
-#       disable_radix_cache: bool
-#       dp:                              # Config when data-parallel attention is enabled
-#         max_running_requests: int
-#         chunked_prefill_size: str      # Can be integer or bash arithmetic expression
-#         cuda_graph_bs: str             # Space-separated values
-#       no_dp:                           # Config when data-parallel attention is disabled
-#         max_running_requests: int
-#         chunked_prefill_size: int
-#         cuda_graph_bs_range: str       # "start-end" expanded via seq
-#     decode:
-#       mem_fraction_static: float
-#       prefill_round_robin_balance: bool
-#       dp:
-#         max_running_requests: int
-#         chunked_prefill_size: str
-#         cuda_graph_bs_range: str
-#       ep_only:                         # Config when EP is enabled but DP is disabled
-#         max_running_requests: int
-#         chunked_prefill_size: int
-#         cuda_graph_bs_range: str
-#       no_dp:
-#         max_running_requests: int
-#         chunked_prefill_size: int
-#         cuda_graph_bs_range: str
+#     env:          str  # Space-separated KEY=VALUE pairs exported unconditionally
+#     hf_overrides: str  # JSON string passed to --hf-overrides
+#     tp_dp_flags:  str  # Parallel flags for TP+DPA case (must include --enable-dp-attention)
+#     tp_dp_env:    str  # Space-separated KEY=VALUE pairs exported only in TP+DPA mode
+#     ep_dp_flags:  str  # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention)
+#     ep_dp_env:    str  # Space-separated KEY=VALUE pairs exported only in EP+DPA mode
+#     mtp_flags:    str  # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens")
+#     kv_cache_flags: str  # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none)
 
 DeepSeek-V4-Pro:
-  # ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS
-  # directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by
-  # server_atom.sh; they are kept here for documentation and potential future use.
-  base_flags: ""
-  mtp_flags: ""
-  dp_flags: ""
+  env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0"
+  kv_cache_flags: "--kv_cache_dtype fp8"
+  tp_dp_flags: "--enable-dp-attention --enable-tbo"
+  tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1"
+  ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  mtp_flags: "--method mtp --num-speculative-tokens"
+  hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
 
 MiniMax-M3-MXFP4:
-  base_flags: ""
-  mtp_flags: ""
-  dp_flags: ""
+  env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
+  kv_cache_flags: "--kv_cache_dtype fp8"
+  tp_dp_flags: "--enable-dp-attention"
+  ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
 
 MiniMax-M3-MXFP8:
-  base_flags: ""
-  mtp_flags: ""
-  dp_flags: ""
\ No newline at end of file
+  env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
+  kv_cache_flags: "--kv_cache_dtype fp8"
+  tp_dp_flags: "--enable-dp-attention"
+  ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index ccc864030a..ab3c25da22 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -47,7 +47,6 @@ HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"
 
 # ATOM server tuning (from reference script defaults)
 MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
-KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
 BLOCK_SIZE="${BLOCK_SIZE:-16}"
 MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
@@ -78,6 +77,24 @@ if [[ -z "$host_ip" ]]; then
 fi
 host_name=$(hostname)
 
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+# Load model-specific config from YAML (single parse for all fields)
+eval "$(python3 -c "
+import yaml
+with open('${ATOM_WS_PATH}/models_atom.yaml') as f:
+    m = yaml.safe_load(f).get('${MODEL_NAME}', {})
+print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"')
+print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"')
+print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"')
+print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"')
+print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"')
+print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
+print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"')
+print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"')
+")"
+
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================
@@ -114,53 +131,48 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 # Parallel args
 PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
 if [ "$PREFILL_ENABLE_DP" = "true" ]; then
-    if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #TP+DPA+TBO
-        if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
-            PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo )
-            export GPU_MAX_HW_QUEUES=5
-            export ATOM_CPU_AFFINITY=1
-        else #TP+DPA 
-            PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
-        fi
+    if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA
+        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS})
+        for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
+    else #TP+DPA
+        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS})
+        for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
     fi
-fi 
+fi
 
-# (srok), split DPA & TBO cases
-DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
+DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP
 if [ "$DECODE_ENABLE_DP" = "true" ]; then
-    if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #TP+DPA+TBO
-        if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
-            DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo )
-            export GPU_MAX_HW_QUEUES=5
-            export ATOM_CPU_AFFINITY=1
-        else #TP+DPA 
-            DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
-        fi
+    if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA
+        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS})
+        for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
+    else #TP+DPA
+        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS})
+        for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
     fi
-fi 
-
-# MTP args
-SPEC_ARGS=() #TP
-if [ "$SPEC_DECODING" = "mtp" ]; then
-    SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
 fi
+unset _dp_env_pair
 
 # HF overrides (single-quoted JSON preserved through eval)
 HF_OVERRIDES_ARG=""
-if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
-    HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
+if [[ -n "$_HF_OVERRIDES" ]]; then
+    HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'"
 fi
+unset _HF_OVERRIDES
+
+for _env_pair in ${MODEL_ENVS}; do
+    export "$_env_pair"
+done
+unset _env_pair
 
-# KV cache dtype (skip if unset or 'auto')
-KV_CACHE_ARG=""
-if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then
-    KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}"
+# MTP args
+SPEC_ARGS=()
+if [ "$SPEC_DECODING" = "mtp" ]; then
+    SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE")
 fi
 
+# KV cache arg - full flag string from YAML
+KV_CACHE_ARG="${MODEL_KV_ARG}"
+
 # Optional model length / batched-token cap
 MODEL_LEN_ARGS=""
 if [[ -n "$MAX_MODEL_LEN" ]]; then
@@ -170,9 +182,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then
     MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
 fi
 
-if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then
-      export AITER_QUICK_REDUCE_QUANTIZATION=INT4
-fi
 
 cat <<INFO
 === Configuration ===
@@ -183,7 +192,7 @@ MODEL    : ${MODEL_NAME}
 BACKEND  : atom (PD mooncake KV transfer)
 MTP      : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}
 xP/yD    : ${xP} / ${yD}
-KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
+KV cache : ${KV_CACHE_ARG:-none} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
 Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset}
 Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
 Decode  args : ${DECODE_PARALLEL_ARGS[*]}
@@ -582,4 +591,4 @@ else
 fi
 
 echo "Script completed successfully"
-exit 0
+exit 0
\ No newline at end of file
diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 95a76ab6dd..97619caf72 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2772,7 +2772,7 @@ minimaxm3-fp8-mi355x-atom-disagg:
           - "DECODE_NODES=1"
 
 minimaxm3-fp4-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260622
+  image: rocm/atom-dev:MiniMax-M3-20260623
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x-disagg

From 594f2bc65cda184b6a897a46d57a364f3b51c34d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 10:28:57 +0900
Subject: [PATCH 02/15] [AMD] add perf-changelog entry for
 minimaxm3-fp4-mi355x-atom-disagg and server_atom.sh refactor (PR #1940)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 6eb6ca61f6..35b796fddb 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4304,6 +4304,17 @@
     - "Pass --use-chat-template for MTP acceptance and mirror the existing MiniMax-M3 MXFP8 MI355X MTP TP/EP/DP-attention search space at 1k1k and 8k1k."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1939
 
+- config-keys:
+    - minimaxm3-fp4-mi355x-atom-disagg
+  description:
+    - "Add minimaxm3-fp4-mi355x-atom-disagg CI recipe: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4"
+    - "Image: rocm/atom-dev:MiniMax-M3-20260623; model: amd/MiniMax-M3-MXFP4; framework: atom-disagg"
+    - "Search space: ISL=8192 and ISL=1024, OSL=1024, 1P1D TP4, conc 1-512"
+    - "Refactor server_atom.sh to eliminate all hardcoded MODEL_NAME checks; all model-specific config (env, parallel flags, MTP flags, KV cache flags, HF overrides) now driven from models_atom.yaml"
+    - "Add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries to models_atom.yaml with EAGLE3 MTP flags (--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3)"
+    - "Fix model HuggingFace path for minimaxm3-fp8-mi355x-atom-disagg: amd/MiniMax-M3-MXFP8 -> MiniMaxAI/MiniMax-M3-MXFP8"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1940
+
 - config-keys:
     - minimaxm3-fp8-mi355x-vllm-mtp
   description:

From 8740b804c24d88f0e2629bc226670b84b36819f3 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 10:34:29 +0900
Subject: [PATCH 03/15] [AMD] add env dump in server_atom.sh and
 minimaxm3-fp4-mi355x-atom-disagg launch script

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh            | 5 +++++
 benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh | 1 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index ab3c25da22..8ef9a22176 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -201,6 +201,11 @@ Opt     args : ${HF_OVERRIDES_ARG}
 =====================
 INFO
 
+set -x
+echo "::group::Environment Variables"
+env
+echo "::endgroup::"
+
 # =============================================================================
 # Node Role Assignment
 #
diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
index 505f743195..9b1957fa5f 100644
--- a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
+++ b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
@@ -65,7 +65,6 @@ export SPEC_DECODING="none"
 export DECODE_MTP_SIZE=0
 
 # Block size 128
-export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}"
 export BLOCK_SIZE="${BLOCK_SIZE:-128}"
 export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}"
 export MAX_MODEL_LEN=32768

From 7c1ef64a4a3775e543861ec25a725349585de345 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 11:38:15 +0900
Subject: [PATCH 04/15] [AMD] fix server_atom.sh YAML loading: safe source, EP
 string compare, SPEC_DECODING guard

- Replace fragile eval "$(python3 -c "...")" with heredoc + source tempfile to
  avoid nested quote escaping issues that caused MODEL_ENVS to be empty at runtime
- Fix PREFILL/DECODE_ENABLE_EP comparison from numeric -gt 1 to string = "true"
  to match the "true"/"false" values set by launch scripts
- Fix SPEC_DECODING guard from hardcoded "mtp" to any non-none/non-empty value
  so EAGLE3 and future methods also activate SPEC_ARGS from models_atom.yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../multi_node/amd_utils/server_atom.sh       | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 8ef9a22176..303e0d8767 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -81,19 +81,25 @@ host_name=$(hostname)
 # Model-Specific Configuration from YAML
 # =============================================================================
 # Load model-specific config from YAML (single parse for all fields)
-eval "$(python3 -c "
+_yaml_tmp=$(mktemp)
+python3 << PYEOF > "$_yaml_tmp"
 import yaml
 with open('${ATOM_WS_PATH}/models_atom.yaml') as f:
     m = yaml.safe_load(f).get('${MODEL_NAME}', {})
-print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"')
-print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"')
-print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"')
-print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"')
-print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"')
-print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
-print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"')
-print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"')
-")"
+def sh(v): return v.replace("'", "'\\''")
+print(f"MODEL_ENVS='{sh(m.get('env', ''))}'")
+print(f"MODEL_TP_DP_FLAGS='{sh(m.get('tp_dp_flags', ''))}'")
+print(f"MODEL_EP_DP_FLAGS='{sh(m.get('ep_dp_flags', ''))}'")
+print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'")
+print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'")
+print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'")
+print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'")
+print(f"_HF_OVERRIDES='{sh(m.get('hf_overrides', ''))}'")
+PYEOF
+# shellcheck source=/dev/null
+source "$_yaml_tmp"
+rm -f "$_yaml_tmp"
+unset _yaml_tmp
 
 # =============================================================================
 # Cluster Topology Configuration
@@ -131,7 +137,7 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 # Parallel args
 PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
 if [ "$PREFILL_ENABLE_DP" = "true" ]; then
-    if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA
+    if [ "$PREFILL_ENABLE_EP" = "true" ]; then #EP+DPA
         PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS})
         for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
     else #TP+DPA
@@ -142,7 +148,7 @@ fi
 
 DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP
 if [ "$DECODE_ENABLE_DP" = "true" ]; then
-    if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA
+    if [ "$DECODE_ENABLE_EP" = "true" ]; then #EP+DPA
         DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS})
         for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
     else #TP+DPA
@@ -166,7 +172,7 @@ unset _env_pair
 
 # MTP args
 SPEC_ARGS=()
-if [ "$SPEC_DECODING" = "mtp" ]; then
+if [[ "$SPEC_DECODING" != "none" && "$SPEC_DECODING" != "" && -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then
     SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE")
 fi
 

From 7cd3353d77ab4e902d10a5411e491c384ebbc180 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 12:04:12 +0900
Subject: [PATCH 05/15] [AMD] cap minimaxm3-fp8-mi355x-atom-disagg conc to 256;
 fix missing newline in models_atom.yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/models_atom.yaml | 2 +-
 configs/amd-master.yaml                          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
index d6f12ac065..620aaf6c68 100644
--- a/benchmarks/multi_node/amd_utils/models_atom.yaml
+++ b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -37,4 +37,4 @@ MiniMax-M3-MXFP8:
   kv_cache_flags: "--kv_cache_dtype fp8"
   tp_dp_flags: "--enable-dp-attention"
   ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
-  mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
\ No newline at end of file
+  mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 97619caf72..971fe29733 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2735,7 +2735,7 @@ minimaxm3-fp8-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 4
@@ -2755,7 +2755,7 @@ minimaxm3-fp8-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 4

From 89611257dd850465bc15e29ab16ecf87eabeb963 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 12:11:29 +0900
Subject: [PATCH 06/15] [AMD] update amd-master.yaml: image bumps, search space
 tweaks for MiniMax-M3 ATOM recipes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 configs/amd-master.yaml | 53 +++++++----------------------------------
 1 file changed, 9 insertions(+), 44 deletions(-)

diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 971fe29733..5327d95116 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2645,7 +2645,7 @@ minimaxm3-fp4-mi355x-vllm-mtp:
 # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
 # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
 minimaxm3-fp4-mi355x-atom:
-  image: rocm/atom-dev:MiniMax-M3-20260623
+  image: rocm/atom-dev:M3
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
@@ -2657,52 +2657,17 @@ minimaxm3-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
+      - { tp: 2, conc-start: 128, conc-end: 256 }
       - { tp: 4, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
+      - { tp: 2, conc-start: 128, conc-end: 256 }
       - { tp: 4, conc-start: 1, conc-end: 256 }
-
-minimaxm3-fp4-mi355x-atom-mtp:
-  image: rocm/atom-dev:MiniMax-M3-20260623
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
-
-minimaxm3-fp8-mi355x-atom:
-  image: rocm/atom-dev:MiniMax-M3-20260623
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256 }
+      - { tp: 8, conc-start: 1, conc-end: 2 }
 
 minimaxm3-fp8-mi355x-atom-mtp:
-  image: rocm/atom-dev:MiniMax-M3-20260623
+  image: rocm/atom-dev:MiniMax-M3-20260622
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2735,7 +2700,7 @@ minimaxm3-fp8-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
         prefill:
           num-worker: 1
           tp: 4
@@ -2755,7 +2720,7 @@ minimaxm3-fp8-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
         prefill:
           num-worker: 1
           tp: 4
@@ -2786,7 +2751,7 @@ minimaxm3-fp4-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 4
@@ -2806,7 +2771,7 @@ minimaxm3-fp4-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 4

From 48b9946ae2d7ac205f737dc0438558e8ee6845a7 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 12:16:20 +0900
Subject: [PATCH 07/15] [AMD] restore minimaxm3-fp4/fp8-mi355x-atom recipes;
 bump all ATOM images to 20260623

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 configs/amd-master.yaml | 44 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 5327d95116..86dceedcb9 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2645,7 +2645,7 @@ minimaxm3-fp4-mi355x-vllm-mtp:
 # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
 # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
 minimaxm3-fp4-mi355x-atom:
-  image: rocm/atom-dev:M3
+  image: rocm/atom-dev:MiniMax-M3-20260623
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
@@ -2666,8 +2666,46 @@ minimaxm3-fp4-mi355x-atom:
       - { tp: 4, conc-start: 1, conc-end: 256 }
       - { tp: 8, conc-start: 1, conc-end: 2 }
 
+minimaxm3-fp4-mi355x-atom-mtp:
+  image: rocm/atom-dev:MiniMax-M3-20260623
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
+
+minimaxm3-fp8-mi355x-atom:
+  image: rocm/atom-dev:MiniMax-M3-20260623
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256 }
+
 minimaxm3-fp8-mi355x-atom-mtp:
-  image: rocm/atom-dev:MiniMax-M3-20260622
+  image: rocm/atom-dev:MiniMax-M3-20260623
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2686,7 +2724,7 @@ minimaxm3-fp8-mi355x-atom-mtp:
       - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
 minimaxm3-fp8-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260622
+  image: rocm/atom-dev:MiniMax-M3-20260623
   model: amd/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x-disagg

From 7f94d30e905105d5ab8c600b32e339b37223bb19 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 26 Jun 2026 12:17:31 +0900
Subject: [PATCH 08/15] [AMD] clean up minimaxm3-fp4-mi355x-atom search space;
 revert fp8-disagg image to 20260622

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 configs/amd-master.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 86dceedcb9..cd05d7dc22 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2657,14 +2657,11 @@ minimaxm3-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 128, conc-end: 256 }
       - { tp: 4, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 128, conc-end: 256 }
       - { tp: 4, conc-start: 1, conc-end: 256 }
-      - { tp: 8, conc-start: 1, conc-end: 2 }
 
 minimaxm3-fp4-mi355x-atom-mtp:
   image: rocm/atom-dev:MiniMax-M3-20260623
@@ -2724,7 +2721,7 @@ minimaxm3-fp8-mi355x-atom-mtp:
       - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
 minimaxm3-fp8-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260623
+  image: rocm/atom-dev:MiniMax-M3-20260622
   model: amd/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x-disagg

From 1aa7acea1699e6ff82c38cd58dc381d59de57b3c Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 3 Jul 2026 13:45:19 +0900
Subject: [PATCH 09/15] [AMD] add amd-master.yaml config

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 3164 +++++++++++++++++++++++++++++++
 1 file changed, 3164 insertions(+)
 create mode 100644 .github/configs/amd-master.yaml

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
new file mode 100644
index 0000000000..f6166699aa
--- /dev/null
+++ b/.github/configs/amd-master.yaml
@@ -0,0 +1,3164 @@
+dsr1-fp4-mi355x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
+  model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    # Agentic-coding sweep commented out for this image-bump PR — the
+    # 10-conc agentic matrix amplifies sweep cost and the bump validation
+    # only needs the fixed-seq-len throughput shape. Re-enable once the
+    # bump merges; the next agentic cron PR will pick it up.
+    # agentic-coding:
+    # - duration: 1800
+    #   search-space:
+    #   - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] }
+
+dsr1-fp4-mi355x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
+dsr1-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
+  model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+
+dsr1-fp4-mi355x-atom-mtp:
+  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp4
+  # WIP framework (no customers yet)
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+
+dsr1-fp8-mi300x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi300x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi325x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi355x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 32, conc-end: 64 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi355x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
+qwen3.5-bf16-mi355x-sglang:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: bf16
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
+
+qwen3.5-bf16-mi355x-sglang-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: bf16
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+
+qwen3.5-bf16-mi300x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  runner: mi300x
+  precision: bf16
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+qwen3.5-bf16-mi325x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  runner: mi325x
+  precision: bf16
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+qwen3.5-fp8-mi325x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+qwen3.5-fp8-mi355x-sglang:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+
+qwen3.5-fp8-mi355x-sglang-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+
+# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
+qwen3.5-fp8-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+qwen3.5-fp8-mi355x-atom:
+  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
+
+qwen3.5-fp8-mi355x-atom-mtp:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
+
+qwen3.5-fp8-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Matches qwen3.5-fp8-mi355x-sglang TP8/EP1 low-concurrency sweep
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P+1D TP8/EP1 low-concurrency sweep.
+      # dp-attn intentionally false (matches the 1k1k row): with
+      # --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes
+      # moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI,
+      # so num_shared_slots stays at the global value (1) and the
+      # (num_experts - num_shared_slots) % moe_ep_size assertion in
+      # fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared).
+      # Track upstream sglang for a fix; flip back to dp-attn=true once
+      # MoRI is added to is_deepep_class_backend() or shared-slot
+      # accounting is reconciled.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+qwen3.5-fp4-mi355x-sglang:
+  image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+
+qwen3.5-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+
+qwen3.5-fp4-mi355x-sglang-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+      - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+      - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+
+qwen3.5-fp4-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP8/EP1, dp-attn false; MoRI conn.py overlay via job.slurm.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+qwen3.5-fp8-mi300x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi300x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+glm5-fp8-mi355x-sglang:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+
+glm5-fp8-mi355x-sglang-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+
+glm5-fp8-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P+1D TP8/EP1 CI smoke sweep (aligned with glm5-fp8-mi355x-sglang conc range)
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P+1D TP8/EP1 CI smoke sweep; dp-attn false (NSA / MoRI path)
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+glm5-fp8-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+      - { tp: 8, conc-start: 4, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+      - { tp: 8, conc-start: 4, conc-end: 256 }
+
+glm5.1-fp4-mi355x-sglang:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+
+# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
+glm5.1-fp4-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+glm5.1-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+
+kimik2.5-int4-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:nightly-b8336c3c7c298e0878f22a7bf70f4e295b2f4e01
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 128 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 128 }
+
+kimik2.5-int4-mi325x-vllm:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: mi325x
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+kimik2.5-int4-mi300x-vllm:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: mi300x
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+kimik2.5-fp4-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+
+# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
+kimik2.5-fp4-mi355x-vllm-agentic:
+  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
+  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
+  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
+  # includes all subsequent ROCm offload work.
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      # CPU offload only above the KV cliff. Lower concurrencies fit
+      # entirely on-GPU, so paying the offload-path overhead there would
+      # just slow them down without measuring anything new.
+      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
+      # TP=4 probe: half-node layout doubles per-GPU weight footprint
+      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
+      # cliff-region concurrencies on both offload modes so we can directly
+      # compare TP=4 vs TP=8 at the same conc points.
+      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
+
+kimik2.5-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 128 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 128 }
+
+gptoss-fp4-mi300x-vllm:
+  image: vllm/vllm-openai-rocm:v0.17.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: mi300x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 64, conc-end: 256 }
+      - { tp: 2, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, conc-start: 1, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 2, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, conc-start: 1, conc-end: 16 }
+
+gptoss-fp4-mi325x-vllm:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: mi325x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 2, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 2, conc-start: 4, conc-end: 8 }
+      - { tp: 4, conc-start: 4, conc-end: 8 }
+      - { tp: 8, conc-start: 4, conc-end: 16 }
+
+gptoss-fp4-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 8 }
+      - { tp: 8, conc-start: 4, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 4, conc-end: 4 }
+      - { tp: 8, conc-start: 4, conc-end: 8 }
+
+gptoss-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 16, conc-end: 256 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
+
+dsr1-fp8-mi355x-atom:
+  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp8
+  # WIP framework (no customers yet)
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 128 }
+
+dsr1-fp8-mi355x-atom-mtp:
+  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
+
+dsr1-fp8-mi355x-sglang-disagg:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # non-MTP configurations
+      # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
+      - spec-decoding: "none"
+        conc-list: [ 1024, 2048 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
+      - spec-decoding: "none"
+        conc-list: [ 1536, 1024, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+      - spec-decoding: "none"
+        conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      - spec-decoding: "none"
+        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # non-MTP configurations
+      # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
+      - spec-decoding: "none"
+        conc-list: [ 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+      # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
+      - spec-decoding: "none"
+        conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      - spec-decoding: "none"
+        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+dsr1-fp8-mi355x-sglang-disagg-mtp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # MTP configurations
+      # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=1"
+
+      # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)
+      - spec-decoding: "mtp"
+        conc-list: [ 1536, 1024, 512, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=1"
+
+      # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+      - spec-decoding: "mtp"
+        conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      - spec-decoding: "mtp"
+        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # MTP configurations
+      # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+
+      # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
+      - spec-decoding: "mtp"
+        conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      - spec-decoding: "mtp"
+        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=2"
+
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total 
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+dsr1-fp4-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # non-MTP configurations
+      # 1P1D TP8
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P2D TP8
+      - spec-decoding: "none"
+        conc-list: [ 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P2D TP8
+      - spec-decoding: "none" 
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P2D TP4
+      - spec-decoding: "none" 
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+    
+      # 1*DEP4+ 1*DEP8
+      - spec-decoding: "none"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # non-MTP configurations
+      # 1P1D pure TP8
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P2D TP8
+      - spec-decoding: "none"
+        conc-list: [ 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P2D TP8
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1P2D TP4
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=0"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "none"
+        conc-list: [ 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+      # 2*DEP8 + 1*DEP8
+      - spec-decoding: "none"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # MTP configurations
+      # 1P1D TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1, 2, 4, 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp" 
+        conc-list: [ 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp" 
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      # 1P2D TP4
+      - spec-decoding: "mtp" 
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      # 1*DEP4+ 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+  
+
+dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # MTP configurations
+      # 1P1D pure TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1, 2, 4, 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 32, 64 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 640, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 128 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 64 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 2*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+
+dsv4-fp4-mi355x-sglang:
+  image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260618
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
+      - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 }
+      - { tp: 4, dp-attn: false, conc-start: 1 , conc-end: 32 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
+      - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 }
+      - { tp: 4, dp-attn: false, conc-start: 1, conc-end: 32 }
+
+
+# MTP variant of dsv4-fp4-mi355x-sglang. Mirrors the base search space and adds
+# spec-decoding: mtp, which routes to dsv4_fp4_mi355x_sglang_mtp.sh (EAGLE
+# speculative decoding), per sgl-project/sglang#26383 ([AMD][DSV4] DSV4 MTP
+# graph + sparse triton attn optimizations, merged to main 2026-05-27). That PR
+# fixes the ROCm HIP-radix MTP CUDA-graph bug (the false-EOS symptom in sgl
+# #20404) and validates GSM8K 0.950 with MTP on.
+#
+# #26383 is on sglang `main`, NOT the amd/deepseek_v4 branch the rocm/sgl-dev:*-DSv4
+# builds are cut from (latest da28108 = f96ac98 + build fixes + an unrelated
+# MLA-decode refactor, still pre-#26383 -> kv_score crash, run 26723126211). So we
+# pin the mainline ROCm nightly, which carries #26383. Mainline omits deep_gemm,
+# but the recipe detects that and routes the DSv4 fp8 wo_a / topk paths to their
+# torch fallbacks (see dsv4_fp4_mi355x_sglang_mtp.sh). When a -DSv4 image carrying
+# #26383 ships, bump to it; the recipe auto-restores the deep_gemm perf path.
+dsv4-fp4-mi355x-sglang-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260601
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
+      - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
+      - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+
+# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
+# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
+# on 2026-05-05, so any nightly built after that includes the
+# DeepseekV4ForCausalLM model class.
+#
+# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
+# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
+# files keyed on the image string and short-circuits re-import if the
+# file already exists, so the floating tag silently keeps a stale build
+# even after Docker Hub updates `:nightly`.
+#
+# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
+# rest); InferenceX classifies this as fp4 — same as the sister sglang
+# and atom DSv4 mi355x entries below. Image and serving flags follow the
+# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
+# executor, triton_unfused MoE (required for the FP4 expert format),
+# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
+# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
+# probe to validate the ROCm DP+EP path.
+dsv4-fp4-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512 }
+
+# MTP variant of dsv4-fp4-mi355x-vllm. Mirrors the base recipe's search space
+# and adds spec-decoding: mtp, which routes to dsv4_fp4_mi355x_vllm_mtp.sh
+# (--speculative-config '{"method":"mtp","num_speculative_tokens":2}'), per
+# vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP, merged 2026-05-24, included in
+# v0.22.0). Full conc 4-512 range maps the complete crossover curve: MTP wins
+# at low batch (PR perf data: +75% @ conc1, +38% @ conc8) and falls behind STP
+# above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm
+# build, which already contains the MTP commit.
+dsv4-fp4-mi355x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
+
+dsv4-fp4-mi355x-atom:
+  image: rocm/atom-dev:nightly_202606161823
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-2048, DPA TBO
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+        # conc4-64, TP8
+        # conc128, DPA
+        # conc256-2048, DPA TBO
+      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
+
+dsv4-fp4-mi355x-atom-mtp:
+  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp }
+
+qwen3.5-bf16-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  runner: mi325x
+  precision: bf16
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
+dsr1-fp8-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
+qwen3.5-fp8-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
+glm5-fp8-mi325x-sglang:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 4, conc-end: 64 }
+
+glm5-fp8-mi325x-sglang-mtp:
+  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+
+# ============================================================================
+# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
+# Recipes that ALREADY existed on main were intentionally left at main's version
+# to preserve main behavior; PR-branch modifications to those recipes are NOT
+# brought in here.
+# ============================================================================
+
+qwen3.5-fp8-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
+dsv4-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
+
+dsr1-fp4-mi355x-sglang-disagg-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # MTP configurations
+      # 1P1D TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1, 2, 4, 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp" 
+        conc-list: [ 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp" 
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      # 1P2D TP4
+      - spec-decoding: "mtp" 
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      # 1*DEP4+ 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # MTP configurations
+      # 1P1D pure TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1, 2, 4, 8 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1P2D TP8
+      - spec-decoding: "mtp"
+        conc-list: [ 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "DECODE_MTP_SIZE=2"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 128, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 64, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+
+      # 2*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+      
+
+# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
+# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
+# image tag, so bumping sglang is just an image tag bump here. Sweeps
+# DP-attention on/off and EP=8.
+
+# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
+# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding.
+# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
+# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
+# comparability. Offload sweep is none-only (SGLang has no equivalent of
+# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
+dsv4-fp4-mi355x-sglang-agentic:
+  image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
+      - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
+
+# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
+# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
+# on 2026-05-05, so any nightly built after that includes the
+# DeepseekV4ForCausalLM model class.
+#
+# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
+# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
+# files keyed on the image string and short-circuits re-import if the
+# file already exists, so the floating tag silently keeps a stale build
+# even after Docker Hub updates `:nightly`.
+#
+# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
+# rest); InferenceX classifies this as fp4 — same as the sister sglang
+# and atom DSv4 mi355x entries below. Image and serving flags follow the
+# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
+# executor, triton_unfused MoE (required for the FP4 expert format),
+# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
+# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
+# probe to validate the ROCm DP+EP path.
+
+dsv4-fp4-mi355x-atom-disagg:
+  image: rocm/atom-dev:nightly_202606101403
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+      # 1P1D DPA+TP8
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 2P1D DPA+TP8 
+      - conc-list: [ 256, 512, 768, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 1P1D TP8 
+      - conc-list: [ 4, 8, 16, 32, 64, 128 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 1P1D TP8
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+
+# MiniMax-M3 MXFP8 MI355X recipe:
+# https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
+# MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
+minimaxm3-fp8-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
+      - { tp: 4, conc-start: 1, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
+      - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
+      - { tp: 4, conc-start: 1, conc-end: 128 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
+
+# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
+# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
+# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). No
+# attention_backend override is needed — the server runs on TRITON_ATTN, so
+# the FlashInfer page-128/MHA limitation that forced FLASH_ATTN on Blackwell
+# does not apply here. Search space mirrors the non-MTP entry trimmed at the
+# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp /
+# b200-vllm-mtp precedent: spec decode pays off at low/mid concurrency while
+# acceptance dilutes in big batches, and the draft weights + draft KV shave
+# headroom — tp2-ep2 is dropped since its KV headroom was already thin.
+minimaxm3-fp8-mi355x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+
+# MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
+minimaxm3-fp4-mi355x-vllm-disagg:
+  image: rocm/vllm-dev:vllm-0.23.1-rocm723-mi35x-mori-0625
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P TP4 + 1D TP4 (2 nodes total), conc sweep 1..512 (single job, looped)
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 2P TP4 + 1D TP4 (3 nodes total), conc 128/256/512 (single job, looped)
+      - spec-decoding: "none"
+        conc-list: [ 128, 256, 512 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+# MiniMax-M3 MXFP4 MI355X vLLM recipe. The pinned nightly includes upstream
+# MiniMax-M3 Quark MXFP4 support (vllm-project/vllm#45794). Use the text-only
+# language-model path and mirror the MXFP8 MI355X search space for a direct
+# precision comparison.
+minimaxm3-fp4-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
+      - { tp: 4, conc-start: 1, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
+      - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
+      - { tp: 4, conc-start: 1, conc-end: 128 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
+
+# EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the
+# amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft
+# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
+# FP4 sweep at extreme concurrency where speculative decoding loses value.
+minimaxm3-fp4-mi355x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+
+# MiniMax-M3 MXFP4 MI355X atom recipe:
+# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
+# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
+minimaxm3-fp4-mi355x-atom:
+  image: rocm/atom-dev:MiniMax-M3-20260623
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256 }
+
+minimaxm3-fp4-mi355x-atom-mtp:
+  image: rocm/atom-dev:MiniMax-M3-20260623
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
+
+minimaxm3-fp8-mi355x-atom:
+  image: rocm/atom-dev:MiniMax-M3-20260623
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256 }
+
+minimaxm3-fp8-mi355x-atom-mtp:
+  image: rocm/atom-dev:MiniMax-M3-20260623
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x
+  precision: fp8
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
+
+minimaxm3-fp8-mi355x-atom-disagg:
+  image: rocm/atom-dev:MiniMax-M3-20260622
+  model: amd/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp8
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 1P1D TP4
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+
+minimaxm3-fp4-mi355x-atom-disagg:
+  image: rocm/atom-dev:MiniMax-M3-20260622
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp4
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 1P1D TP4
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+
+# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
+# MI355X serving shape, but retain the default BF16 KV cache because this
+# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
+# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
+minimaxm3-fp8-mi300x-vllm:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
+
+# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
+# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
+# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only
+# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like
+# H100), with the TP8 latency rows started at conc 1 to capture single-request
+# latency — matching the H100/MI355X MTP recipes. The pinned ROCm nightly
+# includes upstream SupportsEagle3 support for the AMD MiniMax-M3 model.
+minimaxm3-fp8-mi300x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+
+# MiniMax-M3 MXFP8 MI325X day-zero recipe. Reuse the dedicated ROCm image
+# and serving flags validated on MI355X, with the H200 search space: TP4 and
+# TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention.
+minimaxm3-fp8-mi325x-vllm:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 }
+      - { tp: 8, conc-start: 1, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 32 }
+      - { tp: 8, conc-start: 1, conc-end: 128 }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
+
+# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
+# minimaxm3-fp8-mi325x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
+# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style
+# search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency
+# end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP
+# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0,
+# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD
+# MiniMax-M3 model, so the recipe applies that fix in-place at runtime
+# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on
+# MI355X/MI300X) before serving.
+minimaxm3-fp8-mi325x-vllm-mtp:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+
+# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the
+# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to
+# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on
+# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are
+# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in
+# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8).
+minimaxm3-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
+      # conc 1,2,4,8,16,32,64,128,256.
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
+      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
+      - spec-decoding: "none"
+        conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+    # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024;
+    # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8
+    # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one
+    # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) —
+    # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end.
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
+      # conc 1,2,4,8,16,32,64,128,256.
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
+      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
+      - spec-decoding: "none"
+        conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"

From ed5e87461a544b7c592774293062cec36bf494c8 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 3 Jul 2026 13:49:40 +0900
Subject: [PATCH 10/15] [AMD] remove amd-master.yaml config

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 3164 -------------------------------
 1 file changed, 3164 deletions(-)
 delete mode 100644 .github/configs/amd-master.yaml

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
deleted file mode 100644
index f6166699aa..0000000000
--- a/.github/configs/amd-master.yaml
+++ /dev/null
@@ -1,3164 +0,0 @@
-dsr1-fp4-mi355x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
-  model: amd/DeepSeek-R1-0528-MXFP4-Preview
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    # Agentic-coding sweep commented out for this image-bump PR — the
-    # 10-conc agentic matrix amplifies sweep cost and the bump validation
-    # only needs the fixed-seq-len throughput shape. Re-enable once the
-    # bump merges; the next agentic cron PR will pick it up.
-    # agentic-coding:
-    # - duration: 1800
-    #   search-space:
-    #   - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] }
-
-dsr1-fp4-mi355x-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
-  model: amd/DeepSeek-R1-0528-MXFP4
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
-dsr1-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
-  model: amd/DeepSeek-R1-0528-MXFP4-Preview
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
-
-dsr1-fp4-mi355x-atom-mtp:
-  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
-  model: amd/DeepSeek-R1-0528-MXFP4
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp4
-  # WIP framework (no customers yet)
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
-dsr1-fp8-mi300x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi300x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-mi325x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-mi355x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 32, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-mi355x-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
-qwen3.5-bf16-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
-  model: Qwen/Qwen3.5-397B-A17B
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: bf16
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
-
-qwen3.5-bf16-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
-  model: Qwen/Qwen3.5-397B-A17B
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: bf16
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
-qwen3.5-bf16-mi300x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: Qwen/Qwen3.5-397B-A17B
-  model-prefix: qwen3.5
-  runner: mi300x
-  precision: bf16
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-qwen3.5-bf16-mi325x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: Qwen/Qwen3.5-397B-A17B
-  model-prefix: qwen3.5
-  runner: mi325x
-  precision: bf16
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-qwen3.5-fp8-mi325x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-qwen3.5-fp8-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-
-qwen3.5-fp8-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
-# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-qwen3.5-fp8-mi355x-atom:
-  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
-
-qwen3.5-fp8-mi355x-atom-mtp:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
-qwen3.5-fp8-mi355x-sglang-disagg:
-  image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x-disagg
-  precision: fp8
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # Matches qwen3.5-fp8-mi355x-sglang TP8/EP1 low-concurrency sweep
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # 1P+1D TP8/EP1 low-concurrency sweep.
-      # dp-attn intentionally false (matches the 1k1k row): with
-      # --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes
-      # moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI,
-      # so num_shared_slots stays at the global value (1) and the
-      # (num_experts - num_shared_slots) % moe_ep_size assertion in
-      # fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared).
-      # Track upstream sglang for a fix; flip back to dp-attn=true once
-      # MoRI is added to is_deepep_class_backend() or shared-slot
-      # accounting is reconciled.
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-qwen3.5-fp4-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612
-  model: amd/Qwen3.5-397B-A17B-MXFP4
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-
-qwen3.5-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: amd/Qwen3.5-397B-A17B-MXFP4
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-
-qwen3.5-fp4-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612
-  model: amd/Qwen3.5-397B-A17B-MXFP4
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
-
-qwen3.5-fp4-mi355x-sglang-disagg:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523
-  model: amd/Qwen3.5-397B-A17B-MXFP4
-  model-prefix: qwen3.5
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # 1P1D TP8/EP1, dp-attn false; MoRI conn.py overlay via job.slurm.
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-qwen3.5-fp8-mi300x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi300x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-glm5-fp8-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
-
-glm5-fp8-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
-
-glm5-fp8-mi355x-sglang-disagg:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: mi355x-disagg
-  precision: fp8
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # 1P+1D TP8/EP1 CI smoke sweep (aligned with glm5-fp8-mi355x-sglang conc range)
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # 1P+1D TP8/EP1 CI smoke sweep; dp-attn false (NSA / MoRI path)
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-glm5-fp8-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
-      - { tp: 8, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
-      - { tp: 8, conc-start: 4, conc-end: 256 }
-
-glm5.1-fp4-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-
-# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
-glm5.1-fp4-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-glm5.1-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
-
-kimik2.5-int4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-b8336c3c7c298e0878f22a7bf70f4e295b2f4e01
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-
-kimik2.5-int4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: mi325x
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-kimik2.5-int4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: mi300x
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-kimik2.5-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-
-# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
-kimik2.5-fp4-mi355x-vllm-agentic:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
-      # TP=4 probe: half-node layout doubles per-GPU weight footprint
-      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
-      # cliff-region concurrencies on both offload modes so we can directly
-      # compare TP=4 vs TP=8 at the same conc points.
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
-
-kimik2.5-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-
-gptoss-fp4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.17.0
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: mi300x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 64, conc-end: 256 }
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 1, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 1, conc-end: 16 }
-
-gptoss-fp4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: mi325x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 2, conc-start: 4, conc-end: 8 }
-      - { tp: 4, conc-start: 4, conc-end: 8 }
-      - { tp: 8, conc-start: 4, conc-end: 16 }
-
-gptoss-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 8 }
-      - { tp: 8, conc-start: 4, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 4, conc-start: 4, conc-end: 4 }
-      - { tp: 8, conc-start: 4, conc-end: 8 }
-
-gptoss-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 16, conc-end: 256 }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
-
-dsr1-fp8-mi355x-atom:
-  image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp8
-  # WIP framework (no customers yet)
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
-
-dsr1-fp8-mi355x-atom-mtp:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
-
-dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp8
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # non-MTP configurations
-      # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-      - spec-decoding: "none"
-        conc-list: [ 1024, 2048 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
-      - spec-decoding: "none"
-        conc-list: [ 1536, 1024, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-      - spec-decoding: "none"
-        conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      - spec-decoding: "none"
-        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # non-MTP configurations
-      # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
-      - spec-decoding: "none"
-        conc-list: [ 1024, 2048 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-      # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
-      - spec-decoding: "none"
-        conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      - spec-decoding: "none"
-        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-dsr1-fp8-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp8
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # MTP configurations
-      # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=1"
-
-      # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)
-      - spec-decoding: "mtp"
-        conc-list: [ 1536, 1024, 512, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=1"
-
-      # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-      - spec-decoding: "mtp"
-        conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      - spec-decoding: "mtp"
-        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=2"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # MTP configurations
-      # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
-      # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
-      - spec-decoding: "mtp"
-        conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      - spec-decoding: "mtp"
-        conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=2"
-
-kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x-disagg
-  precision: fp4
-  framework: vllm-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total 
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-
-dsr1-fp4-mi355x-sglang-disagg:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # non-MTP configurations
-      # 1P1D TP8
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1P2D TP8
-      - spec-decoding: "none"
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1P2D TP8
-      - spec-decoding: "none" 
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1P2D TP4
-      - spec-decoding: "none" 
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-    
-      # 1*DEP4+ 1*DEP8
-      - spec-decoding: "none"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # non-MTP configurations
-      # 1P1D pure TP8
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1P2D TP8
-      - spec-decoding: "none"
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1P2D TP8
-      - spec-decoding: "none"
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1P2D TP4
-      - spec-decoding: "none"
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=0"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "none"
-        conc-list: [ 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-      # 2*DEP8 + 1*DEP8
-      - spec-decoding: "none"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=0"
-
-dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # MTP configurations
-      # 1P1D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp" 
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp" 
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      # 1P2D TP4
-      - spec-decoding: "mtp" 
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      # 1*DEP4+ 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-  
-
-dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # MTP configurations
-      # 1P1D pure TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 32, 64 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 640, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 128 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 64 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 2*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
-dsv4-fp4-mi355x-sglang:
-  image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260618
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
-      - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 }
-      - { tp: 4, dp-attn: false, conc-start: 1 , conc-end: 32 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
-      - { tp: 4, dp-attn: true, conc-start: 16, conc-end: 128 }
-      - { tp: 4, dp-attn: false, conc-start: 1, conc-end: 32 }
-
-
-# MTP variant of dsv4-fp4-mi355x-sglang. Mirrors the base search space and adds
-# spec-decoding: mtp, which routes to dsv4_fp4_mi355x_sglang_mtp.sh (EAGLE
-# speculative decoding), per sgl-project/sglang#26383 ([AMD][DSV4] DSV4 MTP
-# graph + sparse triton attn optimizations, merged to main 2026-05-27). That PR
-# fixes the ROCm HIP-radix MTP CUDA-graph bug (the false-EOS symptom in sgl
-# #20404) and validates GSM8K 0.950 with MTP on.
-#
-# #26383 is on sglang `main`, NOT the amd/deepseek_v4 branch the rocm/sgl-dev:*-DSv4
-# builds are cut from (latest da28108 = f96ac98 + build fixes + an unrelated
-# MLA-decode refactor, still pre-#26383 -> kv_score crash, run 26723126211). So we
-# pin the mainline ROCm nightly, which carries #26383. Mainline omits deep_gemm,
-# but the recipe detects that and routes the DSv4 fp8 wo_a / topk paths to their
-# torch fallbacks (see dsv4_fp4_mi355x_sglang_mtp.sh). When a -DSv4 image carrying
-# #26383 ships, bump to it; the recipe auto-restores the deep_gemm perf path.
-dsv4-fp4-mi355x-sglang-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260601
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
-      - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
-      - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
-dsv4-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 512 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 512 }
-
-# MTP variant of dsv4-fp4-mi355x-vllm. Mirrors the base recipe's search space
-# and adds spec-decoding: mtp, which routes to dsv4_fp4_mi355x_vllm_mtp.sh
-# (--speculative-config '{"method":"mtp","num_speculative_tokens":2}'), per
-# vllm-project/vllm#43385 (ROCm DeepSeek-V4 MTP, merged 2026-05-24, included in
-# v0.22.0). Full conc 4-512 range maps the complete crossover curve: MTP wins
-# at low batch (PR perf data: +75% @ conc1, +38% @ conc8) and falls behind STP
-# above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm
-# build, which already contains the MTP commit.
-dsv4-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-
-dsv4-fp4-mi355x-atom:
-  image: rocm/atom-dev:nightly_202606161823
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-        # conc4-64, TP8
-        # conc128-512, DPA
-        # conc1024-2048, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-        # conc4-64, TP8
-        # conc128, DPA
-        # conc256-2048, DPA TBO
-      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
-      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
-
-dsv4-fp4-mi355x-atom-mtp:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1024, spec-decoding: mtp }
-
-qwen3.5-bf16-mi325x-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: Qwen/Qwen3.5-397B-A17B
-  model-prefix: qwen3.5
-  runner: mi325x
-  precision: bf16
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
-dsr1-fp8-mi325x-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
-  model: deepseek-ai/DeepSeek-R1-0528
-  model-prefix: dsr1
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
-qwen3.5-fp8-mi325x-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
-glm5-fp8-mi325x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-
-glm5-fp8-mi325x-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
-
-qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
-
-dsv4-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
-
-dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # MTP configurations
-      # 1P1D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp" 
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp" 
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      # 1P2D TP4
-      - spec-decoding: "mtp" 
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      # 1*DEP4+ 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # MTP configurations
-      # 1P1D pure TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1, 2, 4, 8 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=3"
-
-      # 1P2D TP8
-      - spec-decoding: "mtp"
-        conc-list: [ 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 128, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
-      # 1*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 64, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-
-      # 2*DEP8 + 1*DEP8
-      - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
-      
-
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
-
-# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
-# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
-# comparability. Offload sweep is none-only (SGLang has no equivalent of
-# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
-dsv4-fp4-mi355x-sglang-agentic:
-  image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
-      - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
-
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
-
-dsv4-fp4-mi355x-atom-disagg:
-  image: rocm/atom-dev:nightly_202606101403
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: atom-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-      # 1P1D DPA+TP8
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # 2P1D DPA+TP8 
-      - conc-list: [ 256, 512, 768, 1024, 2048 ]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 1
-          dp-attn: true
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: true
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 1P1D TP8 
-      - conc-list: [ 4, 8, 16, 32, 64, 128 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 1P1D TP8
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-
-# MiniMax-M3 MXFP8 MI355X recipe:
-# https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
-# MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
-minimaxm3-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
-      - { tp: 4, conc-start: 1, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
-      - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
-      - { tp: 4, conc-start: 1, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
-
-# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
-# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
-# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). No
-# attention_backend override is needed — the server runs on TRITON_ATTN, so
-# the FlashInfer page-128/MHA limitation that forced FLASH_ATTN on Blackwell
-# does not apply here. Search space mirrors the non-MTP entry trimmed at the
-# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp /
-# b200-vllm-mtp precedent: spec decode pays off at low/mid concurrency while
-# acceptance dilutes in big batches, and the draft weights + draft KV shave
-# headroom — tp2-ep2 is dropped since its KV headroom was already thin.
-minimaxm3-fp8-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
-
-# MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
-minimaxm3-fp4-mi355x-vllm-disagg:
-  image: rocm/vllm-dev:vllm-0.23.1-rocm723-mi35x-mori-0625
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x-disagg
-  precision: fp4
-  framework: vllm-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # 1P TP4 + 1D TP4 (2 nodes total), conc sweep 1..512 (single job, looped)
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 2P TP4 + 1D TP4 (3 nodes total), conc 128/256/512 (single job, looped)
-      - spec-decoding: "none"
-        conc-list: [ 128, 256, 512 ]
-        prefill:
-          num-worker: 2
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-# MiniMax-M3 MXFP4 MI355X vLLM recipe. The pinned nightly includes upstream
-# MiniMax-M3 Quark MXFP4 support (vllm-project/vllm#45794). Use the text-only
-# language-model path and mirror the MXFP8 MI355X search space for a direct
-# precision comparison.
-minimaxm3-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
-      - { tp: 4, conc-start: 1, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
-      - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
-      - { tp: 4, conc-start: 1, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
-
-# EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the
-# amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft
-# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
-# FP4 sweep at extreme concurrency where speculative decoding loses value.
-minimaxm3-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
-
-# MiniMax-M3 MXFP4 MI355X atom recipe:
-# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
-# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
-minimaxm3-fp4-mi355x-atom:
-  image: rocm/atom-dev:MiniMax-M3-20260623
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256 }
-
-minimaxm3-fp4-mi355x-atom-mtp:
-  image: rocm/atom-dev:MiniMax-M3-20260623
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp  }
-
-minimaxm3-fp8-mi355x-atom:
-  image: rocm/atom-dev:MiniMax-M3-20260623
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256 }
-
-minimaxm3-fp8-mi355x-atom-mtp:
-  image: rocm/atom-dev:MiniMax-M3-20260623
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x
-  precision: fp8
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-
-minimaxm3-fp8-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260622
-  model: amd/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x-disagg
-  precision: fp8
-  framework: atom-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 1P1D TP4
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-
-minimaxm3-fp4-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260622
-  model: amd/MiniMax-M3-MXFP4
-  model-prefix: minimaxm3
-  runner: mi355x-disagg
-  precision: fp4
-  framework: atom-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 1P1D TP4
-    - isl: 1024
-      osl: 1024
-      search-space:
-      # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-
-# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
-# MI355X serving shape, but retain the default BF16 KV cache because this
-# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
-# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
-minimaxm3-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:minimax-m3
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 128 }
-      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
-
-# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
-# minimaxm3-fp8-mi300x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
-# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same TP8-only
-# search space as the non-MTP MI300X entry (gfx942 192 GB is memory-tight, like
-# H100), with the TP8 latency rows started at conc 1 to capture single-request
-# latency — matching the H100/MI355X MTP recipes. The pinned ROCm nightly
-# includes upstream SupportsEagle3 support for the AMD MiniMax-M3 model.
-minimaxm3-fp8-mi300x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 128, conc-end: 256, spec-decoding: mtp }
-
-# MiniMax-M3 MXFP8 MI325X day-zero recipe. Reuse the dedicated ROCm image
-# and serving flags validated on MI355X, with the H200 search space: TP4 and
-# TP8 latency, TP4/TP8 expert parallelism, and TP8 data-parallel attention.
-minimaxm3-fp8-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:minimax-m3
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256 }
-      - { tp: 8, conc-start: 1, conc-end: 128 }
-      - { tp: 8, ep: 8, conc-start: 256, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 32 }
-      - { tp: 8, conc-start: 1, conc-end: 128 }
-      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
-
-# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
-# minimaxm3-fp8-mi325x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
-# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style
-# search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency
-# end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP
-# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0,
-# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD
-# MiniMax-M3 model, so the recipe applies that fix in-place at runtime
-# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on
-# MI355X/MI300X) before serving.
-minimaxm3-fp8-mi325x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:minimax-m3
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-
-# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the
-# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to
-# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on
-# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are
-# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in
-# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8).
-minimaxm3-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15
-  model: MiniMaxAI/MiniMax-M3-MXFP8
-  model-prefix: minimaxm3
-  runner: mi355x-disagg
-  precision: fp8
-  framework: vllm-disagg
-  multinode: true
-  disagg: true
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
-      # conc 1,2,4,8,16,32,64,128,256.
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
-      - spec-decoding: "none"
-        conc-list: [ 64, 128, 256, 512, 1024 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
-      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
-      - spec-decoding: "none"
-        conc-list: [ 256, 512, 768, 1024 ]
-        prefill:
-          num-worker: 2
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-    # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024;
-    # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8
-    # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one
-    # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) —
-    # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end.
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
-      # conc 1,2,4,8,16,32,64,128,256.
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
-      - spec-decoding: "none"
-        conc-list: [ 64, 128, 256, 512, 1024 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=1"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"
-      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
-      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
-      - spec-decoding: "none"
-        conc-list: [ 256, 512, 768, 1024 ]
-        prefill:
-          num-worker: 2
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "PREFILL_NODES=2"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "DECODE_NODES=1"

From 8b6141c4076f737788ab694b88b11da1fbb1f625 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 3 Jul 2026 15:48:09 +0900
Subject: [PATCH 11/15] [AMD] refactor ATOM disagg config: split per-role
 flags, move model defaults to YAML

- Split MODEL_TP_DP_FLAGS and MODEL_EP_DP_FLAGS into prefill/decode variants
- Move BLOCK_SIZE, MEM_FRAC_STATIC, MAX_MODEL_LEN, MAX_NUM_SEQS,
  MAX_NUM_BATCHED_TOKENS from launch scripts into models_atom.yaml
- Add hf_overrides and online_quant_config (with DPA variant) to YAML
- Remove SPEC_DECODING gate; use MODEL_MTP_FLAGS + DECODE_MTP_SIZE > 0
- Add minimaxm3-fp4/fp8-mi355x-atom-disagg-mtp recipes to amd-master.yaml

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../multi_node/amd_utils/models_atom.yaml     |  47 ++++-
 .../multi_node/amd_utils/server_atom.sh       |  81 +++++---
 .../minimaxm3_fp4_mi355x_atom-disagg.sh       |  15 --
 .../minimaxm3_fp8_mi355x_atom-disagg.sh       |  16 --
 configs/amd-master.yaml                       | 188 +++++++++++++++++-
 5 files changed, 282 insertions(+), 65 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
index 620aaf6c68..1c20e8aef8 100644
--- a/benchmarks/multi_node/amd_utils/models_atom.yaml
+++ b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -9,32 +9,71 @@
 #   <model-name>:
 #     env:          str  # Space-separated KEY=VALUE pairs exported unconditionally
 #     hf_overrides: str  # JSON string passed to --hf-overrides
-#     tp_dp_flags:  str  # Parallel flags for TP+DPA case (must include --enable-dp-attention)
+#     tp_dp_flags:  str  # Shared TP+DPA flags (fallback when prefill/decode-specific keys are absent)
+#     prefill_tp_dp_flags: str  # TP+DPA flags for prefill only (overrides tp_dp_flags)
+#     decode_tp_dp_flags:  str  # TP+DPA flags for decode only (overrides tp_dp_flags)
 #     tp_dp_env:    str  # Space-separated KEY=VALUE pairs exported only in TP+DPA mode
-#     ep_dp_flags:  str  # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention)
+#     ep_dp_flags:  str  # Shared EP+DPA flags (fallback when prefill/decode-specific keys are absent)
+#     prefill_ep_dp_flags: str  # EP+DPA flags for prefill only (overrides ep_dp_flags)
+#     decode_ep_dp_flags:  str  # EP+DPA flags for decode only (overrides ep_dp_flags)
 #     ep_dp_env:    str  # Space-separated KEY=VALUE pairs exported only in EP+DPA mode
 #     mtp_flags:    str  # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens")
 #     kv_cache_flags: str  # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none)
+#     online_quant_config: str  # JSON string passed to --online_quant_config (used when DPA is disabled)
+#     online_quant_dpa_config: str  # JSON string passed to --online_quant_config when DPA is enabled (falls back to online_quant_config)
+#     block_size:   str  # --block-size value (overrides server_atom.sh default of 16)
+#     mem_frac_static: str  # --gpu-memory-utilization value (overrides default of 0.85)
+#     max_model_len: str  # --max-model-len value (overrides default of unset)
+#     max_num_seqs:  str  # --max-num-seqs value (overrides default of 256)
+#     max_num_batched_tokens: str  # --max-num-batched-tokens value (overrides default of unset)
 
 DeepSeek-V4-Pro:
   env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0"
   kv_cache_flags: "--kv_cache_dtype fp8"
   tp_dp_flags: "--enable-dp-attention --enable-tbo"
+  prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo"
+  decode_tp_dp_flags: "--enable-dp-attention --enable-tbo"
   tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1"
   ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   mtp_flags: "--method mtp --num-speculative-tokens"
   hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
 
 MiniMax-M3-MXFP4:
-  env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
+  env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1"
   kv_cache_flags: "--kv_cache_dtype fp8"
   tp_dp_flags: "--enable-dp-attention"
+  prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill"
+  decode_tp_dp_flags: "--enable-dp-attention"
   ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
+  hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
+  online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
+  online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
+  block_size: "128"
+  mem_frac_static: "0.8"
+  max_model_len: "32768"
+  max_num_seqs: "256"
+  max_num_batched_tokens: "32768"
 
 MiniMax-M3-MXFP8:
-  env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
+  env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1"
   kv_cache_flags: "--kv_cache_dtype fp8"
   tp_dp_flags: "--enable-dp-attention"
+  prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill"
+  decode_tp_dp_flags: "--enable-dp-attention"
   ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
+  decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
+  hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
+  online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
+  online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*.gate.*","*.block_sparse_moe.experts*"]}'
+  block_size: "128"
+  mem_frac_static: "0.8"
+  max_model_len: "32768"
+  max_num_seqs: "256"
+  max_num_batched_tokens: "32768"
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 303e0d8767..ea79a8d663 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -36,8 +36,7 @@ DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
 DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 
 # MTP
-SPEC_DECODING="${SPEC_DECODING:-}"
-DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}"
+DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
 
 # ATOM server ports (different from SGLang which uses 8000 for all)
 PREFILL_PORT="${PREFILL_PORT:-8010}"
@@ -88,19 +87,48 @@ with open('${ATOM_WS_PATH}/models_atom.yaml') as f:
     m = yaml.safe_load(f).get('${MODEL_NAME}', {})
 def sh(v): return v.replace("'", "'\\''")
 print(f"MODEL_ENVS='{sh(m.get('env', ''))}'")
-print(f"MODEL_TP_DP_FLAGS='{sh(m.get('tp_dp_flags', ''))}'")
-print(f"MODEL_EP_DP_FLAGS='{sh(m.get('ep_dp_flags', ''))}'")
+_tp_dp = m.get('tp_dp_flags', '')
+print(f"PREFILL_MODEL_TP_DP_FLAGS='{sh(m.get('prefill_tp_dp_flags', _tp_dp))}'")
+print(f"DECODE_MODEL_TP_DP_FLAGS='{sh(m.get('decode_tp_dp_flags', _tp_dp))}'")
+_ep_dp = m.get('ep_dp_flags', '')
+print(f"PREFILL_MODEL_EP_DP_FLAGS='{sh(m.get('prefill_ep_dp_flags', _ep_dp))}'")
+print(f"DECODE_MODEL_EP_DP_FLAGS='{sh(m.get('decode_ep_dp_flags', _ep_dp))}'")
 print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'")
 print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'")
 print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'")
 print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'")
 print(f"_HF_OVERRIDES='{sh(m.get('hf_overrides', ''))}'")
+print(f"_ONLINE_QUANT_CONFIG='{sh(m.get('online_quant_config', ''))}'")
+print(f"_ONLINE_QUANT_DPA_CONFIG='{sh(m.get('online_quant_dpa_config', m.get('online_quant_config', '')))}'")
+print(f"_YAML_BLOCK_SIZE='{sh(m.get('block_size', ''))}'")
+print(f"_YAML_MEM_FRAC_STATIC='{sh(m.get('mem_frac_static', ''))}'")
+print(f"_YAML_MAX_MODEL_LEN='{sh(m.get('max_model_len', ''))}'")
+print(f"_YAML_MAX_NUM_SEQS='{sh(m.get('max_num_seqs', ''))}'")
+print(f"_YAML_MAX_NUM_BATCHED_TOKENS='{sh(m.get('max_num_batched_tokens', ''))}'")
 PYEOF
 # shellcheck source=/dev/null
 source "$_yaml_tmp"
 rm -f "$_yaml_tmp"
 unset _yaml_tmp
 
+# Apply YAML server-tuning defaults (env vars take precedence)
+if [[ -n "$_YAML_BLOCK_SIZE" ]]; then
+    BLOCK_SIZE="${BLOCK_SIZE:-$_YAML_BLOCK_SIZE}"
+fi
+if [[ -n "$_YAML_MEM_FRAC_STATIC" ]]; then
+    MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-$_YAML_MEM_FRAC_STATIC}"
+fi
+if [[ -n "$_YAML_MAX_MODEL_LEN" ]]; then
+    MAX_MODEL_LEN="${MAX_MODEL_LEN:-$_YAML_MAX_MODEL_LEN}"
+fi
+if [[ -n "$_YAML_MAX_NUM_SEQS" ]]; then
+    MAX_NUM_SEQS="${MAX_NUM_SEQS:-$_YAML_MAX_NUM_SEQS}"
+fi
+if [[ -n "$_YAML_MAX_NUM_BATCHED_TOKENS" ]]; then
+    MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-$_YAML_MAX_NUM_BATCHED_TOKENS}"
+fi
+unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS
+
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================
@@ -134,29 +162,41 @@ PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}"
 DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
 DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 
+
+
+
 # Parallel args
 PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
+ONLINE_QUANT_ARG=""
 if [ "$PREFILL_ENABLE_DP" = "true" ]; then
     if [ "$PREFILL_ENABLE_EP" = "true" ]; then #EP+DPA
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS})
+        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${PREFILL_MODEL_EP_DP_FLAGS})
         for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
     else #TP+DPA
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS})
+        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${PREFILL_MODEL_TP_DP_FLAGS})
         for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
     fi
+    if [[ -n "$_ONLINE_QUANT_DPA_CONFIG" ]]; then
+        ONLINE_QUANT_ARG="--online_quant_config '${_ONLINE_QUANT_DPA_CONFIG}'"
+    fi
+else
+    if [[ -n "$_ONLINE_QUANT_CONFIG" ]]; then
+        ONLINE_QUANT_ARG="--online_quant_config '${_ONLINE_QUANT_CONFIG}'"
+    fi
 fi
 
 DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP
 if [ "$DECODE_ENABLE_DP" = "true" ]; then
     if [ "$DECODE_ENABLE_EP" = "true" ]; then #EP+DPA
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS})
+        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${DECODE_MODEL_EP_DP_FLAGS})
         for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
     else #TP+DPA
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS})
+        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${DECODE_MODEL_TP_DP_FLAGS})
         for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
     fi
 fi
 unset _dp_env_pair
+unset _ONLINE_QUANT_CONFIG _ONLINE_QUANT_DPA_CONFIG
 
 # HF overrides (single-quoted JSON preserved through eval)
 HF_OVERRIDES_ARG=""
@@ -172,7 +212,7 @@ unset _env_pair
 
 # MTP args
 SPEC_ARGS=()
-if [[ "$SPEC_DECODING" != "none" && "$SPEC_DECODING" != "" && -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then
+if [[ -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then
     SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE")
 fi
 
@@ -203,7 +243,7 @@ Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NU
 Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
 Decode  args : ${DECODE_PARALLEL_ARGS[*]}
 Spec    args : ${SPEC_ARGS[*]}
-Opt     args : ${HF_OVERRIDES_ARG}
+Opt     args : ${HF_OVERRIDES_ARG} ${ONLINE_QUANT_ARG}
 =====================
 INFO
 
@@ -244,6 +284,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
         ${HF_OVERRIDES_ARG} \
+        ${ONLINE_QUANT_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -334,7 +375,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
     cd $ATOM_WS_PATH
 
     export IS_MTP="false"
-    if [ "$SPEC_DECODING" = "mtp" ]; then
+    if [[ -n "$MODEL_MTP_FLAGS" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then
         export IS_MTP="true"
     fi
 
@@ -466,6 +507,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
         ${HF_OVERRIDES_ARG} \
+        ${ONLINE_QUANT_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -522,21 +564,9 @@ else
     echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})"
 
     _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
-    if [[ "$_MAX_CONC" -gt 2048 ]]; then
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048,4096]'
-    elif [[ "$_MAX_CONC" -gt 1024 ]]; then
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048]'
-    elif [[ "$_MAX_CONC" -gt 512 ]]; then
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,768,1024]'
-    else
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512]'
-    fi
+    CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
 
-    if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then
-        DECODE_MAX_NUM_SEQS="${_MAX_CONC}"
-    else
-        DECODE_MAX_NUM_SEQS="${MAX_NUM_SEQS}"
-    fi
+    DECODE_MAX_NUM_SEQS="${_MAX_CONC}"
 
     DECODE_CMD="python3 -m atom.entrypoints.openai_server \
         --model ${MODEL_DIR}/${MODEL_NAME} \
@@ -551,6 +581,7 @@ else
         ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
         ${HF_OVERRIDES_ARG} \
+        ${ONLINE_QUANT_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
         ${EXTRA_SERVER_ARGS}"
diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
index 9b1957fa5f..1505b905de 100644
--- a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
+++ b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh
@@ -60,21 +60,6 @@ else
 export DECODE_ENABLE_DP=false
 fi
 
-# No MTP for MiniMax-M3
-export SPEC_DECODING="none"
-export DECODE_MTP_SIZE=0
-
-# Block size 128
-export BLOCK_SIZE="${BLOCK_SIZE:-128}"
-export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}"
-export MAX_MODEL_LEN=32768
-export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
-export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
-
-# Launch jobs based on ISL/OSL
-# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
-# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
-# expects the concurrencies.
 JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $PREFILL_NUM_WORKERS \
     $DECODE_NODES \
diff --git a/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh
index 505f743195..1505b905de 100644
--- a/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh
+++ b/benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh
@@ -60,22 +60,6 @@ else
 export DECODE_ENABLE_DP=false
 fi
 
-# No MTP for MiniMax-M3
-export SPEC_DECODING="none"
-export DECODE_MTP_SIZE=0
-
-# Block size 128
-export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}"
-export BLOCK_SIZE="${BLOCK_SIZE:-128}"
-export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}"
-export MAX_MODEL_LEN=32768
-export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
-export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
-
-# Launch jobs based on ISL/OSL
-# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
-# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
-# expects the concurrencies.
 JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $PREFILL_NUM_WORKERS \
     $DECODE_NODES \
diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index cd05d7dc22..462c477421 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2721,8 +2721,8 @@ minimaxm3-fp8-mi355x-atom-mtp:
       - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
 minimaxm3-fp8-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260622
-  model: amd/MiniMax-M3-MXFP8
+  image: rocm/atom-dev:nightly_202607011530
+  model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x-disagg
   precision: fp8
@@ -2735,7 +2735,7 @@ minimaxm3-fp8-mi355x-atom-disagg:
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 4
@@ -2750,12 +2750,100 @@ minimaxm3-fp8-mi355x-atom-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+      # 2P1D, DPA TP4
+      - conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
       # 1P1D TP4
     - isl: 1024
       osl: 1024
       search-space:
       # 1P1D TP4
-      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+minimaxm3-fp8-mi355x-atom-disagg-mtp:
+  image: rocm/atom-dev:nightly_202607011530
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp8
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+      # 2P1D, DPA TP4
+      - conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+      # 1P1D TP4
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 4
@@ -2770,9 +2858,10 @@ minimaxm3-fp8-mi355x-atom-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
 
 minimaxm3-fp4-mi355x-atom-disagg:
-  image: rocm/atom-dev:MiniMax-M3-20260623
+  image: rocm/atom-dev:nightly_202607011530
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x-disagg
@@ -2801,6 +2890,24 @@ minimaxm3-fp4-mi355x-atom-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+      # 2P1D, DPA TP4
+      - conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
       # 1P1D TP4
     - isl: 1024
       osl: 1024
@@ -2821,6 +2928,77 @@ minimaxm3-fp4-mi355x-atom-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+minimaxm3-fp4-mi355x-atom-disagg-mtp:
+  image: rocm/atom-dev:nightly_202607011530
+  model: amd/MiniMax-M3-MXFP4
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp4
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+      # 2P1D, DPA TP4
+      - conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+      # 1P1D TP4
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
 
 # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
 # MI355X serving shape, but retain the default BF16 KV cache because this

From a6ef155cfb2b39bc2d7e7bd4e8a2e0dc13a742c8 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 3 Jul 2026 16:06:45 +0900
Subject: [PATCH 12/15] [AMD] fix YAML server-tuning defaults never taking
 effect

Shell defaults (BLOCK_SIZE=16, MEM_FRAC_STATIC=0.85) were set before
YAML loading, so the YAML values (128, 0.8) were never substituted.
Use three-tier fallback: env var > YAML > shell default.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../multi_node/amd_utils/models_atom.yaml     |  2 +-
 .../multi_node/amd_utils/server_atom.sh       | 29 +++++--------------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
index 1c20e8aef8..6e4f59f778 100644
--- a/benchmarks/multi_node/amd_utils/models_atom.yaml
+++ b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -44,7 +44,7 @@ MiniMax-M3-MXFP4:
   env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1"
   kv_cache_flags: "--kv_cache_dtype fp8"
   tp_dp_flags: "--enable-dp-attention"
-  prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo prefill"
+  prefill_tp_dp_flags: "--enable-dp-attention --enable-tbo"
   decode_tp_dp_flags: "--enable-dp-attention"
   ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index ea79a8d663..e2f90c6ceb 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -44,12 +44,7 @@ DECODE_PORT="${DECODE_PORT:-8020}"
 ROUTER_PORT="${ROUTER_PORT:-8000}"
 HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"
 
-# ATOM server tuning (from reference script defaults)
-MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
-BLOCK_SIZE="${BLOCK_SIZE:-16}"
-MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
-MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
-MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}"
+# ATOM server tuning — defaults applied after YAML load (env var > YAML > shell default)
 EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}"
 
 # Benchmark Configuration
@@ -111,22 +106,12 @@ source "$_yaml_tmp"
 rm -f "$_yaml_tmp"
 unset _yaml_tmp
 
-# Apply YAML server-tuning defaults (env vars take precedence)
-if [[ -n "$_YAML_BLOCK_SIZE" ]]; then
-    BLOCK_SIZE="${BLOCK_SIZE:-$_YAML_BLOCK_SIZE}"
-fi
-if [[ -n "$_YAML_MEM_FRAC_STATIC" ]]; then
-    MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-$_YAML_MEM_FRAC_STATIC}"
-fi
-if [[ -n "$_YAML_MAX_MODEL_LEN" ]]; then
-    MAX_MODEL_LEN="${MAX_MODEL_LEN:-$_YAML_MAX_MODEL_LEN}"
-fi
-if [[ -n "$_YAML_MAX_NUM_SEQS" ]]; then
-    MAX_NUM_SEQS="${MAX_NUM_SEQS:-$_YAML_MAX_NUM_SEQS}"
-fi
-if [[ -n "$_YAML_MAX_NUM_BATCHED_TOKENS" ]]; then
-    MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-$_YAML_MAX_NUM_BATCHED_TOKENS}"
-fi
+# Apply server-tuning: env var > YAML > shell default
+BLOCK_SIZE="${BLOCK_SIZE:-${_YAML_BLOCK_SIZE:-16}}"
+MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-${_YAML_MEM_FRAC_STATIC:-0.85}}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-${_YAML_MAX_MODEL_LEN:-}}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-${_YAML_MAX_NUM_SEQS:-256}}"
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-${_YAML_MAX_NUM_BATCHED_TOKENS:-}}"
 unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS
 
 # =============================================================================

From 8d57cde68ab1bd1d28189c452eb341f47a7e744d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 3 Jul 2026 16:15:49 +0900
Subject: [PATCH 13/15] [AMD] add perf-changelog entry for MiniMax-M3 ATOM
 disagg refactor (PR #2000)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 35b796fddb..b2af93f012 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4306,14 +4306,15 @@
 
 - config-keys:
     - minimaxm3-fp4-mi355x-atom-disagg
+    - minimaxm3-fp4-mi355x-atom-disagg-mtp
+    - minimaxm3-fp8-mi355x-atom-disagg
+    - minimaxm3-fp8-mi355x-atom-disagg-mtp
   description:
-    - "Add minimaxm3-fp4-mi355x-atom-disagg CI recipe: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4"
-    - "Image: rocm/atom-dev:MiniMax-M3-20260623; model: amd/MiniMax-M3-MXFP4; framework: atom-disagg"
-    - "Search space: ISL=8192 and ISL=1024, OSL=1024, 1P1D TP4, conc 1-512"
-    - "Refactor server_atom.sh to eliminate all hardcoded MODEL_NAME checks; all model-specific config (env, parallel flags, MTP flags, KV cache flags, HF overrides) now driven from models_atom.yaml"
-    - "Add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries to models_atom.yaml with EAGLE3 MTP flags (--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3)"
-    - "Fix model HuggingFace path for minimaxm3-fp8-mi355x-atom-disagg: amd/MiniMax-M3-MXFP8 -> MiniMaxAI/MiniMax-M3-MXFP8"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1940
+    - "Refactor ATOM disagg server_atom.sh and models_atom.yaml: split MODEL_TP_DP_FLAGS and MODEL_EP_DP_FLAGS into per-role prefill/decode variants, move BLOCK_SIZE (128), MEM_FRAC_STATIC (0.8), MAX_MODEL_LEN (32768), MAX_NUM_SEQS (256), MAX_NUM_BATCHED_TOKENS (32768) from launch scripts into models_atom.yaml with env-var override support."
+    - "Add hf_overrides and online_quant_config (with DPA-specific variant) to models_atom.yaml. FP8 uses different online_quant_config exclude patterns when DPA is enabled (*.gate.*, *.block_sparse_moe.experts*) vs disabled (*block_sparse_moe)."
+    - "Add minimaxm3-fp4/fp8-mi355x-atom-disagg-mtp recipes with EAGLE3 speculative decoding (DECODE_MTP_SIZE=3) at 1P1D and 2P1D DPA TP4 search spaces for 8k1k and 1k1k."
+    - "Fix YAML server-tuning defaults precedence (env var > YAML > shell default) and remove redundant SPEC_DECODING gating in favor of MODEL_MTP_FLAGS + DECODE_MTP_SIZE > 0."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2000
 
 - config-keys:
     - minimaxm3-fp8-mi355x-vllm-mtp

From 0fd4545b755abc785c575c35eef4bfe6ab69dc27 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 3 Jul 2026 16:30:59 +0900
Subject: [PATCH 14/15] [AMD] remove hf_overrides from models_atom.yaml and
 server_atom.sh

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/models_atom.yaml |  4 ----
 benchmarks/multi_node/amd_utils/server_atom.sh   | 13 +------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
index 6e4f59f778..4a854f49fd 100644
--- a/benchmarks/multi_node/amd_utils/models_atom.yaml
+++ b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -8,7 +8,6 @@
 # Schema:
 #   <model-name>:
 #     env:          str  # Space-separated KEY=VALUE pairs exported unconditionally
-#     hf_overrides: str  # JSON string passed to --hf-overrides
 #     tp_dp_flags:  str  # Shared TP+DPA flags (fallback when prefill/decode-specific keys are absent)
 #     prefill_tp_dp_flags: str  # TP+DPA flags for prefill only (overrides tp_dp_flags)
 #     decode_tp_dp_flags:  str  # TP+DPA flags for decode only (overrides tp_dp_flags)
@@ -38,7 +37,6 @@ DeepSeek-V4-Pro:
   prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   mtp_flags: "--method mtp --num-speculative-tokens"
-  hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
 
 MiniMax-M3-MXFP4:
   env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_FORCE_ATTN_TRITON=1"
@@ -50,7 +48,6 @@ MiniMax-M3-MXFP4:
   prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
-  hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
   online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
   online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
   block_size: "128"
@@ -69,7 +66,6 @@ MiniMax-M3-MXFP8:
   prefill_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   decode_ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
   mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
-  hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'
   online_quant_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*block_sparse_moe"]}'
   online_quant_dpa_config: '{"global_quant_config":"ptpc_fp8","exclude_layer":["lm_head","model.embed_tokens","vision_tower","multi_modal_projector","patch_merge_mlp","*.gate.*","*.block_sparse_moe.experts*"]}'
   block_size: "128"
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index e2f90c6ceb..76fe119698 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -92,7 +92,6 @@ print(f"MODEL_TP_DP_ENV='{sh(m.get('tp_dp_env', ''))}'")
 print(f"MODEL_EP_DP_ENV='{sh(m.get('ep_dp_env', ''))}'")
 print(f"MODEL_MTP_FLAGS='{sh(m.get('mtp_flags', ''))}'")
 print(f"MODEL_KV_ARG='{sh(m.get('kv_cache_flags', ''))}'")
-print(f"_HF_OVERRIDES='{sh(m.get('hf_overrides', ''))}'")
 print(f"_ONLINE_QUANT_CONFIG='{sh(m.get('online_quant_config', ''))}'")
 print(f"_ONLINE_QUANT_DPA_CONFIG='{sh(m.get('online_quant_dpa_config', m.get('online_quant_config', '')))}'")
 print(f"_YAML_BLOCK_SIZE='{sh(m.get('block_size', ''))}'")
@@ -183,13 +182,6 @@ fi
 unset _dp_env_pair
 unset _ONLINE_QUANT_CONFIG _ONLINE_QUANT_DPA_CONFIG
 
-# HF overrides (single-quoted JSON preserved through eval)
-HF_OVERRIDES_ARG=""
-if [[ -n "$_HF_OVERRIDES" ]]; then
-    HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'"
-fi
-unset _HF_OVERRIDES
-
 for _env_pair in ${MODEL_ENVS}; do
     export "$_env_pair"
 done
@@ -228,7 +220,7 @@ Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NU
 Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
 Decode  args : ${DECODE_PARALLEL_ARGS[*]}
 Spec    args : ${SPEC_ARGS[*]}
-Opt     args : ${HF_OVERRIDES_ARG} ${ONLINE_QUANT_ARG}
+Opt     args : ${ONLINE_QUANT_ARG}
 =====================
 INFO
 
@@ -268,7 +260,6 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --max-num-seqs ${MAX_NUM_SEQS} \
         ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
-        ${HF_OVERRIDES_ARG} \
         ${ONLINE_QUANT_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
@@ -491,7 +482,6 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --max-num-seqs ${MAX_NUM_SEQS} \
         ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
-        ${HF_OVERRIDES_ARG} \
         ${ONLINE_QUANT_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
@@ -565,7 +555,6 @@ else
         --max-num-seqs ${DECODE_MAX_NUM_SEQS} \
         ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
-        ${HF_OVERRIDES_ARG} \
         ${ONLINE_QUANT_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \

From 113e568471d00a14da2f88f7de3a73b5616fff41 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Sat, 4 Jul 2026 13:09:36 +0900
Subject: [PATCH 15/15] [AMD] fix server-tuning: YAML values must override
 job.slurm Docker defaults

job.slurm injects BLOCK_SIZE=16, MEM_FRAC_STATIC=0.85, MAX_NUM_SEQS=256
as Docker env vars with hardcoded defaults. The previous env-first fallback
(env > YAML > default) meant YAML values were always shadowed. Flip all
five server-tuning vars to YAML > env > default so models_atom.yaml
entries (e.g. block_size=128 for MiniMax-M3-MXFP4) actually take effect.

Also add set -x before YAML parsing for CI debuggability.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 76fe119698..45f3053bd5 100755
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -75,6 +75,7 @@ host_name=$(hostname)
 # Model-Specific Configuration from YAML
 # =============================================================================
 # Load model-specific config from YAML (single parse for all fields)
+set -x
 _yaml_tmp=$(mktemp)
 python3 << PYEOF > "$_yaml_tmp"
 import yaml
@@ -105,12 +106,14 @@ source "$_yaml_tmp"
 rm -f "$_yaml_tmp"
 unset _yaml_tmp
 
-# Apply server-tuning: env var > YAML > shell default
-BLOCK_SIZE="${BLOCK_SIZE:-${_YAML_BLOCK_SIZE:-16}}"
-MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-${_YAML_MEM_FRAC_STATIC:-0.85}}"
-MAX_MODEL_LEN="${MAX_MODEL_LEN:-${_YAML_MAX_MODEL_LEN:-}}"
-MAX_NUM_SEQS="${MAX_NUM_SEQS:-${_YAML_MAX_NUM_SEQS:-256}}"
-MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-${_YAML_MAX_NUM_BATCHED_TOKENS:-}}"
+# Apply server-tuning: YAML > env var > shell default
+# (job.slurm injects BLOCK_SIZE/MEM_FRAC_STATIC/MAX_NUM_SEQS with hardcoded
+#  defaults into the Docker env, so env-first would always shadow the YAML.)
+BLOCK_SIZE="${_YAML_BLOCK_SIZE:-${BLOCK_SIZE:-16}}"
+MEM_FRAC_STATIC="${_YAML_MEM_FRAC_STATIC:-${MEM_FRAC_STATIC:-0.85}}"
+MAX_MODEL_LEN="${_YAML_MAX_MODEL_LEN:-${MAX_MODEL_LEN:-}}"
+MAX_NUM_SEQS="${_YAML_MAX_NUM_SEQS:-${MAX_NUM_SEQS:-256}}"
+MAX_NUM_BATCHED_TOKENS="${_YAML_MAX_NUM_BATCHED_TOKENS:-${MAX_NUM_BATCHED_TOKENS:-}}"
 unset _YAML_BLOCK_SIZE _YAML_MEM_FRAC_STATIC _YAML_MAX_MODEL_LEN _YAML_MAX_NUM_SEQS _YAML_MAX_NUM_BATCHED_TOKENS
 
 # =============================================================================