From f8f9b66462973906e4ec017e1567bd47001e200f Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <fangzhouai@gmail.com>
Date: Wed, 1 Jul 2026 08:46:36 +0000
Subject: [PATCH 1/4] [AMD] MiniMax-M3 FP4 MI355X vLLM STP: close high-conc gap
 vs ATOM

Add the four levers that bring the single-node MXFP4 MI355X vLLM STP recipe
to parity with the ATOM recipe at high concurrency:
  - INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +
    CAST_BF16_TO_FP16=0 + QUANTIZATION_MIN_SIZE_KB=256). The decode all-reduce
    is the biggest decode kernel; INT4 makes it ~4x cheaper.
  - fp8 KV cache (--kv-cache-dtype fp8).
  - cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4),
    which needs vllm-project/vllm#47269.
  - input-side all-reduce + RMSNorm fusion, which needs vllm-project/vllm#47270
    (automatic once merged).

Measured (amd/MiniMax-M3-MXFP4, MI355X, TP4, 8k1k, ATOM's benchmark on both):
vLLM conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched).
GSM8K limit=100 = 0.95 (INT4 all-reduce is accuracy-safe).

The image pin must be bumped to a nightly containing #47269 and #47270
before sweeping for the full curve.

AI assistance (Claude) was used.

Signed-off-by: Fangzhou Ai <fangzhouai@gmail.com>
---
 .../minimaxm3_fp4_mi355x_vllm.sh              | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
index 4be977a80..710b8bef8 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
@@ -4,6 +4,18 @@
 # https://huggingface.co/amd/MiniMax-M3-MXFP4#reproduction
 # Block size 128 is mandatory for MSA. This fixed-sequence benchmark uses the
 # text-only language-model path with AITER MoE (vllm-project/vllm#46419).
+#
+# High-concurrency parity with the ATOM recipe comes from four levers:
+#   * INT4 quantized all-reduce (env knobs below) -- the decode all-reduce is
+#     the biggest decode kernel; INT4 makes it ~4x cheaper (~-12% to -17% TPOT
+#     at conc 64/128/256). Works on any nightly.
+#   * fp8 KV cache (--kv-cache-dtype fp8).
+#   * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) --
+#     requires vllm-project/vllm#47269.
+#   * input-side all-reduce + RMSNorm fusion -- requires vllm-project/vllm#47270
+#     (automatic once merged; no flag).
+# Bump the image pin (.github/configs/amd-master.yaml) to a nightly containing
+# #47269 and #47270 before sweeping for the full curve.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -35,6 +47,13 @@ export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_MOE=1
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+# INT4 quantized all-reduce for the (~1.5 MB) decode all-reduces, which are the
+# single biggest decode kernel at high concurrency. The MIN_SIZE_KB override is
+# required: vLLM's default INT4 quick-reduce size gate for (bf16, TP4) is 16 MB,
+# so it never fires for decode-sized tensors without it.
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -63,6 +82,8 @@ vllm serve "$MODEL" --port "$PORT" \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
     --moe-backend aiter \
+    --kv-cache-dtype fp8 \
+    --hf-overrides '{"text_config": {"use_index_cache": true, "index_topk_freq": 4}}' \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \
     --reasoning-parser minimax_m3 > "$SERVER_LOG" 2>&1 &

From 77c5a9dc81b959dd2a83e8c97296ef5e334afc70 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <fangzhouai@gmail.com>
Date: Wed, 1 Jul 2026 08:47:10 +0000
Subject: [PATCH 2/4] Add perf-changelog entry for minimaxm3-fp4-mi355x-vllm
 QR-INT4

Signed-off-by: Fangzhou Ai <fangzhouai@gmail.com>
---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b776a5d1d..f0dcee33e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4351,3 +4351,11 @@
     - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed"
     - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1966
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm
+  description:
+    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4), and rely on input-side all-reduce+RMSNorm fusion."
+    - "INT4 quick-reduce makes the (biggest) decode all-reduce ~4x cheaper; the MIN_SIZE_KB override is required because vLLM's default INT4 gate for (bf16, TP4) is 16MB and never fires for the ~1.5MB decode all-reduces. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
+    - "index_topk_freq needs vllm-project/vllm#47269 and the AR+RMSNorm fusion needs vllm-project/vllm#47270 in the served image; bump the image pin to a nightly containing both before sweeping."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969

From b12d92d0fc58ba6004128ba8c9e9dd71bfeedef7 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <fangzhouai@gmail.com>
Date: Wed, 1 Jul 2026 12:56:19 +0000
Subject: [PATCH 3/4] Drop AR+RMSNorm fusion (#47270) dependency; #47269 merged

The input-side all-reduce+RMSNorm fusion (vllm-project/vllm#47270) is a
no-op on AMD: fused_allreduce_gemma_rms_norm's fused kernel is flashinfer
(NVIDIA-only), so on ROCm it falls back to plain all-reduce + GemmaRMSNorm.
Remove it from the recipe's dependency set. The cross-layer index-sharing
PR (#47269) is merged, so the sweep now only needs a nightly containing it.

Signed-off-by: Fangzhou Ai <fangzhouai@gmail.com>
---
 .../fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh   | 16 +++++++---------
 perf-changelog.yaml                              |  6 +++---
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
index 710b8bef8..f0f0ea5f5 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
@@ -5,17 +5,15 @@
 # Block size 128 is mandatory for MSA. This fixed-sequence benchmark uses the
 # text-only language-model path with AITER MoE (vllm-project/vllm#46419).
 #
-# High-concurrency parity with the ATOM recipe comes from four levers:
-#   * INT4 quantized all-reduce (env knobs below) -- the decode all-reduce is
-#     the biggest decode kernel; INT4 makes it ~4x cheaper (~-12% to -17% TPOT
-#     at conc 64/128/256). Works on any nightly.
+# High-concurrency parity with the ATOM recipe comes from three levers:
+#   * INT4 quantized all-reduce (env knobs below) -- reduces the all-reduce
+#     cost (the biggest decode kernel); measured ~-12% to -17% TPOT at conc
+#     64/128/256. Works on any nightly.
 #   * fp8 KV cache (--kv-cache-dtype fp8).
 #   * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) --
-#     requires vllm-project/vllm#47269.
-#   * input-side all-reduce + RMSNorm fusion -- requires vllm-project/vllm#47270
-#     (automatic once merged; no flag).
-# Bump the image pin (.github/configs/amd-master.yaml) to a nightly containing
-# #47269 and #47270 before sweeping for the full curve.
+#     requires vllm-project/vllm#47269 (merged).
+# Pin the image (.github/configs/amd-master.yaml) to a nightly containing
+# #47269 before sweeping for the full curve.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f0dcee33e..857ed8634 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4355,7 +4355,7 @@
 - config-keys:
     - minimaxm3-fp4-mi355x-vllm
   description:
-    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4), and rely on input-side all-reduce+RMSNorm fusion."
-    - "INT4 quick-reduce makes the (biggest) decode all-reduce ~4x cheaper; the MIN_SIZE_KB override is required because vLLM's default INT4 gate for (bf16, TP4) is 16MB and never fires for the ~1.5MB decode all-reduces. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
-    - "index_topk_freq needs vllm-project/vllm#47269 and the AR+RMSNorm fusion needs vllm-project/vllm#47270 in the served image; bump the image pin to a nightly containing both before sweeping."
+    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
+    - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
+    - "index_topk_freq needs vllm-project/vllm#47269 (merged) in the served image; pin the image to a nightly containing it before sweeping."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969

From bc73306231b23bfa7252c3e4739f361f02ff2166 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 07:41:42 -0500
Subject: [PATCH 4/4] fix(changelog): move MiniMax-M3 FP4 vLLM STP entry to
 EOF; restore atom entry

Addresses Claude review: the new entry was inserted inside the
minimaxm3-fp4-mi355x-atom entry (#1967), dropping its config-keys header and
producing duplicate mapping keys. Restore the atom entry and append the vllm
entry at the end of the file per repo convention. Also pin the image to the
latest nightly containing vllm-project/vllm#47269.
---
 .github/configs/amd-master.yaml |  2 +-
 perf-changelog.yaml             | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9386d9c27..ea57b7e06 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2635,7 +2635,7 @@ minimaxm3-fp4-mi355x-vllm-disagg:
 # language-model path and mirror the MXFP8 MI355X search space for a direct
 # precision comparison.
 minimaxm3-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 419ebf787..53137295f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4361,12 +4361,6 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1966
 
 - config-keys:
-    - minimaxm3-fp4-mi355x-vllm
-  description:
-    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
-    - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
-    - "index_topk_freq needs vllm-project/vllm#47269 (merged) in the served image; pin the image to a nightly containing it before sweeping."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969
     - minimaxm3-fp4-mi355x-atom
     - minimaxm3-fp4-mi355x-atom-mtp
   description:
@@ -4406,3 +4400,11 @@
   description:
     - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm
+  description:
+    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
+    - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
+    - "index_topk_freq needs vllm-project/vllm#47269 (merged) in the served image; pin the image to a nightly containing it before sweeping."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969