From e493673dcdffbefd9b7858c470f6695479bea498 Mon Sep 17 00:00:00 2001
From: jiacao-amd <jiahui.cao@amd.com>
Date: Thu, 2 Jul 2026 17:57:21 +0000
Subject: [PATCH] perf(dsv4-fp4-mi355x-vllm): use AITER a16w4 MoE backend (+21%
 decode)

Switch the DeepSeek-V4-Pro FP4 MI355X vLLM recipe from
--moe-backend triton_unfused to --moe-backend aiter, and export
VLLM_ROCM_USE_AITER_MOE=1.

triton_unfused runs a W4A16 Triton dequant path; --moe-backend aiter
selects AITER_MXFP4_BF16 (a16w4), a faster AITER kernel for the same
numerics. Measured on MI355X TP=8, c64 1k/1k:
  output throughput 915 -> 1105 tok/s (+20.7%)
  mean TPOT 66.97ms -> 55.36ms

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 .../fixed_seq_len/dsv4_fp4_mi355x_vllm.sh        | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
index dc8989b3e2..887c1da078 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
@@ -12,11 +12,14 @@ set -eo pipefail
 # same ROCm recipe while switching parallelism to vLLM's DP+EP form.
 # Image-pin details live in amd-master.yaml.
 #
-# --moe-backend triton_unfused is required for the FP4 MoE expert
-# weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend
-# default to auto picks a backend that doesn't register the FP4 scale
-# parameters (w13_weight_scale / w2_weight_scale), so safetensors
-# loading raises KeyError.
+# --moe-backend aiter selects the AITER AITER_MXFP4_BF16 (a16w4) MoE
+# kernel, which is ~21% faster on decode than triton_unfused
+# (1105 vs 915 output tok/s at c64 1k/1k, TP=8) while staying numerically
+# stable. It requires VLLM_ROCM_USE_AITER=1 and VLLM_ROCM_USE_AITER_MOE=1
+# (both exported below).
+# Do NOT let --moe-backend default to auto: auto picks a backend that
+# doesn't register the FP4 scale parameters (w13_weight_scale /
+# w2_weight_scale), so safetensors loading raises KeyError.
 #
 # --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode
 # enables full CUDA graph capture for improved throughput on MI355X.
@@ -45,6 +48,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MOE=1
 
 SERVER_LOG=/workspace/server.log
 
@@ -75,7 +79,7 @@ vllm serve $MODEL --port $PORT \
     --gpu-memory-utilization 0.8 \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
-    --moe-backend triton_unfused \
+    --moe-backend aiter \
     --tokenizer-mode deepseek_v4 \
     --reasoning-parser deepseek_v4 \
     --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 &