From 736efab43efa6a3b23a618fe118d35c9b3145e2d Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Fri, 3 Jul 2026 16:48:17 -0400 Subject: [PATCH 1/2] perf: update DSV4 GB300 Dynamo vLLM Pareto recipes --- .../8k1k/disagg-gb300-1p6d-dep4-tp4.yaml | 6 ++++-- ...ep4-tp4.yaml => disagg-gb300-1p9d-tep4-tp4.yaml} | 11 ++++++----- .../8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml | 11 +++++++---- .../8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml | 13 ++++++++----- .../8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml | 13 ++++++++----- .../8k1k/disagg-gb300-7p2d-dep4-dep16.yaml | 9 ++++++--- configs/nvidia-master.yaml | 6 +++--- perf-changelog.yaml | 6 ++++++ 8 files changed, 48 insertions(+), 27 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb300-1p17d-tep4-tp4.yaml => disagg-gb300-1p9d-tep4-tp4.yaml} (90%) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml index c3b25acc13..7a299d81a5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -5,7 +5,7 @@ name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4" # point at concurrency 192. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72" precision: "fp4" dynamo: @@ -48,6 +48,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -67,7 +68,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true attention-config: '{"use_fp4_indexer_cache": true}' - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" enforce-eager: true max-model-len: 16384 max-num-seqs: 256 @@ -81,6 +82,7 @@ backend: enable-ep-weight-filter: true no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + tokenizer-mode: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p9d-tep4-tp4.yaml similarity index 90% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p9d-tep4-tp4.yaml index a2c3ab80ac..d1e895bbbf 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p9d-tep4-tp4.yaml @@ -1,12 +1,12 @@ -name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4" +name: "svf-vllm-disagg-gb300-1p9d-tep4-tp4" -# Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72 +# Topology: 1 prefill (TEP=4) + 9 decode (TP=4). 10 GB300 nodes (1P + 9D = 40 # GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node. # Wide-decode point at concurrency 18 — each decode worker holds a # single replica. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72" precision: "fp4" dynamo: @@ -26,9 +26,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 17 + decode_nodes: 9 prefill_workers: 1 - decode_workers: 17 + decode_workers: 9 gpus_per_prefill: 4 gpus_per_decode: 4 @@ -73,6 +73,7 @@ backend: enable-ep-weight-filter: true no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + tokenizer-mode: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml index b97ef0d5a2..0a3643cf27 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml @@ -2,11 +2,11 @@ name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24" # Topology: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300 nodes (4P + 2D # = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. -# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# Max-throughput point at concurrency 4096 with deep_gemm_amxf4_mega_moe on # both workers. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72" precision: "fp4" dynamo: @@ -51,6 +51,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -60,6 +61,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" vllm_config: prefill: @@ -86,7 +88,7 @@ backend: tokenizer-mode: deepseek_v4 enable-ep-weight-filter: true enable-sleep-mode: true - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -110,7 +112,8 @@ backend: tokenizer-mode: deepseek_v4 enable-ep-weight-filter: true enable-sleep-mode: true - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" + no-enable-flashinfer-autotune: true benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml index d83e6d771f..698ef94e2b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml @@ -2,11 +2,11 @@ name: "svf-vllm-disagg-gb300-5p1d-dep4-dep8-28" # Topology: 5 prefill (DEP=4 each) + 1 decode (DEP=8). 7 GB300 nodes (5P + 2D # = 28 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. -# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# Max-throughput point at concurrency 4096 with deep_gemm_amxf4_mega_moe on # both workers. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72" precision: "fp4" dynamo: @@ -51,6 +51,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -60,6 +61,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" vllm_config: prefill: @@ -86,7 +88,7 @@ backend: tokenizer-mode: deepseek_v4 enable-ep-weight-filter: true enable-sleep-mode: true - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -98,7 +100,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 16384 - max-num-seqs: 512 + max-num-seqs: 384 max-cudagraph-capture-size: 512 trust-remote-code: true no-enable-prefix-caching: true @@ -110,7 +112,8 @@ backend: tokenizer-mode: deepseek_v4 enable-ep-weight-filter: true enable-sleep-mode: true - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" + no-enable-flashinfer-autotune: true benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml index 4b54cc13e0..85a1d4c6b6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml @@ -2,11 +2,11 @@ name: "svf-vllm-disagg-gb300-6p1d-dep4-dep8-32" # Topology: 6 prefill (DEP=4 each) + 1 decode (DEP=8). 8 GB300 nodes (6P + 2D # = 32 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. -# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# Max-throughput point at concurrency 4096 with deep_gemm_amxf4_mega_moe on # both workers. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72" precision: "fp4" dynamo: @@ -51,6 +51,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -60,6 +61,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" vllm_config: prefill: @@ -86,7 +88,7 @@ backend: tokenizer-mode: deepseek_v4 enable-ep-weight-filter: true enable-sleep-mode: true - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -98,7 +100,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 16384 - max-num-seqs: 512 + max-num-seqs: 384 max-cudagraph-capture-size: 512 trust-remote-code: true no-enable-prefix-caching: true @@ -110,7 +112,8 @@ backend: tokenizer-mode: deepseek_v4 enable-ep-weight-filter: true enable-sleep-mode: true - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" + no-enable-flashinfer-autotune: true benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml index 43c2031a8d..53534f9697 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -5,7 +5,7 @@ name: "svf-vllm-disagg-gb300-7p2d-dep4-dep16" # Wide-EP decode max-throughput point at concurrency 3072. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72" precision: "fp4" dynamo: @@ -48,6 +48,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -55,6 +56,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" TORCH_SYMMMEM: "NVSHMEM" + VLLM_DSV4_MEGA_FP8_COMBINE: "1" vllm_config: prefill: @@ -67,7 +69,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true attention-config: '{"use_fp4_indexer_cache": true}' - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" enforce-eager: true max-model-len: 16384 max-num-seqs: 256 @@ -81,6 +83,7 @@ backend: enable-ep-weight-filter: true no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + tokenizer-mode: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -91,7 +94,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true attention-config: '{"use_fp4_indexer_cache": true}' - moe-backend: "deep_gemm_mega_moe" + moe-backend: "deep_gemm_amxf4_mega_moe" max-model-len: 16384 max-num-seqs: 512 max-cudagraph-capture-size: 512 diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index bdb6a46029..730ddfdde4 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -9572,7 +9572,7 @@ dsv4-fp4-b300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-nv @@ -9605,9 +9605,9 @@ dsv4-fp4-gb300-dynamo-vllm: ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p9d-tep4-tp4.yaml" decode: - num-worker: 17 + num-worker: 9 tp: 4 ep: 1 dp-attn: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3944e67c5f..af151d2dec 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4433,3 +4433,9 @@ - "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts." - "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001 + +- config-keys: + - dsv4-fp4-gb300-dynamo-vllm + description: + - "Refresh DSV4 8k/1k vLLM GB300 recipes with new w4a4 container and updated configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From a4d2f6b570c74aca0b8cf1e83f18d40851af36ee Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Fri, 3 Jul 2026 17:35:14 -0400 Subject: [PATCH 2/2] chore --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index af151d2dec..42c5cb2717 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4438,4 +4438,4 @@ - dsv4-fp4-gb300-dynamo-vllm description: - "Refresh DSV4 8k/1k vLLM GB300 recipes with new w4a4 container and updated configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2010