Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4"
# point at concurrency 192.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72"
precision: "fp4"

dynamo:
Expand Down Expand Up @@ -48,6 +48,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
Expand All @@ -67,7 +68,7 @@ backend:
data-parallel-rpc-port: 13345
enable-expert-parallel: true
attention-config: '{"use_fp4_indexer_cache": true}'
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"
enforce-eager: true
max-model-len: 16384
max-num-seqs: 256
Expand All @@ -81,6 +82,7 @@ backend:
enable-ep-weight-filter: true
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4"
name: "svf-vllm-disagg-gb300-1p9d-tep4-tp4"

# Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72
# Topology: 1 prefill (TEP=4) + 9 decode (TP=4). 10 GB300 nodes (1P + 9D = 40
# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node.
# Wide-decode point at concurrency 18 — each decode worker holds a
# single replica.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72"
precision: "fp4"

dynamo:
Expand All @@ -26,9 +26,9 @@ resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 17
decode_nodes: 9
prefill_workers: 1
decode_workers: 17
decode_workers: 9
gpus_per_prefill: 4
gpus_per_decode: 4

Expand Down Expand Up @@ -73,6 +73,7 @@ backend:
enable-ep-weight-filter: true
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24"

# Topology: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300 nodes (4P + 2D
# = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on
# Max-throughput point at concurrency 4096 with deep_gemm_amxf4_mega_moe on
# both workers.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72"
precision: "fp4"

dynamo:
Expand Down Expand Up @@ -51,6 +51,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
Expand All @@ -60,6 +61,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

vllm_config:
prefill:
Expand All @@ -86,7 +88,7 @@ backend:
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
enable-sleep-mode: true
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
Expand All @@ -110,7 +112,8 @@ backend:
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
enable-sleep-mode: true
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"
no-enable-flashinfer-autotune: true

benchmark:
type: "sa-bench"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ name: "svf-vllm-disagg-gb300-5p1d-dep4-dep8-28"

# Topology: 5 prefill (DEP=4 each) + 1 decode (DEP=8). 7 GB300 nodes (5P + 2D
# = 28 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on
# Max-throughput point at concurrency 4096 with deep_gemm_amxf4_mega_moe on
# both workers.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72"
precision: "fp4"

dynamo:
Expand Down Expand Up @@ -51,6 +51,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
Expand All @@ -60,6 +61,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

vllm_config:
prefill:
Expand All @@ -86,7 +88,7 @@ backend:
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
enable-sleep-mode: true
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
Expand All @@ -98,7 +100,7 @@ backend:
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 512
max-num-seqs: 384
max-cudagraph-capture-size: 512
trust-remote-code: true
no-enable-prefix-caching: true
Expand All @@ -110,7 +112,8 @@ backend:
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
enable-sleep-mode: true
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"
no-enable-flashinfer-autotune: true

benchmark:
type: "sa-bench"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ name: "svf-vllm-disagg-gb300-6p1d-dep4-dep8-32"

# Topology: 6 prefill (DEP=4 each) + 1 decode (DEP=8). 8 GB300 nodes (6P + 2D
# = 32 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on
# Max-throughput point at concurrency 4096 with deep_gemm_amxf4_mega_moe on
# both workers.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72"
precision: "fp4"

dynamo:
Expand Down Expand Up @@ -51,6 +51,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
Expand All @@ -60,6 +61,7 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

vllm_config:
prefill:
Expand All @@ -86,7 +88,7 @@ backend:
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
enable-sleep-mode: true
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
Expand All @@ -98,7 +100,7 @@ backend:
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 512
max-num-seqs: 384
max-cudagraph-capture-size: 512
trust-remote-code: true
no-enable-prefix-caching: true
Expand All @@ -110,7 +112,8 @@ backend:
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
enable-sleep-mode: true
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"
no-enable-flashinfer-autotune: true

benchmark:
type: "sa-bench"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "svf-vllm-disagg-gb300-7p2d-dep4-dep16"
# Wide-EP decode max-throughput point at concurrency 3072.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
container: "vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72"
precision: "fp4"

dynamo:
Expand Down Expand Up @@ -48,13 +48,15 @@ backend:
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
VLLM_DSV4_MEGA_FP8_COMBINE: "1"

vllm_config:
prefill:
Expand All @@ -67,7 +69,7 @@ backend:
data-parallel-rpc-port: 13345
enable-expert-parallel: true
attention-config: '{"use_fp4_indexer_cache": true}'
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"
enforce-eager: true
max-model-len: 16384
max-num-seqs: 256
Expand All @@ -81,6 +83,7 @@ backend:
enable-ep-weight-filter: true
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
Expand All @@ -91,7 +94,7 @@ backend:
data-parallel-rpc-port: 13345
enable-expert-parallel: true
attention-config: '{"use_fp4_indexer_cache": true}'
moe-backend: "deep_gemm_mega_moe"
moe-backend: "deep_gemm_amxf4_mega_moe"
max-model-len: 16384
max-num-seqs: 512
max-cudagraph-capture-size: 512
Expand Down
6 changes: 3 additions & 3 deletions configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9572,7 +9572,7 @@ dsv4-fp4-b300-dynamo-vllm:
dp-attn: true

dsv4-fp4-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.0-ubuntu2404
image: vllm/vllm-openai:dsv4-megamoe-mxfp4-arm64-cu130-4ba0a72
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-nv
Expand Down Expand Up @@ -9605,9 +9605,9 @@ dsv4-fp4-gb300-dynamo-vllm:
ep: 4
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p9d-tep4-tp4.yaml"
decode:
num-worker: 17
num-worker: 9
tp: 4
ep: 1
dp-attn: false
Expand Down
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4433,3 +4433,9 @@
- "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts."
- "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001

- config-keys:
- dsv4-fp4-gb300-dynamo-vllm
description:
- "Refresh DSV4 8k/1k vLLM GB300 recipes with new w4a4 container and updated configs"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2010
Comment on lines +4437 to +4441

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 The new dsv4-fp4-gb300-dynamo-vllm entry in perf-changelog.yaml (line 4441) has pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX — a template placeholder that wasn't filled in. This should be .../pull/2010 to match the actual PR number, consistent with every other entry in the file.

Extended reasoning...

What the bug is

The new entry added at the bottom of perf-changelog.yaml for dsv4-fp4-gb300-dynamo-vllm uses a literal placeholder for its pr-link:

- config-keys:
    - dsv4-fp4-gb300-dynamo-vllm
  description:
    - "Refresh DSV4 8k/1k vLLM GB300 recipes with new w4a4 container and updated configs"
  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX

The XXX is clearly a fill-in-the-blank the author forgot to substitute. Every other entry in this file uses a concrete numeric PR (e.g. pull/2001, pull/1990, pull/1978, pull/1975). This PR is #2010, so it should read https://github.com/SemiAnalysisAI/InferenceX/pull/2010.

Why existing tooling doesn't catch this on the PR

utils/validate_perf_changelog.py explicitly allows the placeholder on PR runs (via a PR_LINK_PLACEHOLDERS set) so that authors can open the PR before knowing the assigned number. That is why CI is green here.

Impact after merge

On main-branch runs, validate_added_pr_link() enforces CANONICAL_PR_LINK = r'https://github\.com/SemiAnalysisAI/InferenceX/pull/\d+' with fullmatch. XXX does not match \d+, so if this merges as-is, the next main-branch changelog validation will fail on this entry. Beyond that, the entry becomes useless as a tracking record — clicking the link goes to a 404, and the change loses its anchor to the PR that introduced it.

Step-by-step proof

  1. Line 4441 of the file after this PR: pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX.
  2. On PR CI, validate_added_pr_link sees the link is in PR_LINK_PLACEHOLDERS → passes.
  3. After merge to main, validate_added_pr_link runs and executes CANONICAL_PR_LINK.fullmatch(link) with the pattern ^https://github\.com/SemiAnalysisAI/InferenceX/pull/\d+$.
  4. The value .../pull/XXX fails \d+fullmatch returns None → the check errors out.

Fix

Replace pull/XXX with pull/2010 on line 4441 before merging.

Severity

Nit — it does not affect runtime/benchmark behavior and is a one-character fix, but it should be corrected before merge so the changelog stays consistent and main-branch validation stays green.