From 6e78f71d439665a4cfcb839c79fa490450181a8b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 3 Jul 2026 20:00:19 +0800 Subject: [PATCH] chore: clean up srt-slurm recipes --- .../1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml | 122 ------------ .../1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml | 122 ------------ .../1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml | 127 ------------ .../1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml | 127 ------------ .../1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml | 98 ---------- .../1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml | 98 ---------- .../1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml | 101 ---------- .../1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml | 101 ---------- .../1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml | 101 ---------- .../1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml | 101 ---------- .../8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml | 98 ---------- .../8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml | 98 ---------- .../8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml | 98 ---------- .../8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml | 98 ---------- .../8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml | 98 ---------- .../8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml | 98 ---------- .../8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml | 101 ---------- .../8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml | 101 ---------- .../8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml | 101 ---------- .../1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml | 173 ---------------- .../1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml | 173 ---------------- .../1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml | 185 ------------------ .../1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml | 185 ------------------ .../1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml | 185 ------------------ .../8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml | 185 ------------------ .../8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml | 185 ------------------ .../8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml | 185 ------------------ .../1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml | 148 -------------- .../1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml | 148 -------------- .../1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml | 148 -------------- .../1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml | 148 -------------- .../1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml | 148 -------------- .../1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml | 140 ------------- .../1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml | 140 ------------- .../8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml | 148 -------------- .../8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml | 148 -------------- .../8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml | 148 -------------- .../8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml | 148 -------------- .../8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml | 140 ------------- .../8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml | 140 ------------- .../8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml | 140 ------------- .../8k1k/disagg-b200-high-tpt-megamoe.yaml | 138 ------------- .../8k1k/disagg-b200-low-latency.yaml | 137 ------------- .../8k1k/disagg-b200-low-middle-curve.yaml | 138 ------------- .../8k1k/disagg-b200-max-tpt-megamoe.yaml | 138 ------------- .../8k1k/disagg-b200-mid-curve-megamoe.yaml | 138 ------------- .../8k1k/disagg-b300-low-middle-curve.yaml | 125 ------------ .../8k1k/disagg-b300-max-tpt-megamoe.yaml | 130 ------------ .../minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml | 72 ------- .../1k1k/dep2-2p3d-c6144.yaml | 72 ------- .../minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml | 72 ------- .../minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml | 74 ------- .../minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml | 70 ------- .../minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml | 72 ------- .../minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml | 68 ------- .../1k1k/tp4ep-1p1d-hi-conc.yaml | 68 ------- .../1k1k/tp4ep-1p1d.yaml | 70 ------- .../1k1k/tp4ep-1p2d.yaml | 68 ------- .../1k1k/tp4ep-1p3d-hi-conc.yaml | 68 ------- .../1k1k/tp4ep-1p3d.yaml | 68 ------- .../1k1k/tp4ep-2p3d.yaml | 72 ------- .../minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml | 75 ------- .../minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml | 68 ------- .../8k1k/tp4ep-1p1d-hi-conc.yaml | 68 ------- .../8k1k/tp4ep-1p1d.yaml | 68 ------- .../vllm/minimax-m2.5-b200-fp8/1k1k/dep8.yaml | 75 ------- .../1k1k/disagg-b200-1p1d-tp4ep.yaml | 69 ------- .../1k1k/disagg-b200-1p3d-tp4ep.yaml | 72 ------- .../1k1k/disagg-b200-1p4d-dep2-hi-conc.yaml | 74 ------- .../1k1k/disagg-b200-2p1d-dep8.yaml | 86 -------- .../1k1k/disagg-b200-2p3d-dep4-hi-conc.yaml | 74 ------- .../minimax-m2.5-b200-fp8/1k1k/tp4ep.yaml | 73 ------- .../8k1k/disagg-b200-1p1d-tp4ep-hi-conc.yaml | 69 ------- .../8k1k/disagg-b200-1p1d-tp4ep.yaml | 69 ------- .../8k1k/disagg-b200-3p2d-dep4.yaml | 76 ------- .../1k1k/disagg-b300-1p1d-tp4.yaml | 65 ------ .../1k1k/disagg-b300-1p2d-tp4.yaml | 65 ------ .../1k1k/disagg-b300-1p2d-tp4ep.yaml | 67 ------- .../1k1k/disagg-b300-2p1d-dep8.yaml | 69 ------- .../1k1k/disagg-b300-2p2d-dep4-hi-conc.yaml | 69 ------- .../1k1k/disagg-b300-2p2d-dep4.yaml | 69 ------- .../1k1k/disagg-b300-2p3d-dep2.yaml | 69 ------- .../8k1k/disagg-b300-1p1d-tp4ep-hi-conc.yaml | 67 ------- .../8k1k/disagg-b300-1p1d-tp4ep.yaml | 67 ------- .../8k1k/disagg-b300-2p1d-tp2.yaml | 68 ------- .../8k1k/disagg-b300-2p1d-tp4ep.yaml | 67 ------- .../8k1k/disagg-b300-3p1d-dep4-hi-conc.yaml | 72 ------- .../8k1k/disagg-b300-3p1d-dep4.yaml | 72 ------- .../8k1k/disagg-b300-3p1d-tp4.yaml | 68 ------- .../8k1k/disagg-b300-5p2d-dep4.yaml | 69 ------- .../minimax-m2.5-b300/1k1k/dep2-1p2d.yaml | 72 ------- .../1k1k/dep2-2p3d-c6144.yaml | 72 ------- .../minimax-m2.5-b300/1k1k/dep2-2p3d.yaml | 72 ------- .../minimax-m2.5-b300/1k1k/dep8-2p1d.yaml | 71 ------- .../vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml | 73 ------- .../vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml | 69 ------- .../minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml | 71 ------- .../minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml | 69 ------- .../vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml | 78 -------- .../minimax-m2.5-b300/8k1k/dep4-4p1d.yaml | 71 ------- .../minimax-m2.5-b300/8k1k/dep8-4p1d.yaml | 71 ------- .../vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml | 69 ------- .../minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml | 69 ------- .../minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml | 69 ------- .../vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml | 73 ------- .../1k1k/disagg-gb300-1p1d-tp4.yaml | 64 ------ .../1k1k/disagg-gb300-1p2d-tp4.yaml | 69 ------- .../1k1k/disagg-gb300-1p2d-tp4ep.yaml | 66 ------- .../1k1k/disagg-gb300-2p1d-dep8.yaml | 83 -------- .../1k1k/disagg-gb300-2p2d-dep4-hi-conc.yaml | 82 -------- .../1k1k/disagg-gb300-2p2d-dep4.yaml | 82 -------- .../1k1k/disagg-gb300-2p3d-dep2.yaml | 68 ------- .../8k1k/disagg-gb300-1p1d-tp4ep-hi-conc.yaml | 66 ------- .../8k1k/disagg-gb300-1p1d-tp4ep.yaml | 66 ------- .../8k1k/disagg-gb300-2p1d-tp2.yaml | 68 ------- .../8k1k/disagg-gb300-2p1d-tp4ep.yaml | 66 ------- .../8k1k/disagg-gb300-3p1d-dep4-hi-conc.yaml | 82 -------- .../8k1k/disagg-gb300-3p1d-dep4.yaml | 82 -------- .../8k1k/disagg-gb300-3p1d-tp4.yaml | 68 ------- .../8k1k/disagg-gb300-5p2d-dep4.yaml | 68 ------- .../1k1k/disagg-gb200-1p1d-tp4.yaml | 67 ------- .../1k1k/disagg-gb200-1p2d-tp4.yaml | 67 ------- .../1k1k/disagg-gb200-1p3d-tp4ep.yaml | 72 ------- .../1k1k/disagg-gb200-1p4d-dep2.yaml | 74 ------- .../1k1k/disagg-gb200-2p1d-dep8.yaml | 86 -------- .../1k1k/disagg-gb200-2p3d-dep4.yaml | 74 ------- .../8k1k/disagg-gb200-1p1d-tp4.yaml | 68 ------- .../8k1k/disagg-gb200-1p1d-tp4ep.yaml | 70 ------- .../8k1k/disagg-gb200-3p2d-dep4.yaml | 76 ------- .../minimax-m2.5-gb200/1k1k/dep2-1p2d.yaml | 73 ------- .../1k1k/dep2-2p3d-c6144.yaml | 73 ------- .../minimax-m2.5-gb200/1k1k/dep2-2p3d.yaml | 73 ------- .../minimax-m2.5-gb200/1k1k/dep4-3p2d.yaml | 74 ------- .../minimax-m2.5-gb200/1k1k/dep8-2p1d.yaml | 70 ------- .../minimax-m2.5-gb200/1k1k/tp4-1p1d.yaml | 72 ------- .../minimax-m2.5-gb200/1k1k/tp4-1p2d.yaml | 68 ------- .../1k1k/tp4ep-1p1d-hi-conc.yaml | 68 ------- .../minimax-m2.5-gb200/1k1k/tp4ep-1p1d.yaml | 70 ------- .../minimax-m2.5-gb200/1k1k/tp4ep-1p2d.yaml | 68 ------- .../1k1k/tp4ep-1p3d-hi-conc.yaml | 68 ------- .../minimax-m2.5-gb200/1k1k/tp4ep-1p3d.yaml | 68 ------- .../minimax-m2.5-gb200/1k1k/tp4ep-2p3d.yaml | 72 ------- .../minimax-m2.5-gb200/8k1k/dep4-2p1d.yaml | 75 ------- .../minimax-m2.5-gb200/8k1k/tp4-1p1d.yaml | 68 ------- .../8k1k/tp4ep-1p1d-hi-conc.yaml | 68 ------- .../minimax-m2.5-gb200/8k1k/tp4ep-1p1d.yaml | 68 ------- .../minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml | 73 ------- .../vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml | 73 ------- .../vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml | 70 ------- .../vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml | 72 ------- .../vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml | 68 ------- .../vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml | 70 ------- .../vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml | 68 ------- .../vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml | 70 ------- .../vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml | 70 ------- .../vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml | 68 ------- .../vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml | 68 ------- .../vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml | 68 ------- .../b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 81 -------- .../b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 81 -------- .../b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 79 -------- .../b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 81 -------- .../b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 79 -------- configs/nvidia-master.yaml | 82 ++++---- runners/launch_b200-dgxc.sh | 17 +- runners/launch_b300-nv.sh | 12 -- runners/launch_gb200-nv.sh | 14 -- runners/launch_gb300-nv.sh | 24 +-- 168 files changed, 50 insertions(+), 14711 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/dep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p3d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p4d-dep2-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p1d-dep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p3d-dep4-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-3p2d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p1d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p1d-dep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p3d-dep2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-5p2d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p1d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p1d-dep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p3d-dep2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-5p2d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d-c6144.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep4-3p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep8-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-2p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/dep4-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d-hi-conc.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml deleted file mode 100644 index b5fe566457..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml +++ /dev/null @@ -1,122 +0,0 @@ -# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml -# base + zip_override_mtp_lowlat[0]): 1p5d low-latency (dep4 prefill / tep8 decode, 5 decode nodes). -# One flat YAML per concrete topology, matching the 8k1k local recipe layout -# (sglang//-//disagg//...). - -name: b200-fp4-mtp-low-latency-dep4-1p-tep8-5d -model: - path: dsr1 - container: dynamo-sglang - precision: fp4 - -dynamo: - hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" - install: true - -resources: - gpu_type: b200 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 5 - decode_workers: 5 - gpus_per_node: 8 -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: '1' - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: '1' - sglang_config: - prefill: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - moe-dense-tp-size: 1 - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - decode: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - speculative-algorithm: EAGLE - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - req_rate: inf - concurrencies: 16x512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml deleted file mode 100644 index 77905ed598..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml +++ /dev/null @@ -1,122 +0,0 @@ -# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml -# base + zip_override_mtp_lowlat[1]): 1p6d low-latency (dep4 prefill / tep8 decode, 6 decode nodes). -# One flat YAML per concrete topology, matching the 8k1k local recipe layout -# (sglang//-//disagg//...). - -name: b200-fp4-mtp-low-latency-dep4-1p-tep8-6d -model: - path: dsr1 - container: dynamo-sglang - precision: fp4 - -dynamo: - hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" - install: true - -resources: - gpu_type: b200 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 6 - decode_workers: 6 - gpus_per_node: 8 -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: '1' - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: '1' - sglang_config: - prefill: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - moe-dense-tp-size: 1 - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - decode: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 8 - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - speculative-algorithm: EAGLE - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - req_rate: inf - concurrencies: 32x64x256x512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml deleted file mode 100644 index 7cc3a5848d..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml +++ /dev/null @@ -1,127 +0,0 @@ -# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml -# base + zip_override_mtp_maxtpt[0]): 1p1d max-throughput (dep4 prefill / dep8 decode, mem-fraction 0.75). -# One flat YAML per concrete topology, matching the 8k1k local recipe layout -# (sglang//-//disagg//...). - -name: b200-fp4-mtp-max-tpt-dep4-1p-dep8-1d -model: - path: dsr1 - container: dynamo-sglang - precision: fp4 - -dynamo: - hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" - install: true - -resources: - gpu_type: b200 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 1 - decode_workers: 1 - gpus_per_node: 8 -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: '1' - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_MOE_NVFP4_DISPATCH: '1' - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: cutlass - SGLANG_ENABLE_SPEC_V2: '1' - sglang_config: - prefill: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - disable-cuda-graph: true - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - moe-dense-tp-size: 1 - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - decode: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.75 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - enable-dp-attention: true - enable-dp-lm-head: true - moe-dense-tp-size: 1 - speculative-algorithm: EAGLE - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - req_rate: inf - concurrencies: 512x1024 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml deleted file mode 100644 index 17c334d5b8..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml +++ /dev/null @@ -1,127 +0,0 @@ -# Derived from the srt-slurm b200-fp4 1k1k recipe (recipes/b200-fp4/1k1k.yaml -# base + zip_override_mtp_maxtpt[1]): 1p2d max-throughput (dep4 prefill / dep8 decode, mem-fraction 0.85). -# One flat YAML per concrete topology, matching the 8k1k local recipe layout -# (sglang//-//disagg//...). - -name: b200-fp4-mtp-max-tpt-dep4-1p-dep8-2d -model: - path: dsr1 - container: dynamo-sglang - precision: fp4 - -dynamo: - hash: "5b4bc1dd70965017a737c71b19db5a0aeaa88727" - install: true - -resources: - gpu_type: b200 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 2 - decode_workers: 2 - gpus_per_node: 8 -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_ENABLE_SPEC_V2: '1' - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_ENABLE_JIT_DEEPGEMM: 'false' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1' - DYN_REQUEST_PLANE: nats - SGLANG_MOE_NVFP4_DISPATCH: '1' - SGLANG_FLASHINFER_FP4_GEMM_BACKEND: cutlass - SGLANG_ENABLE_SPEC_V2: '1' - sglang_config: - prefill: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - disable-cuda-graph: true - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - enable-dp-attention: true - enable-dp-lm-head: true - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - moe-dense-tp-size: 1 - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - decode: - served-model-name: deepseek-ai/DeepSeek-R1 - trust-remote-code: true - quantization: modelopt_fp4 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.85 - max-prefill-tokens: 32768 - chunked-prefill-size: 32768 - context-length: 2200 - max-running-requests: 512 - cuda-graph-max-bs: 512 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - attention-backend: trtllm_mla - kv-cache-dtype: fp8_e4m3 - moe-runner-backend: flashinfer_trtllm - stream-interval: 30 - watchdog-timeout: 1000000 - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - fp4-gemm-backend: flashinfer_trtllm - enable-dp-attention: true - enable-dp-lm-head: true - moe-dense-tp-size: 1 - speculative-algorithm: EAGLE - speculative-num-steps: 2 - speculative-eagle-topk: 1 - speculative-num-draft-tokens: 3 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - isl: 1024 - osl: 1024 - req_rate: inf - concurrencies: '512' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml deleted file mode 100644 index 8ad78c93eb..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_1k1k_lowlat_0 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 8 - decode_workers: 8 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 64 - cuda-graph-max-bs: 64 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: 512x256x128x64x32 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml deleted file mode 100644 index fced286164..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_1k1k_lowlat_1 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 8 - decode_workers: 8 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 1 - cuda-graph-max-bs: 1 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '16' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml deleted file mode 100644 index 3627b0fce3..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_1k1k_hightpt_0 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 2560 - cuda-graph-max-bs: 2560 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '2576' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml deleted file mode 100644 index dd18582708..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_1k1k_hightpt_1 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 2 - decode_workers: 2 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 1232 - cuda-graph-max-bs: 1232 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '1248' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml deleted file mode 100644 index c93f2b294a..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_1k1k_hightpt_2 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 3 - decode_workers: 3 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 784 - cuda-graph-max-bs: 784 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '800' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml deleted file mode 100644 index e6ad090411..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_1k1k_hightpt_3 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 4 - decode_workers: 4 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 560 - cuda-graph-max-bs: 560 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '576' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml deleted file mode 100644 index a635e99419..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_8k1k_lowlat_0 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 2 - decode_workers: 2 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 80 - cuda-graph-max-bs: 80 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '256' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml deleted file mode 100644 index 4fa3e72857..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_8k1k_lowlat_1 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 3 - decode_workers: 3 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 48 - cuda-graph-max-bs: 48 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '256' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml deleted file mode 100644 index f1404ae279..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_8k1k_lowlat_2 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 4 - decode_workers: 4 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 34 - cuda-graph-max-bs: 34 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '200' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml deleted file mode 100644 index 1b0bff9b51..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_8k1k_lowlat_3 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 5 - decode_workers: 5 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 22 - cuda-graph-max-bs: 22 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '128' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml deleted file mode 100644 index 1fa1e8f6c0..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_8k1k_lowlat_4 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 7 - decode_workers: 7 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 8 - cuda-graph-max-bs: 8 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '64' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml deleted file mode 100644 index 6115cbf7a1..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml +++ /dev/null @@ -1,98 +0,0 @@ -name: b200-fp8-glm5_8k1k_lowlat_5 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 8 - decode_workers: 8 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 1 - max-running-requests: 1 - cuda-graph-max-bs: 1 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '12' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml deleted file mode 100644 index ae824b4a7a..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_8k1k_hightpt_0 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 2 - prefill_workers: 2 - decode_nodes: 1 - decode_workers: 1 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 544 - cuda-graph-max-bs: 544 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '560' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml deleted file mode 100644 index 12844af4a3..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_8k1k_hightpt_1 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 1 - decode_workers: 1 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 224 - cuda-graph-max-bs: 224 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '240' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml deleted file mode 100644 index 1e8d7599ae..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: b200-fp8-glm5_8k1k_hightpt_2 -model: - path: glm5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 -resources: - gpu_type: b200 - gpus_per_node: 8 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 2 - decode_workers: 2 -frontend: - type: dynamo -dynamo: - version: "1.1.0" -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - NCCL_CUMEM_ENABLE: '1' - DYN_REQUEST_PLANE: nats - sglang_config: - prefill: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 65536 - max-prefill-tokens: 8192 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - decode: - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - mem-fraction-static: 0.8 - context-length: 9600 - tensor-parallel-size: 8 - expert-parallel-size: 1 - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - enable-flashinfer-allreduce-fusion: true - weight-loader-prefetch-checkpoints: true - disable-radix-cache: true - stream-interval: 30 - model-loader-extra-config: '{"enable_multithread_load": true}' - data-parallel-size: 8 - enable-dp-lm-head: true - enable-dp-attention: true - load-balance-method: total_tokens - max-running-requests: 208 - cuda-graph-max-bs: 208 -health_check: - max_attempts: 360 - interval_seconds: 10 -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '224' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml deleted file mode 100644 index 32cfbd4b72..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml +++ /dev/null @@ -1,173 +0,0 @@ -name: "gb300-fp4-glm5_1k1k_lowlat_0" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 17 - decode_workers: 17 - gpus_per_decode: 4 - -frontend: - type: dynamo - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_lowlat) - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - - moe-runner-backend: "flashinfer_trtllm" - max-running-requests: 32 - cuda-graph-max-bs: 32 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x256x128x64" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml deleted file mode 100644 index cf7ab32ee5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml +++ /dev/null @@ -1,173 +0,0 @@ -name: "gb300-fp4-glm5_1k1k_lowlat_1" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 17 - decode_workers: 17 - gpus_per_decode: 4 - -frontend: - type: dynamo - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_lowlat) - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - - moe-runner-backend: "flashinfer_trtllm" - max-running-requests: 1 - cuda-graph-max-bs: 1 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml deleted file mode 100644 index 9cadc4c6f3..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "gb300-fp4-glm5_1k1k_maxtpt_0" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 3 - prefill_workers: 3 - gpus_per_prefill: 4 - decode_nodes: 8 - decode_workers: 1 - gpus_per_decode: 32 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_hightpt) - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - - # dp - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - - # ep - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: "static" - - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - deepep-config: "/configs/deepep_config.json" - max-running-requests: 16384 - cuda-graph-max-bs: 512 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16500" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml deleted file mode 100644 index 73d8a2e307..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "gb300-fp4-glm5_1k1k_maxtpt_1" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - prefill_workers: 2 - gpus_per_prefill: 4 - decode_nodes: 8 - decode_workers: 1 - gpus_per_decode: 32 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_hightpt) - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - - # dp - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - - # ep - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: "static" - - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - deepep-config: "/configs/deepep_config.json" - max-running-requests: 8192 - cuda-graph-max-bs: 256 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8300" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml deleted file mode 100644 index b7086cfc0f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "gb300-fp4-glm5_1k1k_maxtpt_2" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 8 - decode_workers: 1 - gpus_per_decode: 32 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_hightpt) - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - - # dp - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - - # ep - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: "static" - - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - deepep-config: "/configs/deepep_config.json" - max-running-requests: 2304 - cuda-graph-max-bs: 72 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2500x1024x512x256" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml deleted file mode 100644 index 56669629df..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "gb300-fp4-glm5_8k1k_maxtpt_0" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 5 - prefill_workers: 5 - gpus_per_prefill: 4 - decode_nodes: 8 - decode_workers: 1 - gpus_per_decode: 32 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_hightpt) - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - - # dp - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - - # ep - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: "static" - - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - deepep-config: "/configs/deepep_config.json" - max-running-requests: 4096 - cuda-graph-max-bs: 4096 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml deleted file mode 100644 index 7fa40fa423..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "gb300-fp4-glm5_8k1k_maxtpt_1" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 7 - prefill_workers: 7 - gpus_per_prefill: 4 - decode_nodes: 8 - decode_workers: 1 - gpus_per_decode: 32 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_hightpt) - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - - # dp - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - - # ep - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: "static" - - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - deepep-config: "/configs/deepep_config.json" - max-running-requests: 4096 - cuda-graph-max-bs: 4096 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "3072" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml deleted file mode 100644 index 2b6ef93511..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "gb300-fp4-glm5_8k1k_maxtpt_2" - -# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152). -# Upstream uses a single combined file with `zip_override_*` arrays -# expanded by srtctl across zip indices. We split into one flat yaml -# per concrete topology to match the InferenceX dsv4 sglang convention -# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the -# prefill sglang_config are inlined here verbatim from the upstream -# `base:` block; the decode block is the upstream base plus the -# topology-specific override from this zip index. - -model: - path: "glm-5-fp4" - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: "fp4" - -# Released dynamo wheel; unlike hash-based sources, this recipe does not -# require a persistent /configs/dynamo-wheels build cache. -dynamo: - version: "1.1.0" - -slurm: - time_limit: "03:00:00" - -# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU -# default that turns dynamo install + sglang weight load into a serial -# crawl; mem=0 grants whole-node memory. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 10 - prefill_workers: 10 - gpus_per_prefill: 4 - decode_nodes: 8 - decode_workers: 1 - gpus_per_decode: 32 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 - -backend: - type: sglang - - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" - PYTHONUNBUFFERED: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - MC_TE_METRIC: "true" - MC_FORCE_MNNVL: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512" - SGLANG_MOE_NVFP4_DISPATCH: "1" - - sglang_config: - prefill: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "prefill" - disaggregation-transfer-backend: "nixl" - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: "total_tokens" - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_trtllm" - fp4-gemm-backend: "flashinfer_cutlass" - - # Other flags - # disable-shared-experts-fusion: true - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: "GLM-5-FP4" - trust-remote-code: true - - quantization: "modelopt_fp4" - kv-cache-dtype: "fp8_e4m3" - - # Disaggregation mode - disaggregation-mode: "decode" - disaggregation-transfer-backend: "nixl" - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: "trtllm" - nsa-prefill-backend: "trtllm" - moe-runner-backend: "flashinfer_cutedsl" - fp4-gemm-backend: "flashinfer_cutlass" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - # disable-shared-experts-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - # Parallelism (override from upstream zip_override_*_hightpt) - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - - # dp - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - - # ep - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: "static" - - moe-a2a-backend: "deepep" - deepep-mode: "low_latency" - deepep-config: "/configs/deepep_config.json" - max-running-requests: 4096 - cuda-graph-max-bs: 4096 - - - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml deleted file mode 100644 index 33da57e947..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_1k1k_hightpt_0 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 12 - prefill_workers: 12 - decode_nodes: 6 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 24 - expert-parallel-size: 24 - data-parallel-size: 24 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 8192 - cuda-graph-max-bs: 512 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '8192' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml deleted file mode 100644 index 03cb7e671f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_1k1k_hightpt_1 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 10 - prefill_workers: 10 - decode_nodes: 8 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 8192 - cuda-graph-max-bs: 256 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '7500' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml deleted file mode 100644 index c06206c81c..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_1k1k_hightpt_2 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 8 - prefill_workers: 8 - decode_nodes: 10 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 40 - expert-parallel-size: 40 - data-parallel-size: 40 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 24 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 7200 - cuda-graph-max-bs: 180 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '7300' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml deleted file mode 100644 index 9517724799..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_1k1k_hightpt_3 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 6 - prefill_workers: 6 - decode_nodes: 12 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 48 - expert-parallel-size: 48 - data-parallel-size: 48 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 6144 - cuda-graph-max-bs: 128 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '6500' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml deleted file mode 100644 index 9a1f320a59..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_1k1k_hightpt_4 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 4 - prefill_workers: 4 - decode_nodes: 14 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 56 - expert-parallel-size: 56 - data-parallel-size: 56 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 24 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 5600 - cuda-graph-max-bs: 100 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '5700' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml deleted file mode 100644 index 3ace5647c5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml +++ /dev/null @@ -1,140 +0,0 @@ -name: gb300-fp8-glm5_1k1k_lowlat_0 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 17 - decode_workers: 17 -frontend: - type: dynamo -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - moe-runner-backend: flashinfer_trtllm - max-running-requests: 32 - cuda-graph-max-bs: 32 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: 512x256x128x64 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml deleted file mode 100644 index 965b6f1485..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml +++ /dev/null @@ -1,140 +0,0 @@ -name: gb300-fp8-glm5_1k1k_lowlat_1 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 17 - decode_workers: 17 -frontend: - type: dynamo -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - moe-runner-backend: flashinfer_trtllm - max-running-requests: 1 - cuda-graph-max-bs: 1 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 1024 - osl: 1024 - concurrencies: '32' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml deleted file mode 100644 index 150e62233f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_8k1k_hightpt_0 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 14 - prefill_workers: 14 - decode_nodes: 4 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 16 - expert-parallel-size: 16 - data-parallel-size: 16 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 2800 - cuda-graph-max-bs: 175 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '2800' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml deleted file mode 100644 index 6393069c87..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_8k1k_hightpt_1 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 12 - prefill_workers: 12 - decode_nodes: 6 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 24 - expert-parallel-size: 24 - data-parallel-size: 24 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 1680 - cuda-graph-max-bs: 70 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '1700' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml deleted file mode 100644 index 56b11ed8a9..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_8k1k_hightpt_2 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 10 - prefill_workers: 10 - decode_nodes: 8 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 32 - expert-parallel-size: 32 - data-parallel-size: 32 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 32 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 1280 - cuda-graph-max-bs: 40 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '1300' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml deleted file mode 100644 index 13fb0f3267..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml +++ /dev/null @@ -1,148 +0,0 @@ -name: gb300-fp8-glm5_8k1k_hightpt_3 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 8 - prefill_workers: 8 - decode_nodes: 10 - decode_workers: 1 -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 9 -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 40 - expert-parallel-size: 40 - data-parallel-size: 40 - enable-dp-lm-head: true - enable-dp-attention: true - moe-dense-tp-size: 1 - ep-num-redundant-experts: 24 - ep-dispatch-algorithm: static - moe-a2a-backend: deepep - deepep-mode: low_latency - deepep-config: /configs/deepep_config.json - max-running-requests: 880 - cuda-graph-max-bs: 22 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '900' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml deleted file mode 100644 index 8065160bd7..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml +++ /dev/null @@ -1,140 +0,0 @@ -name: gb300-fp8-glm5_8k1k_lowlat_0 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 9 - decode_workers: 9 -frontend: - type: dynamo -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - moe-runner-backend: flashinfer_trtllm - max-running-requests: 15 - cuda-graph-max-bs: 15 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '150' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml deleted file mode 100644 index 33f0324b69..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml +++ /dev/null @@ -1,140 +0,0 @@ -name: gb300-fp8-glm5_8k1k_lowlat_1 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 17 - decode_workers: 17 -frontend: - type: dynamo -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - moe-runner-backend: flashinfer_trtllm - max-running-requests: 8 - cuda-graph-max-bs: 8 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: 128x64x32 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml deleted file mode 100644 index 64d4c701a4..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml +++ /dev/null @@ -1,140 +0,0 @@ -name: gb300-fp8-glm5_8k1k_lowlat_2 - -model: - path: glm-5-fp8 - container: "lmsysorg/sglang:v0.5.11-cu130" - precision: fp8 - -resources: - gpu_type: gb300 - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - decode_nodes: 17 - decode_workers: 17 -frontend: - type: dynamo -dynamo: - version: 1.1.0 - -backend: - prefill_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - - decode_environment: - TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' - PYTHONUNBUFFERED: '1' - DYN_SKIP_SGLANG_LOG_FORMATTING: '1' - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' - MC_TE_METRIC: 'true' - MC_FORCE_MNNVL: '1' - NCCL_MNNVL_ENABLE: '1' - NCCL_CUMEM_ENABLE: '1' - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' - SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' - DYN_REQUEST_PLANE: nats - # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). - # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' - - sglang_config: - prefill: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: prefill - disaggregation-transfer-backend: nixl - - # Size limits - max-running-requests: 256 - cuda-graph-max-bs: 256 - mem-fraction-static: 0.7 - context-length: 9600 - chunked-prefill-size: 32768 - max-prefill-tokens: 8192 - - # Parallelism - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 1 - enable-dp-attention: true - enable-dp-lm-head: true - load-balance-method: total_tokens - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - moe-runner-backend: flashinfer_trtllm - - # Other flags - enable-flashinfer-allreduce-fusion: true - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - - decode: - # Model configuration - served-model-name: GLM-5-FP8 - trust-remote-code: true - - quantization: fp8 - kv-cache-dtype: fp8_e4m3 - - # Disaggregation mode - disaggregation-mode: decode - disaggregation-transfer-backend: nixl - - # Memory and token limits - mem-fraction-static: 0.8 - context-length: 9600 - - # Backend - nsa-decode-backend: trtllm - nsa-prefill-backend: trtllm - # moe-runner-backend: "cutedsl" - - # Detokenizer - skip-tokenizer-init: true - stream-interval: 30 - - # Other flags - disable-radix-cache: true - weight-loader-prefetch-checkpoints: true - model-loader-extra-config: '{"enable_multithread_load": true}' - tensor-parallel-size: 4 - expert-parallel-size: 1 - data-parallel-size: 1 - enable-flashinfer-allreduce-fusion: true - moe-runner-backend: flashinfer_trtllm - max-running-requests: 1 - cuda-graph-max-bs: 1 -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: sa-bench - req_rate: inf - isl: 8192 - osl: 1024 - concurrencies: '24' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml deleted file mode 100644 index cdbe0668b9..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "svf-vllm-disagg-b200-high-tpt-megamoe" - -# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe.yaml -# -# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses -# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.1 to -# match nvidia-master.yaml image (which the launch script registers as -# the alias key in srtslurm.yaml). Upstream variants ship either the -# non-dynamo floating tag or a sha256 pin. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache model loads. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - enforce-eager: true - max-model-len: 9280 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.9 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - max-model-len: 9280 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml deleted file mode 100644 index 7549794136..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml +++ /dev/null @@ -1,137 +0,0 @@ -name: "svf-vllm-disagg-b200-low-latency" - -# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency.yaml -# -# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses -# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.1 to -# match nvidia-master.yaml image (which the launch script registers as -# the alias key in srtslurm.yaml). Upstream variants ship either the -# non-dynamo floating tag or a sha256 pin. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache model loads. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 -# data-parallel-size: 8 -# data-parallel-rpc-port: 13345 -# enable-expert-parallel: true - max-model-len: 16384 - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x16x32x64x128" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml deleted file mode 100644 index 533ad0bf88..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "svf-vllm-disagg-b200-low-middle-curve" - -# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve.yaml -# -# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses -# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.1 to -# match nvidia-master.yaml image (which the launch script registers as -# the alias key in srtslurm.yaml). Upstream variants ship either the -# non-dynamo floating tag or a sha256 pin. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache model loads. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 -# data-parallel-size: 8 -# data-parallel-rpc-port: 13345 -# enable-expert-parallel: true - max-model-len: 16384 - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml deleted file mode 100644 index eb4c5308b5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "svf-vllm-disagg-b200-max-tpt-megamoe" - -# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-max-tpt-megamoe.yaml -# -# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses -# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.1 to -# match nvidia-master.yaml image (which the launch script registers as -# the alias key in srtslurm.yaml). Upstream variants ship either the -# non-dynamo floating tag or a sha256 pin. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache model loads. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 3 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - enforce-eager: true - max-model-len: 9280 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.9 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - max-model-len: 9280 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8192x12345" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml deleted file mode 100644 index c17605b1c8..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml +++ /dev/null @@ -1,138 +0,0 @@ -name: "svf-vllm-disagg-b200-mid-curve-megamoe" - -# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-mid-curve-megamoe.yaml -# -# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses -# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.1 to -# match nvidia-master.yaml image (which the launch script registers as -# the alias key in srtslurm.yaml). Upstream variants ship either the -# non-dynamo floating tag or a sha256 pin. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache model loads. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - enforce-eager: true - max-model-len: 9280 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.9 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - max-model-len: 9280 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml deleted file mode 100644 index 1ba829a513..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml +++ /dev/null @@ -1,125 +0,0 @@ -name: "svf-vllm-disagg-b300-low-middle-curve" - -# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses -# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - attention-config: '{"use_fp4_indexer_cache": true}' - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 16 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - tokenizer-mode: deepseek_v4 - reasoning-parser: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - max-model-len: 16384 - max-num-seqs: 256 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - reasoning-parser: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml deleted file mode 100644 index cb20003ecd..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml +++ /dev/null @@ -1,130 +0,0 @@ -name: "svf-vllm-disagg-b300-max-tpt-megamoe" - -# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses -# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.23.0" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 3 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - NCCL_CUMEM_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - attention-config: '{"use_fp4_indexer_cache": true}' - enforce-eager: true - max-model-len: 9280 - max-num-seqs: 16 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' - gpu-memory-utilization: 0.85 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - tokenizer-mode: deepseek_v4 - reasoning-parser: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - attention-config: '{"use_fp4_indexer_cache": true}' - max-model-len: 9280 - max-num-seqs: 512 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' - gpu-memory-utilization: 0.85 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - reasoning-parser: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.23.0" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.23.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml deleted file mode 100644 index badf45403e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-2xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048x4096x8192" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml deleted file mode 100644 index c3c994bca2..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2-c6144" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6144" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml deleted file mode 100644 index 5b352e35f8..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml deleted file mode 100644 index b7809a9e24..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1k1k-3p2xdep4" - -# Rate-matched dep4 at 1k/1k. -# Measured X_dep4/P = 56.8k / 38k = 1.49; 3P:2D ratio = 1.5 ✓ - - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml deleted file mode 100644 index 683f4c72d2..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-2p1xdep8" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048x4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml deleted file mode 100644 index bc6a6a1ac4..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml deleted file mode 100644 index 5d7072ea5f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4-1p2d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml deleted file mode 100644 index 23ec9444c8..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p1d-hi-conc" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 4a56ab27ee..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml deleted file mode 100644 index 87c928c63e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p2d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml deleted file mode 100644 index e828387150..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p3d-hi-conc" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml deleted file mode 100644 index 268a585359..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p3d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "64x128x256x512" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml deleted file mode 100644 index 0d83e2e63a..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1k1k-2p3xtp4ep" - -# Better-matched tp4ep at 1k/1k. -# Measured X_tp4ep/P = 24.1k / 38k = 0.63; 2P:3D ratio = 0.67 ✓ - - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x1024" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml deleted file mode 100644 index 0a867e508f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml +++ /dev/null @@ -1,75 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-8k1k-2p1xdep4" - -# Rate-matched dep4 at 8k/1k. -# Measured X_dep4_8k = 13.6k tok/s; rate-match ratio = X*8/P_8k = 13.6*8/58 = 1.88 -# 2P:1D = 2.0, much closer to optimum than 4P:1D (2× over-prefilled). - - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml deleted file mode 100644 index 75c7b9d737..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml deleted file mode 100644 index c43abe5958..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4ep-hi-conc" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 3d295e2904..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4ep" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/dep8.yaml deleted file mode 100644 index 504da7e4d0..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/dep8.yaml +++ /dev/null @@ -1,75 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-fp8-decode-focus-dep8" - -# Over-prefilled (4P:1D-dep8) at 1k/1k to measure X_dep8_fp8_gb200. -# 4P × 48k = 192k vs dep8 X ≈ 90k → 2.1× buffer. - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p1d-tp4ep.yaml deleted file mode 100644 index 1003a1c541..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p1d-tp4ep.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p3d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p3d-tp4ep.yaml deleted file mode 100644 index f79e03c991..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p3d-tp4ep.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1p3d-tp4ep" - -# Rate-matched tp4ep for FP8 GB200 1k/1k. -# X_tp4ep_fp8_gb200 = 17.9k tok/s; P_per_worker = 48k; ideal X/P = 0.37; 1P:3D = 0.33 ✓ - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p4d-dep2-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p4d-dep2-hi-conc.yaml deleted file mode 100644 index 8168374450..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-1p4d-dep2-hi-conc.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1p4d-dep2-hi-conc" - -# Rate-matched dep2 for FP8 GB200 1k/1k. -# X_dep2_fp8_gb200 = 12.7k tok/s; P_per_worker = 48k; ideal X/P = 0.27; 1P:4D = 0.25 ✓ - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 2 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p1d-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p1d-dep8.yaml deleted file mode 100644 index 92855acbeb..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p1d-dep8.yaml +++ /dev/null @@ -1,86 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-2p1d-dep8" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024" - # warmup_prompts: 1 - # use_chat_template: false - # req_rate: "inf" - # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p3d-dep4-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p3d-dep4-hi-conc.yaml deleted file mode 100644 index eb66c2041c..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/disagg-b200-2p3d-dep4-hi-conc.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-2p3d-dep4-hi-conc" - -# Rate-matched dep4 for FP8 GB200 1k/1k. -# X_dep4_fp8_gb200 = 30.9k tok/s; P_per_worker = 48k; ideal X/P = 0.64; 2P:3D = 0.67 ✓ - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/tp4ep.yaml deleted file mode 100644 index dc54a69606..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/1k1k/tp4ep.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-fp8-decode-focus-tp4ep" - -# Over-prefilled (1P:1D-tp4ep, high conc) at 1k/1k to measure X_tp4ep_fp8_gb200. -# 1P × 48k = 48k vs tp4ep X ≈ 18-24k → 2-2.7× buffer. - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep-hi-conc.yaml deleted file mode 100644 index bf89bdc574..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep-hi-conc.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1p1d-tp4ep-hi-conc" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep.yaml deleted file mode 100644 index 6268370cc0..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-1p1d-tp4ep.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-1p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-3p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-3p2d-dep4.yaml deleted file mode 100644 index d67da94d0f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/8k1k/disagg-b200-3p2d-dep4.yaml +++ /dev/null @@ -1,76 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b200-8k1k-3p2d-dep4" - -# Rate-matched dep4 for FP8 GB200 8k/1k. -# X_dep4_fp8_gb200_8k ≈ 9.8k tok/s (from 5p2d-dep4 saturation); -# P_per_worker_8k = 57k; ratio = X*8/P = 78.4/57 = 1.38; 3P:2D = 1.5 ✓ (closest int fit) - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b200" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - max-num-batched-tokens: 16384 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p1d-tp4.yaml deleted file mode 100644 index e42e7b9419..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p1d-tp4.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-1p1d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8x16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4.yaml deleted file mode 100644 index 6d8513fecc..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-1p2d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4ep.yaml deleted file mode 100644 index 927ac087eb..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-1p2d-tp4ep.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-1p2d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p1d-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p1d-dep8.yaml deleted file mode 100644 index 20cb76d530..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p1d-dep8.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-2p1d-dep8" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4-hi-conc.yaml deleted file mode 100644 index f2f23c6b5e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4-hi-conc.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-2p2d-dep4-hi-conc" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4.yaml deleted file mode 100644 index 9edc59915a..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p2d-dep4.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-2p2d-dep4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p3d-dep2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p3d-dep2.yaml deleted file mode 100644 index 38c7c467af..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/1k1k/disagg-b300-2p3d-dep2.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-2p3d-dep2" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep-hi-conc.yaml deleted file mode 100644 index b42a58dfb5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep-hi-conc.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-1p1d-tp4ep-hi-conc" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep.yaml deleted file mode 100644 index 095e737761..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-1p1d-tp4ep.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-1p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "16x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp2.yaml deleted file mode 100644 index 8efa46ea41..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp2.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-2p1d-tp2" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 2 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp4ep.yaml deleted file mode 100644 index b1a7d92814..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-2p1d-tp4ep.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-2p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4-hi-conc.yaml deleted file mode 100644 index 4e859cd7e6..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4-hi-conc.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-3p1d-dep4-hi-conc" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4.yaml deleted file mode 100644 index c6f47d49f8..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-dep4.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-3p1d-dep4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-tp4.yaml deleted file mode 100644 index 13b073cfb0..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-3p1d-tp4.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-3p1d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-5p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-5p2d-dep4.yaml deleted file mode 100644 index 6ece6ec6fc..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8/8k1k/disagg-b300-5p2d-dep4.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-5p2d-dep4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 5 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml deleted file mode 100644 index d6e6dc53c6..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-2xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 2 -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml deleted file mode 100644 index 3fd24aa253..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-2p3xdep2-c6144" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6144x8192" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml deleted file mode 100644 index bc68f6d59e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-2p3xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048x4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml deleted file mode 100644 index 516e51f113..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-2p1xdep8" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x1536x2048x4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml deleted file mode 100644 index 726b5a63b1..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml deleted file mode 100644 index 77329ffcc9..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4-1p2d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 4f25aee385..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4ep-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml deleted file mode 100644 index 8da4cb7ca1..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4ep-1p3d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "64x128x256x1024" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml deleted file mode 100644 index 757eeed973..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml +++ /dev/null @@ -1,78 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp8-1p1d" - -# B300-only: full-node TP=8 decode (the 8 GPUs of a single B300 node). -# Cousin of tp4-1p1d.yaml but exercises the wider TP that B300's per-node -# GPU count makes available. Only the smallest concurrencies (1,4,8) — -# this topology is decode-latency focused, not throughput. - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml deleted file mode 100644 index 258e9ba4f7..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-8k1k-4p1xdep4" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "384x512" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml deleted file mode 100644 index 1f41e52e24..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-8k1k-4p1xdep8" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "384" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml deleted file mode 100644 index 91761b75fc..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-8k1k-1p1xtp4" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2x4x8x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 76b000e8c2..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-8k1k-1p1xtp4ep" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml deleted file mode 100644 index b34025ee2b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-8k1k-2p1xtp4ep" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128x256x512" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml deleted file mode 100644 index ea276c25ac..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-b300-8k1k-1p1xtp8" - -# B300-only: full-node TP=8 decode at 8k input. Cousin of tp4-1p1d.yaml -# but exercises the wider TP that B300's per-node GPU count makes -# available. Smallest concurrencies only (1,4,8). - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - allow_prefill_decode_colocation: true - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p1d-tp4.yaml deleted file mode 100644 index 4475c45485..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p1d-tp4.yaml +++ /dev/null @@ -1,64 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-1p1d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "8x16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4.yaml deleted file mode 100644 index 005d3ab451..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4.yaml +++ /dev/null @@ -1,69 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-1p2d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128x256x512" - # warmup_prompts: 1 - # use_chat_template: false - # req_rate: "inf" - # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4ep.yaml deleted file mode 100644 index 42e2bbff7d..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4ep.yaml +++ /dev/null @@ -1,66 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-1p2d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p1d-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p1d-dep8.yaml deleted file mode 100644 index dadaea41cd..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p1d-dep8.yaml +++ /dev/null @@ -1,83 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-2p1d-dep8" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" - # warmup_prompts: 1 - # use_chat_template: false - # req_rate: "inf" - # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4-hi-conc.yaml deleted file mode 100644 index 95a6f4032f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4-hi-conc.yaml +++ /dev/null @@ -1,82 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-2p2d-dep4-hi-conc" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - # use_chat_template: false - # req_rate: "inf" - # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4.yaml deleted file mode 100644 index 90d14b5b0b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4.yaml +++ /dev/null @@ -1,82 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-2p2d-dep4" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" - # use_chat_template: false - # req_rate: "inf" - # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p3d-dep2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p3d-dep2.yaml deleted file mode 100644 index ef4bfc846f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p3d-dep2.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-2p3d-dep2" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep-hi-conc.yaml deleted file mode 100644 index f9e9ccf793..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep-hi-conc.yaml +++ /dev/null @@ -1,66 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-1p1d-tp4ep-hi-conc" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep.yaml deleted file mode 100644 index 76e72c229c..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep.yaml +++ /dev/null @@ -1,66 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-1p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "16x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp2.yaml deleted file mode 100644 index f71458a70b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp2.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-2p1d-tp2" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 2 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp4ep.yaml deleted file mode 100644 index 668cf185bd..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp4ep.yaml +++ /dev/null @@ -1,66 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-2p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4-hi-conc.yaml deleted file mode 100644 index 94b866d954..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4-hi-conc.yaml +++ /dev/null @@ -1,82 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-3p1d-dep4-hi-conc" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4.yaml deleted file mode 100644 index 9bb6081db5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4.yaml +++ /dev/null @@ -1,82 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-3p1d-dep4" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-tp4.yaml deleted file mode 100644 index b638c03512..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-tp4.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-3p1d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64" - use_chat_template: false - req_rate: "inf" - random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-5p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-5p2d-dep4.yaml deleted file mode 100644 index ed2a9cdd4c..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-5p2d-dep4.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-5p2d-dep4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 5 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml deleted file mode 100644 index 120a35e45f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml deleted file mode 100644 index 6b5e76e429..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1p2d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2x32x64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml deleted file mode 100644 index 765562d0c8..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1p3d-tp4ep" - -# Rate-matched tp4ep for FP8 GB200 1k/1k. -# X_tp4ep_fp8_gb200 = 17.9k tok/s; P_per_worker = 48k; ideal X/P = 0.37; 1P:3D = 0.33 ✓ - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml deleted file mode 100644 index aeeb8a0125..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1p4d-dep2" - -# Rate-matched dep2 for FP8 GB200 1k/1k. -# X_dep2_fp8_gb200 = 12.7k tok/s; P_per_worker = 48k; ideal X/P = 0.27; 1P:4D = 0.25 ✓ - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 2 - gpus_per_decode: 2 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml deleted file mode 100644 index 83bc7aeb26..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml +++ /dev/null @@ -1,86 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-2p1d-dep8" - -# model: -# path: "minimax-m2.5-fp8" -# container: "v0.18.1" -# precision: "fp8" - -# dynamo: -# version: 1.0.1 -# install: true - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - - - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024" - # warmup_prompts: 1 - # use_chat_template: false - # req_rate: "inf" - # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml deleted file mode 100644 index 5340192221..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-2p3d-dep4" - -# Rate-matched dep4 for FP8 GB200 1k/1k. -# X_dep4_fp8_gb200 = 30.9k tok/s; P_per_worker = 48k; ideal X/P = 0.64; 2P:3D = 0.67 ✓ - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml deleted file mode 100644 index 847c4b1386..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-8k1k-1p1d-tp4" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - max-num-batched-tokens: 16384 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml deleted file mode 100644 index 61497185aa..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4ep" - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - max-num-batched-tokens: 16384 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml deleted file mode 100644 index c927571469..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml +++ /dev/null @@ -1,76 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-8k1k-3p2d-dep4" - -# Rate-matched dep4 for FP8 GB200 8k/1k. -# X_dep4_fp8_gb200_8k ≈ 9.8k tok/s (from 5p2d-dep4 saturation); -# P_per_worker_8k = 57k; ratio = X*8/P = 78.4/57 = 1.38; 3P:2D = 1.5 ✓ (closest int fit) - -model: - path: "minimax-m2.5-fp8" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - max-num-batched-tokens: 16384 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048x4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-1p2d.yaml deleted file mode 100644 index 4decc38b1e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-1p2d.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-2xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 2 - spread_workers: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048x4096x8192" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d-c6144.yaml deleted file mode 100644 index e99cb27250..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d-c6144.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-2p3xdep2-c6144" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - spread_workers: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6144" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d.yaml deleted file mode 100644 index 50d054f1d9..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep2-2p3d.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-2p3xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - spread_workers: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep4-3p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep4-3p2d.yaml deleted file mode 100644 index 8016fa160e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep4-3p2d.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1k1k-3p2xdep4" - -# Rate-matched dep4 at 1k/1k. -# Measured X_dep4/P = 56.8k / 38k = 1.49; 3P:2D ratio = 1.5 ✓ - - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep8-2p1d.yaml deleted file mode 100644 index f9b97131c5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/dep8-2p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-2p1xdep8" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048x4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p1d.yaml deleted file mode 100644 index 7b4a98182e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p2d.yaml deleted file mode 100644 index 71aafe4e5f..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4-1p2d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4-1p2d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d-hi-conc.yaml deleted file mode 100644 index f8efd4f25b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d-hi-conc.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4ep-1p1d-hi-conc" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 345f34f351..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4ep-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p2d.yaml deleted file mode 100644 index 4194388aca..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p2d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4ep-1p2d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d-hi-conc.yaml deleted file mode 100644 index fe47c7834b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d-hi-conc.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4ep-1p3d-hi-conc" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d.yaml deleted file mode 100644 index 05cbf316c5..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-1p3d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-decode-focus-tp4ep-1p3d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "64x128x256x512" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-2p3d.yaml deleted file mode 100644 index 60a6639f56..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/1k1k/tp4ep-2p3d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-1k1k-2p3xtp4ep" - -# Better-matched tp4ep at 1k/1k. -# Measured X_tp4ep/P = 24.1k / 38k = 0.63; 2P:3D ratio = 0.67 ✓ - - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x1024" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/dep4-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/dep4-2p1d.yaml deleted file mode 100644 index 07bab8f319..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/dep4-2p1d.yaml +++ /dev/null @@ -1,75 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-8k1k-2p1xdep4" - -# Rate-matched dep4 at 8k/1k. -# Measured X_dep4_8k = 13.6k tok/s; rate-match ratio = X*8/P_8k = 13.6*8/58 = 1.88 -# 2P:1D = 2.0, much closer to optimum than 4P:1D (2× over-prefilled). - - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4-1p1d.yaml deleted file mode 100644 index a03d6b69ab..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-8k1k-1p1xtp4" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d-hi-conc.yaml deleted file mode 100644 index f38890f1ee..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d-hi-conc.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-8k1k-1p1xtp4ep-hi-conc" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 946bc64b00..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200/8k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb200-8k1k-1p1xtp4ep" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml deleted file mode 100644 index c7f7e28afb..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2-c6144" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - spread_workers: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "6144x8192" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml deleted file mode 100644 index adaf6f271e..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 3 - prefill_workers: 2 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 2 - spread_workers: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml deleted file mode 100644 index 28427e002d..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-2p1xdep8" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 128 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1024x2048x4096" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml deleted file mode 100644 index eee93c9f8b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - UCX_RCACHE_MAX_UNRELEASED: "1024" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2x4x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml deleted file mode 100644 index 10ba980cac..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4-1p2d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x64" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml deleted file mode 100644 index ebff26fb00..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4ep-1p1d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32x64x128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml deleted file mode 100644 index 5353e4dd02..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4ep-1p3d" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 3 - prefill_workers: 1 - decode_workers: 3 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 2048 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "64x128x256x512x1024" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml deleted file mode 100644 index d3c777618d..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-8k1k-4p1xdep4" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml deleted file mode 100644 index a56c095afd..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-8k1k-4p1xdep8" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - max-num-seqs: 864 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml deleted file mode 100644 index a92975c57a..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-8k1k-1p1xtp4" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: false - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2x4x8x16" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml deleted file mode 100644 index 53daeafbdb..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-8k1k-1p1xtp4ep" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32x64x128x256" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml deleted file mode 100644 index 163d412f5b..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: "minimax-m2.5-vllm-disagg-gb300-8k1k-2p1xtp4ep" - -model: - path: "minimax-m2.5-nvfp4" - container: "vllm/vllm-openai:v0.20.1" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -setup_script: install-deps.sh - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 1 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - trust-remote-code: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - kv-cache-dtype: "fp8" - tensor-parallel-size: 4 - enable-expert-parallel: true - no-enable-prefix-caching: true - max-model-len: 9280 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - gpu-memory-utilization: 0.90 - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128" - random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml deleted file mode 100644 index bc4c449b29..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml +++ /dev/null @@ -1,81 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml deleted file mode 100644 index cf8736e143..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ /dev/null @@ -1,81 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml deleted file mode 100644 index 9572688b2c..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml +++ /dev/null @@ -1,79 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml deleted file mode 100644 index 6f765ab746..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ /dev/null @@ -1,81 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 4 - decode_workers: 3 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 512 # Per DP rank: 3 workers x DP4 = 12 ranks. - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml deleted file mode 100644 index d40a335829..0000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml +++ /dev/null @@ -1,79 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 5 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x64" - req_rate: "inf" diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index 46154341c5..bdb6a46029 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -7902,7 +7902,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" decode: num-worker: 5 tp: 8 @@ -7916,7 +7916,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" decode: num-worker: 6 tp: 8 @@ -7930,7 +7930,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_0.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" decode: num-worker: 1 tp: 8 @@ -7944,7 +7944,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/dsr1/b200-fp4/1k1k/disagg/mtp/1k1k_mtp_maxtpt_1.yaml" + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" decode: num-worker: 2 tp: 8 @@ -10618,7 +10618,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_1k1k_hightpt[0]" decode: num-worker: 1 tp: 8 @@ -10631,7 +10631,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_1k1k_hightpt[1]" decode: num-worker: 2 tp: 8 @@ -10644,7 +10644,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_1k1k_hightpt[2]" decode: num-worker: 3 tp: 8 @@ -10657,7 +10657,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_maxtpt_3.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_1k1k_hightpt[3]" decode: num-worker: 4 tp: 8 @@ -10670,7 +10670,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_1k1k_lowlat[0]" decode: num-worker: 8 tp: 8 @@ -10683,7 +10683,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_1k1k_lowlat[1]" decode: num-worker: 8 tp: 8 @@ -10699,7 +10699,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_hightpt[0]" decode: num-worker: 1 tp: 8 @@ -10712,7 +10712,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_hightpt[1]" decode: num-worker: 1 tp: 8 @@ -10725,7 +10725,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_hightpt[2]" decode: num-worker: 2 tp: 8 @@ -10738,7 +10738,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_lowlat[0]" decode: num-worker: 2 tp: 8 @@ -10751,7 +10751,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_lowlat[1]" decode: num-worker: 3 tp: 8 @@ -10764,7 +10764,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_lowlat[2]" decode: num-worker: 4 tp: 8 @@ -10777,7 +10777,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_lowlat[3]" decode: num-worker: 5 tp: 8 @@ -10790,7 +10790,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_lowlat[4]" decode: num-worker: 7 tp: 8 @@ -10803,7 +10803,7 @@ glm5-fp8-b200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_5.yaml" + - "CONFIG_FILE=recipes/b200-fp8/glm5.yaml:zip_override_8k1k_lowlat[5]" decode: num-worker: 8 tp: 8 @@ -11109,7 +11109,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_8k1k_hightpt[0]" decode: num-worker: 1 tp: 32 @@ -11123,7 +11123,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_8k1k_hightpt[1]" decode: num-worker: 1 tp: 32 @@ -11137,7 +11137,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_8k1k_hightpt[2]" decode: num-worker: 1 tp: 32 @@ -11229,7 +11229,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_1k1k_hightpt[0]" decode: num-worker: 1 tp: 32 @@ -11243,7 +11243,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_1k1k_hightpt[1]" decode: num-worker: 1 tp: 32 @@ -11257,7 +11257,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_1k1k_hightpt[2]" decode: num-worker: 1 tp: 32 @@ -11275,7 +11275,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_1k1k_lowlat[0]" decode: num-worker: 17 tp: 4 @@ -11289,7 +11289,7 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp4/glm5.yaml:zip_override_1k1k_lowlat[1]" decode: num-worker: 17 tp: 4 @@ -11318,7 +11318,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_hightpt[0]" decode: num-worker: 1 tp: 16 @@ -11331,7 +11331,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_hightpt[1]" decode: num-worker: 1 tp: 24 @@ -11344,7 +11344,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_hightpt[2]" decode: num-worker: 1 tp: 32 @@ -11357,7 +11357,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_hightpt[3]" decode: num-worker: 1 tp: 40 @@ -11374,7 +11374,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_lowlat[0]" decode: num-worker: 9 tp: 4 @@ -11387,7 +11387,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_lowlat[1]" decode: num-worker: 17 tp: 4 @@ -11400,7 +11400,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_8k1k_lowlat[2]" decode: num-worker: 17 tp: 4 @@ -11417,7 +11417,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_hightpt[0]" decode: num-worker: 1 tp: 24 @@ -11430,7 +11430,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_hightpt[1]" decode: num-worker: 1 tp: 32 @@ -11443,7 +11443,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_hightpt[2]" decode: num-worker: 1 tp: 40 @@ -11456,7 +11456,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_hightpt[3]" decode: num-worker: 1 tp: 48 @@ -11469,7 +11469,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_hightpt[4]" decode: num-worker: 1 tp: 56 @@ -11486,7 +11486,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_lowlat[0]" decode: num-worker: 17 tp: 4 @@ -11499,7 +11499,7 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml" + - "CONFIG_FILE=recipes/gb300-fp8/glm5.yaml:zip_override_1k1k_lowlat[1]" decode: num-worker: 17 tp: 4 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 72de4fc2df..7fe590cb32 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -113,24 +113,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 - elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" || exit 1 - git checkout main - mkdir -p recipes/vllm/minimax-m2.5-b200-fp4 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4" recipes/vllm/minimax-m2.5-b200-fp4 - elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" || exit 1 - git checkout main - mkdir -p recipes/vllm/minimax-m2.5-b200-fp8 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8" recipes/vllm/minimax-m2.5-b200-fp8 elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 - git checkout sa-submission-q2-2026 - mkdir -p recipes/sglang/glm5/b200-fp8 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8" recipes/sglang/glm5/b200-fp8 + git checkout main elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 @@ -233,6 +219,7 @@ containers: dynamo-trtllm: "${SQUASH_FILE}" dynamo-sglang: "${SQUASH_FILE}" dynamo-vllm: "${SQUASH_FILE}" + sglang-v0.5.11-cu130: "${SQUASH_FILE}" "${IMAGE}": "${SQUASH_FILE}" nginx-sqsh: "${NGINX_SQUASH_FILE}" use_exclusive_sbatch_directive: true diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index bc94b22712..d8b3e3d86e 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -76,18 +76,6 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" || exit 1 - git checkout main - mkdir -p recipes/vllm/minimax-m2.5 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300" recipes/vllm/minimax-m2.5 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" || exit 1 - git checkout main - mkdir -p recipes/vllm/minimax-m2.5-fp8 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && ( $PRECISION == "fp4" || $PRECISION == "fp8" ) ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index a179f7f4fc..ba5fffa83e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -307,20 +307,6 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "qwen3.5" ]]; then cd "$SRT_REPO_DIR" mkdir -p recipes/sglang/qwen3.5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5" recipes/sglang/qwen3.5 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 - cd "$SRT_REPO_DIR" || exit 1 - git checkout main || exit 1 - if [[ $PRECISION == "fp8" ]]; then - mkdir -p recipes/vllm/minimax-m2.5-gb200-fp8 || exit 1 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8" recipes/vllm/minimax-m2.5-gb200-fp8 || exit 1 - elif [[ $PRECISION == "fp4" ]]; then - mkdir -p recipes/vllm/minimax-m2.5-gb200 || exit 1 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200" recipes/vllm/minimax-m2.5-gb200 || exit 1 - else - echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 - exit 1 - fi elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 cd "$SRT_REPO_DIR" || exit 1 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 10d1b19287..aaf48134fa 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -159,29 +159,18 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 - mkdir -p recipes/sglang/glm5 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 + git checkout main + if [[ $PRECISION == "fp4" ]]; then + mkdir -p recipes/sglang/glm5/gb300-fp4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4" recipes/sglang/glm5/gb300-fp4 + fi elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "qwen3.5" ]]; then - # Same srt-slurm tooling as glm5: NVIDIA/srt-slurm @ sa-submission-q2-2026. - # Overlay our version-controlled Qwen3.5 recipes on top (upstream has none). + # Overlay our version-controlled Qwen3.5 recipes onto the submission branch. git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/qwen3.5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5" recipes/sglang/qwen3.5 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout main - mkdir -p recipes/vllm/minimax-m2.5-fp8 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-fp8" recipes/vllm/minimax-m2.5-fp8 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout main - mkdir -p recipes/vllm/minimax-m2.5 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5" recipes/vllm/minimax-m2.5 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -265,6 +254,7 @@ model_paths: containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} + v0.5.11: ${SQUASH_FILE} "${IMAGE}": ${SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} use_segment_sbatch_directive: false