From f7fbad8cb1c10c07d0558184889898ffc70108a6 Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Thu, 2 Jul 2026 19:34:04 -0700 Subject: [PATCH 1/2] Add --- .../single_node/agentic/kimik2.5_fp4_b300.sh | 111 +++++++++++++++--- configs/nvidia-master.yaml | 13 +- perf-changelog.yaml | 6 + 3 files changed, 103 insertions(+), 27 deletions(-) diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index b99299b12..02f1cd586 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -7,7 +7,7 @@ set -x # Required env vars: # MODEL, TP, CONC, KV_OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR # -# KV_OFFLOADING=dram requires KV_OFFLOAD_BACKEND=native. +# KV_OFFLOADING=dram requires KV_OFFLOAD_BACKEND=native or KV_OFFLOAD_BACKEND=mooncake. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -37,39 +37,112 @@ install_agentic_deps # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" mkdir -p "$RESULT_DIR" +SERVER_PID="" +MOONCAKE_MASTER_PID="" + +cleanup_agentic_services() { + local exit_code=$? + trap - EXIT INT TERM + set +e + stop_background_process_tree "$SERVER_PID" "vLLM server" 60 + stop_background_process_tree "$MOONCAKE_MASTER_PID" "Mooncake master" + exit "$exit_code" +} +trap cleanup_agentic_services EXIT +trap 'exit 130' INT +trap 'exit 143' TERM + OFFLOAD_ARGS=() -PREFIX_CACHE_ARGS=() - -if require_agentic_kv_offload_backend native; then - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS=( - --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_GB" - --disable-hybrid-kv-cache-manager - ) + +if agentic_kv_offload_enabled; then + case "$KV_OFFLOAD_BACKEND" in + native) + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + mooncake) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) + + MOONCAKE_VERSION=0.3.11.post1 + agentic_pip_install --quiet --no-cache-dir --no-deps \ + --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" + + MOONCAKE_MASTER_PORT=$((PORT + 12000)) + MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" + cat > "$MOONCAKE_CONFIG_PATH" < "$MOONCAKE_MASTER_LOG" 2>&1 & + MOONCAKE_MASTER_PID=$! + sleep 2 + if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then + echo "Mooncake master died during startup." >&2 + cat "$MOONCAKE_MASTER_LOG" >&2 + exit 1 + fi + OFFLOAD_ARGS=( + --kv-transfer-config + '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both"}' + ) + ;; + *) echo "Error: unsupported KV_OFFLOAD_BACKEND value '$KV_OFFLOAD_BACKEND' (expected one of: native, mooncake)" >&2; exit 1 ;; + esac fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 +DCP_ARGS=() +if [[ "$CONC" -ge 16 ]]; then + DCP_ARGS=(--decode-context-parallel-size "$TP") +fi + { set +x; } 2>/dev/null VLLM_CMD=( vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" - --tensor-parallel-size="$TP" - --gpu-memory-utilization 0.90 - --max-num-seqs "$CONC" - --reasoning-parser kimi_k2 - --tool-call-parser kimi_k2 - --compilation_config.pass_config.fuse_allreduce_rms true --kv-cache-dtype fp8 - --max-cudagraph-capture-size 2048 - --stream-interval 20 --trust-remote-code - "${PREFIX_CACHE_ARGS[@]}" + --block-size 64 + --language-model-only + --attention-config '{"mla_prefill_backend":"TRTLLM_RAGGED","use_prefill_query_quantization":true}' + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --max-cudagraph-capture-size 2048 + --max-num-batched-tokens 16384 + --stream-interval 10 + --enable-prefix-caching + --tensor-parallel-size "$TP" + "${DCP_ARGS[@]}" "${OFFLOAD_ARGS[@]}" ) printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" diff --git a/configs/nvidia-master.yaml b/configs/nvidia-master.yaml index 46154341c..c48e81bb2 100644 --- a/configs/nvidia-master.yaml +++ b/configs/nvidia-master.yaml @@ -2965,12 +2965,7 @@ dsr1-fp8-b300-sglang-mtp: - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp } kimik2.5-fp4-b300-vllm-agentic: - # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM - # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the - # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted - # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the - # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:nightly-09663abde0f50944a8d5ea30120666024b503faa model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: cluster:b300-nv @@ -2981,8 +2976,10 @@ kimik2.5-fp4-b300-vllm-agentic: agentic-coding: - dram-utilization: 0.80 search-space: - - { tp: 8, ep: 1, kv-offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - - { tp: 8, ep: 1, kv-offloading: dram, kv-offload-backend: native, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 8, ep: 1, kv-offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, ep: 1, kv-offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80] } + - { tp: 4, ep: 1, kv-offloading: dram, kv-offload-backend: native, conc-list: [64, 72, 80] } + - { tp: 4, ep: 1, kv-offloading: dram, kv-offload-backend: mooncake, conc-list: [64, 72, 80] } dsr1-fp8-b200-trt: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 943f4b390..6a7ecbd8f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4440,3 +4440,9 @@ - "Update Minimax M3 b200 vllm image tag" - "Update search space to cover more configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1978 + +- config-keys: + - kimik2.5-fp4-b300-vllm-agentic + description: + - "Update kimi k2.5 agentx B300" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/ From 9a47359f035c462160b9efa52a7c67ccba568738 Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Thu, 2 Jul 2026 19:34:53 -0700 Subject: [PATCH 2/2] update --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6a7ecbd8f..3f823c520 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4445,4 +4445,4 @@ - kimik2.5-fp4-b300-vllm-agentic description: - "Update kimi k2.5 agentx B300" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/ + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1998