From 3a8a68fae01fd8ea397651b2eb2b8e1a7bc3e1a4 Mon Sep 17 00:00:00 2001 From: Albert Cheng Date: Tue, 2 Jun 2026 14:15:57 -0700 Subject: [PATCH 01/27] Add GitHub Action to collect SPEED-Bench AL matrix Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench acceptance-length matrix (thinking on/off x MTP 1-8) on self-hosted B300 runners, optionally opening a PR that updates benchmarks/speedbench-reference-al.yaml. - benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh: per (thinking, MTP) cell, serve vLLM, run SPEED-Bench, derive AL from /metrics, and emit the YAML matrix. Serves from MODEL_PATH (the local pre-staged weights resolved by the launcher), falling back to MODEL for a standalone local run. Carries a temporary --chat-template-kwargs shim until vllm-project/vllm#44244 lands in the benchmark image (idempotent, applied only for thinking-on cells). - runners/launch_b300-nv.sh: add opt-in BENCH_SCRIPT_OVERRIDE and SALLOC_TIME_LIMIT hooks; both default to the prior behavior. - .github/workflows/speedbench-al.yml: workflow_dispatch entry point; MODEL is the HF id so the launcher resolves the staged MODEL_PATH. --- .github/workflows/speedbench-al.yml | 200 +++++++++++ .../dsv4_fp4_b300_vllm_speedbench_matrix.sh | 337 ++++++++++++++++++ runners/launch_b300-nv.sh | 10 +- 3 files changed, 546 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/speedbench-al.yml create mode 100755 benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml new file mode 100644 index 0000000000..771e53e6c4 --- /dev/null +++ b/.github/workflows/speedbench-al.yml @@ -0,0 +1,200 @@ +name: SpeedBench AL Collection + +# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench +# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the +# golden reference consumed by the synthetic-acceptance framework and (optionally) +# opens a PR updating benchmarks/speedbench-reference-al.yaml. + +on: + workflow_dispatch: + inputs: + runner: + description: "Self-hosted GPU runner label (B300)" + required: false + type: string + default: 'b300' + image: + description: "vLLM container image" + required: false + type: string + default: 'vllm/vllm-openai:v0.21.0' + mtp-list: + description: "Space-separated MTP levels (num_speculative_tokens)" + required: false + type: string + default: '1 2 3 4 5 6 7 8' + thinking-modes: + description: "Space-separated thinking modes to collect" + required: false + type: string + default: 'off on' + category: + description: "SPEED-Bench category" + required: false + type: string + default: 'coding' + output-len: + description: "Per-request output length" + required: false + type: string + default: '4096' + thinking-kwargs: + description: "chat_template_kwargs JSON for thinking-on cells (match golden config)" + required: false + type: string + default: '{"thinking": true, "reasoning_effort": "high"}' + salloc-time: + description: "Slurm allocation minutes (16 server starts ~ several hours)" + required: false + type: string + default: '480' + open-pr: + description: "Open a PR updating benchmarks/speedbench-reference-al.yaml" + required: false + type: boolean + default: true + ref: + description: "Git ref (branch/sha) to checkout" + required: false + type: string + +permissions: + contents: read + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the + # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so + # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts + # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download. + MODEL: deepseek-ai/DeepSeek-V4-Pro + MODEL_PREFIX: dsv4 + PRECISION: fp4 + FRAMEWORK: vllm + EXP_NAME: dsv4_speedbench + IMAGE: ${{ inputs.image }} + TP: '8' + EP_SIZE: '1' + DP_ATTENTION: 'false' + SPEC_DECODING: mtp + # Run the AL-matrix collector instead of the auto-selected throughput script. + BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh + SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }} + # Matrix-collector tunables (propagated into the container via srun --export=ALL). + MTP_LIST: ${{ inputs.mtp-list }} + THINKING_MODES: ${{ inputs.thinking-modes }} + CATEGORY: ${{ inputs.category }} + SPEEDBENCH_OUTPUT_LEN: ${{ inputs.output-len }} + CHAT_TEMPLATE_KWARGS_ON: ${{ inputs.thinking-kwargs }} + OUT_YAML: /workspace/speedbench-reference-al.yaml + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache + +jobs: + collect-al: + runs-on: ${{ inputs.runner }} + timeout-minutes: 600 + name: "SpeedBench AL matrix | ${{ inputs.category }} | mtp=[${{ inputs.mtp-list }}] | thinking=[${{ inputs.thinking-modes }}]" + steps: + - name: Resource cleanup (pre-run) + run: &resource-cleanup | + # Cleanup Docker resources + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + + # Cleanup SLURM resources + if command -v squeue >/dev/null 2>&1; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + fi + + # Cleanup AL-matrix outputs from a prior job on this runner so a stale + # matrix from a previous run is never picked up as this job's output. + rm -rf "${{ github.workspace }}/speedbench_results" 2>/dev/null || true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.sha }} + clean: true + submodules: true + + - name: Cleanup stale outputs (pre-run) + run: | + rm -f speedbench-reference-al.yaml || true + rm -f gpu_metrics.csv || true + rm -rf speed_bench_data || true + + - name: Collect AL matrix + env: + RUNNER_NAME: ${{ runner.name }} + run: | + set -euo pipefail + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + if [ ! -f "speedbench-reference-al.yaml" ]; then + echo "AL collection failed: speedbench-reference-al.yaml not produced." >&2 + exit 1 + fi + echo "### SpeedBench AL matrix" >> "$GITHUB_STEP_SUMMARY" + echo '```yaml' >> "$GITHUB_STEP_SUMMARY" + cat speedbench-reference-al.yaml >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + + - name: Upload AL matrix artifact + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: speedbench-reference-al + path: speedbench-reference-al.yaml + if-no-files-found: warn + + - name: Open PR updating reference yaml + if: ${{ inputs.open-pr && success() }} + env: + GH_TOKEN: ${{ secrets.REPO_PAT }} + run: | + set -euo pipefail + cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml + + BRANCH="speedbench-al/auto-${{ github.run_id }}" + git config user.name "github-actions" + git config user.email "github-actions@github.com" + git checkout -b "$BRANCH" + git add benchmarks/speedbench-reference-al.yaml + if git diff --cached --quiet; then + echo "No change in reference yaml; skipping PR." + exit 0 + fi + git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})" + git push -u origin "$BRANCH" + gh pr create \ + --title "Update SpeedBench AL reference matrix (auto)" \ + --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \ + --base main \ + --head "$BRANCH" + + - name: Upload server logs + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: speedbench_server_logs + path: speedbench_results/server_*.log + if-no-files-found: ignore + + - name: Resource cleanup (post-run) + if: always() + run: *resource-cleanup \ No newline at end of file diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh new file mode 100755 index 0000000000..572801b2c3 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh @@ -0,0 +1,337 @@ +#!/usr/bin/env bash + +# DSV4-Pro B300 vLLM SPEED-Bench AL matrix collector. +# +# Produces the golden acceptance-length (AL) reference matrix consumed by the +# synthetic-acceptance framework: for each thinking mode (on/off) and each MTP +# level (num_speculative_tokens), measure the AL on a single SPEED-Bench +# category (default: coding) and emit a YAML matrix identical in shape to +# benchmarks/speedbench-reference-al.yaml. +# +# This is the "AL distribution collection" script wired into the +# speedbench-al.yml GitHub Action (workflow_dispatch / push-button). +# +# Usage (inside the vLLM container, on a B300 node): +# export MODEL=/data/models/dsv4-pro +# bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh +# +# Tunables (env): +# MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8") +# THINKING_MODES space-separated: off|on (default "off on") +# CATEGORY SPEED-Bench category (default coding) +# SPEEDBENCH_OUTPUT_LEN per-request output len (default 4096) +# OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml) + +set -uo pipefail +source "$(dirname "$0")/../benchmark_lib.sh" + +MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}" +# Serve from the local weights dir resolved by the launcher (MODEL_PATH points +# at the pre-staged copy, e.g. /scratch/models/DeepSeek-V4-Pro). Falls back to +# MODEL for a standalone local run where MODEL is itself a path. A leading "/" +# makes the download guard below a no-op. +SERVE_MODEL="${MODEL_PATH:-$MODEL}" +TP="${TP:-8}" +DP_ATTENTION="${DP_ATTENTION:-false}" +EP_SIZE="${EP_SIZE:-1}" +PORT="${PORT:-8888}" + +MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}" +THINKING_MODES="${THINKING_MODES:-off on}" +CATEGORY="${CATEGORY:-coding}" +SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" +CONCURRENCY="${CONCURRENCY:-1}" +TEMPERATURE="${TEMPERATURE:-1.0}" +# thinking-on chat_template_kwargs. MUST match the production/golden config: +# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured +# with reasoning_effort=high. +CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-{\"thinking\": true, \"reasoning_effort\": \"high\"}}" + +SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}" +RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}" +OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}" + +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +mkdir -p "$RESULTS_DIR" +nvidia-smi +if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi + +# ---- Download SPEED-Bench dataset ---- +echo "=== Downloading SPEED-Bench dataset ===" +pip install -q datasets tiktoken +curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \ + | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR" + +if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then + echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found" + exit 1 +fi + +# ---- Temporary shim: add a real --chat-template-kwargs CLI option ---- +# Upstream gap (until vllm-project/vllm#44244 lands): speed_bench/CustomDataset +# pre-renders the chat template client-side WITHOUT chat_template_kwargs and +# posts to /v1/completions, so thinking mode cannot be enabled via --extra-body +# or --default-chat-template-kwargs. This wires a proper --chat-template-kwargs +# option through get_samples into CustomDataset.sample's apply_chat_template. +# TODO: delete this whole block once #44244 is released in the benchmark image; +# the patch is idempotent (marker check) so it is safe to leave until then. +apply_chat_template_kwargs_shim() { + echo "=== Patching vLLM benchmark to add --chat-template-kwargs (temporary shim) ===" + python3 - <<'PYEOF' +import vllm.benchmarks.serve as S +import vllm.benchmarks.datasets.datasets as D + +def patch(mod, edits, marker): + f = mod.__file__ + src = open(f).read() + if marker in src: + print("already patched:", f) + return + for old, new in edits: + n = src.count(old) + assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..." + src = src.replace(old, new, 1) + open(f, "w").write(src) + print("patched OK ->", f) + +# Edit 1: serve.py -- declare the --chat-template-kwargs argument before --extra-body +serve_old = ''' parser.add_argument( + "--extra-body",''' +serve_new = ''' parser.add_argument( + "--chat-template-kwargs", + type=json.loads, + default=None, + help="JSON dict forwarded to apply_chat_template during " + "client-side prompt rendering, e.g. to enable reasoning mode.", + ) + parser.add_argument( + "--extra-body",''' +patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"') + +# Edit 2: datasets.py -- forward args.chat_template_kwargs into the speed_bench .sample() call +disp_old = ''' output_len=args.speed_bench_output_len, + enable_multimodal_chat=args.enable_multimodal_chat,''' +disp_new = ''' output_len=args.speed_bench_output_len, + chat_template_kwargs=args.chat_template_kwargs, + enable_multimodal_chat=args.enable_multimodal_chat,''' + +# Edit 3: datasets.py -- forward chat_template_kwargs into CustomDataset.sample's template call +samp_old = ''' # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +samp_new = ''' # apply template + if not skip_chat_template: + _ctk = kwargs.get("chat_template_kwargs") or {} + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + **_ctk, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +patch(D, [(disp_old, disp_new), (samp_old, samp_new)], + marker="chat_template_kwargs=args.chat_template_kwargs") +PYEOF +} + +# Apply the shim once if any thinking-on cell is requested. +if [[ " $THINKING_MODES " == *" on "* ]]; then + if ! apply_chat_template_kwargs_shim; then + echo "CRITICAL: --chat-template-kwargs shim failed — aborting" + exit 1 + fi +fi + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi +MOE_ARGS=() +if [ "${DP_ATTENTION}" = "true" ]; then + MOE_ARGS=(--moe-backend deep_gemm_mega_moe) +fi + +fetch_metric() { + local port="$1" name="$2" + curl -s "http://localhost:${port}/metrics" \ + | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0" +} + +SERVER_PID="" +# List all descendant PIDs of $1 recursively, matched by PARENT pid. This can +# never include this script (the script is an ancestor of the server, not a +# descendant), so it avoids the self-kill a name-based `pkill -f vllm` caused +# (the script filename contains "vllm"). +_descendants() { + local pid="$1" child + for child in $(pgrep -P "$pid" 2>/dev/null || true); do + echo "$child" + _descendants "$child" + done +} +cleanup_server() { + if [[ -n "$SERVER_PID" ]]; then + # Snapshot the server's worker/EngineCore subprocesses BEFORE killing the + # parent: once the parent dies the children reparent to init and the tree + # link is lost. Killing the captured PIDs guarantees no orphaned worker + # survives to hold GPU memory and OOM the next server start. + local descendants + descendants=$(_descendants "$SERVER_PID") + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + local pid + for pid in $descendants; do + kill -9 "$pid" 2>/dev/null || true + done + # Wait for GPU memory to actually free before the next server starts. + local waited=0 + while [[ $waited -lt 120 ]]; do + local used + used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1) + if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi + sleep 3; waited=$((waited + 3)) + done + SERVER_PID="" + fi +} +trap 'cleanup_server' EXIT + +start_gpu_monitor + +# Per-cell AL is collected into associative arrays keyed by "mode_mtp". +declare -A AL_RESULT + +run_cell() { + local mode="$1" mtp="$2" + local think_args=() + if [[ "$mode" == "on" ]]; then + think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_ON") + fi + + echo "" + echo "==========================================" + echo " Cell: thinking=$mode MTP=$mtp category=$CATEGORY" + echo "==========================================" + + local serve_args=( + --host 0.0.0.0 --port "$PORT" + "${PARALLEL_ARGS[@]}" + --pipeline-parallel-size 1 + --kv-cache-dtype fp8 + --trust-remote-code + --block-size 256 + --no-enable-prefix-caching + "${EP_ARGS[@]}" + "${MOE_ARGS[@]}" + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --attention_config.use_fp4_indexer_cache True + --tokenizer-mode deepseek_v4 + --tool-call-parser deepseek_v4 + --enable-auto-tool-choice + --reasoning-parser deepseek_v4 + --max-cudagraph-capture-size 2048 + --max-model-len 16384 + --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}" + ) + + local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log" + vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 & + SERVER_PID=$! + + if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then + echo " -> server failed to start (thinking=$mode mtp=$mtp), recording N/A" + AL_RESULT["${mode}_${mtp}"]="N/A" + cleanup_server + return + fi + + local acc_before drf_before acc_after drf_after + acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + vllm bench serve \ + --model "$SERVE_MODEL" \ + --port "$PORT" \ + --dataset-name speed_bench \ + --dataset-path "$SPEEDBENCH_DIR" \ + --speed-bench-category "$CATEGORY" \ + --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \ + --num-prompts -1 \ + --max-concurrency "$CONCURRENCY" \ + --save-result \ + --result-dir "$RESULTS_DIR" \ + --result-filename "speedbench_${mode}_mtp${mtp}" \ + --trust-remote-code \ + --tokenizer-mode deepseek_v4 \ + --temperature "$TEMPERATURE" \ + "${think_args[@]}" + + acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total") + drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total") + + local delta_acc delta_drf al + delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}") + delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}") + if [[ "$delta_drf" -gt 0 ]]; then + al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}") + else + al="N/A" + fi + echo " -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)" + AL_RESULT["${mode}_${mtp}"]="$al" + + cleanup_server +} + +for mode in $THINKING_MODES; do + for mtp in $MTP_LIST; do + run_cell "$mode" "$mtp" + done +done + +stop_gpu_monitor + +# ---- Emit the YAML matrix ---- +emit_mode_block() { + local mode="$1" + for mtp in $MTP_LIST; do + echo " $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}" + done +} + +{ + echo "# Acceptance Length (AL) reference values measured with SPEED-Bench." + echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN" + echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON" + echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens." + echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)." + echo "#" + echo "# key = num_speculative_tokens (MTP level); value = golden AL" + echo "deepseek-v4-pro:" + if [[ " $THINKING_MODES " == *" on "* ]]; then + echo " thinking_on:" + emit_mode_block on + fi + if [[ " $THINKING_MODES " == *" off "* ]]; then + echo " thinking_off:" + emit_mode_block off + fi +} > "$OUT_YAML" + +echo "" +echo "==========================================" +echo " SPEED-Bench AL matrix written to: $OUT_YAML" +echo "==========================================" +cat "$OUT_YAML" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 67e8b48cce..e6bdf1a0da 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -334,6 +334,12 @@ else BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh" fi + # Allow callers (e.g. the speedbench-al.yml AL-collection workflow) to run a + # specific script instead of the auto-selected throughput benchmark. + if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then + BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE" + fi + LOCK_FILE="${SQUASH_FILE}.lock" # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell @@ -379,7 +385,9 @@ else fi ) - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + # Default 180 min; AL-matrix collection (16 server starts) needs longer and + # overrides via SALLOC_TIME_LIMIT. + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID \ From bab431dac7c1813491b42f595644dc8b1cd876a7 Mon Sep 17 00:00:00 2001 From: Albert Cheng Date: Wed, 3 Jun 2026 14:05:58 -0700 Subject: [PATCH 02/27] speedbench-al: default open-pr to false (artifact-only by default) Make the workflow default to Option 1 (upload the AL matrix as an artifact for manual review/paste) rather than auto-opening a PR. The auto-PR path stays available as an opt-in (open-pr: true), but keeping it off by default avoids exposing a write-scoped PAT on the self-hosted runner and matches the repo's artifact-collection convention. --- .github/workflows/speedbench-al.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml index 771e53e6c4..bbd7a9d7b9 100644 --- a/.github/workflows/speedbench-al.yml +++ b/.github/workflows/speedbench-al.yml @@ -49,10 +49,10 @@ on: type: string default: '480' open-pr: - description: "Open a PR updating benchmarks/speedbench-reference-al.yaml" + description: "Open a PR updating benchmarks/speedbench-reference-al.yaml (default off: artifact-only, paste values in manually)" required: false type: boolean - default: true + default: false ref: description: "Git ref (branch/sha) to checkout" required: false From d595d49ab93175ff1c807a8af4ca61dd85464cc2 Mon Sep 17 00:00:00 2001 From: Albert Cheng Date: Thu, 4 Jun 2026 10:17:28 -0700 Subject: [PATCH 03/27] speedbench-al: parameterize model + relocate collector script Address review: - Model is now a workflow input (model + model-prefix, default deepseek-ai/DeepSeek-V4-Pro / dsv4). MODEL, MODEL_PREFIX, EXP_NAME, BENCH_SCRIPT_OVERRIDE, artifact names and the Create-PR branch/title/body are all derived from those inputs. The emitted YAML top-level key is now derived from the model (MODEL_KEY, defaults to the model basename lowercased). - Move the collector to benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh and fix its benchmark_lib.sh source path (../ -> ../../) for the deeper dir. --- .github/workflows/speedbench-al.yml | 46 ++++++++++++------- .../dsv4_fp4_b300_vllm.sh} | 13 ++++-- 2 files changed, 38 insertions(+), 21 deletions(-) rename benchmarks/single_node/{dsv4_fp4_b300_vllm_speedbench_matrix.sh => speedbench/dsv4_fp4_b300_vllm.sh} (95%) diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml index bbd7a9d7b9..ea4baea5c9 100644 --- a/.github/workflows/speedbench-al.yml +++ b/.github/workflows/speedbench-al.yml @@ -1,9 +1,10 @@ name: SpeedBench AL Collection -# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench -# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the -# golden reference consumed by the synthetic-acceptance framework and (optionally) -# opens a PR updating benchmarks/speedbench-reference-al.yaml. +# Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length +# (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to +# DeepSeek-V4-Pro). Produces the golden reference consumed by the +# synthetic-acceptance framework and (optionally) opens a PR updating +# benchmarks/speedbench-reference-al.yaml. on: workflow_dispatch: @@ -13,6 +14,16 @@ on: required: false type: string default: 'b300' + model: + description: "HF model id (basename must be in launcher STAGED_MODELS for pre-staged local weights)" + required: false + type: string + default: 'deepseek-ai/DeepSeek-V4-Pro' + model-prefix: + description: "Model prefix; drives launcher MODEL_PATH resolution, exp name, collector script, and artifact names" + required: false + type: string + default: 'dsv4' image: description: "vLLM container image" required: false @@ -64,22 +75,22 @@ permissions: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_CACHE: '/mnt/hf_hub_cache/' - # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the - # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so + # Drive the single-node path in runners/launch_b300-nv.sh. MODEL is the HF id; + # its basename (e.g. DeepSeek-V4-Pro) must be in the launcher's STAGED_MODELS so # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download. - MODEL: deepseek-ai/DeepSeek-V4-Pro - MODEL_PREFIX: dsv4 + MODEL: ${{ inputs.model }} + MODEL_PREFIX: ${{ inputs.model-prefix }} PRECISION: fp4 FRAMEWORK: vllm - EXP_NAME: dsv4_speedbench + EXP_NAME: ${{ inputs.model-prefix }}_speedbench IMAGE: ${{ inputs.image }} TP: '8' EP_SIZE: '1' DP_ATTENTION: 'false' SPEC_DECODING: mtp # Run the AL-matrix collector instead of the auto-selected throughput script. - BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh + BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/speedbench/${{ inputs.model-prefix }}_fp4_b300_vllm.sh SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }} # Matrix-collector tunables (propagated into the container via srun --export=ALL). MTP_LIST: ${{ inputs.mtp-list }} @@ -158,7 +169,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: speedbench-reference-al + name: speedbench-reference-al-${{ inputs.model-prefix }} path: speedbench-reference-al.yaml if-no-files-found: warn @@ -168,9 +179,12 @@ jobs: GH_TOKEN: ${{ secrets.REPO_PAT }} run: | set -euo pipefail + # NOTE: the reference yaml is keyed by model at the top level. This + # overwrites it with the current model's matrix; when more than one + # model is collected, replace this cp with a per-model-key YAML merge. cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml - BRANCH="speedbench-al/auto-${{ github.run_id }}" + BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}" git config user.name "github-actions" git config user.email "github-actions@github.com" git checkout -b "$BRANCH" @@ -179,11 +193,11 @@ jobs: echo "No change in reference yaml; skipping PR." exit 0 fi - git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})" + git commit -m "Update SpeedBench AL reference matrix for ${{ inputs.model }} (auto, run ${{ github.run_id }})" git push -u origin "$BRANCH" gh pr create \ - --title "Update SpeedBench AL reference matrix (auto)" \ - --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \ + --title "Update SpeedBench AL reference matrix for ${{ inputs.model-prefix }} (auto)" \ + --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Model: \`${{ inputs.model }}\`, category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \ --base main \ --head "$BRANCH" @@ -191,7 +205,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: speedbench_server_logs + name: speedbench_server_logs-${{ inputs.model-prefix }} path: speedbench_results/server_*.log if-no-files-found: ignore diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh similarity index 95% rename from benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh rename to benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh index 572801b2c3..7e39c32b3c 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh +++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh @@ -13,7 +13,7 @@ # # Usage (inside the vLLM container, on a B300 node): # export MODEL=/data/models/dsv4-pro -# bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh +# bash benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh # # Tunables (env): # MTP_LIST space-separated MTP levels (default "1 2 3 4 5 6 7 8") @@ -23,7 +23,7 @@ # OUT_YAML output matrix path (default $RESULTS_DIR/speedbench-reference-al.yaml) set -uo pipefail -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}" # Serve from the local weights dir resolved by the launcher (MODEL_PATH points @@ -39,6 +39,9 @@ PORT="${PORT:-8888}" MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}" THINKING_MODES="${THINKING_MODES:-off on}" CATEGORY="${CATEGORY:-coding}" +# Top-level key in the emitted YAML matrix. Derived from the model by the +# workflow (e.g. deepseek-v4-pro); falls back to the model basename, lowercased. +MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}" SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" CONCURRENCY="${CONCURRENCY:-1}" TEMPERATURE="${TEMPERATURE:-1.0}" @@ -315,11 +318,11 @@ emit_mode_block() { echo "# Acceptance Length (AL) reference values measured with SPEED-Bench." echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN" echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON" - echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens." - echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)." + echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens." + echo "# Auto-generated by benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh (speedbench-al.yml)." echo "#" echo "# key = num_speculative_tokens (MTP level); value = golden AL" - echo "deepseek-v4-pro:" + echo "${MODEL_KEY}:" if [[ " $THINKING_MODES " == *" on "* ]]; then echo " thinking_on:" emit_mode_block on From b2dd50adb1e542c2c933a0cb90b4e1dbe9032196 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:36:10 -0400 Subject: [PATCH 04/27] feat: add SpeedBench AL eval validation --- benchmarks/benchmark_lib.sh | 214 +++++++++++++ .../fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh | 1 + .../fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh | 1 + utils/collect_eval_results.py | 88 +++++- utils/evals/EVALS.md | 6 +- utils/evals/speedbench_al.py | 298 ++++++++++++++++++ utils/evals/test_speedbench_al.py | 122 +++++++ utils/evals/validate_scores.py | 37 +++ 8 files changed, 752 insertions(+), 15 deletions(-) create mode 100644 utils/evals/speedbench_al.py create mode 100644 utils/evals/test_speedbench_al.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e3080b4bfa..7b277cd28e 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -694,6 +694,219 @@ setup_eval_context() { export EVAL_MAX_MODEL_LEN } +# ------------------------------ +# SpeedBench acceptance-length eval helpers +# ------------------------------ + +_prometheus_metric_sum() { + local port="$1" + local name="$2" + local metrics + metrics=$(curl -fsS "http://0.0.0.0:${port}/metrics" 2>/dev/null) || return 1 + awk -v name="$name" ' + /^#/ { next } + { + metric = $1 + sub(/\{.*/, "", metric) + if (metric == name && $NF ~ /^-?([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][-+]?[0-9]+)?$/) { + sum += $NF + found = 1 + } + } + END { + if (found) { + printf "%.10f\n", sum + } else { + exit 1 + } + } + ' <<< "$metrics" +} + +_speedbench_write_eval_result() { + local output="$1" + local mode="$2" + local mtp="$3" + local al="${4:-}" + local accepted="${5:-}" + local drafts="${6:-}" + local error="${7:-}" + local speedbench_model="${MODEL_NAME:-${MODEL:-}}" + + local record_cmd=( + python3 "$(pwd)/utils/evals/speedbench_al.py" + record + --output "$output" + --reference-yaml "benchmarks/speedbench-reference-al.yaml" + --model "$speedbench_model" + --model-prefix "${MODEL_PREFIX:-}" + --thinking-mode "$mode" + --num-speculative-tokens "$mtp" + --category "coding" + --output-len "4096" + --temperature "1.0" + --threshold-ratio "0.90" + ) + if [[ -n "$al" ]]; then + record_cmd+=(--acceptance-length "$al") + fi + if [[ -n "$accepted" ]]; then + record_cmd+=(--accepted-tokens "$accepted") + fi + if [[ -n "$drafts" ]]; then + record_cmd+=(--draft-tokens "$drafts") + fi + if [[ -n "$error" ]]; then + record_cmd+=(--error "$error") + fi + "${record_cmd[@]}" || true +} + +_speedbench_reference_available() { + local mode="$1" + local mtp="$2" + local reference="benchmarks/speedbench-reference-al.yaml" + local speedbench_model="${MODEL_NAME:-${MODEL:-}}" + [[ -f "$reference" ]] || return 1 + python3 "$(pwd)/utils/evals/speedbench_al.py" resolve \ + --reference-yaml "$reference" \ + --model "$speedbench_model" \ + --model-prefix "${MODEL_PREFIX:-}" \ + --thinking-mode "$mode" \ + --num-speculative-tokens "$mtp" \ + --threshold-ratio "0.90" >/dev/null +} + +_speedbench_prepare_dataset() { + local speedbench_dir="$1" + if [[ -f "$speedbench_dir/qualitative.jsonl" ]]; then + return 0 + fi + mkdir -p "$speedbench_dir" + python3 -m pip install -q datasets tiktoken + curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \ + | python3 - --config qualitative --output_dir "$speedbench_dir" + [[ -f "$speedbench_dir/qualitative.jsonl" ]] +} + +run_speedbench_al_eval() { + local port="${PORT:-8888}" + while [[ $# -gt 0 ]]; do + case $1 in + --port) port="$2"; shift 2 ;; + *) + if [[ $# -gt 1 && "$2" != --* ]]; then + shift 2 + else + shift + fi + ;; + esac + done + + local mtp="${SPEEDBENCH_NUM_SPEC_TOKENS:-${NUM_SPEC_TOKENS:-${SPECULATIVE_DRAFT_TOKENS:-2}}}" + local default_thinking_mode="off" + if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + default_thinking_mode="on" + fi + local mode="$default_thinking_mode" + + if [[ "${SPEC_DECODING:-none}" != "mtp" ]]; then + echo "SpeedBench AL eval: skipping non-MTP config (SPEC_DECODING=${SPEC_DECODING:-none})" + return 0 + fi + + if [[ -z "${EVAL_RESULT_DIR:-}" ]]; then + EVAL_RESULT_DIR="$(mktemp -d /tmp/eval_out-XXXXXX)" + export EVAL_RESULT_DIR + fi + + # TODO: Add unified support for SGLang, TRT-LLM, and disagg (Dynamo). + if ! command -v vllm >/dev/null 2>&1; then + local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" + echo "SpeedBench AL eval: vllm CLI is not available for SpeedBench client" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm CLI is not available for SpeedBench client" + return 0 + fi + + local speedbench_dir="${SPEEDBENCH_DIR:-$(pwd)/speed_bench_data}" + if ! _speedbench_prepare_dataset "$speedbench_dir"; then + local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" + echo "SpeedBench AL eval: SpeedBench dataset download failed" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "SpeedBench dataset download failed" + return 0 + fi + + local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" + if ! _speedbench_reference_available "$mode" "$mtp"; then + echo "SpeedBench AL eval: no reference for mode=${mode} mtp=${mtp}" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "No SpeedBench AL reference for this eval cell" + return 0 + fi + + local think_args=() + if [[ "$mode" == "on" ]]; then + think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}') + fi + + local accepted_before="" draft_before="" + accepted_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true) + draft_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true) + accepted_before="${accepted_before:-0}" + draft_before="${draft_before:-0}" + + local raw_result_dir + raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)" + local bench_rc=0 + local speedbench_model="${MODEL_NAME:-${MODEL:-}}" + local bench_cmd=( + vllm bench serve + --model "$speedbench_model" + --port "$port" + --dataset-name speed_bench + --dataset-path "$speedbench_dir" + --speed-bench-category coding + --speed-bench-output-len 4096 + --num-prompts -1 + --max-concurrency 1 + --save-result + --result-dir "$raw_result_dir" + --result-filename "speedbench_al_${mode}_mtp${mtp}" + --trust-remote-code + --tokenizer-mode deepseek_v4 + --temperature 1.0 + "${think_args[@]}" + ) + + echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}" + "${bench_cmd[@]}" || bench_rc=$? + if [[ "$bench_rc" -ne 0 ]]; then + echo "SpeedBench AL eval: vllm bench serve failed with exit code ${bench_rc}" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm bench serve failed with exit code ${bench_rc}" + rm -rf "$raw_result_dir" || true + return 0 + fi + + local accepted_after="" draft_after="" al="" delta_acc="" delta_draft="" + accepted_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true) + draft_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true) + if [[ -n "$accepted_after" && -n "$draft_after" ]]; then + delta_acc=$(awk "BEGIN {printf \"%d\", ${accepted_after} - ${accepted_before}}") + delta_draft=$(awk "BEGIN {printf \"%d\", ${draft_after} - ${draft_before}}") + if [[ "$delta_draft" -gt 0 ]]; then + al=$(awk "BEGIN {printf \"%.4f\", 1 + (${delta_acc} / ${delta_draft})}") + fi + fi + + if [[ -z "$al" ]]; then + echo "SpeedBench AL eval: could not collect speculative acceptance metrics from server" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "$delta_acc" "$delta_draft" "Could not collect speculative acceptance metrics from server" + else + _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_draft" + fi + rm -rf "$raw_result_dir" || true +} + run_lm_eval() { local port="${PORT:-8888}" local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}" @@ -876,6 +1089,7 @@ run_eval() { fi local eval_rc=0 + run_speedbench_al_eval "${forwarded[@]}" || true case "$framework" in lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;; *) echo "Unknown framework '${framework}'"; eval_rc=1 ;; diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh index 6846223e8e..0f4eeb8600 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh @@ -65,6 +65,7 @@ fi # use 2 speculative tokens for all configs for now NUM_SPEC_TOKENS=2 +export SPEEDBENCH_NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS" # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh index a5e7dd28cb..c2a3741250 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh @@ -66,6 +66,7 @@ fi # use 2 speculative tokens for all configs for now NUM_SPEC_TOKENS=2 +export SPEEDBENCH_NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS" start_gpu_monitor diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 18917447ec..f4bca741f6 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -36,8 +36,8 @@ def find_eval_sets(root: Path) -> List[Path]: return out -def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: - """Return (lm_eval_json) if present. +def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], List[Path]]: + """Return (lm_eval_json, speedbench_al_jsons) if present. Checks immediate directory for result JSONs. """ @@ -46,7 +46,7 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: ] lm_path = None - le_path = None + speedbench_paths: List[Path] = [] for p in immediate_jsons: data = load_json(p) @@ -57,8 +57,12 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]: # lm-eval harness - pick latest if multiple if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime: lm_path = p - - return lm_path, le_path + + if 'speedbench_al_eval_version' in data: + speedbench_paths.append(p) + + speedbench_paths.sort() + return lm_path, speedbench_paths def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]: @@ -145,6 +149,32 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]: return extracted +def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]: + """Extract a compact SpeedBench AL result as an eval metric row.""" + data = load_json(json_path) or {} + if 'speedbench_al_eval_version' not in data: + return [] + + mode = data.get('thinking_mode', 'unknown') + mtp = data.get('num_speculative_tokens', 'unknown') + task_label = f"speedbench_al/{mode}/mtp{mtp}" + return [{ + 'metric_type': 'speedbench_al', + 'task': 'speedbench_al', + 'task_label': task_label, + 'acceptance_length': data.get('acceptance_length'), + 'reference_acceptance_length': data.get('reference_acceptance_length'), + 'min_acceptance_length': data.get('min_acceptance_length'), + 'threshold_ratio': data.get('threshold_ratio'), + 'thinking_mode': mode, + 'num_speculative_tokens': mtp, + 'passed': data.get('passed'), + 'error': data.get('error'), + 'model': data.get('model'), + 'source': str(json_path), + }] + + def pct(x: Any) -> str: """Format value as percentage.""" try: @@ -222,7 +252,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: 'dp_attention': str(dp_attention).lower(), 'prefill_dp_attention': str(prefill_dp_attention).lower(), 'decode_dp_attention': str(decode_dp_attention).lower(), - 'task': m.get('task', 'unknown'), + 'task': m.get('task_label') or m.get('task', 'unknown'), 'em_strict': m.get('strict'), 'em_strict_se': m.get('strict_se'), 'em_flexible': m.get('flex'), @@ -232,7 +262,18 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: } # Add universal score field (primary metric for unified comparison) - if m.get('strict') is not None: + if m.get('metric_type') == 'speedbench_al': + row['score'] = m.get('acceptance_length') + row['score_name'] = 'acceptance_length' + row['score_se'] = None + row['speedbench_reference_acceptance_length'] = m.get('reference_acceptance_length') + row['speedbench_min_acceptance_length'] = m.get('min_acceptance_length') + row['speedbench_threshold_ratio'] = m.get('threshold_ratio') + row['speedbench_thinking_mode'] = m.get('thinking_mode') + row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens') + row['speedbench_passed'] = m.get('passed') + row['speedbench_error'] = m.get('error') + elif m.get('strict') is not None: row['score'] = m.get('strict') row['score_name'] = 'em_strict' row['score_se'] = m.get('strict_se') @@ -248,6 +289,24 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: return row +def score_cell(r: Dict[str, Any]) -> str: + """Format the primary score for lm-eval and non-percentage eval rows.""" + if r.get('score_name') == 'acceptance_length': + score = r.get('score') + minimum = r.get('speedbench_min_acceptance_length') + passed = r.get('speedbench_passed') + if score is None: + return 'FAIL' + try: + status = 'PASS' if passed else 'FAIL' + if minimum is None: + return f"{float(score):.2f} ({status})" + return f"{float(score):.2f} >= {float(minimum):.2f} ({status})" + except Exception: + return str(score) + return f"{pct(r['score'])}{se(r['score_se'])}" + + def main(): if len(sys.argv) < 3: print('Usage: collect_eval_results.py [sort_by: model_prefix|hw]') @@ -259,13 +318,14 @@ def main(): rows: List[Dict[str, Any]] = [] for d in find_eval_sets(root): meta = load_json(d / 'meta_env.json') or {} - lm_path, le_path = detect_eval_jsons(d) + lm_path, speedbench_paths = detect_eval_jsons(d) - # Extract metrics (prefer lm-eval) - returns list for multi-task support + metrics_list: List[Dict[str, Any]] = [] + # Extract metrics - lm-eval returns one row per task. if lm_path: - metrics_list = extract_lm_metrics(lm_path) - else: - continue + metrics_list.extend(extract_lm_metrics(lm_path)) + for speedbench_path in speedbench_paths: + metrics_list.extend(extract_speedbench_al_metrics(speedbench_path)) if not metrics_list: continue @@ -332,7 +392,7 @@ def main(): r['conc'], r['dp_attention'], r['task'], - f"{pct(r['score'])}{se(r['score_se'])}", + score_cell(r), f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", r['n_eff'] or '', @@ -367,7 +427,7 @@ def main(): r['decode_num_workers'], r['conc'], r['task'], - f"{pct(r['score'])}{se(r['score_se'])}", + score_cell(r), f"{pct(r['em_strict'])}{se(r['em_strict_se'])}", f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}", r['n_eff'] or '', diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index a59bdb40c3..9de27e765f 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -34,6 +34,7 @@ All benchmark scripts in `benchmarks/` follow one of two flows: # 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true) # 4. Run evals: if [ "${RUN_EVAL}" = "true" ]; then + # MTP evals also run SpeedBench AL validation first when a reference exists. run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary # Writes meta_env.json and moves artifacts fi @@ -51,6 +52,7 @@ Key eval functions in `benchmarks/benchmark_lib.sh`: | Function | Description | |----------|-------------| | `run_eval` | Unified entrypoint - dispatches to framework-specific runner | +| `run_speedbench_al_eval` | Runs SpeedBench on MTP eval jobs, records measured acceptance length, and defers threshold failure to `validate_scores.py` | | `run_lm_eval` | Runs lm-eval harness against the OpenAI-compatible endpoint | | `append_lm_eval_summary` | Writes `meta_env.json` and moves eval artifacts to workspace | | `_install_lm_eval_deps` | Installs lm-eval dependencies | @@ -131,9 +133,11 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results | | `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) | | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval | +| `SPEEDBENCH_DIR` | `$(pwd)/speed_bench_data` | Prepared SpeedBench dataset directory; resolves to `/workspace/speed_bench_data` or `/ix/speed_bench_data` through the runner's container workdir | +| `SPEEDBENCH_NUM_SPEC_TOKENS` | script-provided or `2` | MTP level used to select the reference AL row | ### Score validation -`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails. +`utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. ### Adding a new eval task diff --git a/utils/evals/speedbench_al.py b/utils/evals/speedbench_al.py new file mode 100644 index 0000000000..4a5fd5d6d0 --- /dev/null +++ b/utils/evals/speedbench_al.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +"""SpeedBench acceptance-length reference and result helpers.""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + + +MODEL_PREFIX_ALIASES = { + "dsv4": "deepseek-v4-pro", +} + + +def _parse_scalar(value: str) -> Any: + value = value.strip() + if value in {"", "null", "None", "~"}: + return None + if value in {"N/A", "NA", "n/a", "na"}: + return None + if (value.startswith('"') and value.endswith('"')) or ( + value.startswith("'") and value.endswith("'") + ): + return value[1:-1] + try: + if re.match(r"^-?\d+$", value): + return int(value) + return float(value) + except ValueError: + return value + + +def _load_simple_reference_yaml(path: Path) -> dict[str, Any]: + """Parse the simple nested mapping emitted by the SpeedBench AL workflow.""" + data: dict[str, Any] = {} + current_model: str | None = None + current_mode: str | None = None + + for raw_line in path.read_text().splitlines(): + line = raw_line.split("#", 1)[0].rstrip() + if not line.strip(): + continue + indent = len(raw_line) - len(raw_line.lstrip(" ")) + if ":" not in line: + continue + key, value = line.strip().split(":", 1) + key = key.strip().strip("'\"") + value = value.strip() + + if indent == 0: + current_model = key + data.setdefault(current_model, {}) + current_mode = None + elif indent == 2 and current_model is not None: + current_mode = key + data[current_model].setdefault(current_mode, {}) + elif indent == 4 and current_model is not None and current_mode is not None: + data[current_model][current_mode][key] = _parse_scalar(value) + + return data + + +def load_reference(path: Path) -> dict[str, Any]: + try: + import yaml # type: ignore + except ImportError: + return _load_simple_reference_yaml(path) + + with path.open() as f: + loaded = yaml.safe_load(f) or {} + if not isinstance(loaded, dict): + raise ValueError(f"{path} must contain a mapping at the top level") + return loaded + + +def normalize_key(value: str) -> str: + value = value.strip().split("/")[-1].lower() + value = value.replace("_", "-") + value = re.sub(r"[^a-z0-9.+-]+", "-", value) + return value.strip("-") + + +def model_candidates(model: str, model_prefix: str | None = None) -> list[str]: + candidates: list[str] = [] + if model_prefix: + prefix = normalize_key(model_prefix) + candidates.append(MODEL_PREFIX_ALIASES.get(prefix, prefix)) + if model: + normalized = normalize_key(model) + candidates.append(MODEL_PREFIX_ALIASES.get(normalized, normalized)) + seen = set() + out = [] + for candidate in candidates: + if candidate and candidate not in seen: + out.append(candidate) + seen.add(candidate) + return out + + +def normalize_mode(thinking_mode: str) -> str: + mode = thinking_mode.strip().lower().replace("-", "_") + if mode == "on": + return "thinking_on" + if mode == "off": + return "thinking_off" + raise ValueError("SpeedBench thinking mode must be 'on' or 'off'") + + +def lookup_reference( + reference: dict[str, Any], + model: str, + model_prefix: str | None, + thinking_mode: str, + num_speculative_tokens: int, +) -> tuple[str, str, float]: + normalized_reference = {normalize_key(str(k)): v for k, v in reference.items()} + mode_key = normalize_mode(thinking_mode) + token_key = str(num_speculative_tokens) + + for candidate in model_candidates(model, model_prefix): + model_block = normalized_reference.get(candidate) + if not isinstance(model_block, dict): + continue + mode_block = model_block.get(mode_key) + if not isinstance(mode_block, dict): + continue + value = mode_block.get(num_speculative_tokens, mode_block.get(token_key)) + if value is None: + continue + try: + return candidate, mode_key, float(value) + except (TypeError, ValueError): + continue + + candidates = ", ".join(model_candidates(model, model_prefix)) or "" + raise KeyError( + "No SpeedBench AL reference for " + f"model candidates [{candidates}], mode {mode_key}, MTP {num_speculative_tokens}" + ) + + +def _optional_float(value: str | None) -> float | None: + if value in {None, "", "None", "null", "N/A"}: + return None + return float(value) + + +def _optional_int(value: str | None) -> int | None: + if value in {None, "", "None", "null", "N/A"}: + return None + return int(float(value)) + + +def build_result(args: argparse.Namespace) -> dict[str, Any]: + reference_al: float | None = None + min_acceptance_length: float | None = None + model_key: str | None = None + mode_key = normalize_mode(args.thinking_mode) + error: str | None = args.error + + if args.reference_yaml: + reference_path = Path(args.reference_yaml) + if reference_path.exists(): + try: + model_key, mode_key, reference_al = lookup_reference( + load_reference(reference_path), + args.model, + args.model_prefix, + args.thinking_mode, + args.num_speculative_tokens, + ) + min_acceptance_length = reference_al * args.threshold_ratio + except Exception as exc: # noqa: BLE001 - recorded for CI artifacts + error = error or str(exc) + else: + error = error or f"Reference YAML not found: {reference_path}" + + acceptance_length = _optional_float(args.acceptance_length) + accepted_tokens = _optional_int(args.accepted_tokens) + draft_tokens = _optional_int(args.draft_tokens) + passed = ( + error is None + and acceptance_length is not None + and min_acceptance_length is not None + and acceptance_length >= min_acceptance_length + ) + + result = { + "speedbench_al_eval_version": 1, + "task": "speedbench_al", + "model": args.model, + "model_key": model_key, + "model_prefix": args.model_prefix, + "thinking_mode": mode_key, + "num_speculative_tokens": args.num_speculative_tokens, + "category": args.category, + "output_len": args.output_len, + "temperature": args.temperature, + "acceptance_length": acceptance_length, + "accepted_tokens": accepted_tokens, + "draft_tokens": draft_tokens, + "reference_acceptance_length": reference_al, + "threshold_ratio": args.threshold_ratio, + "min_acceptance_length": min_acceptance_length, + "passed": passed, + } + if error: + result["error"] = error + return result + + +def cmd_resolve(args: argparse.Namespace) -> int: + model_key, mode_key, reference_al = lookup_reference( + load_reference(Path(args.reference_yaml)), + args.model, + args.model_prefix, + args.thinking_mode, + args.num_speculative_tokens, + ) + payload = { + "model_key": model_key, + "thinking_mode": mode_key, + "num_speculative_tokens": args.num_speculative_tokens, + "reference_acceptance_length": reference_al, + "threshold_ratio": args.threshold_ratio, + "min_acceptance_length": reference_al * args.threshold_ratio, + } + print(json.dumps(payload, sort_keys=True)) + return 0 + + +def cmd_record(args: argparse.Namespace) -> int: + result = build_result(args) + output = Path(args.output) + output.write_text(json.dumps(result, indent=2, sort_keys=True) + "\n") + status = "PASS" if result["passed"] else "FAIL" + actual = result.get("acceptance_length") + minimum = result.get("min_acceptance_length") + print( + f"{status}: SpeedBench AL {actual} " + f"(min {minimum}, mode {result['thinking_mode']}, " + f"mtp {result['num_speculative_tokens']})" + ) + if args.exit_status and not result["passed"]: + return 1 + return 0 + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers(dest="command", required=True) + + resolve = subparsers.add_parser("resolve", help="Resolve a reference AL cell") + resolve.add_argument("--reference-yaml", required=True) + resolve.add_argument("--model", required=True) + resolve.add_argument("--model-prefix", default="") + resolve.add_argument("--thinking-mode", required=True) + resolve.add_argument("--num-speculative-tokens", type=int, required=True) + resolve.add_argument("--threshold-ratio", type=float, default=0.90) + resolve.set_defaults(func=cmd_resolve) + + record = subparsers.add_parser("record", help="Write a compact AL eval result") + record.add_argument("--output", required=True) + record.add_argument("--reference-yaml", default="") + record.add_argument("--model", required=True) + record.add_argument("--model-prefix", default="") + record.add_argument("--thinking-mode", required=True) + record.add_argument("--num-speculative-tokens", type=int, required=True) + record.add_argument("--category", default="coding") + record.add_argument("--output-len", type=int, default=4096) + record.add_argument("--temperature", type=float, default=1.0) + record.add_argument("--threshold-ratio", type=float, default=0.90) + record.add_argument("--acceptance-length", default=None) + record.add_argument("--accepted-tokens", default=None) + record.add_argument("--draft-tokens", default=None) + record.add_argument("--error", default=None) + record.add_argument("--exit-status", action="store_true") + record.set_defaults(func=cmd_record) + + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + try: + return args.func(args) + except Exception as exc: # noqa: BLE001 - CLI should print concise failures + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py new file mode 100644 index 0000000000..ab480cf782 --- /dev/null +++ b/utils/evals/test_speedbench_al.py @@ -0,0 +1,122 @@ +import argparse +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from collect_eval_results import build_row, extract_speedbench_al_metrics, score_cell +from speedbench_al import build_result, load_reference, lookup_reference +from validate_scores import validate_speedbench_al + + +def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None: + ref = tmp_path / "speedbench-reference-al.yaml" + ref.write_text( + """ +deepseek-v4-pro: + thinking_on: + 2: 2.75 + thinking_off: + 2: 2.40 +""" + ) + + data = load_reference(ref) + model_key, mode_key, value = lookup_reference( + data, + model="deepseek-ai/DeepSeek-V4-Pro", + model_prefix="dsv4", + thinking_mode="on", + num_speculative_tokens=2, + ) + + assert model_key == "deepseek-v4-pro" + assert mode_key == "thinking_on" + assert value == 2.75 + + +def test_build_result_records_threshold_pass(tmp_path: Path) -> None: + ref = tmp_path / "speedbench-reference-al.yaml" + ref.write_text( + """ +deepseek-v4-pro: + thinking_on: + 2: 2.50 +""" + ) + args = argparse.Namespace( + reference_yaml=str(ref), + model="deepseek-ai/DeepSeek-V4-Pro", + model_prefix="dsv4", + thinking_mode="on", + num_speculative_tokens=2, + category="coding", + output_len=4096, + temperature=1.0, + threshold_ratio=0.90, + acceptance_length="2.30", + accepted_tokens="13", + draft_tokens="10", + error=None, + ) + + result = build_result(args) + + assert result["reference_acceptance_length"] == 2.50 + assert result["min_acceptance_length"] == 2.25 + assert result["passed"] is True + + +def test_validate_speedbench_al_fails_below_minimum() -> None: + ok, checked = validate_speedbench_al( + { + "speedbench_al_eval_version": 1, + "task": "speedbench_al", + "thinking_mode": "thinking_on", + "num_speculative_tokens": 2, + "acceptance_length": 2.0, + "min_acceptance_length": 2.25, + "passed": False, + }, + "results_speedbench_al.json", + ) + + assert checked == 1 + assert ok is False + + +def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: + result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json" + result_path.write_text( + json.dumps( + { + "speedbench_al_eval_version": 1, + "task": "speedbench_al", + "model": "deepseek-ai/DeepSeek-V4-Pro", + "thinking_mode": "thinking_on", + "num_speculative_tokens": 2, + "acceptance_length": 2.3, + "reference_acceptance_length": 2.5, + "min_acceptance_length": 2.25, + "threshold_ratio": 0.9, + "passed": True, + } + ) + ) + metrics = extract_speedbench_al_metrics(result_path) + row = build_row( + { + "infmax_model_prefix": "dsv4", + "hw": "b300", + "framework": "vllm", + "precision": "fp4", + "spec_decoding": "mtp", + }, + metrics[0], + ) + + assert row["task"] == "speedbench_al/thinking_on/mtp2" + assert row["score_name"] == "acceptance_length" + assert score_cell(row) == "2.30 >= 2.25 (PASS)" diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py index 85433ec4bf..c85becd06b 100644 --- a/utils/evals/validate_scores.py +++ b/utils/evals/validate_scores.py @@ -23,6 +23,35 @@ def load_thresholds(path: str) -> dict[str, float]: return json.load(f) +def validate_speedbench_al(data: dict, source: str) -> tuple[bool, int]: + """Validate a compact SpeedBench AL result JSON.""" + if "speedbench_al_eval_version" not in data: + return False, 0 + + actual = data.get("acceptance_length") + minimum = data.get("min_acceptance_length") + passed = data.get("passed") + label = ( + f"{data.get('task', 'speedbench_al')} " + f"{data.get('thinking_mode', 'unknown')} " + f"mtp{data.get('num_speculative_tokens', 'unknown')}" + ) + + if passed is True: + print(f"PASS: {label} AL = {float(actual):.4f} (>= {float(minimum):.4f})") + return True, 1 + + if isinstance(actual, (int, float)) and isinstance(minimum, (int, float)): + print( + f"FAIL: {label} AL = {actual:.4f} (< {minimum:.4f})", + file=sys.stderr, + ) + else: + error = data.get("error", "missing acceptance length or threshold") + print(f"FAIL: {label} in {source}: {error}", file=sys.stderr) + return False, 1 + + def main() -> int: parser = argparse.ArgumentParser(description="Validate eval scores") parser.add_argument( @@ -63,6 +92,14 @@ def main() -> int: for f in sorted(glob.glob(args.results_glob)): with open(f) as fh: data = json.load(fh) + + speedbench_ok, speedbench_checked = validate_speedbench_al(data, f) + if speedbench_checked: + checked += speedbench_checked + if not speedbench_ok: + failed = True + continue + for task, metrics in data.get("results", {}).items(): min_score = thresholds.get(task, args.min_score) for name, val in metrics.items(): From 4d72cdb0faa4e849ca7de7b19ab53cd37d8e5941 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:43:39 -0700 Subject: [PATCH 05/27] test: add SpeedBench AL reference handling --- benchmarks/speedbench-reference-al.yaml | 29 +++++++++++++++++++++++++ utils/collect_eval_results.py | 8 ++++--- utils/evals/test_speedbench_al.py | 29 ++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 benchmarks/speedbench-reference-al.yaml diff --git a/benchmarks/speedbench-reference-al.yaml b/benchmarks/speedbench-reference-al.yaml new file mode 100644 index 0000000000..b3dbf441d1 --- /dev/null +++ b/benchmarks/speedbench-reference-al.yaml @@ -0,0 +1,29 @@ +# Acceptance Length (AL) reference values measured with SPEED-Bench. +# dataset: coding | temperature: 1.0 | output_len: 4096 +# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens. +# +# Two modes are reported: +# thinking_on - reasoning enabled; this is the PRODUCTION configuration and +# the golden reference used for synthetic-acceptance modeling. +# thinking_off - reasoning disabled; provided for comparison only. +# +# key = num_speculative_tokens (MTP level); value = golden AL +deepseek-v4-pro: + thinking_on: + 1: 1.79 + 2: 2.27 + 3: 2.47 + 4: 2.54 + 5: 2.52 + 6: 2.54 + 7: 2.54 + 8: 2.56 + thinking_off: + 1: 1.92 + 2: 2.60 + 3: 2.97 + 4: 3.04 + 5: 3.13 + 6: 3.08 + 7: 3.13 + 8: 3.12 diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index f4bca741f6..f6a8c2031a 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -41,9 +41,11 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], List[Path]]: Checks immediate directory for result JSONs. """ - immediate_jsons = list(d.glob('results*.json')) + [ - p for p in d.glob('*.json') if p.name != 'meta_env.json' - ] + immediate_jsons = sorted( + set(d.glob('results*.json')).union( + p for p in d.glob('*.json') if p.name != 'meta_env.json' + ) + ) lm_path = None speedbench_paths: List[Path] = [] diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index ab480cf782..2665609c3e 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -6,7 +6,12 @@ sys.path.insert(0, str(Path(__file__).resolve().parent)) sys.path.insert(0, str(Path(__file__).resolve().parents[1])) -from collect_eval_results import build_row, extract_speedbench_al_metrics, score_cell +from collect_eval_results import ( + build_row, + detect_eval_jsons, + extract_speedbench_al_metrics, + score_cell, +) from speedbench_al import build_result, load_reference, lookup_reference from validate_scores import validate_speedbench_al @@ -120,3 +125,25 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: assert row["task"] == "speedbench_al/thinking_on/mtp2" assert row["score_name"] == "acceptance_length" assert score_cell(row) == "2.30 >= 2.25 (PASS)" + + +def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> None: + result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json" + result_path.write_text( + json.dumps( + { + "speedbench_al_eval_version": 1, + "task": "speedbench_al", + "thinking_mode": "thinking_on", + "num_speculative_tokens": 2, + "acceptance_length": 2.3, + "min_acceptance_length": 2.25, + "passed": True, + } + ) + ) + + lm_path, speedbench_paths = detect_eval_jsons(tmp_path) + + assert lm_path is None + assert speedbench_paths == [result_path] From f40d6f276de716cfdc554f1604e5528629557318 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Fri, 5 Jun 2026 12:48:15 -0700 Subject: [PATCH 06/27] Add multi-framework SpeedBench AL metrics --- benchmarks/benchmark_lib.sh | 396 ++++++++++++++++-- .../multi_node/amd_utils/server_sglang.sh | 13 + .../multi_node/amd_utils/server_vllm.sh | 12 + .../single_node/dsv4_fp4_mi355x_sglang_mtp.sh | 2 + .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 1 + .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 2 + .../fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh | 1 + .../fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh | 1 + runners/launch_b300-nv.sh | 2 +- utils/collect_eval_results.py | 10 + utils/evals/EVALS.md | 6 + utils/evals/speedbench_al.py | 12 + utils/evals/test_speedbench_al.py | 18 + 13 files changed, 439 insertions(+), 37 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 7b277cd28e..0b917ddb09 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -698,29 +698,286 @@ setup_eval_context() { # SpeedBench acceptance-length eval helpers # ------------------------------ -_prometheus_metric_sum() { - local port="$1" - local name="$2" - local metrics - metrics=$(curl -fsS "http://0.0.0.0:${port}/metrics" 2>/dev/null) || return 1 +_prometheus_metric_values_from_text() { + local name="$1" awk -v name="$name" ' /^#/ { next } { metric = $1 sub(/\{.*/, "", metric) if (metric == name && $NF ~ /^-?([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][-+]?[0-9]+)?$/) { - sum += $NF + print $NF found = 1 } } END { - if (found) { - printf "%.10f\n", sum - } else { + if (!found) { + exit 1 + } + } + ' +} + +_prometheus_metric_values_url() { + local url="$1" + local name="$2" + local metrics + metrics=$(curl -fsS --max-time "${SPEEDBENCH_METRICS_CURL_TIMEOUT:-10}" "$url" 2>/dev/null) || return 1 + _prometheus_metric_values_from_text "$name" <<< "$metrics" +} + +_prometheus_metric_sum_url() { + local url="$1" + local name="$2" + local values + values=$(_prometheus_metric_values_url "$url" "$name") || return 1 + awk ' + { sum += $1; found = 1 } + END { + if (!found) { exit 1 } + printf "%.10f\n", sum + } + ' <<< "$values" +} + +_prometheus_metric_avg_url() { + local url="$1" + local name="$2" + local values + values=$(_prometheus_metric_values_url "$url" "$name") || return 1 + awk ' + { sum += $1; count += 1 } + END { + if (count == 0) { + exit 1 + } + printf "%.10f\n", sum / count + } + ' <<< "$values" +} + +_prometheus_metric_sum() { + local port="$1" + local name="$2" + _prometheus_metric_sum_url "http://0.0.0.0:${port}/metrics" "$name" +} + +_speedbench_normalize_metrics_url() { + local endpoint="$1" + endpoint="${endpoint%,}" + endpoint="${endpoint%/}" + [[ -z "$endpoint" ]] && return 0 + + if [[ "$endpoint" =~ ^https?:// ]]; then + if [[ "$endpoint" == */metrics || "$endpoint" == */metrics\?* ]]; then + echo "$endpoint" + else + echo "${endpoint}/metrics" + fi + elif [[ "$endpoint" =~ ^[0-9]+$ ]]; then + echo "http://0.0.0.0:${endpoint}/metrics" + elif [[ "$endpoint" =~ ^:[0-9]+$ ]]; then + echo "http://0.0.0.0${endpoint}/metrics" + elif [[ "$endpoint" == */metrics || "$endpoint" == */metrics\?* ]]; then + echo "http://${endpoint}" + else + echo "http://${endpoint}/metrics" + fi +} + +_speedbench_metric_urls() { + local port="$1" + local raw="${SPEEDBENCH_DECODE_METRICS_URLS:-${SPEEDBENCH_METRICS_URLS:-}}" + local endpoint + + if [[ -n "$raw" ]]; then + for endpoint in ${raw//,/ }; do + _speedbench_normalize_metrics_url "$endpoint" + done + return 0 + fi + + raw="${SPEEDBENCH_METRICS_PORTS:-}" + if [[ -n "$raw" ]]; then + for endpoint in ${raw//,/ }; do + _speedbench_normalize_metrics_url "$endpoint" + done + return 0 + fi + + echo "http://0.0.0.0:${port}/metrics" +} + +_speedbench_metric_sum() { + local port="$1" + local name="$2" + local url value + local total="0" + local found=0 + + while IFS= read -r url; do + [[ -z "$url" ]] && continue + value=$(_prometheus_metric_sum_url "$url" "$name" 2>/dev/null || true) + if [[ -n "$value" ]]; then + total=$(awk -v a="$total" -v b="$value" 'BEGIN { printf "%.10f", a + b }') + found=1 + fi + done < <(_speedbench_metric_urls "$port") + + [[ "$found" -eq 1 ]] || return 1 + awk -v total="$total" 'BEGIN { printf "%.10f\n", total }' +} + +_speedbench_metric_avg() { + local port="$1" + local name="$2" + local url value + local total="0" + local count=0 + + while IFS= read -r url; do + [[ -z "$url" ]] && continue + while IFS= read -r value; do + [[ -z "$value" ]] && continue + total=$(awk -v a="$total" -v b="$value" 'BEGIN { printf "%.10f", a + b }') + count=$((count + 1)) + done < <(_prometheus_metric_values_url "$url" "$name" 2>/dev/null || true) + done < <(_speedbench_metric_urls "$port") + + [[ "$count" -gt 0 ]] || return 1 + awk -v total="$total" -v count="$count" 'BEGIN { printf "%.10f\n", total / count }' +} + +_speedbench_metric_endpoint_count() { + local port="$1" + local url count=0 + while IFS= read -r url; do + [[ -n "$url" ]] && count=$((count + 1)) + done < <(_speedbench_metric_urls "$port") + echo "$count" +} + +_speedbench_metric_delta() { + local before="$1" + local after="$2" + [[ -n "$before" && -n "$after" ]] || return 1 + awk -v before="$before" -v after="$after" ' + BEGIN { + delta = after - before + if (delta < 0) { + delta = after + } + printf "%.10f\n", delta } - ' <<< "$metrics" + ' +} + +_speedbench_round_metric() { + local value="$1" + [[ -n "$value" ]] || return 1 + awk -v value="$value" 'BEGIN { printf "%.0f\n", value }' +} + +_speedbench_metrics_framework() { + local fw="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-vllm}}" + fw="${fw,,}" + if [[ "$fw" == "dynamo" ]]; then + local inner="${SPEEDBENCH_DYNAMO_BACKEND_FRAMEWORK:-${DYNAMO_BACKEND_FRAMEWORK:-${DYNAMO_BACKEND:-}}}" + [[ -n "$inner" ]] && fw="dynamo-${inner,,}" + fi + + case "$fw" in + vllm|dynamo-vllm) + echo "vllm" + ;; + sglang|dynamo-sglang) + echo "sglang" + ;; + trt|trtllm|tensorrt-llm|tensorrt_llm|dynamo-trt|dynamo-trtllm|dynamo-tensorrt-llm|dynamo-tensorrt_llm) + echo "trtllm" + ;; + *) + echo "$fw" + ;; + esac +} + +_speedbench_metric_source_base() { + local framework="$1" + local configured="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-$framework}}" + configured="${configured,,}" + if [[ "$configured" == dynamo* ]]; then + echo "dynamo-${framework}-prometheus" + else + echo "${framework}-prometheus" + fi +} + +_speedbench_spec_counter_metric() { + local framework="$1" + local kind="$2" + case "${framework}:${kind}" in + vllm:accepted) + echo "vllm:spec_decode_num_accepted_tokens_total" + ;; + vllm:proposed) + echo "vllm:spec_decode_num_draft_tokens_total" + ;; + vllm:verify) + echo "vllm:spec_decode_num_drafts_total" + ;; + trtllm:accepted) + echo "trtllm_spec_decode_num_accepted_tokens_total" + ;; + trtllm:proposed) + echo "trtllm_spec_decode_num_draft_tokens_total" + ;; + sglang:verify) + echo "sglang:spec_verify_calls_total" + ;; + *) + return 1 + ;; + esac +} + +_speedbench_spec_gauge_metric() { + local framework="$1" + local kind="$2" + case "${framework}:${kind}" in + trtllm:acceptance_length) + echo "trtllm_spec_decode_acceptance_length" + ;; + sglang:acceptance_length) + echo "sglang:spec_accept_length" + ;; + sglang:draft_tokens_per_step) + echo "sglang:spec_num_draft_tokens" + ;; + *) + return 1 + ;; + esac +} + +_speedbench_spec_counter_sum() { + local framework="$1" + local port="$2" + local kind="$3" + local metric + metric=$(_speedbench_spec_counter_metric "$framework" "$kind") || return 1 + _speedbench_metric_sum "$port" "$metric" +} + +_speedbench_spec_gauge_avg() { + local framework="$1" + local port="$2" + local kind="$3" + local metric + metric=$(_speedbench_spec_gauge_metric "$framework" "$kind") || return 1 + _speedbench_metric_avg "$port" "$metric" } _speedbench_write_eval_result() { @@ -729,8 +986,11 @@ _speedbench_write_eval_result() { local mtp="$3" local al="${4:-}" local accepted="${5:-}" - local drafts="${6:-}" - local error="${7:-}" + local verify_steps="${6:-}" + local proposed_drafts="${7:-}" + local framework="${8:-${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-}}}" + local metric_source="${9:-}" + local error="${10:-}" local speedbench_model="${MODEL_NAME:-${MODEL:-}}" local record_cmd=( @@ -747,14 +1007,24 @@ _speedbench_write_eval_result() { --temperature "1.0" --threshold-ratio "0.90" ) + if [[ -n "$framework" ]]; then + record_cmd+=(--framework "$framework") + fi + if [[ -n "$metric_source" ]]; then + record_cmd+=(--metric-source "$metric_source") + fi if [[ -n "$al" ]]; then record_cmd+=(--acceptance-length "$al") fi if [[ -n "$accepted" ]]; then record_cmd+=(--accepted-tokens "$accepted") fi - if [[ -n "$drafts" ]]; then - record_cmd+=(--draft-tokens "$drafts") + if [[ -n "$verify_steps" ]]; then + record_cmd+=(--draft-tokens "$verify_steps") + record_cmd+=(--verify-steps "$verify_steps") + fi + if [[ -n "$proposed_drafts" ]]; then + record_cmd+=(--proposed-draft-tokens "$proposed_drafts") fi if [[ -n "$error" ]]; then record_cmd+=(--error "$error") @@ -821,26 +1091,40 @@ run_speedbench_al_eval() { export EVAL_RESULT_DIR fi - # TODO: Add unified support for SGLang, TRT-LLM, and disagg (Dynamo). + local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" + local metrics_framework result_framework metric_source_base metrics_endpoint_count + metrics_framework=$(_speedbench_metrics_framework) + result_framework="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-$metrics_framework}}" + metric_source_base=$(_speedbench_metric_source_base "$metrics_framework") + metrics_endpoint_count=$(_speedbench_metric_endpoint_count "$port") + + case "$metrics_framework" in + vllm|sglang|trtllm) + ;; + *) + echo "SpeedBench AL eval: unsupported speculative metrics framework=${metrics_framework}" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "Unsupported speculative metrics framework: ${metrics_framework}" + return 0 + ;; + esac + + echo "SpeedBench AL eval: metrics framework=${metrics_framework}, endpoints=${metrics_endpoint_count}" if ! command -v vllm >/dev/null 2>&1; then - local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" echo "SpeedBench AL eval: vllm CLI is not available for SpeedBench client" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm CLI is not available for SpeedBench client" + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm CLI is not available for SpeedBench client" return 0 fi local speedbench_dir="${SPEEDBENCH_DIR:-$(pwd)/speed_bench_data}" if ! _speedbench_prepare_dataset "$speedbench_dir"; then - local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" echo "SpeedBench AL eval: SpeedBench dataset download failed" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "SpeedBench dataset download failed" + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "SpeedBench dataset download failed" return 0 fi - local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json" if ! _speedbench_reference_available "$mode" "$mtp"; then echo "SpeedBench AL eval: no reference for mode=${mode} mtp=${mtp}" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "No SpeedBench AL reference for this eval cell" + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "No SpeedBench AL reference for this eval cell" return 0 fi @@ -849,11 +1133,13 @@ run_speedbench_al_eval() { think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}') fi - local accepted_before="" draft_before="" - accepted_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true) - draft_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true) + local accepted_before="" proposed_before="" verify_before="" + accepted_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true) + proposed_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true) + verify_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true) accepted_before="${accepted_before:-0}" - draft_before="${draft_before:-0}" + proposed_before="${proposed_before:-0}" + verify_before="${verify_before:-0}" local raw_result_dir raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)" @@ -882,27 +1168,65 @@ run_speedbench_al_eval() { "${bench_cmd[@]}" || bench_rc=$? if [[ "$bench_rc" -ne 0 ]]; then echo "SpeedBench AL eval: vllm bench serve failed with exit code ${bench_rc}" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm bench serve failed with exit code ${bench_rc}" + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm bench serve failed with exit code ${bench_rc}" rm -rf "$raw_result_dir" || true return 0 fi - local accepted_after="" draft_after="" al="" delta_acc="" delta_draft="" - accepted_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true) - draft_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true) - if [[ -n "$accepted_after" && -n "$draft_after" ]]; then - delta_acc=$(awk "BEGIN {printf \"%d\", ${accepted_after} - ${accepted_before}}") - delta_draft=$(awk "BEGIN {printf \"%d\", ${draft_after} - ${draft_before}}") - if [[ "$delta_draft" -gt 0 ]]; then - al=$(awk "BEGIN {printf \"%.4f\", 1 + (${delta_acc} / ${delta_draft})}") + local accepted_after="" proposed_after="" verify_after="" + local al="" delta_acc="" delta_proposed="" delta_verify="" metric_source="" + accepted_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true) + proposed_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true) + verify_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true) + + if [[ -n "$accepted_after" ]]; then + delta_acc=$(_speedbench_round_metric "$(_speedbench_metric_delta "$accepted_before" "$accepted_after")") + fi + if [[ -n "$proposed_after" ]]; then + delta_proposed=$(_speedbench_round_metric "$(_speedbench_metric_delta "$proposed_before" "$proposed_after")") + fi + if [[ -n "$verify_after" ]]; then + delta_verify=$(_speedbench_round_metric "$(_speedbench_metric_delta "$verify_before" "$verify_after")") + fi + + if [[ "$metrics_framework" == "vllm" && -n "$delta_acc" && -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then + al=$(awk -v accepted="$delta_acc" -v verify="$delta_verify" 'BEGIN { printf "%.4f", 1 + (accepted / verify) }') + metric_source="${metric_source_base}-counters-endpoints${metrics_endpoint_count}" + elif [[ "$metrics_framework" == "trtllm" ]]; then + al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) + if [[ -n "$al" ]]; then + metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}" + if [[ -n "$delta_acc" || -n "$delta_proposed" ]]; then + metric_source="${metric_source}+token-counters" + fi + fi + elif [[ "$metrics_framework" == "sglang" ]]; then + al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) + if [[ -n "$al" ]]; then + metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}" + fi + if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then + local draft_depth="" + draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true) + if [[ -n "$draft_depth" ]]; then + delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')") + fi + if [[ -n "$al" ]]; then + delta_acc=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v al="$al" 'BEGIN { value = verify * (al - 1); if (value < 0) value = 0; printf "%.10f\n", value }')") + metric_source="${metric_source:-${metric_source_base}-gauge-endpoints${metrics_endpoint_count}}+derived-token-counters" + fi fi fi if [[ -z "$al" ]]; then echo "SpeedBench AL eval: could not collect speculative acceptance metrics from server" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "$delta_acc" "$delta_draft" "Could not collect speculative acceptance metrics from server" + local metric_error="Could not collect speculative acceptance metrics from server" + if [[ "${FRAMEWORK:-}" == dynamo* && -z "${SPEEDBENCH_DECODE_METRICS_URLS:-}${SPEEDBENCH_METRICS_URLS:-}${SPEEDBENCH_METRICS_PORTS:-}" ]]; then + metric_error="${metric_error}; for Dynamo/disagg set SPEEDBENCH_DECODE_METRICS_URLS or SPEEDBENCH_METRICS_PORTS to decode-worker /metrics endpoints" + fi + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "$delta_acc" "$delta_verify" "$delta_proposed" "$result_framework" "$metric_source_base" "$metric_error" else - _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_draft" + _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_verify" "$delta_proposed" "$result_framework" "$metric_source" fi rm -rf "$raw_result_dir" || true } diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41a..0d307f6e4d 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -565,6 +565,19 @@ if [ "$NODE_RANK" -eq 0 ]; then export EVAL_MAX_MODEL_LEN="$prefill_context_length" fi + speedbench_decode_metric_urls="" + for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${IP_ARRAY[$decode_idx]}:8000/metrics" + done + if [[ -z "$speedbench_decode_metric_urls" ]]; then + speedbench_decode_metric_urls="http://${NODE0_ADDR}:8000/metrics" + fi + export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}" + if [[ "${SPEC_DECODING:-none}" == "mtp" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then + export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-$DECODE_MTP_SIZE}" + fi + if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" else diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index d61fe03592..a2bed048d0 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -344,6 +344,18 @@ if [ "$NODE_RANK" -eq 0 ]; then export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) fi + speedbench_decode_metric_urls="" + for decode_ip in ${DECODE_ARGS}; do + speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${decode_ip}:${SERVER_PORT}/metrics" + done + if [[ -z "$speedbench_decode_metric_urls" ]]; then + speedbench_decode_metric_urls="http://${NODE0_ADDR}:${SERVER_PORT}/metrics" + fi + export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}" + if [[ "${SPEC_DECODING:-none}" == "mtp" ]]; then + export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-${DECODE_MTP_SIZE:-${NUM_SPEC_TOKENS:-2}}}" + fi + if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" else diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh index 3addce4526..13c639c98f 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh @@ -166,6 +166,7 @@ SPEC_FLAGS=( --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 ) +export SPEEDBENCH_NUM_SPEC_TOKENS=3 if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS+=( --dp "$TP" @@ -178,6 +179,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then --speculative-eagle-topk 1 --speculative-num-draft-tokens 3 ) + export SPEEDBENCH_NUM_SPEC_TOKENS=2 fi if [ "${EP_SIZE:-1}" -gt 1 ]; then PARALLEL_ARGS+=(--ep-size "$EP_SIZE") diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index 9e5c88212b..e51d4043d8 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -57,6 +57,7 @@ EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" MOE_BACKEND="TRTLLM" MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}" +export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index 672d48f4b3..bfb38953b8 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -88,6 +88,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 ) + export SPEEDBENCH_NUM_SPEC_TOKENS=1 PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention @@ -107,6 +108,7 @@ else --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 ) + export SPEEDBENCH_NUM_SPEC_TOKENS=3 PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh index 507b96e346..e4664dcd59 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh @@ -65,6 +65,7 @@ EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" MOE_BACKEND="TRTLLM" MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}" +export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}" diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh index 788eff5b8b..e8d4ffde79 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh @@ -20,6 +20,7 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi SERVER_LOG="$PWD/server.log" +export SPEEDBENCH_NUM_SPEC_TOKENS=3 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index fc0ac297f0..5eaf15302a 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -2,7 +2,7 @@ # System-specific configuration for B300 NV Slurm cluster SLURM_PARTITION="batch_1" -SLURM_ACCOUNT="benchmark" +SLURM_ACCOUNT="restricted" set -x diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index f6a8c2031a..45b464329d 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -170,6 +170,11 @@ def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]: 'threshold_ratio': data.get('threshold_ratio'), 'thinking_mode': mode, 'num_speculative_tokens': mtp, + 'speedbench_framework': data.get('framework'), + 'speedbench_metric_source': data.get('metric_source'), + 'speedbench_accepted_tokens': data.get('accepted_tokens'), + 'speedbench_verify_steps': data.get('verify_steps', data.get('draft_tokens')), + 'speedbench_proposed_draft_tokens': data.get('proposed_draft_tokens'), 'passed': data.get('passed'), 'error': data.get('error'), 'model': data.get('model'), @@ -273,6 +278,11 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: row['speedbench_threshold_ratio'] = m.get('threshold_ratio') row['speedbench_thinking_mode'] = m.get('thinking_mode') row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens') + row['speedbench_framework'] = m.get('speedbench_framework') + row['speedbench_metric_source'] = m.get('speedbench_metric_source') + row['speedbench_accepted_tokens'] = m.get('speedbench_accepted_tokens') + row['speedbench_verify_steps'] = m.get('speedbench_verify_steps') + row['speedbench_proposed_draft_tokens'] = m.get('speedbench_proposed_draft_tokens') row['speedbench_passed'] = m.get('passed') row['speedbench_error'] = m.get('error') elif m.get('strict') is not None: diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 9de27e765f..65fca90183 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -135,6 +135,12 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval | | `SPEEDBENCH_DIR` | `$(pwd)/speed_bench_data` | Prepared SpeedBench dataset directory; resolves to `/workspace/speed_bench_data` or `/ix/speed_bench_data` through the runner's container workdir | | `SPEEDBENCH_NUM_SPEC_TOKENS` | script-provided or `2` | MTP level used to select the reference AL row | +| `SPEEDBENCH_METRICS_FRAMEWORK` | `FRAMEWORK` or `vllm` | Override speculative metrics parser. Supports `vllm`, `sglang`, `trtllm`/`trt`, and `dynamo-*` variants | +| `SPEEDBENCH_DECODE_METRICS_URLS` | unset | Comma/space-separated decode worker Prometheus `/metrics` URLs for disaggregated runs | +| `SPEEDBENCH_METRICS_URLS` | unset | Generic comma/space-separated Prometheus endpoints when decode-specific naming is not applicable | +| `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied | + +SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM records its acceptance-length gauge and token counters because it does not expose verify steps through Prometheus. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values. ### Score validation `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. diff --git a/utils/evals/speedbench_al.py b/utils/evals/speedbench_al.py index 4a5fd5d6d0..a9167e6565 100644 --- a/utils/evals/speedbench_al.py +++ b/utils/evals/speedbench_al.py @@ -182,6 +182,10 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]: acceptance_length = _optional_float(args.acceptance_length) accepted_tokens = _optional_int(args.accepted_tokens) draft_tokens = _optional_int(args.draft_tokens) + verify_steps = _optional_int(getattr(args, "verify_steps", None)) + proposed_draft_tokens = _optional_int(getattr(args, "proposed_draft_tokens", None)) + if verify_steps is None: + verify_steps = draft_tokens passed = ( error is None and acceptance_length is not None @@ -200,8 +204,12 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]: "category": args.category, "output_len": args.output_len, "temperature": args.temperature, + "framework": getattr(args, "framework", ""), + "metric_source": getattr(args, "metric_source", ""), "acceptance_length": acceptance_length, "accepted_tokens": accepted_tokens, + "verify_steps": verify_steps, + "proposed_draft_tokens": proposed_draft_tokens, "draft_tokens": draft_tokens, "reference_acceptance_length": reference_al, "threshold_ratio": args.threshold_ratio, @@ -274,9 +282,13 @@ def build_parser() -> argparse.ArgumentParser: record.add_argument("--output-len", type=int, default=4096) record.add_argument("--temperature", type=float, default=1.0) record.add_argument("--threshold-ratio", type=float, default=0.90) + record.add_argument("--framework", default="") + record.add_argument("--metric-source", default="") record.add_argument("--acceptance-length", default=None) record.add_argument("--accepted-tokens", default=None) record.add_argument("--draft-tokens", default=None) + record.add_argument("--verify-steps", default=None) + record.add_argument("--proposed-draft-tokens", default=None) record.add_argument("--error", default=None) record.add_argument("--exit-status", action="store_true") record.set_defaults(func=cmd_record) diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index 2665609c3e..8663058c30 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -64,6 +64,10 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None: acceptance_length="2.30", accepted_tokens="13", draft_tokens="10", + verify_steps="10", + proposed_draft_tokens="20", + framework="vllm", + metric_source="vllm-prometheus-counters-endpoints1", error=None, ) @@ -71,6 +75,10 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None: assert result["reference_acceptance_length"] == 2.50 assert result["min_acceptance_length"] == 2.25 + assert result["framework"] == "vllm" + assert result["metric_source"] == "vllm-prometheus-counters-endpoints1" + assert result["verify_steps"] == 10 + assert result["proposed_draft_tokens"] == 20 assert result["passed"] is True @@ -103,6 +111,11 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: "thinking_mode": "thinking_on", "num_speculative_tokens": 2, "acceptance_length": 2.3, + "framework": "sglang", + "metric_source": "sglang-prometheus-gauge-endpoints1+derived-token-counters", + "accepted_tokens": 13, + "verify_steps": 10, + "proposed_draft_tokens": 20, "reference_acceptance_length": 2.5, "min_acceptance_length": 2.25, "threshold_ratio": 0.9, @@ -124,6 +137,11 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: assert row["task"] == "speedbench_al/thinking_on/mtp2" assert row["score_name"] == "acceptance_length" + assert row["speedbench_framework"] == "sglang" + assert row["speedbench_metric_source"] == "sglang-prometheus-gauge-endpoints1+derived-token-counters" + assert row["speedbench_accepted_tokens"] == 13 + assert row["speedbench_verify_steps"] == 10 + assert row["speedbench_proposed_draft_tokens"] == 20 assert score_cell(row) == "2.30 >= 2.25 (PASS)" From f2aba4c32a1b62810b9972cc791c3a2d24c3c155 Mon Sep 17 00:00:00 2001 From: "Albert Cheng (Engrg-Hardware 1)" Date: Mon, 8 Jun 2026 11:12:06 -0700 Subject: [PATCH 07/27] speedbench-al: fix --chat-template-kwargs default quoting so thinking-on cells run --- benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh index 7e39c32b3c..2a77dcb361 100755 --- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh @@ -48,7 +48,8 @@ TEMPERATURE="${TEMPERATURE:-1.0}" # thinking-on chat_template_kwargs. MUST match the production/golden config: # the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured # with reasoning_effort=high. -CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-{\"thinking\": true, \"reasoning_effort\": \"high\"}}" +DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"thinking": true, "reasoning_effort": "high"}' +CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}" SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}" RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}" From c12acba250c7882bea72882857ff58304edefbc3 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 9 Jun 2026 13:25:02 -0700 Subject: [PATCH 08/27] Apply SpeedBench chat-template shim to eval helper --- benchmarks/benchmark_lib.sh | 72 +++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 0b917ddb09..e54ca6d235 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1059,6 +1059,73 @@ _speedbench_prepare_dataset() { [[ -f "$speedbench_dir/qualitative.jsonl" ]] } +_speedbench_apply_chat_template_kwargs_shim() { + echo "SpeedBench AL eval: patching vLLM benchmark --chat-template-kwargs support if needed" + python3 - <<'PYEOF' +import vllm.benchmarks.serve as S +import vllm.benchmarks.datasets.datasets as D + + +def patch(mod, edits, marker): + f = mod.__file__ + with open(f) as handle: + src = handle.read() + if marker in src: + print("already patched:", f) + return + for old, new in edits: + n = src.count(old) + assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..." + src = src.replace(old, new, 1) + with open(f, "w") as handle: + handle.write(src) + print("patched OK ->", f) + + +serve_old = ''' parser.add_argument( + "--extra-body",''' +serve_new = ''' parser.add_argument( + "--chat-template-kwargs", + type=json.loads, + default=None, + help="JSON dict forwarded to apply_chat_template during " + "client-side prompt rendering, e.g. to enable reasoning mode.", + ) + parser.add_argument( + "--extra-body",''' +patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"') + +disp_old = ''' output_len=args.speed_bench_output_len, + enable_multimodal_chat=args.enable_multimodal_chat,''' +disp_new = ''' output_len=args.speed_bench_output_len, + chat_template_kwargs=args.chat_template_kwargs, + enable_multimodal_chat=args.enable_multimodal_chat,''' + +samp_old = ''' # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +samp_new = ''' # apply template + if not skip_chat_template: + _ctk = kwargs.get("chat_template_kwargs") or {} + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + **_ctk, + ) + + prompt_len = len(tokenizer(prompt).input_ids)''' +patch(D, [(disp_old, disp_new), (samp_old, samp_new)], + marker="chat_template_kwargs=args.chat_template_kwargs") +PYEOF +} + run_speedbench_al_eval() { local port="${PORT:-8888}" while [[ $# -gt 0 ]]; do @@ -1130,6 +1197,11 @@ run_speedbench_al_eval() { local think_args=() if [[ "$mode" == "on" ]]; then + if ! _speedbench_apply_chat_template_kwargs_shim; then + echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed" + return 0 + fi think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}') fi From fa83900b0e9e2407222dedd7f3095bcfaf98464d Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 08:17:04 -0700 Subject: [PATCH 09/27] Add native SpeedBench client fallback --- benchmarks/benchmark_lib.sh | 102 ++++++++----- utils/evals/speedbench_client.py | 242 ++++++++++++++++++++++++++++++ utils/evals/test_speedbench_al.py | 53 +++++++ 3 files changed, 361 insertions(+), 36 deletions(-) create mode 100644 utils/evals/speedbench_client.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e54ca6d235..54c2867b50 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1176,11 +1176,6 @@ run_speedbench_al_eval() { esac echo "SpeedBench AL eval: metrics framework=${metrics_framework}, endpoints=${metrics_endpoint_count}" - if ! command -v vllm >/dev/null 2>&1; then - echo "SpeedBench AL eval: vllm CLI is not available for SpeedBench client" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm CLI is not available for SpeedBench client" - return 0 - fi local speedbench_dir="${SPEEDBENCH_DIR:-$(pwd)/speed_bench_data}" if ! _speedbench_prepare_dataset "$speedbench_dir"; then @@ -1195,14 +1190,23 @@ run_speedbench_al_eval() { return 0 fi + local thinking_kwargs='{"thinking": true, "reasoning_effort": "high"}' + local client="${SPEEDBENCH_CLIENT:-auto}" + local use_vllm_client=0 + if [[ "$client" != "openai" && "$client" != "native" ]] && command -v vllm >/dev/null 2>&1; then + use_vllm_client=1 + fi + local think_args=() if [[ "$mode" == "on" ]]; then - if ! _speedbench_apply_chat_template_kwargs_shim; then - echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed" - return 0 + if [[ "$use_vllm_client" -eq 1 ]]; then + if ! _speedbench_apply_chat_template_kwargs_shim; then + echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed" + return 0 + fi + think_args=(--chat-template-kwargs "$thinking_kwargs") fi - think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}') fi local accepted_before="" proposed_before="" verify_before="" @@ -1213,35 +1217,62 @@ run_speedbench_al_eval() { proposed_before="${proposed_before:-0}" verify_before="${verify_before:-0}" - local raw_result_dir - raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)" local bench_rc=0 local speedbench_model="${MODEL_NAME:-${MODEL:-}}" - local bench_cmd=( - vllm bench serve - --model "$speedbench_model" - --port "$port" - --dataset-name speed_bench - --dataset-path "$speedbench_dir" - --speed-bench-category coding - --speed-bench-output-len 4096 - --num-prompts -1 - --max-concurrency 1 - --save-result - --result-dir "$raw_result_dir" - --result-filename "speedbench_al_${mode}_mtp${mtp}" - --trust-remote-code - --tokenizer-mode deepseek_v4 - --temperature 1.0 - "${think_args[@]}" - ) - echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}" - "${bench_cmd[@]}" || bench_rc=$? - if [[ "$bench_rc" -ne 0 ]]; then - echo "SpeedBench AL eval: vllm bench serve failed with exit code ${bench_rc}" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm bench serve failed with exit code ${bench_rc}" + if [[ "$use_vllm_client" -eq 1 ]]; then + local raw_result_dir + raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)" + local bench_cmd=( + vllm bench serve + --model "$speedbench_model" + --port "$port" + --dataset-name speed_bench + --dataset-path "$speedbench_dir" + --speed-bench-category coding + --speed-bench-output-len 4096 + --num-prompts -1 + --max-concurrency 1 + --save-result + --result-dir "$raw_result_dir" + --result-filename "speedbench_al_${mode}_mtp${mtp}" + --trust-remote-code + --tokenizer-mode deepseek_v4 + --temperature 1.0 + "${think_args[@]}" + ) + "${bench_cmd[@]}" || bench_rc=$? rm -rf "$raw_result_dir" || true + else + export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" + local native_cmd=( + python3 "$(pwd)/utils/evals/speedbench_client.py" + --model "$speedbench_model" + --base-url "http://0.0.0.0:${port}" + --dataset-path "$speedbench_dir" + --category coding + --output-len 4096 + --temperature 1.0 + --thinking-mode "$mode" + --timeout "${SPEEDBENCH_CLIENT_TIMEOUT:-1800}" + --retries "${SPEEDBENCH_CLIENT_RETRIES:-2}" + ) + if [[ -n "${SPEEDBENCH_CLIENT_ENDPOINT:-}" ]]; then + native_cmd+=(--endpoint "$SPEEDBENCH_CLIENT_ENDPOINT") + elif [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + native_cmd+=(--endpoint completions) + fi + if [[ "$mode" == "on" ]]; then + native_cmd+=(--thinking-kwargs "$thinking_kwargs") + fi + if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + native_cmd+=(--dsv4) + fi + "${native_cmd[@]}" || bench_rc=$? + fi + if [[ "$bench_rc" -ne 0 ]]; then + echo "SpeedBench AL eval: client failed with exit code ${bench_rc}" >&2 + _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "SpeedBench client failed with exit code ${bench_rc}" return 0 fi @@ -1300,7 +1331,6 @@ run_speedbench_al_eval() { else _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_verify" "$delta_proposed" "$result_framework" "$metric_source" fi - rm -rf "$raw_result_dir" || true } run_lm_eval() { diff --git a/utils/evals/speedbench_client.py b/utils/evals/speedbench_client.py new file mode 100644 index 0000000000..5ad869dd90 --- /dev/null +++ b/utils/evals/speedbench_client.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +"""Small OpenAI-compatible client for SpeedBench AL eval load. + +This intentionally avoids importing vLLM benchmark code so the eval can run in +TensorRT-LLM and SGLang runtime images. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from pathlib import Path +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + + +def _load_dsv4_encoder(): + bench_serving_dir = Path(__file__).resolve().parents[1] / "bench_serving" + sys.path.insert(0, str(bench_serving_dir)) + from encoding_dsv4 import encode_messages # type: ignore + + return encode_messages + + +def _load_speedbench_requests( + dataset_path: Path, + category: str, + num_prompts: int, +) -> list[list[dict[str, Any]]]: + jsonl_path = dataset_path / "qualitative.jsonl" + if not jsonl_path.is_file(): + raise FileNotFoundError(f"missing SpeedBench JSONL: {jsonl_path}") + + requests: list[list[dict[str, Any]]] = [] + with jsonl_path.open(encoding="utf-8") as handle: + for line in handle: + if not line.strip(): + continue + row = json.loads(line) + if category and row.get("category") != category: + continue + messages = row.get("messages") + if not isinstance(messages, list) or not messages: + continue + requests.append(messages) + if num_prompts > 0 and len(requests) >= num_prompts: + break + + if not requests: + raise ValueError(f"no SpeedBench prompts found for category={category!r}") + return requests + + +def _json_post( + url: str, + payload: dict[str, Any], + timeout: int, + retries: int, +) -> dict[str, Any]: + body = json.dumps(payload).encode("utf-8") + headers = {"Content-Type": "application/json"} + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + last_error: Exception | None = None + for attempt in range(retries + 1): + request = Request(url, data=body, headers=headers, method="POST") + try: + with urlopen(request, timeout=timeout) as response: + raw = response.read().decode("utf-8") + return json.loads(raw) if raw else {} + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + message = f"HTTP {exc.code} from {url}: {detail[:1000]}" + last_error = RuntimeError(message) + if exc.code < 500: + break + except URLError as exc: + last_error = exc + except TimeoutError as exc: + last_error = exc + + if attempt < retries: + time.sleep(min(2**attempt, 10)) + + assert last_error is not None + raise last_error + + +def _chat_payload( + messages: list[dict[str, Any]], + model: str, + output_len: int, + temperature: float, + thinking_mode: str, + thinking_kwargs: dict[str, Any], +) -> dict[str, Any]: + payload: dict[str, Any] = { + "model": model, + "messages": messages, + "max_tokens": output_len, + "temperature": temperature, + "stream": False, + } + if thinking_mode == "on" and thinking_kwargs: + payload["chat_template_kwargs"] = thinking_kwargs + if "reasoning_effort" in thinking_kwargs: + payload["reasoning_effort"] = thinking_kwargs["reasoning_effort"] + return payload + + +def _completion_payload( + messages: list[dict[str, Any]], + model: str, + output_len: int, + temperature: float, + thinking_mode: str, + thinking_kwargs: dict[str, Any], + dsv4: bool, +) -> dict[str, Any]: + if dsv4: + encode_messages = _load_dsv4_encoder() + prompt = encode_messages( + messages, + thinking_mode="thinking" if thinking_mode == "on" else "chat", + reasoning_effort=thinking_kwargs.get("reasoning_effort"), + ) + else: + first = messages[0] + prompt = first.get("content", "") if isinstance(first, dict) else str(first) + + return { + "model": model, + "prompt": prompt, + "max_tokens": output_len, + "temperature": temperature, + "stream": False, + } + + +def run(args: argparse.Namespace) -> int: + dataset_path = Path(args.dataset_path) + prompts = _load_speedbench_requests(dataset_path, args.category, args.num_prompts) + base_url = args.base_url.rstrip("/") + chat_url = f"{base_url}/v1/chat/completions" + completions_url = f"{base_url}/v1/completions" + thinking_kwargs = json.loads(args.thinking_kwargs) if args.thinking_kwargs else {} + + failures = 0 + resolved_endpoint = args.endpoint + for index, messages in enumerate(prompts, start=1): + endpoint_attempts = ["chat", "completions"] if resolved_endpoint == "auto" else [resolved_endpoint] + last_error: Exception | None = None + success = False + for endpoint in endpoint_attempts: + if endpoint == "completions": + payload = _completion_payload( + messages, + args.model, + args.output_len, + args.temperature, + args.thinking_mode, + thinking_kwargs, + args.dsv4, + ) + url = completions_url + else: + payload = _chat_payload( + messages, + args.model, + args.output_len, + args.temperature, + args.thinking_mode, + thinking_kwargs, + ) + url = chat_url + + try: + _json_post(url, payload, timeout=args.timeout, retries=args.retries) + except Exception as exc: + last_error = exc + if resolved_endpoint == "auto" and endpoint == "chat": + print( + "SpeedBench client chat endpoint failed; trying completions " + f"fallback: {exc}", + file=sys.stderr, + ) + continue + break + else: + if resolved_endpoint == "auto": + resolved_endpoint = endpoint + print( + f"SpeedBench client request {index}/{len(prompts)} " + f"completed via {endpoint}", + flush=True, + ) + success = True + break + + if success: + continue + + if last_error is None: + last_error = RuntimeError("no SpeedBench endpoint attempts were made") + failures += 1 + print( + f"SpeedBench client request {index}/{len(prompts)} failed: {last_error}", + file=sys.stderr, + ) + if failures > args.max_failures: + return 1 + + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--model", required=True) + parser.add_argument("--base-url", required=True) + parser.add_argument("--dataset-path", required=True) + parser.add_argument("--category", default="coding") + parser.add_argument("--output-len", type=int, default=4096) + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--thinking-mode", choices=["on", "off"], default="off") + parser.add_argument("--thinking-kwargs", default="") + parser.add_argument("--endpoint", choices=["auto", "chat", "completions"], default="auto") + parser.add_argument("--num-prompts", type=int, default=-1) + parser.add_argument("--timeout", type=int, default=1800) + parser.add_argument("--retries", type=int, default=2) + parser.add_argument("--max-failures", type=int, default=0) + parser.add_argument("--dsv4", action="store_true") + return run(parser.parse_args()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index 8663058c30..bd7db0def2 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -13,6 +13,11 @@ score_cell, ) from speedbench_al import build_result, load_reference, lookup_reference +from speedbench_client import ( + _chat_payload, + _completion_payload, + _load_speedbench_requests, +) from validate_scores import validate_speedbench_al @@ -165,3 +170,51 @@ def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> Non assert lm_path is None assert speedbench_paths == [result_path] + + +def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None: + dataset = tmp_path / "speed_bench_data" + dataset.mkdir() + (dataset / "qualitative.jsonl").write_text( + "\n".join( + [ + json.dumps( + { + "category": "coding", + "messages": [{"role": "user", "content": "Write fizzbuzz."}], + } + ), + json.dumps( + { + "category": "math", + "messages": [{"role": "user", "content": "Solve 2+2."}], + } + ), + ] + ) + ) + + prompts = _load_speedbench_requests(dataset, "coding", -1) + chat = _chat_payload( + prompts[0], + model="deepseek-ai/DeepSeek-V4-Pro", + output_len=4096, + temperature=1.0, + thinking_mode="on", + thinking_kwargs={"thinking": True, "reasoning_effort": "high"}, + ) + completions = _completion_payload( + prompts[0], + model="deepseek-ai/DeepSeek-V4-Pro", + output_len=4096, + temperature=1.0, + thinking_mode="on", + thinking_kwargs={"thinking": True, "reasoning_effort": "high"}, + dsv4=True, + ) + + assert len(prompts) == 1 + assert chat["chat_template_kwargs"]["thinking"] is True + assert chat["reasoning_effort"] == "high" + assert "" in completions["prompt"] + assert completions["max_tokens"] == 4096 From 60c19dd56db1c5bde1552b42ac1a4f2a230feffa Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 08:35:04 -0700 Subject: [PATCH 10/27] Use shared GB200 workspace for DSV4 Dynamo --- runners/launch_gb200-nv.sh | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 45ef3a952a..4c5ca8601e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -4,6 +4,11 @@ set -x +USE_SHARED_GB200_WORKSPACE=false +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "dsv4" ]]; then + USE_SHARED_GB200_WORKSPACE=true +fi + # MODEL_PATH: Override with pre-downloaded paths on GB200 runner # The yaml files specify HuggingFace model IDs for portability, but we use # local paths to avoid repeated downloading on the shared GB200 cluster. @@ -74,15 +79,15 @@ export SLURM_ACCOUNT="benchmark" NGINX_IMAGE="nginx:1.27.4" -# === Cluster diagnostic probe (minimax only) === +# === Cluster diagnostic probe for watchtower-hosted GB200 jobs === # The gb200-nv_* runners may be hosted on different physical clusters # (e.g., the legacy NVIDIA Lustre cluster vs Oracle Cloud "watchtower"). # Print enough info to identify the layout, then pick a writable # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then - echo "=== cluster diagnostic (minimax sweep) ===" +if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then + echo "=== cluster diagnostic (shared GB200 workspace) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" echo "HOSTNAME=$(hostname -f 2>/dev/null || hostname)" @@ -202,7 +207,7 @@ SRT_REPO_DIR="srt-slurm" # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -295,7 +300,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then +if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" && -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -312,10 +317,10 @@ echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" -# Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path +# Watchtower-hosted jobs: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -357,7 +362,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \ From 081cbca98b36c49b87172eee2e411ab5fd04637e Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 08:58:15 -0700 Subject: [PATCH 11/27] Enable metrics for DSV4 SGLang MTP --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index bfb38953b8..7fabb03f47 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -131,6 +131,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL_PATH --served-model-name $MODEL \ --host 0.0.0.0 \ --port $PORT \ + --enable-metrics \ --trust-remote-code \ --tp $TP \ --ep-size $EP_SIZE \ From 4cf5bbf91e74483372d9bafa6c87764af582b125 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 11:18:57 -0700 Subject: [PATCH 12/27] Enable TRT-LLM spec metrics for DSV4 MTP --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 2 ++ benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index e51d4043d8..781bfe9337 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -77,6 +77,8 @@ cuda_graph_config: max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG print_iter_log: true +return_perf_metrics: true +enable_iter_perf_stats: true kv_cache_config: tokens_per_block: 128 dtype: fp8 diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh index e4664dcd59..50c5908750 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh @@ -85,6 +85,8 @@ cuda_graph_config: max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG print_iter_log: true +return_perf_metrics: true +enable_iter_perf_stats: true kv_cache_config: tokens_per_block: 128 dtype: fp8 From 63bf3ebaa218955bb5cfc89bc5399d1101802c71 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 11:46:56 -0700 Subject: [PATCH 13/27] Use TRT-LLM Prometheus metrics endpoint for SpeedBench --- benchmarks/benchmark_lib.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 54c2867b50..9a5d0636c3 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1163,6 +1163,9 @@ run_speedbench_al_eval() { metrics_framework=$(_speedbench_metrics_framework) result_framework="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-$metrics_framework}}" metric_source_base=$(_speedbench_metric_source_base "$metrics_framework") + if [[ "$metrics_framework" == "trtllm" && -z "${SPEEDBENCH_DECODE_METRICS_URLS:-}${SPEEDBENCH_METRICS_URLS:-}${SPEEDBENCH_METRICS_PORTS:-}" ]]; then + export SPEEDBENCH_METRICS_URLS="http://0.0.0.0:${port}/prometheus/metrics" + fi metrics_endpoint_count=$(_speedbench_metric_endpoint_count "$port") case "$metrics_framework" in From 4a4fbf13e40c753c65798f7817c816d8cbcda5b5 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 12:42:24 -0700 Subject: [PATCH 14/27] Use TRT-LLM JSON stats for SpeedBench fallback --- benchmarks/benchmark_lib.sh | 134 ++++++++++++++++++++++++++++++++++++ utils/evals/EVALS.md | 3 +- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 9a5d0636c3..1beb29a008 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -859,6 +859,133 @@ _speedbench_metric_endpoint_count() { echo "$count" } +_speedbench_trtllm_json_metrics_urls() { + local port="$1" + local raw="${SPEEDBENCH_TRTLLM_JSON_METRICS_URLS:-}" + local endpoint url + + if [[ -n "$raw" ]]; then + for endpoint in ${raw//,/ }; do + _speedbench_normalize_metrics_url "$endpoint" + done + return 0 + fi + + while IFS= read -r url; do + [[ -z "$url" ]] && continue + echo "$url" | sed -E 's#/prometheus/metrics([?].*)?$#/metrics#' + done < <(_speedbench_metric_urls "$port") +} + +_speedbench_trtllm_json_spec_metrics() { + local port="$1" + local mtp="$2" + local urls=() + local url + + while IFS= read -r url; do + [[ -n "$url" ]] && urls+=("$url") + done < <(_speedbench_trtllm_json_metrics_urls "$port") + + [[ "${#urls[@]}" -gt 0 ]] || return 1 + + python3 - "$mtp" "${urls[@]}" <<'PY' +import json +import os +import sys +import urllib.request + + +def number(value, default=0.0): + try: + if value is None: + return default + return float(value) + except (TypeError, ValueError): + return default + + +def stats_from_payload(payload): + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + return [payload] + return [] + + +try: + mtp = float(sys.argv[1]) +except (IndexError, ValueError): + mtp = 0.0 + +timeout = float(os.environ.get("SPEEDBENCH_METRICS_CURL_TIMEOUT", "10")) +total_draft = 0.0 +total_accepted = 0.0 +total_requests = 0.0 +weighted_acceptance_length = 0.0 +unweighted_acceptance_length = 0.0 +unweighted_count = 0 +used_endpoints = 0 + +for url in sys.argv[2:]: + try: + with urllib.request.urlopen(url, timeout=timeout) as response: + payload = json.load(response) + except Exception as exc: # noqa: BLE001 - diagnostics for CI logs + print(f"SpeedBench AL eval: TRT-LLM JSON metrics fetch failed for {url}: {exc}", file=sys.stderr) + continue + + endpoint_had_spec = False + for stat in stats_from_payload(payload): + if not isinstance(stat, dict): + continue + spec = stat.get("specDecodingStats") + if not isinstance(spec, dict): + continue + + draft = number(spec.get("numDraftTokens")) + if draft <= 0: + continue + + accepted = number(spec.get("numAcceptedTokens")) + requests = number(spec.get("numRequestsWithDraftTokens")) + acceptance_length = number(spec.get("acceptanceLength"), default=-1.0) + + total_draft += draft + total_accepted += accepted + endpoint_had_spec = True + + if acceptance_length > 0: + if requests > 0: + total_requests += requests + weighted_acceptance_length += acceptance_length * requests + else: + unweighted_acceptance_length += acceptance_length + unweighted_count += 1 + + if endpoint_had_spec: + used_endpoints += 1 + +if total_requests > 0: + acceptance_length = weighted_acceptance_length / total_requests +elif unweighted_count > 0: + acceptance_length = unweighted_acceptance_length / unweighted_count +elif total_draft > 0 and mtp > 0: + acceptance_length = 1.0 + (total_accepted / (total_draft / mtp)) +else: + sys.exit(1) + +verify_steps = round(total_draft / mtp) if total_draft > 0 and mtp > 0 else 0 +print( + f"{acceptance_length:.4f}\t" + f"{int(round(total_accepted))}\t" + f"{int(verify_steps)}\t" + f"{int(round(total_draft))}\t" + f"{used_endpoints}" +) +PY +} + _speedbench_metric_delta() { local before="$1" local after="$2" @@ -1305,6 +1432,13 @@ run_speedbench_al_eval() { if [[ -n "$delta_acc" || -n "$delta_proposed" ]]; then metric_source="${metric_source}+token-counters" fi + else + local trt_json_metrics="" trt_json_endpoints="" + trt_json_metrics=$(_speedbench_trtllm_json_spec_metrics "$port" "$mtp" 2>/dev/null || true) + if [[ -n "$trt_json_metrics" ]]; then + IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_json_endpoints <<< "$trt_json_metrics" + metric_source="trtllm-json-iteration-stats-endpoints${trt_json_endpoints}" + fi fi elif [[ "$metrics_framework" == "sglang" ]]; then al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 65fca90183..4177bb2fc8 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -139,8 +139,9 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `SPEEDBENCH_DECODE_METRICS_URLS` | unset | Comma/space-separated decode worker Prometheus `/metrics` URLs for disaggregated runs | | `SPEEDBENCH_METRICS_URLS` | unset | Generic comma/space-separated Prometheus endpoints when decode-specific naming is not applicable | | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied | +| `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable | -SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM records its acceptance-length gauge and token counters because it does not expose verify steps through Prometheus. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values. +SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON iteration stats from `/metrics` when the Prometheus spec series are unavailable. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values. ### Score validation `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. From de360bf04ffd190387c430bcaafcb5230c225eff Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 13:27:45 -0700 Subject: [PATCH 15/27] Use TRT-LLM decoded-token metric as AL fallback --- benchmarks/benchmark_lib.sh | 116 +++++++++++++++++++++++++++++++++++- utils/evals/EVALS.md | 2 +- 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 1beb29a008..ef6c8604d5 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -986,6 +986,106 @@ print( PY } +_speedbench_trtllm_avg_decoded_al() { + local port="$1" + local value + value=$(_speedbench_metric_avg "$port" "trtllm_avg_decoded_tokens_per_iter" 2>/dev/null || true) + [[ -n "$value" ]] || return 1 + awk -v value="$value" ' + BEGIN { + if (value < 1.0) { + exit 1 + } + printf "%.4f\n", value + } + ' +} + +_speedbench_trtllm_json_avg_decoded_al() { + local port="$1" + local urls=() + local url + + while IFS= read -r url; do + [[ -n "$url" ]] && urls+=("$url") + done < <(_speedbench_trtllm_json_metrics_urls "$port") + + [[ "${#urls[@]}" -gt 0 ]] || return 1 + + python3 - "${urls[@]}" <<'PY' +import json +import os +import sys +import urllib.request + + +def number(value, default=0.0): + try: + if value is None: + return default + return float(value) + except (TypeError, ValueError): + return default + + +def stats_from_payload(payload): + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + return [payload] + return [] + + +timeout = float(os.environ.get("SPEEDBENCH_METRICS_CURL_TIMEOUT", "10")) +weighted_total = 0.0 +total_requests = 0.0 +unweighted_total = 0.0 +unweighted_count = 0 +used_endpoints = 0 + +for url in sys.argv[1:]: + try: + with urllib.request.urlopen(url, timeout=timeout) as response: + payload = json.load(response) + except Exception as exc: # noqa: BLE001 - diagnostics for CI logs + print(f"SpeedBench AL eval: TRT-LLM JSON metrics fetch failed for {url}: {exc}", file=sys.stderr) + continue + + endpoint_had_avg = False + for stat in stats_from_payload(payload): + if not isinstance(stat, dict): + continue + ifb = stat.get("inflightBatchingStats") + if not isinstance(ifb, dict): + continue + + avg_decoded = number(ifb.get("avgNumDecodedTokensPerIter"), default=-1.0) + if avg_decoded < 1.0: + continue + + gen_requests = number(ifb.get("numGenRequests")) + endpoint_had_avg = True + if gen_requests > 0: + weighted_total += avg_decoded * gen_requests + total_requests += gen_requests + else: + unweighted_total += avg_decoded + unweighted_count += 1 + + if endpoint_had_avg: + used_endpoints += 1 + +if total_requests > 0: + acceptance_length = weighted_total / total_requests +elif unweighted_count > 0: + acceptance_length = unweighted_total / unweighted_count +else: + sys.exit(1) + +print(f"{acceptance_length:.4f}\t{used_endpoints}") +PY +} + _speedbench_metric_delta() { local before="$1" local after="$2" @@ -1434,11 +1534,25 @@ run_speedbench_al_eval() { fi else local trt_json_metrics="" trt_json_endpoints="" - trt_json_metrics=$(_speedbench_trtllm_json_spec_metrics "$port" "$mtp" 2>/dev/null || true) + trt_json_metrics=$(_speedbench_trtllm_json_spec_metrics "$port" "$mtp" || true) if [[ -n "$trt_json_metrics" ]]; then IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_json_endpoints <<< "$trt_json_metrics" metric_source="trtllm-json-iteration-stats-endpoints${trt_json_endpoints}" fi + if [[ -z "$al" ]]; then + al=$(_speedbench_trtllm_avg_decoded_al "$port" || true) + if [[ -n "$al" ]]; then + metric_source="${metric_source_base}-avg-decoded-tokens-endpoints${metrics_endpoint_count}" + fi + fi + if [[ -z "$al" ]]; then + local trt_json_avg_metrics="" trt_json_avg_endpoints="" + trt_json_avg_metrics=$(_speedbench_trtllm_json_avg_decoded_al "$port" || true) + if [[ -n "$trt_json_avg_metrics" ]]; then + IFS=$'\t' read -r al trt_json_avg_endpoints <<< "$trt_json_avg_metrics" + metric_source="trtllm-json-avg-decoded-tokens-endpoints${trt_json_avg_endpoints}" + fi + fi fi elif [[ "$metrics_framework" == "sglang" ]]; then al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 4177bb2fc8..6541800b32 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -141,7 +141,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied | | `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable | -SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON iteration stats from `/metrics` when the Prometheus spec series are unavailable. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values. +SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values. ### Score validation `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. From cc4c7e3d7ae0757f09a12fff17b3433e5d302aa2 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 13:42:38 -0700 Subject: [PATCH 16/27] Collect Dynamo SpeedBench AL from decode logs --- runners/launch_b200-dgxc.sh | 3 + runners/launch_gb200-nv.sh | 3 + utils/evals/EVALS.md | 2 +- utils/evals/dynamo_speedbench_al_from_logs.py | 215 ++++++++++++++++++ utils/evals/test_speedbench_al.py | 36 +++ .../write_dynamo_speedbench_al_from_logs.sh | 57 +++++ 6 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 utils/evals/dynamo_speedbench_al_from_logs.py create mode 100644 utils/evals/write_dynamo_speedbench_al_from_logs.sh diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 9eeed2af6e..1d4b716a69 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -359,6 +359,9 @@ EOF # Collect eval results if eval was requested if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then EVAL_DIR="$LOGS_DIR/eval_results" + if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then + bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE" + fi if [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" shopt -s nullglob diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 4c5ca8601e..ee149e7c41 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -503,6 +503,9 @@ fi # Collect eval results if eval was requested if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then EVAL_DIR="$LOGS_DIR/eval_results" + if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then + bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE" + fi if [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" shopt -s nullglob diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 6541800b32..63a96cb29a 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -141,7 +141,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied | | `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable | -SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values. +SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker. ### Score validation `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. diff --git a/utils/evals/dynamo_speedbench_al_from_logs.py b/utils/evals/dynamo_speedbench_al_from_logs.py new file mode 100644 index 0000000000..0cf71ac4d8 --- /dev/null +++ b/utils/evals/dynamo_speedbench_al_from_logs.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +"""Build a SpeedBench AL result from Dynamo decode-worker spec logs.""" + +from __future__ import annotations + +import argparse +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +from speedbench_al import build_result, cmd_record + + +SPEC_LINE_RE = re.compile( + r"SpecDecoding metrics:\s*" + r"Mean acceptance length:\s*(?P[0-9]+(?:\.[0-9]+)?)" + r".*?" + r"Accepted:\s*(?P[0-9]+)\s*tokens,\s*" + r"Drafted:\s*(?P[0-9]+)\s*tokens" +) +WORKER_RE = re.compile(r"_decode_w(?P[0-9]+)\.out$") + + +@dataclass(frozen=True) +class LogMetrics: + path: Path + worker: str + samples: int + accepted_tokens: int + proposed_draft_tokens: int + + +@dataclass(frozen=True) +class AggregatedMetrics: + workers: int + samples: int + accepted_tokens: int + proposed_draft_tokens: int + verify_steps: int + acceptance_length: float + selected_logs: tuple[Path, ...] + + +def _decode_log_files(logs_dir: Path) -> Iterable[Path]: + if not logs_dir.is_dir(): + return [] + return sorted(logs_dir.rglob("*_decode_w*.out")) + + +def parse_decode_log(path: Path) -> LogMetrics | None: + match = WORKER_RE.search(path.name) + if not match: + return None + + samples = 0 + accepted = 0 + drafted = 0 + try: + lines = path.read_text(errors="ignore").splitlines() + except OSError: + return None + + for line in lines: + parsed = SPEC_LINE_RE.search(line) + if not parsed: + continue + samples += 1 + accepted += int(parsed.group("accepted")) + drafted += int(parsed.group("drafted")) + + if samples == 0 or drafted <= 0: + return None + + return LogMetrics( + path=path, + worker=match.group("worker"), + samples=samples, + accepted_tokens=accepted, + proposed_draft_tokens=drafted, + ) + + +def select_decode_worker_logs(logs_dir: Path) -> list[LogMetrics]: + by_worker: dict[str, LogMetrics] = {} + for path in _decode_log_files(logs_dir): + metrics = parse_decode_log(path) + if metrics is None: + continue + current = by_worker.get(metrics.worker) + if current is None: + by_worker[metrics.worker] = metrics + continue + if (metrics.samples, metrics.proposed_draft_tokens) > ( + current.samples, + current.proposed_draft_tokens, + ): + by_worker[metrics.worker] = metrics + return [by_worker[k] for k in sorted(by_worker, key=int)] + + +def aggregate_log_metrics(logs_dir: Path, mtp: int) -> AggregatedMetrics | None: + if mtp <= 0: + raise ValueError("mtp must be positive") + + selected = select_decode_worker_logs(logs_dir) + if not selected: + return None + + accepted = sum(item.accepted_tokens for item in selected) + proposed = sum(item.proposed_draft_tokens for item in selected) + samples = sum(item.samples for item in selected) + if proposed <= 0: + return None + + verify_steps = round(proposed / mtp) + acceptance_length = 1.0 + (accepted / (proposed / mtp)) + + return AggregatedMetrics( + workers=len(selected), + samples=samples, + accepted_tokens=accepted, + proposed_draft_tokens=proposed, + verify_steps=verify_steps, + acceptance_length=acceptance_length, + selected_logs=tuple(item.path for item in selected), + ) + + +def _record_args(args: argparse.Namespace, metrics: AggregatedMetrics | None) -> argparse.Namespace: + record = argparse.Namespace( + output=args.output, + reference_yaml=args.reference_yaml, + model=args.model, + model_prefix=args.model_prefix, + thinking_mode=args.thinking_mode, + num_speculative_tokens=args.num_speculative_tokens, + category=args.category, + output_len=args.output_len, + temperature=args.temperature, + threshold_ratio=args.threshold_ratio, + framework=args.framework, + metric_source=args.metric_source, + acceptance_length=None, + accepted_tokens=None, + draft_tokens=None, + verify_steps=None, + proposed_draft_tokens=None, + error=None, + exit_status=False, + ) + if metrics is None: + record.error = ( + "Could not parse Dynamo speculative acceptance metrics from decode-worker logs" + ) + return record + + record.metric_source = ( + f"{args.metric_source}-workers{metrics.workers}-samples{metrics.samples}" + ) + record.acceptance_length = f"{metrics.acceptance_length:.4f}" + record.accepted_tokens = str(metrics.accepted_tokens) + record.verify_steps = str(metrics.verify_steps) + record.draft_tokens = str(metrics.verify_steps) + record.proposed_draft_tokens = str(metrics.proposed_draft_tokens) + return record + + +def cmd_from_logs(args: argparse.Namespace) -> int: + metrics = aggregate_log_metrics(Path(args.logs_dir), args.num_speculative_tokens) + record_args = _record_args(args, metrics) + result = build_result(record_args) + rc = cmd_record(record_args) + + if metrics is not None: + print("Dynamo SpeedBench AL log aggregation:") + for path in metrics.selected_logs: + print(f" selected {path}") + if not result.get("passed"): + return 0 + return rc + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--logs-dir", required=True) + parser.add_argument("--output", required=True) + parser.add_argument("--reference-yaml", required=True) + parser.add_argument("--model", required=True) + parser.add_argument("--model-prefix", default="") + parser.add_argument("--thinking-mode", required=True) + parser.add_argument("--num-speculative-tokens", type=int, required=True) + parser.add_argument("--category", default="coding") + parser.add_argument("--output-len", type=int, default=4096) + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--threshold-ratio", type=float, default=0.90) + parser.add_argument("--framework", default="dynamo") + parser.add_argument("--metric-source", default="dynamo-decode-log-counters") + parser.set_defaults(func=cmd_from_logs) + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + try: + return args.func(args) + except Exception as exc: # noqa: BLE001 - CLI should record a concise failure + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index bd7db0def2..81175ac20b 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -12,6 +12,7 @@ extract_speedbench_al_metrics, score_cell, ) +from dynamo_speedbench_al_from_logs import aggregate_log_metrics from speedbench_al import build_result, load_reference, lookup_reference from speedbench_client import ( _chat_payload, @@ -172,6 +173,41 @@ def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> Non assert speedbench_paths == [result_path] +def test_dynamo_log_parser_aggregates_decode_workers(tmp_path: Path) -> None: + def write_log(name: str, rows: list[tuple[float, int, int]]) -> None: + lines = [] + for al, accepted, drafted in rows: + lines.append( + "INFO metrics.log: SpecDecoding metrics: " + f"Mean acceptance length: {al}, " + "Accepted throughput: 1.0 tokens/s, " + "Drafted throughput: 1.0 tokens/s, " + f"Accepted: {accepted} tokens, Drafted: {drafted} tokens, " + "Per-position acceptance rate: 0.9, 0.7, " + "Avg Draft acceptance rate: 80.0%" + ) + (tmp_path / name).write_text("\n".join(lines)) + + write_log("node-a_decode_w0.out", [(2.0, 10, 20)]) + write_log("node-b_decode_w0.out", [(2.5, 15, 20), (2.5, 5, 10)]) + write_log("node-c_decode_w1.out", [(2.0, 10, 20)]) + write_log("node-d_decode_w1.out", []) + + metrics = aggregate_log_metrics(tmp_path, mtp=2) + + assert metrics is not None + assert metrics.workers == 2 + assert metrics.samples == 3 + assert metrics.accepted_tokens == 30 + assert metrics.proposed_draft_tokens == 50 + assert metrics.verify_steps == 25 + assert metrics.acceptance_length == 2.2 + assert [p.name for p in metrics.selected_logs] == [ + "node-b_decode_w0.out", + "node-c_decode_w1.out", + ] + + def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None: dataset = tmp_path / "speed_bench_data" dataset.mkdir() diff --git a/utils/evals/write_dynamo_speedbench_al_from_logs.sh b/utils/evals/write_dynamo_speedbench_al_from_logs.sh new file mode 100644 index 0000000000..b33d0bdcd3 --- /dev/null +++ b/utils/evals/write_dynamo_speedbench_al_from_logs.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +set -u + +logs_dir="${1:-}" +workspace="${2:-${GITHUB_WORKSPACE:-$(pwd)}}" + +if [[ -z "$logs_dir" ]]; then + echo "Dynamo SpeedBench AL: missing logs directory argument" >&2 + exit 0 +fi + +if [[ "${FRAMEWORK:-}" != dynamo* || "${SPEC_DECODING:-none}" != "mtp" ]]; then + echo "Dynamo SpeedBench AL: skipping FRAMEWORK=${FRAMEWORK:-unknown} SPEC_DECODING=${SPEC_DECODING:-none}" + exit 0 +fi + +mtp="${SPEEDBENCH_NUM_SPEC_TOKENS:-${NUM_SPEC_TOKENS:-${SPECULATIVE_DRAFT_TOKENS:-}}}" +if [[ -z "$mtp" && -n "${CONFIG_FILE:-}" ]]; then + config_path="${CONFIG_FILE%%:*}" + if [[ -f "$config_path" ]]; then + mtp="$(sed -n 's/.*num_speculative_tokens[^0-9]*\([0-9][0-9]*\).*/\1/p' "$config_path" | head -1)" + fi +fi +mtp="${mtp:-2}" + +mode="${SPEEDBENCH_THINKING_MODE:-}" +if [[ -z "$mode" ]]; then + if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + mode="on" + else + mode="off" + fi +fi + +model_name="${MODEL_NAME:-${MODEL:-}}" +if [[ -z "$model_name" ]]; then + model_name="${SERVED_MODEL_NAME:-unknown}" +fi + +output="${workspace}/results_speedbench_al_${mode}_mtp${mtp}.json" +metric_source="dynamo-decode-log-counters" +if [[ -n "${FRAMEWORK:-}" ]]; then + metric_source="${FRAMEWORK}-decode-log-counters" +fi + +echo "Dynamo SpeedBench AL: parsing decode logs from $logs_dir" +python3 "${workspace}/utils/evals/dynamo_speedbench_al_from_logs.py" \ + --logs-dir "$logs_dir" \ + --output "$output" \ + --reference-yaml "${workspace}/benchmarks/speedbench-reference-al.yaml" \ + --model "$model_name" \ + --model-prefix "${MODEL_PREFIX:-}" \ + --thinking-mode "$mode" \ + --num-speculative-tokens "$mtp" \ + --framework "${FRAMEWORK:-dynamo}" \ + --metric-source "$metric_source" || true From 2aef667c4432bbc36c1a551f2aa4f9e2fc39aee7 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 19:03:39 -0700 Subject: [PATCH 17/27] Use TRT-LLM server logs for SpeedBench AL fallback --- benchmarks/benchmark_lib.sh | 31 +++++ utils/evals/EVALS.md | 3 +- utils/evals/test_speedbench_al.py | 51 +++++++ utils/evals/trtllm_speedbench_al_from_log.py | 138 +++++++++++++++++++ 4 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 utils/evals/trtllm_speedbench_al_from_log.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index ef6c8604d5..affc88aea7 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1086,6 +1086,19 @@ print(f"{acceptance_length:.4f}\t{used_endpoints}") PY } +_speedbench_trtllm_server_log_metrics() { + local mtp="$1" + local start_offset="${2:-0}" + local log_path="${SPEEDBENCH_TRTLLM_SERVER_LOG:-${SERVER_LOG:-}}" + + [[ -n "$log_path" && -f "$log_path" ]] || return 1 + + python3 "$(pwd)/utils/evals/trtllm_speedbench_al_from_log.py" \ + --log "$log_path" \ + --num-speculative-tokens "$mtp" \ + --start-offset "$start_offset" +} + _speedbench_metric_delta() { local before="$1" local after="$2" @@ -1447,6 +1460,16 @@ run_speedbench_al_eval() { proposed_before="${proposed_before:-0}" verify_before="${verify_before:-0}" + local trt_server_log_offset="0" + if [[ "$metrics_framework" == "trtllm" ]]; then + local trt_server_log="${SPEEDBENCH_TRTLLM_SERVER_LOG:-${SERVER_LOG:-}}" + if [[ -n "$trt_server_log" && -f "$trt_server_log" ]]; then + trt_server_log_offset=$(wc -c < "$trt_server_log" 2>/dev/null || true) + trt_server_log_offset="${trt_server_log_offset//[!0-9]/}" + trt_server_log_offset="${trt_server_log_offset:-0}" + fi + fi + local bench_rc=0 local speedbench_model="${MODEL_NAME:-${MODEL:-}}" echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}" @@ -1539,6 +1562,14 @@ run_speedbench_al_eval() { IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_json_endpoints <<< "$trt_json_metrics" metric_source="trtllm-json-iteration-stats-endpoints${trt_json_endpoints}" fi + if [[ -z "$al" ]]; then + local trt_log_metrics="" trt_log_samples="" + trt_log_metrics=$(_speedbench_trtllm_server_log_metrics "$mtp" "$trt_server_log_offset" || true) + if [[ -n "$trt_log_metrics" ]]; then + IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_log_samples <<< "$trt_log_metrics" + metric_source="trtllm-server-log-generation-tokens-samples${trt_log_samples}" + fi + fi if [[ -z "$al" ]]; then al=$(_speedbench_trtllm_avg_decoded_al "$port" || true) if [[ -n "$al" ]]; then diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 63a96cb29a..6aeeb585d5 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -140,8 +140,9 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `SPEEDBENCH_METRICS_URLS` | unset | Generic comma/space-separated Prometheus endpoints when decode-specific naming is not applicable | | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied | | `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable | +| `SPEEDBENCH_TRTLLM_SERVER_LOG` | `SERVER_LOG` | Optional TRT-LLM `print_iter_log` file used to derive SpeedBench AL from generation-token iteration logs when spec metrics are unavailable | -SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker. +SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations enable `print_iter_log` but do not expose `specDecodingStats`; for those, SpeedBench records the server-log byte offset before running SpeedBench and derives accepted/proposed/verify counters from the new `num_generation_tokens` iteration lines. If neither exact spec stats nor server logs are available, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker. ### Score validation `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index 81175ac20b..0975b9af97 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -19,6 +19,7 @@ _completion_payload, _load_speedbench_requests, ) +from trtllm_speedbench_al_from_log import parse_trtllm_iteration_log from validate_scores import validate_speedbench_al @@ -208,6 +209,56 @@ def write_log(name: str, rows: list[tuple[float, int, int]]) -> None: ] +def test_trtllm_log_parser_reads_generation_tokens_after_offset(tmp_path: Path) -> None: + log_path = tmp_path / "server.log" + prefix = "previous eval traffic\n" + body = "\n".join( + [ + "[TRT-LLM] [I] iter = 1, num_scheduled_requests: 1, " + "states = {'num_ctx_requests': 1, 'num_ctx_tokens': 1024, 'num_generation_tokens': 0}", + "[TRT-LLM] [I] iter = 2, num_scheduled_requests: 1, " + "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 3}", + "[TRT-LLM] [I] iter = 3, num_scheduled_requests: 1, " + "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 2}", + 'INFO: 127.0.0.1:1 - "GET /prometheus/metrics HTTP/1.1" 200 OK', + "[TRT-LLM] [I] iter = 4, num_scheduled_requests: 32, " + "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 96}", + ] + ) + log_path.write_text(prefix + body) + + metrics = parse_trtllm_iteration_log(log_path, mtp=2, start_offset=len(prefix)) + + assert metrics is not None + assert metrics.samples == 2 + assert metrics.generated_tokens == 5 + assert metrics.accepted_tokens == 3 + assert metrics.verify_steps == 2 + assert metrics.proposed_draft_tokens == 4 + assert metrics.acceptance_length == 2.5 + + +def test_trtllm_log_parser_can_infer_batched_steps(tmp_path: Path) -> None: + log_path = tmp_path / "server.log" + log_path.write_text( + "[TRT-LLM] [I] iter = 10, num_scheduled_requests: 28, " + "states = {'num_ctx_requests': 9, 'num_ctx_tokens': 9345, 'num_generation_tokens': 57}" + ) + + metrics = parse_trtllm_iteration_log( + log_path, + mtp=2, + stop_at_metrics_get=False, + ) + + assert metrics is not None + assert metrics.samples == 1 + assert metrics.verify_steps == 19 + assert metrics.accepted_tokens == 38 + assert metrics.proposed_draft_tokens == 38 + assert metrics.acceptance_length == 3.0 + + def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None: dataset = tmp_path / "speed_bench_data" dataset.mkdir() diff --git a/utils/evals/trtllm_speedbench_al_from_log.py b/utils/evals/trtllm_speedbench_al_from_log.py new file mode 100644 index 0000000000..c63a933915 --- /dev/null +++ b/utils/evals/trtllm_speedbench_al_from_log.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +"""Parse TRT-LLM iteration logs into SpeedBench AL counters.""" + +from __future__ import annotations + +import argparse +import math +import re +import sys +from dataclasses import dataclass +from pathlib import Path + + +GEN_TOKENS_RE = re.compile(r"'num_generation_tokens':\s*(?P[0-9]+)") +ITER_LINE_RE = re.compile(r"\biter\s*=\s*[0-9]+,.*\bstates\s*=") +METRICS_GET_RE = re.compile(r'GET\s+/(?:prometheus/)?metrics\b') + + +@dataclass(frozen=True) +class TrtLogMetrics: + samples: int + accepted_tokens: int + proposed_draft_tokens: int + verify_steps: int + generated_tokens: int + + @property + def acceptance_length(self) -> float: + return self.generated_tokens / self.verify_steps + + +def _read_log_suffix(path: Path, start_offset: int) -> list[str]: + with path.open("rb") as f: + if start_offset > 0: + f.seek(start_offset) + return f.read().decode(errors="ignore").splitlines() + + +def parse_trtllm_iteration_log( + path: Path, + mtp: int, + start_offset: int = 0, + stop_at_metrics_get: bool = True, +) -> TrtLogMetrics | None: + if mtp <= 0: + raise ValueError("mtp must be positive") + if not path.is_file(): + return None + + samples = 0 + accepted = 0 + proposed = 0 + verify_steps = 0 + generated = 0 + max_tokens_per_step = mtp + 1 + + for line in _read_log_suffix(path, start_offset): + if samples and stop_at_metrics_get and METRICS_GET_RE.search(line): + break + if not ITER_LINE_RE.search(line): + continue + match = GEN_TOKENS_RE.search(line) + if not match: + continue + + gen_tokens = int(match.group("tokens")) + if gen_tokens <= 0: + continue + + # SpeedBench AL is issued at max-concurrency=1 today, where each + # generation iteration is one verification step. Keep a batched fallback + # for postmortem logs by assuming no step can emit more than mtp + 1 + # tokens per active request. + steps = max(1, math.ceil(gen_tokens / max_tokens_per_step)) + samples += 1 + verify_steps += steps + generated += gen_tokens + accepted += max(gen_tokens - steps, 0) + proposed += steps * mtp + + if samples == 0 or verify_steps <= 0: + return None + + return TrtLogMetrics( + samples=samples, + accepted_tokens=accepted, + proposed_draft_tokens=proposed, + verify_steps=verify_steps, + generated_tokens=generated, + ) + + +def cmd_tsv(args: argparse.Namespace) -> int: + metrics = parse_trtllm_iteration_log( + Path(args.log), + args.num_speculative_tokens, + start_offset=max(args.start_offset, 0), + stop_at_metrics_get=not args.no_stop_at_metrics_get, + ) + if metrics is None: + return 1 + + print( + f"{metrics.acceptance_length:.4f}\t" + f"{metrics.accepted_tokens}\t" + f"{metrics.verify_steps}\t" + f"{metrics.proposed_draft_tokens}\t" + f"{metrics.samples}" + ) + return 0 + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--log", required=True) + parser.add_argument("--num-speculative-tokens", type=int, required=True) + parser.add_argument("--start-offset", type=int, default=0) + parser.add_argument( + "--no-stop-at-metrics-get", + action="store_true", + help="Do not stop parsing at the next /metrics request after samples appear.", + ) + parser.set_defaults(func=cmd_tsv) + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + try: + return args.func(args) + except Exception as exc: # noqa: BLE001 - CLI should return concise diagnostics + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From 816dd1a84bbccbd914449fb0792aefabae324ddf Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 20:50:39 -0700 Subject: [PATCH 18/27] Capture GB200 srt-slurm bootstrap logs on early failure --- runners/launch_gb200-nv.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ee149e7c41..67329f1cc2 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -415,6 +415,18 @@ while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" scontrol show job "$JOB_ID" + BOOTSTRAP_LOG="outputs/sweep_${JOB_ID}.bootstrap.log" + if [ -f "$BOOTSTRAP_LOG" ]; then + echo "Bootstrap log from $BOOTSTRAP_LOG:" + cat "$BOOTSTRAP_LOG" + if [ -n "${GITHUB_WORKSPACE:-}" ]; then + mkdir -p "$GITHUB_WORKSPACE/LOGS" + cp "$BOOTSTRAP_LOG" "$GITHUB_WORKSPACE/LOGS/" || true + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$GITHUB_WORKSPACE/LOGS" . || true + fi + else + echo "Bootstrap log not found at $BOOTSTRAP_LOG" + fi exit 1 fi echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." From 9cbb11cd0ee2f43840d37bd50390af0f51c144f3 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 20:55:46 -0700 Subject: [PATCH 19/27] Parse Dynamo SGLang SpeedBench AL from decode logs --- runners/launch_gb300-cw.sh | 3 + utils/evals/dynamo_speedbench_al_from_logs.py | 102 +++++++++++++----- utils/evals/test_speedbench_al.py | 25 +++++ 3 files changed, 105 insertions(+), 25 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 6a5c50e381..a92d4bc388 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -394,6 +394,9 @@ fi if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then EVAL_DIR="$LOGS_DIR/eval_results" + if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then + bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE" + fi if [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" shopt -s nullglob diff --git a/utils/evals/dynamo_speedbench_al_from_logs.py b/utils/evals/dynamo_speedbench_al_from_logs.py index 0cf71ac4d8..f70992537c 100644 --- a/utils/evals/dynamo_speedbench_al_from_logs.py +++ b/utils/evals/dynamo_speedbench_al_from_logs.py @@ -20,6 +20,10 @@ r"Accepted:\s*(?P[0-9]+)\s*tokens,\s*" r"Drafted:\s*(?P[0-9]+)\s*tokens" ) +SGLANG_ACCEPT_LINE_RE = re.compile( + r"\baccept len:\s*(?P[0-9]+(?:\.[0-9]+)?)" + r"\s*,\s*accept rate:\s*(?P[0-9]+(?:\.[0-9]+)?)" +) WORKER_RE = re.compile(r"_decode_w(?P[0-9]+)\.out$") @@ -28,18 +32,25 @@ class LogMetrics: path: Path worker: str samples: int + acceptance_length_samples: int + acceptance_length_total: float accepted_tokens: int proposed_draft_tokens: int + @property + def has_counter_metrics(self) -> bool: + return self.proposed_draft_tokens > 0 + @dataclass(frozen=True) class AggregatedMetrics: workers: int samples: int - accepted_tokens: int - proposed_draft_tokens: int - verify_steps: int + accepted_tokens: int | None + proposed_draft_tokens: int | None + verify_steps: int | None acceptance_length: float + has_counter_metrics: bool selected_logs: tuple[Path, ...] @@ -55,6 +66,8 @@ def parse_decode_log(path: Path) -> LogMetrics | None: return None samples = 0 + acceptance_length_samples = 0 + acceptance_length_total = 0.0 accepted = 0 drafted = 0 try: @@ -65,18 +78,26 @@ def parse_decode_log(path: Path) -> LogMetrics | None: for line in lines: parsed = SPEC_LINE_RE.search(line) if not parsed: + sglang_parsed = SGLANG_ACCEPT_LINE_RE.search(line) + if not sglang_parsed: + continue + samples += 1 + acceptance_length_samples += 1 + acceptance_length_total += float(sglang_parsed.group("al")) continue samples += 1 accepted += int(parsed.group("accepted")) drafted += int(parsed.group("drafted")) - if samples == 0 or drafted <= 0: + if samples == 0 or (drafted <= 0 and acceptance_length_samples == 0): return None return LogMetrics( path=path, worker=match.group("worker"), samples=samples, + acceptance_length_samples=acceptance_length_samples, + acceptance_length_total=acceptance_length_total, accepted_tokens=accepted, proposed_draft_tokens=drafted, ) @@ -92,9 +113,16 @@ def select_decode_worker_logs(logs_dir: Path) -> list[LogMetrics]: if current is None: by_worker[metrics.worker] = metrics continue - if (metrics.samples, metrics.proposed_draft_tokens) > ( + if ( + metrics.has_counter_metrics, + metrics.samples, + metrics.proposed_draft_tokens, + metrics.acceptance_length_samples, + ) > ( + current.has_counter_metrics, current.samples, current.proposed_draft_tokens, + current.acceptance_length_samples, ): by_worker[metrics.worker] = metrics return [by_worker[k] for k in sorted(by_worker, key=int)] @@ -108,23 +136,42 @@ def aggregate_log_metrics(logs_dir: Path, mtp: int) -> AggregatedMetrics | None: if not selected: return None - accepted = sum(item.accepted_tokens for item in selected) - proposed = sum(item.proposed_draft_tokens for item in selected) - samples = sum(item.samples for item in selected) - if proposed <= 0: - return None + counter_logs = [item for item in selected if item.has_counter_metrics] + if counter_logs: + accepted = sum(item.accepted_tokens for item in counter_logs) + proposed = sum(item.proposed_draft_tokens for item in counter_logs) + samples = sum(item.samples for item in counter_logs) + verify_steps = round(proposed / mtp) + acceptance_length = 1.0 + (accepted / (proposed / mtp)) + + return AggregatedMetrics( + workers=len(counter_logs), + samples=samples, + accepted_tokens=accepted, + proposed_draft_tokens=proposed, + verify_steps=verify_steps, + acceptance_length=acceptance_length, + has_counter_metrics=True, + selected_logs=tuple(item.path for item in counter_logs), + ) - verify_steps = round(proposed / mtp) - acceptance_length = 1.0 + (accepted / (proposed / mtp)) + al_logs = [item for item in selected if item.acceptance_length_samples > 0] + al_samples = sum(item.acceptance_length_samples for item in al_logs) + if al_samples <= 0: + return None + acceptance_length = ( + sum(item.acceptance_length_total for item in al_logs) / al_samples + ) return AggregatedMetrics( - workers=len(selected), - samples=samples, - accepted_tokens=accepted, - proposed_draft_tokens=proposed, - verify_steps=verify_steps, + workers=len(al_logs), + samples=al_samples, + accepted_tokens=None, + proposed_draft_tokens=None, + verify_steps=None, acceptance_length=acceptance_length, - selected_logs=tuple(item.path for item in selected), + has_counter_metrics=False, + selected_logs=tuple(item.path for item in al_logs), ) @@ -156,14 +203,19 @@ def _record_args(args: argparse.Namespace, metrics: AggregatedMetrics | None) -> ) return record - record.metric_source = ( - f"{args.metric_source}-workers{metrics.workers}-samples{metrics.samples}" - ) + metric_source = args.metric_source + if ( + not metrics.has_counter_metrics + and metric_source.endswith("-decode-log-counters") + ): + metric_source = metric_source[: -len("counters")] + "accept-length" + record.metric_source = f"{metric_source}-workers{metrics.workers}-samples{metrics.samples}" record.acceptance_length = f"{metrics.acceptance_length:.4f}" - record.accepted_tokens = str(metrics.accepted_tokens) - record.verify_steps = str(metrics.verify_steps) - record.draft_tokens = str(metrics.verify_steps) - record.proposed_draft_tokens = str(metrics.proposed_draft_tokens) + if metrics.has_counter_metrics: + record.accepted_tokens = str(metrics.accepted_tokens) + record.verify_steps = str(metrics.verify_steps) + record.draft_tokens = str(metrics.verify_steps) + record.proposed_draft_tokens = str(metrics.proposed_draft_tokens) return record diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index 0975b9af97..b02cb5bd98 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -209,6 +209,31 @@ def write_log(name: str, rows: list[tuple[float, int, int]]) -> None: ] +def test_dynamo_log_parser_reads_sglang_accept_length_samples(tmp_path: Path) -> None: + (tmp_path / "node-a_decode_w0.out").write_text( + "\n".join( + [ + "Decode batch, #running-req: 1, accept len: 2.10, accept rate: 0.37,", + "Decode batch, #running-req: 1, accept len: 2.30, accept rate: 0.43,", + ] + ) + ) + (tmp_path / "node-b_decode_w1.out").write_text( + "Decode batch, #running-req: 1, accept len: 2.50, accept rate: 0.50," + ) + + metrics = aggregate_log_metrics(tmp_path, mtp=4) + + assert metrics is not None + assert metrics.workers == 2 + assert metrics.samples == 3 + assert round(metrics.acceptance_length, 4) == 2.3 + assert metrics.has_counter_metrics is False + assert metrics.accepted_tokens is None + assert metrics.verify_steps is None + assert metrics.proposed_draft_tokens is None + + def test_trtllm_log_parser_reads_generation_tokens_after_offset(tmp_path: Path) -> None: log_path = tmp_path / "server.log" prefix = "previous eval traffic\n" From 046f3042e791edaa56c3a27fbe7a7b29d43de44e Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 10 Jun 2026 20:57:43 -0700 Subject: [PATCH 20/27] Read GB200 bootstrap log from Slurm stderr path --- runners/launch_gb200-nv.sh | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 67329f1cc2..7ebb2ac7e3 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -414,8 +414,24 @@ LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" - scontrol show job "$JOB_ID" - BOOTSTRAP_LOG="outputs/sweep_${JOB_ID}.bootstrap.log" + JOB_INFO=$(scontrol show job "$JOB_ID" 2>&1 || true) + echo "$JOB_INFO" + BOOTSTRAP_LOG="" + BOOTSTRAP_CANDIDATES=("outputs/sweep_${JOB_ID}.bootstrap.log") + SCONTROL_STDERR=$(printf '%s\n' "$JOB_INFO" | awk '{ for (i = 1; i <= NF; i++) if ($i ~ /^StdErr=/) { sub(/^StdErr=/, "", $i); print $i; exit } }') + if [ -n "$SCONTROL_STDERR" ]; then + BOOTSTRAP_CANDIDATES+=("$SCONTROL_STDERR") + fi + for candidate in "${BOOTSTRAP_CANDIDATES[@]}"; do + [ -n "$candidate" ] || continue + for _ in 1 2 3; do + if [ -f "$candidate" ]; then + BOOTSTRAP_LOG="$candidate" + break 2 + fi + sleep 2 + done + done if [ -f "$BOOTSTRAP_LOG" ]; then echo "Bootstrap log from $BOOTSTRAP_LOG:" cat "$BOOTSTRAP_LOG" @@ -425,7 +441,7 @@ while ! ls "$LOG_FILE" &>/dev/null; do tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$GITHUB_WORKSPACE/LOGS" . || true fi else - echo "Bootstrap log not found at $BOOTSTRAP_LOG" + echo "Bootstrap log not found. Tried: ${BOOTSTRAP_CANDIDATES[*]}" fi exit 1 fi From 16e7aa572da9b1ae300eaaebda67f600908f68a5 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Fri, 26 Jun 2026 08:09:19 -0700 Subject: [PATCH 21/27] update SGL metrics gathering method --- benchmarks/benchmark_lib.sh | 45 +++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index affc88aea7..31cdc013b6 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1177,6 +1177,9 @@ _speedbench_spec_counter_metric() { sglang:verify) echo "sglang:spec_verify_calls_total" ;; + sglang:completion) + echo "sglang:generation_tokens_total" + ;; *) return 1 ;; @@ -1439,6 +1442,9 @@ run_speedbench_al_eval() { if [[ "$client" != "openai" && "$client" != "native" ]] && command -v vllm >/dev/null 2>&1; then use_vllm_client=1 fi + if [[ "$metrics_framework" == "sglang" ]]; then + use_vllm_client=0 + fi local think_args=() if [[ "$mode" == "on" ]]; then @@ -1452,13 +1458,15 @@ run_speedbench_al_eval() { fi fi - local accepted_before="" proposed_before="" verify_before="" + local accepted_before="" proposed_before="" verify_before="" completion_before="" accepted_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true) proposed_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true) verify_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true) + completion_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "completion" 2>/dev/null || true) accepted_before="${accepted_before:-0}" proposed_before="${proposed_before:-0}" verify_before="${verify_before:-0}" + completion_before="${completion_before:-0}" local trt_server_log_offset="0" if [[ "$metrics_framework" == "trtllm" ]]; then @@ -1529,11 +1537,12 @@ run_speedbench_al_eval() { return 0 fi - local accepted_after="" proposed_after="" verify_after="" - local al="" delta_acc="" delta_proposed="" delta_verify="" metric_source="" + local accepted_after="" proposed_after="" verify_after="" completion_after="" + local al="" delta_acc="" delta_proposed="" delta_verify="" delta_completion="" metric_source="" accepted_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true) proposed_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true) verify_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true) + completion_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "completion" 2>/dev/null || true) if [[ -n "$accepted_after" ]]; then delta_acc=$(_speedbench_round_metric "$(_speedbench_metric_delta "$accepted_before" "$accepted_after")") @@ -1544,6 +1553,9 @@ run_speedbench_al_eval() { if [[ -n "$verify_after" ]]; then delta_verify=$(_speedbench_round_metric "$(_speedbench_metric_delta "$verify_before" "$verify_after")") fi + if [[ -n "$completion_after" ]]; then + delta_completion=$(_speedbench_round_metric "$(_speedbench_metric_delta "$completion_before" "$completion_after")") + fi if [[ "$metrics_framework" == "vllm" && -n "$delta_acc" && -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then al=$(awk -v accepted="$delta_acc" -v verify="$delta_verify" 'BEGIN { printf "%.4f", 1 + (accepted / verify) }') @@ -1586,17 +1598,32 @@ run_speedbench_al_eval() { fi fi elif [[ "$metrics_framework" == "sglang" ]]; then - al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) - if [[ -n "$al" ]]; then - metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}" - fi - if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then - local draft_depth="" + local draft_depth="" + if [[ -n "$delta_completion" && "$delta_completion" -gt 0 && -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then + al=$(awk -v completion="$delta_completion" -v verify="$delta_verify" 'BEGIN { printf "%.4f", completion / verify }') + delta_acc=$(_speedbench_round_metric "$(awk -v completion="$delta_completion" -v verify="$delta_verify" 'BEGIN { value = completion - verify; if (value < 0) value = 0; printf "%.10f\n", value }')") draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true) if [[ -n "$draft_depth" ]]; then delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')") + elif [[ -n "$mtp" ]]; then + delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v mtp="$mtp" 'BEGIN { value = verify * mtp; if (value < 0) value = 0; printf "%.10f\n", value }')") fi + metric_source="${metric_source_base}-generation-counter+verify-counter-endpoints${metrics_endpoint_count}" + fi + if [[ -z "$al" ]]; then + al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) if [[ -n "$al" ]]; then + metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}" + fi + fi + if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then + if [[ -z "$draft_depth" ]]; then + draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true) + fi + if [[ -n "$draft_depth" ]]; then + delta_proposed="${delta_proposed:-$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")}" + fi + if [[ -n "$al" && "$metric_source" != *"generation-counter+verify-counter"* ]]; then delta_acc=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v al="$al" 'BEGIN { value = verify * (al - 1); if (value < 0) value = 0; printf "%.10f\n", value }')") metric_source="${metric_source:-${metric_source_base}-gauge-endpoints${metrics_endpoint_count}}+derived-token-counters" fi From a48fc820a08e677f528d308334047639c88412e8 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Mon, 29 Jun 2026 11:42:40 -0700 Subject: [PATCH 22/27] fix: validate SpeedBench AL within golden tolerance --- benchmarks/benchmark_lib.sh | 6 ++- utils/collect_eval_results.py | 12 +++++- utils/evals/EVALS.md | 2 +- utils/evals/dynamo_speedbench_al_from_logs.py | 4 +- utils/evals/speedbench_al.py | 43 ++++++++++++++++--- utils/evals/test_speedbench_al.py | 42 ++++++++++++++---- utils/evals/validate_scores.py | 26 ++++++++--- 7 files changed, 111 insertions(+), 24 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 31cdc013b6..a9db11e62d 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1248,7 +1248,8 @@ _speedbench_write_eval_result() { --category "coding" --output-len "4096" --temperature "1.0" - --threshold-ratio "0.90" + --threshold-ratio "0.95" + --max-threshold-ratio "1.05" ) if [[ -n "$framework" ]]; then record_cmd+=(--framework "$framework") @@ -1287,7 +1288,8 @@ _speedbench_reference_available() { --model-prefix "${MODEL_PREFIX:-}" \ --thinking-mode "$mode" \ --num-speculative-tokens "$mtp" \ - --threshold-ratio "0.90" >/dev/null + --threshold-ratio "0.95" \ + --max-threshold-ratio "1.05" >/dev/null } _speedbench_prepare_dataset() { diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py index 45b464329d..cca611c6f9 100644 --- a/utils/collect_eval_results.py +++ b/utils/collect_eval_results.py @@ -167,7 +167,9 @@ def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]: 'acceptance_length': data.get('acceptance_length'), 'reference_acceptance_length': data.get('reference_acceptance_length'), 'min_acceptance_length': data.get('min_acceptance_length'), + 'max_acceptance_length': data.get('max_acceptance_length'), 'threshold_ratio': data.get('threshold_ratio'), + 'max_threshold_ratio': data.get('max_threshold_ratio'), 'thinking_mode': mode, 'num_speculative_tokens': mtp, 'speedbench_framework': data.get('framework'), @@ -275,7 +277,9 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]: row['score_se'] = None row['speedbench_reference_acceptance_length'] = m.get('reference_acceptance_length') row['speedbench_min_acceptance_length'] = m.get('min_acceptance_length') + row['speedbench_max_acceptance_length'] = m.get('max_acceptance_length') row['speedbench_threshold_ratio'] = m.get('threshold_ratio') + row['speedbench_max_threshold_ratio'] = m.get('max_threshold_ratio') row['speedbench_thinking_mode'] = m.get('thinking_mode') row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens') row['speedbench_framework'] = m.get('speedbench_framework') @@ -306,14 +310,18 @@ def score_cell(r: Dict[str, Any]) -> str: if r.get('score_name') == 'acceptance_length': score = r.get('score') minimum = r.get('speedbench_min_acceptance_length') + maximum = r.get('speedbench_max_acceptance_length') passed = r.get('speedbench_passed') if score is None: return 'FAIL' try: status = 'PASS' if passed else 'FAIL' - if minimum is None: + if minimum is None or maximum is None: return f"{float(score):.2f} ({status})" - return f"{float(score):.2f} >= {float(minimum):.2f} ({status})" + return ( + f"{float(score):.2f} in " + f"[{float(minimum):.2f}, {float(maximum):.2f}] ({status})" + ) except Exception: return str(score) return f"{pct(r['score'])}{se(r['score_se'])}" diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index 6aeeb585d5..f42ccbdb9b 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -145,7 +145,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations enable `print_iter_log` but do not expose `specDecodingStats`; for those, SpeedBench records the server-log byte offset before running SpeedBench and derives accepted/proposed/verify counters from the new `num_generation_tokens` iteration lines. If neither exact spec stats nor server logs are available, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker. ### Score validation -`utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. +`utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the inclusive range from 95% to 105% of the golden AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails. ### Adding a new eval task diff --git a/utils/evals/dynamo_speedbench_al_from_logs.py b/utils/evals/dynamo_speedbench_al_from_logs.py index f70992537c..e2135953e9 100644 --- a/utils/evals/dynamo_speedbench_al_from_logs.py +++ b/utils/evals/dynamo_speedbench_al_from_logs.py @@ -187,6 +187,7 @@ def _record_args(args: argparse.Namespace, metrics: AggregatedMetrics | None) -> output_len=args.output_len, temperature=args.temperature, threshold_ratio=args.threshold_ratio, + max_threshold_ratio=args.max_threshold_ratio, framework=args.framework, metric_source=args.metric_source, acceptance_length=None, @@ -246,7 +247,8 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("--category", default="coding") parser.add_argument("--output-len", type=int, default=4096) parser.add_argument("--temperature", type=float, default=1.0) - parser.add_argument("--threshold-ratio", type=float, default=0.90) + parser.add_argument("--threshold-ratio", type=float, default=0.95) + parser.add_argument("--max-threshold-ratio", type=float, default=1.05) parser.add_argument("--framework", default="dynamo") parser.add_argument("--metric-source", default="dynamo-decode-log-counters") parser.set_defaults(func=cmd_from_logs) diff --git a/utils/evals/speedbench_al.py b/utils/evals/speedbench_al.py index a9167e6565..94a17c6132 100644 --- a/utils/evals/speedbench_al.py +++ b/utils/evals/speedbench_al.py @@ -15,6 +15,14 @@ "dsv4": "deepseek-v4-pro", } +DEFAULT_MIN_THRESHOLD_RATIO = 0.95 +DEFAULT_MAX_THRESHOLD_RATIO = 1.05 + + +def scaled_threshold(reference: float, ratio: float) -> float: + """Scale an AL reference without leaking binary float noise into boundaries.""" + return round(reference * ratio, 10) + def _parse_scalar(value: str) -> Any: value = value.strip() @@ -158,6 +166,7 @@ def _optional_int(value: str | None) -> int | None: def build_result(args: argparse.Namespace) -> dict[str, Any]: reference_al: float | None = None min_acceptance_length: float | None = None + max_acceptance_length: float | None = None model_key: str | None = None mode_key = normalize_mode(args.thinking_mode) error: str | None = args.error @@ -173,7 +182,12 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]: args.thinking_mode, args.num_speculative_tokens, ) - min_acceptance_length = reference_al * args.threshold_ratio + min_acceptance_length = scaled_threshold( + reference_al, args.threshold_ratio + ) + max_acceptance_length = scaled_threshold( + reference_al, args.max_threshold_ratio + ) except Exception as exc: # noqa: BLE001 - recorded for CI artifacts error = error or str(exc) else: @@ -190,7 +204,9 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]: error is None and acceptance_length is not None and min_acceptance_length is not None + and max_acceptance_length is not None and acceptance_length >= min_acceptance_length + and acceptance_length <= max_acceptance_length ) result = { @@ -213,7 +229,9 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]: "draft_tokens": draft_tokens, "reference_acceptance_length": reference_al, "threshold_ratio": args.threshold_ratio, + "max_threshold_ratio": args.max_threshold_ratio, "min_acceptance_length": min_acceptance_length, + "max_acceptance_length": max_acceptance_length, "passed": passed, } if error: @@ -235,7 +253,11 @@ def cmd_resolve(args: argparse.Namespace) -> int: "num_speculative_tokens": args.num_speculative_tokens, "reference_acceptance_length": reference_al, "threshold_ratio": args.threshold_ratio, - "min_acceptance_length": reference_al * args.threshold_ratio, + "max_threshold_ratio": args.max_threshold_ratio, + "min_acceptance_length": scaled_threshold(reference_al, args.threshold_ratio), + "max_acceptance_length": scaled_threshold( + reference_al, args.max_threshold_ratio + ), } print(json.dumps(payload, sort_keys=True)) return 0 @@ -248,9 +270,10 @@ def cmd_record(args: argparse.Namespace) -> int: status = "PASS" if result["passed"] else "FAIL" actual = result.get("acceptance_length") minimum = result.get("min_acceptance_length") + maximum = result.get("max_acceptance_length") print( f"{status}: SpeedBench AL {actual} " - f"(min {minimum}, mode {result['thinking_mode']}, " + f"(range [{minimum}, {maximum}], mode {result['thinking_mode']}, " f"mtp {result['num_speculative_tokens']})" ) if args.exit_status and not result["passed"]: @@ -268,7 +291,12 @@ def build_parser() -> argparse.ArgumentParser: resolve.add_argument("--model-prefix", default="") resolve.add_argument("--thinking-mode", required=True) resolve.add_argument("--num-speculative-tokens", type=int, required=True) - resolve.add_argument("--threshold-ratio", type=float, default=0.90) + resolve.add_argument( + "--threshold-ratio", type=float, default=DEFAULT_MIN_THRESHOLD_RATIO + ) + resolve.add_argument( + "--max-threshold-ratio", type=float, default=DEFAULT_MAX_THRESHOLD_RATIO + ) resolve.set_defaults(func=cmd_resolve) record = subparsers.add_parser("record", help="Write a compact AL eval result") @@ -281,7 +309,12 @@ def build_parser() -> argparse.ArgumentParser: record.add_argument("--category", default="coding") record.add_argument("--output-len", type=int, default=4096) record.add_argument("--temperature", type=float, default=1.0) - record.add_argument("--threshold-ratio", type=float, default=0.90) + record.add_argument( + "--threshold-ratio", type=float, default=DEFAULT_MIN_THRESHOLD_RATIO + ) + record.add_argument( + "--max-threshold-ratio", type=float, default=DEFAULT_MAX_THRESHOLD_RATIO + ) record.add_argument("--framework", default="") record.add_argument("--metric-source", default="") record.add_argument("--acceptance-length", default=None) diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index b02cb5bd98..751de1cdd7 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -67,8 +67,9 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None: category="coding", output_len=4096, temperature=1.0, - threshold_ratio=0.90, - acceptance_length="2.30", + threshold_ratio=0.95, + max_threshold_ratio=1.05, + acceptance_length="2.50", accepted_tokens="13", draft_tokens="10", verify_steps="10", @@ -81,7 +82,8 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None: result = build_result(args) assert result["reference_acceptance_length"] == 2.50 - assert result["min_acceptance_length"] == 2.25 + assert result["min_acceptance_length"] == 2.375 + assert result["max_acceptance_length"] == 2.625 assert result["framework"] == "vllm" assert result["metric_source"] == "vllm-prometheus-counters-endpoints1" assert result["verify_steps"] == 10 @@ -97,7 +99,8 @@ def test_validate_speedbench_al_fails_below_minimum() -> None: "thinking_mode": "thinking_on", "num_speculative_tokens": 2, "acceptance_length": 2.0, - "min_acceptance_length": 2.25, + "min_acceptance_length": 2.375, + "max_acceptance_length": 2.625, "passed": False, }, "results_speedbench_al.json", @@ -107,6 +110,25 @@ def test_validate_speedbench_al_fails_below_minimum() -> None: assert ok is False +def test_validate_speedbench_al_fails_above_maximum() -> None: + ok, checked = validate_speedbench_al( + { + "speedbench_al_eval_version": 1, + "task": "speedbench_al", + "thinking_mode": "thinking_on", + "num_speculative_tokens": 2, + "acceptance_length": 2.7, + "min_acceptance_length": 2.375, + "max_acceptance_length": 2.625, + "passed": True, + }, + "results_speedbench_al.json", + ) + + assert checked == 1 + assert ok is False + + def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json" result_path.write_text( @@ -117,15 +139,17 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: "model": "deepseek-ai/DeepSeek-V4-Pro", "thinking_mode": "thinking_on", "num_speculative_tokens": 2, - "acceptance_length": 2.3, + "acceptance_length": 2.5, "framework": "sglang", "metric_source": "sglang-prometheus-gauge-endpoints1+derived-token-counters", "accepted_tokens": 13, "verify_steps": 10, "proposed_draft_tokens": 20, "reference_acceptance_length": 2.5, - "min_acceptance_length": 2.25, - "threshold_ratio": 0.9, + "min_acceptance_length": 2.375, + "max_acceptance_length": 2.625, + "threshold_ratio": 0.95, + "max_threshold_ratio": 1.05, "passed": True, } ) @@ -149,7 +173,8 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: assert row["speedbench_accepted_tokens"] == 13 assert row["speedbench_verify_steps"] == 10 assert row["speedbench_proposed_draft_tokens"] == 20 - assert score_cell(row) == "2.30 >= 2.25 (PASS)" + assert row["speedbench_max_acceptance_length"] == 2.625 + assert score_cell(row) == "2.50 in [2.38, 2.62] (PASS)" def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> None: @@ -163,6 +188,7 @@ def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> Non "num_speculative_tokens": 2, "acceptance_length": 2.3, "min_acceptance_length": 2.25, + "max_acceptance_length": 2.75, "passed": True, } ) diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py index c85becd06b..4486f07839 100644 --- a/utils/evals/validate_scores.py +++ b/utils/evals/validate_scores.py @@ -30,6 +30,7 @@ def validate_speedbench_al(data: dict, source: str) -> tuple[bool, int]: actual = data.get("acceptance_length") minimum = data.get("min_acceptance_length") + maximum = data.get("max_acceptance_length") passed = data.get("passed") label = ( f"{data.get('task', 'speedbench_al')} " @@ -37,17 +38,32 @@ def validate_speedbench_al(data: dict, source: str) -> tuple[bool, int]: f"mtp{data.get('num_speculative_tokens', 'unknown')}" ) - if passed is True: - print(f"PASS: {label} AL = {float(actual):.4f} (>= {float(minimum):.4f})") + values_are_numeric = all( + isinstance(value, (int, float)) for value in (actual, minimum, maximum) + ) + within_range = values_are_numeric and minimum <= actual <= maximum + + if passed is True and within_range: + print( + f"PASS: {label} AL = {float(actual):.4f} " + f"(range [{float(minimum):.4f}, {float(maximum):.4f}])" + ) return True, 1 - if isinstance(actual, (int, float)) and isinstance(minimum, (int, float)): + if values_are_numeric: + if actual < minimum: + comparison = "below" + elif actual > maximum: + comparison = "above" + else: + comparison = "marked failed" print( - f"FAIL: {label} AL = {actual:.4f} (< {minimum:.4f})", + f"FAIL: {label} AL = {actual:.4f} ({comparison}; " + f"expected [{minimum:.4f}, {maximum:.4f}])", file=sys.stderr, ) else: - error = data.get("error", "missing acceptance length or threshold") + error = data.get("error", "missing acceptance length or validation bounds") print(f"FAIL: {label} in {source}: {error}", file=sys.stderr) return False, 1 From ba7fa14b5f80258d219b6d02561958671843294e Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:20:58 -0700 Subject: [PATCH 23/27] fix: use canonical golden AL distribution --- .github/workflows/speedbench-al.yml | 25 ++++++++++------ benchmarks/benchmark_lib.sh | 23 +++++++++++++-- .../speedbench/dsr1_fp4_b300_vllm.sh | 2 +- .../speedbench/dsv4_fp4_b300_vllm.sh | 4 +-- .../speedbench/glm5_fp4_b300_vllm.sh | 2 +- .../speedbench/kimik2.5_fp4_b300_vllm.sh | 2 +- .../speedbench/minimaxm3_fp4_b300_vllm.sh | 2 +- .../speedbench/qwen3.5_fp4_b300_vllm.sh | 2 +- benchmarks/speedbench-reference-al.yaml | 29 ------------------- utils/evals/test_speedbench_al.py | 15 ++++++++++ .../write_dynamo_speedbench_al_from_logs.sh | 15 +++++++++- 11 files changed, 73 insertions(+), 48 deletions(-) delete mode 100644 benchmarks/speedbench-reference-al.yaml diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml index 84a92e57a4..781228dee2 100644 --- a/.github/workflows/speedbench-al.yml +++ b/.github/workflows/speedbench-al.yml @@ -3,8 +3,8 @@ name: SpeedBench AL Collection # Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length # (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to # DeepSeek-V4-Pro). Produces the golden reference consumed by the -# synthetic-acceptance framework and (optionally) opens a PR updating -# benchmarks/speedbench-reference-al.yaml. +# synthetic-acceptance framework and can optionally open a PR updating the +# model's YAML under golden_al_distribution/. on: workflow_dispatch: @@ -60,7 +60,7 @@ on: type: string default: '480' open-pr: - description: "Open a PR updating benchmarks/speedbench-reference-al.yaml (default off: artifact-only, paste values in manually)" + description: "Open a PR updating the model's golden_al_distribution YAML (default off: artifact-only)" required: false type: boolean default: false @@ -179,16 +179,23 @@ jobs: GH_TOKEN: ${{ secrets.REPO_PAT }} run: | set -euo pipefail - # NOTE: the reference yaml is keyed by model at the top level. This - # overwrites it with the current model's matrix; when more than one - # model is collected, replace this cp with a per-model-key YAML merge. - cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml + case "${{ inputs.model-prefix }}" in + dsv4) GOLDEN_AL_PATH="golden_al_distribution/dsv4_mtp.yaml" ;; + qwen3.5) GOLDEN_AL_PATH="golden_al_distribution/qwen3.5_mtp.yaml" ;; + kimik2.5) GOLDEN_AL_PATH="golden_al_distribution/kimik2.5_eagle3.yaml" ;; + minimaxm3) GOLDEN_AL_PATH="golden_al_distribution/minimaxm3_eagle3.yaml" ;; + *) + echo "No committed golden AL path for model prefix: ${{ inputs.model-prefix }}" >&2 + exit 1 + ;; + esac + cp speedbench-reference-al.yaml "$GOLDEN_AL_PATH" BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}" git config user.name "github-actions" git config user.email "github-actions@github.com" git checkout -b "$BRANCH" - git add benchmarks/speedbench-reference-al.yaml + git add "$GOLDEN_AL_PATH" if git diff --cached --quiet; then echo "No change in reference yaml; skipping PR." exit 0 @@ -222,4 +229,4 @@ jobs: - name: Resource cleanup (post-run) if: always() - run: *resource-cleanup \ No newline at end of file + run: *resource-cleanup diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 52eea300ed..340e3a9341 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1223,6 +1223,22 @@ _speedbench_spec_gauge_avg() { _speedbench_metric_avg "$port" "$metric" } +_speedbench_reference_yaml() { + if [[ -n "${SPEEDBENCH_REFERENCE_YAML:-}" ]]; then + echo "$SPEEDBENCH_REFERENCE_YAML" + return 0 + fi + + case "${MODEL_PREFIX:-}" in + dsv4) + echo "golden_al_distribution/dsv4_mtp.yaml" + ;; + *) + return 1 + ;; + esac +} + _speedbench_write_eval_result() { local output="$1" local mode="$2" @@ -1235,12 +1251,14 @@ _speedbench_write_eval_result() { local metric_source="${9:-}" local error="${10:-}" local speedbench_model="${MODEL_NAME:-${MODEL:-}}" + local reference + reference=$(_speedbench_reference_yaml 2>/dev/null || true) local record_cmd=( python3 "$(pwd)/utils/evals/speedbench_al.py" record --output "$output" - --reference-yaml "benchmarks/speedbench-reference-al.yaml" + --reference-yaml "$reference" --model "$speedbench_model" --model-prefix "${MODEL_PREFIX:-}" --thinking-mode "$mode" @@ -1279,8 +1297,9 @@ _speedbench_write_eval_result() { _speedbench_reference_available() { local mode="$1" local mtp="$2" - local reference="benchmarks/speedbench-reference-al.yaml" + local reference local speedbench_model="${MODEL_NAME:-${MODEL:-}}" + reference=$(_speedbench_reference_yaml) || return 1 [[ -f "$reference" ]] || return 1 python3 "$(pwd)/utils/evals/speedbench_al.py" resolve \ --reference-yaml "$reference" \ diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh index d0357c6b43..d032164e67 100755 --- a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh @@ -5,7 +5,7 @@ # Produces the golden acceptance-length (AL) reference matrix consumed by the # synthetic-acceptance framework: for each MTP level (num_speculative_tokens), # measure the REAL AL on a single SPEED-Bench category (default: coding) and emit -# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml. +# a YAML matrix identical in shape to the files under golden_al_distribution/. # This measures real MTP acceptance; the synthetic value is injected downstream # by the throughput recipe, not here. # diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh index b8550a3502..97421eaa5a 100755 --- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh @@ -6,7 +6,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP # level (num_speculative_tokens), measure the AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# benchmarks/speedbench-reference-al.yaml. +# golden_al_distribution/dsv4_mtp.yaml. # # This is the "AL distribution collection" script wired into the # speedbench-al.yml GitHub Action (workflow_dispatch / push-button). @@ -46,7 +46,7 @@ SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" CONCURRENCY="${CONCURRENCY:-1}" TEMPERATURE="${TEMPERATURE:-1.0}" # thinking-on chat_template_kwargs. MUST match the production/golden config: -# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured +# the reference matrix (golden_al_distribution/dsv4_mtp.yaml) was measured # with reasoning_effort=high. DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"thinking": true, "reasoning_effort": "high"}' CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}" diff --git a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh index 6265500b94..09577d802f 100755 --- a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh @@ -6,7 +6,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance; +# a model-specific YAML under golden_al_distribution/. This measures real MTP acceptance; # the synthetic value is injected downstream by the throughput recipe, not here. # # Filename *_fp4_* matches both the speedbench-al.yml path convention diff --git a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh index 890c059f9d..137e4415c1 100755 --- a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh @@ -7,7 +7,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each # EAGLE3 speculative-token count, measure the REAL AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# benchmarks/speedbench-reference-al.yaml. +# golden_al_distribution/kimik2.5_eagle3.yaml. # # Kimi-K2.5 uses the lightseekorg/kimi-k2.5-eagle3-mla draft head (MLA # variant, recommended by official docs). The draft model is downloaded diff --git a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh index dac39fb538..71dfed4656 100755 --- a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh @@ -7,7 +7,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each # EAGLE3 level (num_speculative_tokens), measure the REAL AL on a single # SPEED-Bench category (default: coding) and emit a YAML matrix identical in -# shape to benchmarks/speedbench-reference-al.yaml. This measures real EAGLE3 +# shape to the files under golden_al_distribution/. This measures real EAGLE3 # acceptance; the synthetic value is injected downstream by the throughput # recipe, not here. # diff --git a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh index bf2bda7c8d..64dd01178c 100755 --- a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh @@ -6,7 +6,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance; +# golden_al_distribution/qwen3.5_mtp.yaml. This measures real MTP acceptance; # the synthetic value is injected downstream by the throughput recipe, not here. # # Adapted from speedbench/dsv4_fp4_b300_vllm.sh. Differences vs DSV4 (deepseek_v4 diff --git a/benchmarks/speedbench-reference-al.yaml b/benchmarks/speedbench-reference-al.yaml deleted file mode 100644 index b3dbf441d1..0000000000 --- a/benchmarks/speedbench-reference-al.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Acceptance Length (AL) reference values measured with SPEED-Bench. -# dataset: coding | temperature: 1.0 | output_len: 4096 -# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens. -# -# Two modes are reported: -# thinking_on - reasoning enabled; this is the PRODUCTION configuration and -# the golden reference used for synthetic-acceptance modeling. -# thinking_off - reasoning disabled; provided for comparison only. -# -# key = num_speculative_tokens (MTP level); value = golden AL -deepseek-v4-pro: - thinking_on: - 1: 1.79 - 2: 2.27 - 3: 2.47 - 4: 2.54 - 5: 2.52 - 6: 2.54 - 7: 2.54 - 8: 2.56 - thinking_off: - 1: 1.92 - 2: 2.60 - 3: 2.97 - 4: 3.04 - 5: 3.13 - 6: 3.08 - 7: 3.13 - 8: 3.12 diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index 751de1cdd7..b49d424a8e 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -49,6 +49,21 @@ def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None: assert value == 2.75 +def test_lookup_reference_uses_committed_dsv4_golden() -> None: + ref = Path(__file__).resolve().parents[2] / "golden_al_distribution/dsv4_mtp.yaml" + + _, mode_key, value = lookup_reference( + load_reference(ref), + model="deepseek-ai/DeepSeek-V4-Pro", + model_prefix="dsv4", + thinking_mode="on", + num_speculative_tokens=3, + ) + + assert mode_key == "thinking_on" + assert value == 2.49 + + def test_build_result_records_threshold_pass(tmp_path: Path) -> None: ref = tmp_path / "speedbench-reference-al.yaml" ref.write_text( diff --git a/utils/evals/write_dynamo_speedbench_al_from_logs.sh b/utils/evals/write_dynamo_speedbench_al_from_logs.sh index b33d0bdcd3..d47d428d48 100644 --- a/utils/evals/write_dynamo_speedbench_al_from_logs.sh +++ b/utils/evals/write_dynamo_speedbench_al_from_logs.sh @@ -38,6 +38,19 @@ if [[ -z "$model_name" ]]; then model_name="${SERVED_MODEL_NAME:-unknown}" fi +reference_yaml="${SPEEDBENCH_REFERENCE_YAML:-}" +if [[ -z "$reference_yaml" ]]; then + case "${MODEL_PREFIX:-}" in + dsv4) + reference_yaml="${workspace}/golden_al_distribution/dsv4_mtp.yaml" + ;; + *) + echo "Dynamo SpeedBench AL: no golden AL file for MODEL_PREFIX=${MODEL_PREFIX:-unknown}" + exit 0 + ;; + esac +fi + output="${workspace}/results_speedbench_al_${mode}_mtp${mtp}.json" metric_source="dynamo-decode-log-counters" if [[ -n "${FRAMEWORK:-}" ]]; then @@ -48,7 +61,7 @@ echo "Dynamo SpeedBench AL: parsing decode logs from $logs_dir" python3 "${workspace}/utils/evals/dynamo_speedbench_al_from_logs.py" \ --logs-dir "$logs_dir" \ --output "$output" \ - --reference-yaml "${workspace}/benchmarks/speedbench-reference-al.yaml" \ + --reference-yaml "$reference_yaml" \ --model "$model_name" \ --model-prefix "${MODEL_PREFIX:-}" \ --thinking-mode "$mode" \ From 3935383910b4f4bdc9920f6cd4b62cc7ae2c4f0a Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:22:17 -0700 Subject: [PATCH 24/27] test: remove golden AL regression coverage --- utils/evals/test_speedbench_al.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py index b49d424a8e..751de1cdd7 100644 --- a/utils/evals/test_speedbench_al.py +++ b/utils/evals/test_speedbench_al.py @@ -49,21 +49,6 @@ def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None: assert value == 2.75 -def test_lookup_reference_uses_committed_dsv4_golden() -> None: - ref = Path(__file__).resolve().parents[2] / "golden_al_distribution/dsv4_mtp.yaml" - - _, mode_key, value = lookup_reference( - load_reference(ref), - model="deepseek-ai/DeepSeek-V4-Pro", - model_prefix="dsv4", - thinking_mode="on", - num_speculative_tokens=3, - ) - - assert mode_key == "thinking_on" - assert value == 2.49 - - def test_build_result_records_threshold_pass(tmp_path: Path) -> None: ref = tmp_path / "speedbench-reference-al.yaml" ref.write_text( From cf98627310fb3c654a2b8581b9db3d554a3d0fa0 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:27:56 -0700 Subject: [PATCH 25/27] refactor: limit SpeedBench MTP exports to vLLM --- benchmarks/multi_node/amd_utils/server_sglang.sh | 3 --- benchmarks/multi_node/amd_utils/server_vllm.sh | 3 --- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 1 - .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 2 -- benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh | 1 - .../single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh | 2 -- .../single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh | 1 - 7 files changed, 13 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 7c116a2c35..68a68b650b 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -575,9 +575,6 @@ if [ "$NODE_RANK" -eq 0 ]; then speedbench_decode_metric_urls="http://${NODE0_ADDR}:8000/metrics" fi export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}" - if [[ "${SPEC_DECODING:-none}" == "mtp" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then - export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-$DECODE_MTP_SIZE}" - fi if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index b6e30a987d..3d096acaa0 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -355,9 +355,6 @@ if [ "$NODE_RANK" -eq 0 ]; then speedbench_decode_metric_urls="http://${NODE0_ADDR}:${SERVER_PORT}/metrics" fi export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}" - if [[ "${SPEC_DECODING:-none}" == "mtp" ]]; then - export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-${DECODE_MTP_SIZE:-${NUM_SPEC_TOKENS:-2}}}" - fi if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index 65ec60aa0c..0c7f323637 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -75,7 +75,6 @@ if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then else MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" fi -export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE" if [[ "$DP_ATTENTION" == "true" ]]; then diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index b3386a60d8..388194ddd3 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -85,7 +85,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 ) - export SPEEDBENCH_NUM_SPEC_TOKENS=1 PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention @@ -106,7 +105,6 @@ else --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 ) - export SPEEDBENCH_NUM_SPEC_TOKENS=3 PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh index 9f8969d5e8..bb0362c256 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh @@ -83,7 +83,6 @@ if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then else MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}" fi -export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP" MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 )) # Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA # prologue tensors across streams without record_stream(), so graph warmup at diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh index 13c639c98f..3addce4526 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh @@ -166,7 +166,6 @@ SPEC_FLAGS=( --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 ) -export SPEEDBENCH_NUM_SPEC_TOKENS=3 if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS+=( --dp "$TP" @@ -179,7 +178,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then --speculative-eagle-topk 1 --speculative-num-draft-tokens 3 ) - export SPEEDBENCH_NUM_SPEC_TOKENS=2 fi if [ "${EP_SIZE:-1}" -gt 1 ]; then PARALLEL_ARGS+=(--ep-size "$EP_SIZE") diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh index e8d4ffde79..788eff5b8b 100644 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh @@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi SERVER_LOG="$PWD/server.log" -export SPEEDBENCH_NUM_SPEC_TOKENS=3 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" From 7ae7c56da8263133e7f71a7776aca6eb6dd520da Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:31:06 -0700 Subject: [PATCH 26/27] fix: restore GB200 Dynamo SpeedBench log collection --- runners/launch_gb200-nv.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 91331791d4..2c0f872832 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -551,6 +551,9 @@ fi # Collect eval results if eval was requested if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then EVAL_DIR="$LOGS_DIR/eval_results" + if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then + bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE" + fi if [ -d "$EVAL_DIR" ]; then echo "Extracting eval results from $EVAL_DIR" shopt -s nullglob From f3179ad26cf220b7cc5d0f0186c486292d1e90c3 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Mon, 29 Jun 2026 12:48:32 -0700 Subject: [PATCH 27/27] refactor: trim SpeedBench AL integration --- .github/workflows/speedbench-al.yml | 25 +- benchmarks/benchmark_lib.sh | 313 ++------------- .../multi_node/amd_utils/server_sglang.sh | 10 - .../multi_node/amd_utils/server_vllm.sh | 9 - .../speedbench/dsr1_fp4_b300_vllm.sh | 2 +- .../speedbench/dsv4_fp4_b300_vllm.sh | 4 +- .../speedbench/glm5_fp4_b300_vllm.sh | 2 +- .../speedbench/kimik2.5_fp4_b300_vllm.sh | 2 +- .../speedbench/minimaxm3_fp4_b300_vllm.sh | 2 +- .../speedbench/qwen3.5_fp4_b300_vllm.sh | 2 +- runners/launch_gb200-nv.sh | 2 +- utils/evals/EVALS.md | 12 +- utils/evals/test_speedbench_al.py | 358 ------------------ 13 files changed, 44 insertions(+), 699 deletions(-) delete mode 100644 utils/evals/test_speedbench_al.py diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml index 781228dee2..84a92e57a4 100644 --- a/.github/workflows/speedbench-al.yml +++ b/.github/workflows/speedbench-al.yml @@ -3,8 +3,8 @@ name: SpeedBench AL Collection # Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length # (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to # DeepSeek-V4-Pro). Produces the golden reference consumed by the -# synthetic-acceptance framework and can optionally open a PR updating the -# model's YAML under golden_al_distribution/. +# synthetic-acceptance framework and (optionally) opens a PR updating +# benchmarks/speedbench-reference-al.yaml. on: workflow_dispatch: @@ -60,7 +60,7 @@ on: type: string default: '480' open-pr: - description: "Open a PR updating the model's golden_al_distribution YAML (default off: artifact-only)" + description: "Open a PR updating benchmarks/speedbench-reference-al.yaml (default off: artifact-only, paste values in manually)" required: false type: boolean default: false @@ -179,23 +179,16 @@ jobs: GH_TOKEN: ${{ secrets.REPO_PAT }} run: | set -euo pipefail - case "${{ inputs.model-prefix }}" in - dsv4) GOLDEN_AL_PATH="golden_al_distribution/dsv4_mtp.yaml" ;; - qwen3.5) GOLDEN_AL_PATH="golden_al_distribution/qwen3.5_mtp.yaml" ;; - kimik2.5) GOLDEN_AL_PATH="golden_al_distribution/kimik2.5_eagle3.yaml" ;; - minimaxm3) GOLDEN_AL_PATH="golden_al_distribution/minimaxm3_eagle3.yaml" ;; - *) - echo "No committed golden AL path for model prefix: ${{ inputs.model-prefix }}" >&2 - exit 1 - ;; - esac - cp speedbench-reference-al.yaml "$GOLDEN_AL_PATH" + # NOTE: the reference yaml is keyed by model at the top level. This + # overwrites it with the current model's matrix; when more than one + # model is collected, replace this cp with a per-model-key YAML merge. + cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}" git config user.name "github-actions" git config user.email "github-actions@github.com" git checkout -b "$BRANCH" - git add "$GOLDEN_AL_PATH" + git add benchmarks/speedbench-reference-al.yaml if git diff --cached --quiet; then echo "No change in reference yaml; skipping PR." exit 0 @@ -229,4 +222,4 @@ jobs: - name: Resource cleanup (post-run) if: always() - run: *resource-cleanup + run: *resource-cleanup \ No newline at end of file diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 340e3a9341..28f5dd50b7 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -742,28 +742,6 @@ _prometheus_metric_sum_url() { ' <<< "$values" } -_prometheus_metric_avg_url() { - local url="$1" - local name="$2" - local values - values=$(_prometheus_metric_values_url "$url" "$name") || return 1 - awk ' - { sum += $1; count += 1 } - END { - if (count == 0) { - exit 1 - } - printf "%.10f\n", sum / count - } - ' <<< "$values" -} - -_prometheus_metric_sum() { - local port="$1" - local name="$2" - _prometheus_metric_sum_url "http://0.0.0.0:${port}/metrics" "$name" -} - _speedbench_normalize_metrics_url() { local endpoint="$1" endpoint="${endpoint%,}" @@ -986,106 +964,6 @@ print( PY } -_speedbench_trtllm_avg_decoded_al() { - local port="$1" - local value - value=$(_speedbench_metric_avg "$port" "trtllm_avg_decoded_tokens_per_iter" 2>/dev/null || true) - [[ -n "$value" ]] || return 1 - awk -v value="$value" ' - BEGIN { - if (value < 1.0) { - exit 1 - } - printf "%.4f\n", value - } - ' -} - -_speedbench_trtllm_json_avg_decoded_al() { - local port="$1" - local urls=() - local url - - while IFS= read -r url; do - [[ -n "$url" ]] && urls+=("$url") - done < <(_speedbench_trtllm_json_metrics_urls "$port") - - [[ "${#urls[@]}" -gt 0 ]] || return 1 - - python3 - "${urls[@]}" <<'PY' -import json -import os -import sys -import urllib.request - - -def number(value, default=0.0): - try: - if value is None: - return default - return float(value) - except (TypeError, ValueError): - return default - - -def stats_from_payload(payload): - if isinstance(payload, list): - return payload - if isinstance(payload, dict): - return [payload] - return [] - - -timeout = float(os.environ.get("SPEEDBENCH_METRICS_CURL_TIMEOUT", "10")) -weighted_total = 0.0 -total_requests = 0.0 -unweighted_total = 0.0 -unweighted_count = 0 -used_endpoints = 0 - -for url in sys.argv[1:]: - try: - with urllib.request.urlopen(url, timeout=timeout) as response: - payload = json.load(response) - except Exception as exc: # noqa: BLE001 - diagnostics for CI logs - print(f"SpeedBench AL eval: TRT-LLM JSON metrics fetch failed for {url}: {exc}", file=sys.stderr) - continue - - endpoint_had_avg = False - for stat in stats_from_payload(payload): - if not isinstance(stat, dict): - continue - ifb = stat.get("inflightBatchingStats") - if not isinstance(ifb, dict): - continue - - avg_decoded = number(ifb.get("avgNumDecodedTokensPerIter"), default=-1.0) - if avg_decoded < 1.0: - continue - - gen_requests = number(ifb.get("numGenRequests")) - endpoint_had_avg = True - if gen_requests > 0: - weighted_total += avg_decoded * gen_requests - total_requests += gen_requests - else: - unweighted_total += avg_decoded - unweighted_count += 1 - - if endpoint_had_avg: - used_endpoints += 1 - -if total_requests > 0: - acceptance_length = weighted_total / total_requests -elif unweighted_count > 0: - acceptance_length = unweighted_total / unweighted_count -else: - sys.exit(1) - -print(f"{acceptance_length:.4f}\t{used_endpoints}") -PY -} - _speedbench_trtllm_server_log_metrics() { local mtp="$1" local start_offset="${2:-0}" @@ -1193,9 +1071,6 @@ _speedbench_spec_gauge_metric() { trtllm:acceptance_length) echo "trtllm_spec_decode_acceptance_length" ;; - sglang:acceptance_length) - echo "sglang:spec_accept_length" - ;; sglang:draft_tokens_per_step) echo "sglang:spec_num_draft_tokens" ;; @@ -1323,73 +1198,6 @@ _speedbench_prepare_dataset() { [[ -f "$speedbench_dir/qualitative.jsonl" ]] } -_speedbench_apply_chat_template_kwargs_shim() { - echo "SpeedBench AL eval: patching vLLM benchmark --chat-template-kwargs support if needed" - python3 - <<'PYEOF' -import vllm.benchmarks.serve as S -import vllm.benchmarks.datasets.datasets as D - - -def patch(mod, edits, marker): - f = mod.__file__ - with open(f) as handle: - src = handle.read() - if marker in src: - print("already patched:", f) - return - for old, new in edits: - n = src.count(old) - assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..." - src = src.replace(old, new, 1) - with open(f, "w") as handle: - handle.write(src) - print("patched OK ->", f) - - -serve_old = ''' parser.add_argument( - "--extra-body",''' -serve_new = ''' parser.add_argument( - "--chat-template-kwargs", - type=json.loads, - default=None, - help="JSON dict forwarded to apply_chat_template during " - "client-side prompt rendering, e.g. to enable reasoning mode.", - ) - parser.add_argument( - "--extra-body",''' -patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"') - -disp_old = ''' output_len=args.speed_bench_output_len, - enable_multimodal_chat=args.enable_multimodal_chat,''' -disp_new = ''' output_len=args.speed_bench_output_len, - chat_template_kwargs=args.chat_template_kwargs, - enable_multimodal_chat=args.enable_multimodal_chat,''' - -samp_old = ''' # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids)''' -samp_new = ''' # apply template - if not skip_chat_template: - _ctk = kwargs.get("chat_template_kwargs") or {} - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - **_ctk, - ) - - prompt_len = len(tokenizer(prompt).input_ids)''' -patch(D, [(disp_old, disp_new), (samp_old, samp_new)], - marker="chat_template_kwargs=args.chat_template_kwargs") -PYEOF -} - run_speedbench_al_eval() { local port="${PORT:-8888}" while [[ $# -gt 0 ]]; do @@ -1458,26 +1266,6 @@ run_speedbench_al_eval() { fi local thinking_kwargs='{"thinking": true, "reasoning_effort": "high"}' - local client="${SPEEDBENCH_CLIENT:-auto}" - local use_vllm_client=0 - if [[ "$client" != "openai" && "$client" != "native" ]] && command -v vllm >/dev/null 2>&1; then - use_vllm_client=1 - fi - if [[ "$metrics_framework" == "sglang" ]]; then - use_vllm_client=0 - fi - - local think_args=() - if [[ "$mode" == "on" ]]; then - if [[ "$use_vllm_client" -eq 1 ]]; then - if ! _speedbench_apply_chat_template_kwargs_shim; then - echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2 - _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed" - return 0 - fi - think_args=(--chat-template-kwargs "$thinking_kwargs") - fi - fi local accepted_before="" proposed_before="" verify_before="" completion_before="" accepted_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true) @@ -1502,56 +1290,31 @@ run_speedbench_al_eval() { local bench_rc=0 local speedbench_model="${MODEL_NAME:-${MODEL:-}}" echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}" - if [[ "$use_vllm_client" -eq 1 ]]; then - local raw_result_dir - raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)" - local bench_cmd=( - vllm bench serve - --model "$speedbench_model" - --port "$port" - --dataset-name speed_bench - --dataset-path "$speedbench_dir" - --speed-bench-category coding - --speed-bench-output-len 4096 - --num-prompts -1 - --max-concurrency 1 - --save-result - --result-dir "$raw_result_dir" - --result-filename "speedbench_al_${mode}_mtp${mtp}" - --trust-remote-code - --tokenizer-mode deepseek_v4 - --temperature 1.0 - "${think_args[@]}" - ) - "${bench_cmd[@]}" || bench_rc=$? - rm -rf "$raw_result_dir" || true - else - export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" - local native_cmd=( - python3 "$(pwd)/utils/evals/speedbench_client.py" - --model "$speedbench_model" - --base-url "http://0.0.0.0:${port}" - --dataset-path "$speedbench_dir" - --category coding - --output-len 4096 - --temperature 1.0 - --thinking-mode "$mode" - --timeout "${SPEEDBENCH_CLIENT_TIMEOUT:-1800}" - --retries "${SPEEDBENCH_CLIENT_RETRIES:-2}" - ) - if [[ -n "${SPEEDBENCH_CLIENT_ENDPOINT:-}" ]]; then - native_cmd+=(--endpoint "$SPEEDBENCH_CLIENT_ENDPOINT") - elif [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then - native_cmd+=(--endpoint completions) - fi - if [[ "$mode" == "on" ]]; then - native_cmd+=(--thinking-kwargs "$thinking_kwargs") - fi - if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then - native_cmd+=(--dsv4) - fi - "${native_cmd[@]}" || bench_rc=$? + export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}" + local client_cmd=( + python3 "$(pwd)/utils/evals/speedbench_client.py" + --model "$speedbench_model" + --base-url "http://0.0.0.0:${port}" + --dataset-path "$speedbench_dir" + --category coding + --output-len 4096 + --temperature 1.0 + --thinking-mode "$mode" + --timeout "${SPEEDBENCH_CLIENT_TIMEOUT:-1800}" + --retries "${SPEEDBENCH_CLIENT_RETRIES:-2}" + ) + if [[ -n "${SPEEDBENCH_CLIENT_ENDPOINT:-}" ]]; then + client_cmd+=(--endpoint "$SPEEDBENCH_CLIENT_ENDPOINT") + elif [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + client_cmd+=(--endpoint completions) fi + if [[ "$mode" == "on" ]]; then + client_cmd+=(--thinking-kwargs "$thinking_kwargs") + fi + if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + client_cmd+=(--dsv4) + fi + "${client_cmd[@]}" || bench_rc=$? if [[ "$bench_rc" -ne 0 ]]; then echo "SpeedBench AL eval: client failed with exit code ${bench_rc}" >&2 _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "SpeedBench client failed with exit code ${bench_rc}" @@ -1603,20 +1366,6 @@ run_speedbench_al_eval() { metric_source="trtllm-server-log-generation-tokens-samples${trt_log_samples}" fi fi - if [[ -z "$al" ]]; then - al=$(_speedbench_trtllm_avg_decoded_al "$port" || true) - if [[ -n "$al" ]]; then - metric_source="${metric_source_base}-avg-decoded-tokens-endpoints${metrics_endpoint_count}" - fi - fi - if [[ -z "$al" ]]; then - local trt_json_avg_metrics="" trt_json_avg_endpoints="" - trt_json_avg_metrics=$(_speedbench_trtllm_json_avg_decoded_al "$port" || true) - if [[ -n "$trt_json_avg_metrics" ]]; then - IFS=$'\t' read -r al trt_json_avg_endpoints <<< "$trt_json_avg_metrics" - metric_source="trtllm-json-avg-decoded-tokens-endpoints${trt_json_avg_endpoints}" - fi - fi fi elif [[ "$metrics_framework" == "sglang" ]]; then local draft_depth="" @@ -1631,22 +1380,12 @@ run_speedbench_al_eval() { fi metric_source="${metric_source_base}-generation-counter+verify-counter-endpoints${metrics_endpoint_count}" fi - if [[ -z "$al" ]]; then - al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true) - if [[ -n "$al" ]]; then - metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}" - fi - fi if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then if [[ -z "$draft_depth" ]]; then draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true) fi - if [[ -n "$draft_depth" ]]; then - delta_proposed="${delta_proposed:-$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")}" - fi - if [[ -n "$al" && "$metric_source" != *"generation-counter+verify-counter"* ]]; then - delta_acc=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v al="$al" 'BEGIN { value = verify * (al - 1); if (value < 0) value = 0; printf "%.10f\n", value }')") - metric_source="${metric_source:-${metric_source_base}-gauge-endpoints${metrics_endpoint_count}}+derived-token-counters" + if [[ -n "$draft_depth" && -z "$delta_proposed" ]]; then + delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')") fi fi fi diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 68a68b650b..34351b1e43 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -566,16 +566,6 @@ if [ "$NODE_RANK" -eq 0 ]; then export EVAL_MAX_MODEL_LEN="$prefill_context_length" fi - speedbench_decode_metric_urls="" - for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${IP_ARRAY[$decode_idx]}:8000/metrics" - done - if [[ -z "$speedbench_decode_metric_urls" ]]; then - speedbench_decode_metric_urls="http://${NODE0_ADDR}:8000/metrics" - fi - export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}" - if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" else diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index 3d096acaa0..f19ce8560b 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -347,15 +347,6 @@ if [ "$NODE_RANK" -eq 0 ]; then export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) fi - speedbench_decode_metric_urls="" - for decode_ip in ${DECODE_ARGS}; do - speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${decode_ip}:${SERVER_PORT}/metrics" - done - if [[ -z "$speedbench_decode_metric_urls" ]]; then - speedbench_decode_metric_urls="http://${NODE0_ADDR}:${SERVER_PORT}/metrics" - fi - export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}" - if [[ "$DRY_RUN" -eq 1 ]]; then echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})" else diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh index d032164e67..d0357c6b43 100755 --- a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh @@ -5,7 +5,7 @@ # Produces the golden acceptance-length (AL) reference matrix consumed by the # synthetic-acceptance framework: for each MTP level (num_speculative_tokens), # measure the REAL AL on a single SPEED-Bench category (default: coding) and emit -# a YAML matrix identical in shape to the files under golden_al_distribution/. +# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml. # This measures real MTP acceptance; the synthetic value is injected downstream # by the throughput recipe, not here. # diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh index 97421eaa5a..b8550a3502 100755 --- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh @@ -6,7 +6,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP # level (num_speculative_tokens), measure the AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# golden_al_distribution/dsv4_mtp.yaml. +# benchmarks/speedbench-reference-al.yaml. # # This is the "AL distribution collection" script wired into the # speedbench-al.yml GitHub Action (workflow_dispatch / push-button). @@ -46,7 +46,7 @@ SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}" CONCURRENCY="${CONCURRENCY:-1}" TEMPERATURE="${TEMPERATURE:-1.0}" # thinking-on chat_template_kwargs. MUST match the production/golden config: -# the reference matrix (golden_al_distribution/dsv4_mtp.yaml) was measured +# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured # with reasoning_effort=high. DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"thinking": true, "reasoning_effort": "high"}' CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}" diff --git a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh index 09577d802f..6265500b94 100755 --- a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh @@ -6,7 +6,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# a model-specific YAML under golden_al_distribution/. This measures real MTP acceptance; +# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance; # the synthetic value is injected downstream by the throughput recipe, not here. # # Filename *_fp4_* matches both the speedbench-al.yml path convention diff --git a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh index 137e4415c1..890c059f9d 100755 --- a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh @@ -7,7 +7,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each # EAGLE3 speculative-token count, measure the REAL AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# golden_al_distribution/kimik2.5_eagle3.yaml. +# benchmarks/speedbench-reference-al.yaml. # # Kimi-K2.5 uses the lightseekorg/kimi-k2.5-eagle3-mla draft head (MLA # variant, recommended by official docs). The draft model is downloaded diff --git a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh index 71dfed4656..dac39fb538 100755 --- a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh @@ -7,7 +7,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each # EAGLE3 level (num_speculative_tokens), measure the REAL AL on a single # SPEED-Bench category (default: coding) and emit a YAML matrix identical in -# shape to the files under golden_al_distribution/. This measures real EAGLE3 +# shape to benchmarks/speedbench-reference-al.yaml. This measures real EAGLE3 # acceptance; the synthetic value is injected downstream by the throughput # recipe, not here. # diff --git a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh index 64dd01178c..bf2bda7c8d 100755 --- a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh +++ b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh @@ -6,7 +6,7 @@ # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench # category (default: coding) and emit a YAML matrix identical in shape to -# golden_al_distribution/qwen3.5_mtp.yaml. This measures real MTP acceptance; +# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance; # the synthetic value is injected downstream by the throughput recipe, not here. # # Adapted from speedbench/dsv4_fp4_b300_vllm.sh. Differences vs DSV4 (deepseek_v4 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 2c0f872832..ac9842312b 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -79,7 +79,7 @@ NGINX_IMAGE="nginx:1.27.4" uses_watchtower_shared_fs() { case "$MODEL_PREFIX" in - dsv4|minimaxm2.5|minimaxm3|kimik2.5) return 0 ;; + minimaxm2.5|minimaxm3|kimik2.5) return 0 ;; *) return 1 ;; esac } diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index ba23cad76a..7ff878dcec 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -152,19 +152,9 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results | | `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) | | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval; a space-separated list enables sequential batched evals against one live engine | -| `SPEEDBENCH_DIR` | `$(pwd)/speed_bench_data` | Prepared SpeedBench dataset directory | -| `SPEEDBENCH_NUM_SPEC_TOKENS` | script-provided or `2` | MTP level used to select the reference AL row | -| `SPEEDBENCH_METRICS_FRAMEWORK` | `FRAMEWORK` or `vllm` | Speculative metrics parser: `vllm`, `sglang`, `trtllm`, or a Dynamo variant | -| `SPEEDBENCH_DECODE_METRICS_URLS` | unset | Decode-worker Prometheus endpoints for disaggregated runs | -| `SPEEDBENCH_METRICS_URLS` | unset | Generic Prometheus endpoints | -| `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports when full URLs are unavailable | -| `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stat endpoints | -| `SPEEDBENCH_TRTLLM_SERVER_LOG` | `SERVER_LOG` | TRT-LLM iteration log used when spec metrics are unavailable | - -SpeedBench AL uses counter deltas over the eval request window. vLLM uses accepted-token and verify-step counters. SGLang uses generation-token and verify-call counters. TRT-LLM prefers Prometheus or JSON speculative metrics and falls back to iteration logs or average decoded tokens. Dynamo runs collect metrics from decode workers rather than the router. ### Score validation -`utils/evals/validate_scores.py` checks lm-eval results against `utils/evals/thresholds.json` and requires SpeedBench AL to be within 95% to 105% of its golden value. It runs after artifact upload so results are preserved when validation fails. +`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails. ### Adding a new eval task diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py deleted file mode 100644 index 751de1cdd7..0000000000 --- a/utils/evals/test_speedbench_al.py +++ /dev/null @@ -1,358 +0,0 @@ -import argparse -import json -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).resolve().parent)) -sys.path.insert(0, str(Path(__file__).resolve().parents[1])) - -from collect_eval_results import ( - build_row, - detect_eval_jsons, - extract_speedbench_al_metrics, - score_cell, -) -from dynamo_speedbench_al_from_logs import aggregate_log_metrics -from speedbench_al import build_result, load_reference, lookup_reference -from speedbench_client import ( - _chat_payload, - _completion_payload, - _load_speedbench_requests, -) -from trtllm_speedbench_al_from_log import parse_trtllm_iteration_log -from validate_scores import validate_speedbench_al - - -def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None: - ref = tmp_path / "speedbench-reference-al.yaml" - ref.write_text( - """ -deepseek-v4-pro: - thinking_on: - 2: 2.75 - thinking_off: - 2: 2.40 -""" - ) - - data = load_reference(ref) - model_key, mode_key, value = lookup_reference( - data, - model="deepseek-ai/DeepSeek-V4-Pro", - model_prefix="dsv4", - thinking_mode="on", - num_speculative_tokens=2, - ) - - assert model_key == "deepseek-v4-pro" - assert mode_key == "thinking_on" - assert value == 2.75 - - -def test_build_result_records_threshold_pass(tmp_path: Path) -> None: - ref = tmp_path / "speedbench-reference-al.yaml" - ref.write_text( - """ -deepseek-v4-pro: - thinking_on: - 2: 2.50 -""" - ) - args = argparse.Namespace( - reference_yaml=str(ref), - model="deepseek-ai/DeepSeek-V4-Pro", - model_prefix="dsv4", - thinking_mode="on", - num_speculative_tokens=2, - category="coding", - output_len=4096, - temperature=1.0, - threshold_ratio=0.95, - max_threshold_ratio=1.05, - acceptance_length="2.50", - accepted_tokens="13", - draft_tokens="10", - verify_steps="10", - proposed_draft_tokens="20", - framework="vllm", - metric_source="vllm-prometheus-counters-endpoints1", - error=None, - ) - - result = build_result(args) - - assert result["reference_acceptance_length"] == 2.50 - assert result["min_acceptance_length"] == 2.375 - assert result["max_acceptance_length"] == 2.625 - assert result["framework"] == "vllm" - assert result["metric_source"] == "vllm-prometheus-counters-endpoints1" - assert result["verify_steps"] == 10 - assert result["proposed_draft_tokens"] == 20 - assert result["passed"] is True - - -def test_validate_speedbench_al_fails_below_minimum() -> None: - ok, checked = validate_speedbench_al( - { - "speedbench_al_eval_version": 1, - "task": "speedbench_al", - "thinking_mode": "thinking_on", - "num_speculative_tokens": 2, - "acceptance_length": 2.0, - "min_acceptance_length": 2.375, - "max_acceptance_length": 2.625, - "passed": False, - }, - "results_speedbench_al.json", - ) - - assert checked == 1 - assert ok is False - - -def test_validate_speedbench_al_fails_above_maximum() -> None: - ok, checked = validate_speedbench_al( - { - "speedbench_al_eval_version": 1, - "task": "speedbench_al", - "thinking_mode": "thinking_on", - "num_speculative_tokens": 2, - "acceptance_length": 2.7, - "min_acceptance_length": 2.375, - "max_acceptance_length": 2.625, - "passed": True, - }, - "results_speedbench_al.json", - ) - - assert checked == 1 - assert ok is False - - -def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None: - result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json" - result_path.write_text( - json.dumps( - { - "speedbench_al_eval_version": 1, - "task": "speedbench_al", - "model": "deepseek-ai/DeepSeek-V4-Pro", - "thinking_mode": "thinking_on", - "num_speculative_tokens": 2, - "acceptance_length": 2.5, - "framework": "sglang", - "metric_source": "sglang-prometheus-gauge-endpoints1+derived-token-counters", - "accepted_tokens": 13, - "verify_steps": 10, - "proposed_draft_tokens": 20, - "reference_acceptance_length": 2.5, - "min_acceptance_length": 2.375, - "max_acceptance_length": 2.625, - "threshold_ratio": 0.95, - "max_threshold_ratio": 1.05, - "passed": True, - } - ) - ) - metrics = extract_speedbench_al_metrics(result_path) - row = build_row( - { - "infmax_model_prefix": "dsv4", - "hw": "b300", - "framework": "vllm", - "precision": "fp4", - "spec_decoding": "mtp", - }, - metrics[0], - ) - - assert row["task"] == "speedbench_al/thinking_on/mtp2" - assert row["score_name"] == "acceptance_length" - assert row["speedbench_framework"] == "sglang" - assert row["speedbench_metric_source"] == "sglang-prometheus-gauge-endpoints1+derived-token-counters" - assert row["speedbench_accepted_tokens"] == 13 - assert row["speedbench_verify_steps"] == 10 - assert row["speedbench_proposed_draft_tokens"] == 20 - assert row["speedbench_max_acceptance_length"] == 2.625 - assert score_cell(row) == "2.50 in [2.38, 2.62] (PASS)" - - -def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> None: - result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json" - result_path.write_text( - json.dumps( - { - "speedbench_al_eval_version": 1, - "task": "speedbench_al", - "thinking_mode": "thinking_on", - "num_speculative_tokens": 2, - "acceptance_length": 2.3, - "min_acceptance_length": 2.25, - "max_acceptance_length": 2.75, - "passed": True, - } - ) - ) - - lm_path, speedbench_paths = detect_eval_jsons(tmp_path) - - assert lm_path is None - assert speedbench_paths == [result_path] - - -def test_dynamo_log_parser_aggregates_decode_workers(tmp_path: Path) -> None: - def write_log(name: str, rows: list[tuple[float, int, int]]) -> None: - lines = [] - for al, accepted, drafted in rows: - lines.append( - "INFO metrics.log: SpecDecoding metrics: " - f"Mean acceptance length: {al}, " - "Accepted throughput: 1.0 tokens/s, " - "Drafted throughput: 1.0 tokens/s, " - f"Accepted: {accepted} tokens, Drafted: {drafted} tokens, " - "Per-position acceptance rate: 0.9, 0.7, " - "Avg Draft acceptance rate: 80.0%" - ) - (tmp_path / name).write_text("\n".join(lines)) - - write_log("node-a_decode_w0.out", [(2.0, 10, 20)]) - write_log("node-b_decode_w0.out", [(2.5, 15, 20), (2.5, 5, 10)]) - write_log("node-c_decode_w1.out", [(2.0, 10, 20)]) - write_log("node-d_decode_w1.out", []) - - metrics = aggregate_log_metrics(tmp_path, mtp=2) - - assert metrics is not None - assert metrics.workers == 2 - assert metrics.samples == 3 - assert metrics.accepted_tokens == 30 - assert metrics.proposed_draft_tokens == 50 - assert metrics.verify_steps == 25 - assert metrics.acceptance_length == 2.2 - assert [p.name for p in metrics.selected_logs] == [ - "node-b_decode_w0.out", - "node-c_decode_w1.out", - ] - - -def test_dynamo_log_parser_reads_sglang_accept_length_samples(tmp_path: Path) -> None: - (tmp_path / "node-a_decode_w0.out").write_text( - "\n".join( - [ - "Decode batch, #running-req: 1, accept len: 2.10, accept rate: 0.37,", - "Decode batch, #running-req: 1, accept len: 2.30, accept rate: 0.43,", - ] - ) - ) - (tmp_path / "node-b_decode_w1.out").write_text( - "Decode batch, #running-req: 1, accept len: 2.50, accept rate: 0.50," - ) - - metrics = aggregate_log_metrics(tmp_path, mtp=4) - - assert metrics is not None - assert metrics.workers == 2 - assert metrics.samples == 3 - assert round(metrics.acceptance_length, 4) == 2.3 - assert metrics.has_counter_metrics is False - assert metrics.accepted_tokens is None - assert metrics.verify_steps is None - assert metrics.proposed_draft_tokens is None - - -def test_trtllm_log_parser_reads_generation_tokens_after_offset(tmp_path: Path) -> None: - log_path = tmp_path / "server.log" - prefix = "previous eval traffic\n" - body = "\n".join( - [ - "[TRT-LLM] [I] iter = 1, num_scheduled_requests: 1, " - "states = {'num_ctx_requests': 1, 'num_ctx_tokens': 1024, 'num_generation_tokens': 0}", - "[TRT-LLM] [I] iter = 2, num_scheduled_requests: 1, " - "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 3}", - "[TRT-LLM] [I] iter = 3, num_scheduled_requests: 1, " - "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 2}", - 'INFO: 127.0.0.1:1 - "GET /prometheus/metrics HTTP/1.1" 200 OK', - "[TRT-LLM] [I] iter = 4, num_scheduled_requests: 32, " - "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 96}", - ] - ) - log_path.write_text(prefix + body) - - metrics = parse_trtllm_iteration_log(log_path, mtp=2, start_offset=len(prefix)) - - assert metrics is not None - assert metrics.samples == 2 - assert metrics.generated_tokens == 5 - assert metrics.accepted_tokens == 3 - assert metrics.verify_steps == 2 - assert metrics.proposed_draft_tokens == 4 - assert metrics.acceptance_length == 2.5 - - -def test_trtllm_log_parser_can_infer_batched_steps(tmp_path: Path) -> None: - log_path = tmp_path / "server.log" - log_path.write_text( - "[TRT-LLM] [I] iter = 10, num_scheduled_requests: 28, " - "states = {'num_ctx_requests': 9, 'num_ctx_tokens': 9345, 'num_generation_tokens': 57}" - ) - - metrics = parse_trtllm_iteration_log( - log_path, - mtp=2, - stop_at_metrics_get=False, - ) - - assert metrics is not None - assert metrics.samples == 1 - assert metrics.verify_steps == 19 - assert metrics.accepted_tokens == 38 - assert metrics.proposed_draft_tokens == 38 - assert metrics.acceptance_length == 3.0 - - -def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None: - dataset = tmp_path / "speed_bench_data" - dataset.mkdir() - (dataset / "qualitative.jsonl").write_text( - "\n".join( - [ - json.dumps( - { - "category": "coding", - "messages": [{"role": "user", "content": "Write fizzbuzz."}], - } - ), - json.dumps( - { - "category": "math", - "messages": [{"role": "user", "content": "Solve 2+2."}], - } - ), - ] - ) - ) - - prompts = _load_speedbench_requests(dataset, "coding", -1) - chat = _chat_payload( - prompts[0], - model="deepseek-ai/DeepSeek-V4-Pro", - output_len=4096, - temperature=1.0, - thinking_mode="on", - thinking_kwargs={"thinking": True, "reasoning_effort": "high"}, - ) - completions = _completion_payload( - prompts[0], - model="deepseek-ai/DeepSeek-V4-Pro", - output_len=4096, - temperature=1.0, - thinking_mode="on", - thinking_kwargs={"thinking": True, "reasoning_effort": "high"}, - dsv4=True, - ) - - assert len(prompts) == 1 - assert chat["chat_template_kwargs"]["thinking"] is True - assert chat["reasoning_effort"] == "high" - assert "" in completions["prompt"] - assert completions["max_tokens"] == 4096