From 3a8a68fae01fd8ea397651b2eb2b8e1a7bc3e1a4 Mon Sep 17 00:00:00 2001
From: Albert Cheng <albertching0112@gmail.com>
Date: Tue, 2 Jun 2026 14:15:57 -0700
Subject: [PATCH 01/27] Add GitHub Action to collect SPEED-Bench AL matrix

Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro
SPEED-Bench acceptance-length matrix (thinking on/off x MTP 1-8) on
self-hosted B300 runners, optionally opening a PR that updates
benchmarks/speedbench-reference-al.yaml.

- benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh:
  per (thinking, MTP) cell, serve vLLM, run SPEED-Bench, derive AL from
  /metrics, and emit the YAML matrix. Serves from MODEL_PATH (the local
  pre-staged weights resolved by the launcher), falling back to MODEL for
  a standalone local run. Carries a temporary --chat-template-kwargs shim
  until vllm-project/vllm#44244 lands in the benchmark image (idempotent,
  applied only for thinking-on cells).
- runners/launch_b300-nv.sh: add opt-in BENCH_SCRIPT_OVERRIDE and
  SALLOC_TIME_LIMIT hooks; both default to the prior behavior.
- .github/workflows/speedbench-al.yml: workflow_dispatch entry point;
  MODEL is the HF id so the launcher resolves the staged MODEL_PATH.
---
 .github/workflows/speedbench-al.yml           | 200 +++++++++++
 .../dsv4_fp4_b300_vllm_speedbench_matrix.sh   | 337 ++++++++++++++++++
 runners/launch_b300-nv.sh                     |  10 +-
 3 files changed, 546 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/speedbench-al.yml
 create mode 100755 benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh

diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
new file mode 100644
index 0000000000..771e53e6c4
--- /dev/null
+++ b/.github/workflows/speedbench-al.yml
@@ -0,0 +1,200 @@
+name: SpeedBench AL Collection
+
+# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench
+# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the
+# golden reference consumed by the synthetic-acceptance framework and (optionally)
+# opens a PR updating benchmarks/speedbench-reference-al.yaml.
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: "Self-hosted GPU runner label (B300)"
+        required: false
+        type: string
+        default: 'b300'
+      image:
+        description: "vLLM container image"
+        required: false
+        type: string
+        default: 'vllm/vllm-openai:v0.21.0'
+      mtp-list:
+        description: "Space-separated MTP levels (num_speculative_tokens)"
+        required: false
+        type: string
+        default: '1 2 3 4 5 6 7 8'
+      thinking-modes:
+        description: "Space-separated thinking modes to collect"
+        required: false
+        type: string
+        default: 'off on'
+      category:
+        description: "SPEED-Bench category"
+        required: false
+        type: string
+        default: 'coding'
+      output-len:
+        description: "Per-request output length"
+        required: false
+        type: string
+        default: '4096'
+      thinking-kwargs:
+        description: "chat_template_kwargs JSON for thinking-on cells (match golden config)"
+        required: false
+        type: string
+        default: '{"thinking": true, "reasoning_effort": "high"}'
+      salloc-time:
+        description: "Slurm allocation minutes (16 server starts ~ several hours)"
+        required: false
+        type: string
+        default: '480'
+      open-pr:
+        description: "Open a PR updating benchmarks/speedbench-reference-al.yaml"
+        required: false
+        type: boolean
+        default: true
+      ref:
+        description: "Git ref (branch/sha) to checkout"
+        required: false
+        type: string
+
+permissions:
+  contents: read
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the
+  # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so
+  # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts
+  # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download.
+  MODEL: deepseek-ai/DeepSeek-V4-Pro
+  MODEL_PREFIX: dsv4
+  PRECISION: fp4
+  FRAMEWORK: vllm
+  EXP_NAME: dsv4_speedbench
+  IMAGE: ${{ inputs.image }}
+  TP: '8'
+  EP_SIZE: '1'
+  DP_ATTENTION: 'false'
+  SPEC_DECODING: mtp
+  # Run the AL-matrix collector instead of the auto-selected throughput script.
+  BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+  SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }}
+  # Matrix-collector tunables (propagated into the container via srun --export=ALL).
+  MTP_LIST: ${{ inputs.mtp-list }}
+  THINKING_MODES: ${{ inputs.thinking-modes }}
+  CATEGORY: ${{ inputs.category }}
+  SPEEDBENCH_OUTPUT_LEN: ${{ inputs.output-len }}
+  CHAT_TEMPLATE_KWARGS_ON: ${{ inputs.thinking-kwargs }}
+  OUT_YAML: /workspace/speedbench-reference-al.yaml
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
+
+jobs:
+  collect-al:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 600
+    name: "SpeedBench AL matrix | ${{ inputs.category }} | mtp=[${{ inputs.mtp-list }}] | thinking=[${{ inputs.thinking-modes }}]"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: &resource-cleanup |
+          # Cleanup Docker resources
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
+          fi
+
+          # Cleanup SLURM resources
+          if command -v squeue >/dev/null 2>&1; then
+            echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+            scancel --name="${{ runner.name }}" || true
+            while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
+              squeue --name="${{ runner.name }}"
+              sleep 5
+            done
+          fi
+
+          # Cleanup AL-matrix outputs from a prior job on this runner so a stale
+          # matrix from a previous run is never picked up as this job's output.
+          rm -rf "${{ github.workspace }}/speedbench_results" 2>/dev/null || true
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.sha }}
+          clean: true
+          submodules: true
+
+      - name: Cleanup stale outputs (pre-run)
+        run: |
+          rm -f speedbench-reference-al.yaml || true
+          rm -f gpu_metrics.csv || true
+          rm -rf speed_bench_data || true
+
+      - name: Collect AL matrix
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: |
+          set -euo pipefail
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+          if [ ! -f "speedbench-reference-al.yaml" ]; then
+            echo "AL collection failed: speedbench-reference-al.yaml not produced." >&2
+            exit 1
+          fi
+          echo "### SpeedBench AL matrix" >> "$GITHUB_STEP_SUMMARY"
+          echo '```yaml' >> "$GITHUB_STEP_SUMMARY"
+          cat speedbench-reference-al.yaml >> "$GITHUB_STEP_SUMMARY"
+          echo '```' >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload AL matrix artifact
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: speedbench-reference-al
+          path: speedbench-reference-al.yaml
+          if-no-files-found: warn
+
+      - name: Open PR updating reference yaml
+        if: ${{ inputs.open-pr && success() }}
+        env:
+          GH_TOKEN: ${{ secrets.REPO_PAT }}
+        run: |
+          set -euo pipefail
+          cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml
+
+          BRANCH="speedbench-al/auto-${{ github.run_id }}"
+          git config user.name "github-actions"
+          git config user.email "github-actions@github.com"
+          git checkout -b "$BRANCH"
+          git add benchmarks/speedbench-reference-al.yaml
+          if git diff --cached --quiet; then
+            echo "No change in reference yaml; skipping PR."
+            exit 0
+          fi
+          git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})"
+          git push -u origin "$BRANCH"
+          gh pr create \
+            --title "Update SpeedBench AL reference matrix (auto)" \
+            --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \
+            --base main \
+            --head "$BRANCH"
+
+      - name: Upload server logs
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: speedbench_server_logs
+          path: speedbench_results/server_*.log
+          if-no-files-found: ignore
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: *resource-cleanup
\ No newline at end of file
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
new file mode 100755
index 0000000000..572801b2c3
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
@@ -0,0 +1,337 @@
+#!/usr/bin/env bash
+
+# DSV4-Pro B300 vLLM SPEED-Bench AL matrix collector.
+#
+# Produces the golden acceptance-length (AL) reference matrix consumed by the
+# synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
+# level (num_speculative_tokens), measure the AL on a single SPEED-Bench
+# category (default: coding) and emit a YAML matrix identical in shape to
+# benchmarks/speedbench-reference-al.yaml.
+#
+# This is the "AL distribution collection" script wired into the
+# speedbench-al.yml GitHub Action (workflow_dispatch / push-button).
+#
+# Usage (inside the vLLM container, on a B300 node):
+#   export MODEL=/data/models/dsv4-pro
+#   bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+#
+# Tunables (env):
+#   MTP_LIST          space-separated MTP levels   (default "1 2 3 4 5 6 7 8")
+#   THINKING_MODES    space-separated: off|on       (default "off on")
+#   CATEGORY          SPEED-Bench category          (default coding)
+#   SPEEDBENCH_OUTPUT_LEN  per-request output len   (default 4096)
+#   OUT_YAML          output matrix path            (default $RESULTS_DIR/speedbench-reference-al.yaml)
+
+set -uo pipefail
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}"
+# Serve from the local weights dir resolved by the launcher (MODEL_PATH points
+# at the pre-staged copy, e.g. /scratch/models/DeepSeek-V4-Pro). Falls back to
+# MODEL for a standalone local run where MODEL is itself a path. A leading "/"
+# makes the download guard below a no-op.
+SERVE_MODEL="${MODEL_PATH:-$MODEL}"
+TP="${TP:-8}"
+DP_ATTENTION="${DP_ATTENTION:-false}"
+EP_SIZE="${EP_SIZE:-1}"
+PORT="${PORT:-8888}"
+
+MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}"
+THINKING_MODES="${THINKING_MODES:-off on}"
+CATEGORY="${CATEGORY:-coding}"
+SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
+CONCURRENCY="${CONCURRENCY:-1}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+# thinking-on chat_template_kwargs. MUST match the production/golden config:
+# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured
+# with reasoning_effort=high.
+CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-{\"thinking\": true, \"reasoning_effort\": \"high\"}}"
+
+SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}"
+RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}"
+OUT_YAML="${OUT_YAML:-$RESULTS_DIR/speedbench-reference-al.yaml}"
+
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+mkdir -p "$RESULTS_DIR"
+nvidia-smi
+if [[ "$SERVE_MODEL" != /* ]]; then hf download "$SERVE_MODEL"; fi
+
+# ---- Download SPEED-Bench dataset ----
+echo "=== Downloading SPEED-Bench dataset ==="
+pip install -q datasets tiktoken
+curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \
+  | python3 - --config qualitative --output_dir "$SPEEDBENCH_DIR"
+
+if [[ ! -f "$SPEEDBENCH_DIR/qualitative.jsonl" ]]; then
+    echo "CRITICAL: SPEED-Bench download failed — $SPEEDBENCH_DIR/qualitative.jsonl not found"
+    exit 1
+fi
+
+# ---- Temporary shim: add a real --chat-template-kwargs CLI option ----
+# Upstream gap (until vllm-project/vllm#44244 lands): speed_bench/CustomDataset
+# pre-renders the chat template client-side WITHOUT chat_template_kwargs and
+# posts to /v1/completions, so thinking mode cannot be enabled via --extra-body
+# or --default-chat-template-kwargs. This wires a proper --chat-template-kwargs
+# option through get_samples into CustomDataset.sample's apply_chat_template.
+# TODO: delete this whole block once #44244 is released in the benchmark image;
+# the patch is idempotent (marker check) so it is safe to leave until then.
+apply_chat_template_kwargs_shim() {
+    echo "=== Patching vLLM benchmark to add --chat-template-kwargs (temporary shim) ==="
+    python3 - <<'PYEOF'
+import vllm.benchmarks.serve as S
+import vllm.benchmarks.datasets.datasets as D
+
+def patch(mod, edits, marker):
+    f = mod.__file__
+    src = open(f).read()
+    if marker in src:
+        print("already patched:", f)
+        return
+    for old, new in edits:
+        n = src.count(old)
+        assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..."
+        src = src.replace(old, new, 1)
+    open(f, "w").write(src)
+    print("patched OK ->", f)
+
+# Edit 1: serve.py -- declare the --chat-template-kwargs argument before --extra-body
+serve_old = '''    parser.add_argument(
+        "--extra-body",'''
+serve_new = '''    parser.add_argument(
+        "--chat-template-kwargs",
+        type=json.loads,
+        default=None,
+        help="JSON dict forwarded to apply_chat_template during "
+        "client-side prompt rendering, e.g. to enable reasoning mode.",
+    )
+    parser.add_argument(
+        "--extra-body",'''
+patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"')
+
+# Edit 2: datasets.py -- forward args.chat_template_kwargs into the speed_bench .sample() call
+disp_old = '''                output_len=args.speed_bench_output_len,
+                enable_multimodal_chat=args.enable_multimodal_chat,'''
+disp_new = '''                output_len=args.speed_bench_output_len,
+                chat_template_kwargs=args.chat_template_kwargs,
+                enable_multimodal_chat=args.enable_multimodal_chat,'''
+
+# Edit 3: datasets.py -- forward chat_template_kwargs into CustomDataset.sample's template call
+samp_old = '''                # apply template
+                if not skip_chat_template:
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                    )
+
+                prompt_len = len(tokenizer(prompt).input_ids)'''
+samp_new = '''                # apply template
+                if not skip_chat_template:
+                    _ctk = kwargs.get("chat_template_kwargs") or {}
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                        **_ctk,
+                    )
+
+                prompt_len = len(tokenizer(prompt).input_ids)'''
+patch(D, [(disp_old, disp_new), (samp_old, samp_new)],
+      marker="chat_template_kwargs=args.chat_template_kwargs")
+PYEOF
+}
+
+# Apply the shim once if any thinking-on cell is requested.
+if [[ " $THINKING_MODES " == *" on "* ]]; then
+    if ! apply_chat_template_kwargs_shim; then
+        echo "CRITICAL: --chat-template-kwargs shim failed — aborting"
+        exit 1
+    fi
+fi
+
+PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
+fi
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+MOE_ARGS=()
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
+fi
+
+fetch_metric() {
+    local port="$1" name="$2"
+    curl -s "http://localhost:${port}/metrics" \
+      | grep -oP "${name}\\{[^}]*\\} \\K[0-9.]+" || echo "0"
+}
+
+SERVER_PID=""
+# List all descendant PIDs of $1 recursively, matched by PARENT pid. This can
+# never include this script (the script is an ancestor of the server, not a
+# descendant), so it avoids the self-kill a name-based `pkill -f vllm` caused
+# (the script filename contains "vllm").
+_descendants() {
+    local pid="$1" child
+    for child in $(pgrep -P "$pid" 2>/dev/null || true); do
+        echo "$child"
+        _descendants "$child"
+    done
+}
+cleanup_server() {
+    if [[ -n "$SERVER_PID" ]]; then
+        # Snapshot the server's worker/EngineCore subprocesses BEFORE killing the
+        # parent: once the parent dies the children reparent to init and the tree
+        # link is lost. Killing the captured PIDs guarantees no orphaned worker
+        # survives to hold GPU memory and OOM the next server start.
+        local descendants
+        descendants=$(_descendants "$SERVER_PID")
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+        local pid
+        for pid in $descendants; do
+            kill -9 "$pid" 2>/dev/null || true
+        done
+        # Wait for GPU memory to actually free before the next server starts.
+        local waited=0
+        while [[ $waited -lt 120 ]]; do
+            local used
+            used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1)
+            if [[ -z "$used" || "$used" -lt 2000 ]]; then break; fi
+            sleep 3; waited=$((waited + 3))
+        done
+        SERVER_PID=""
+    fi
+}
+trap 'cleanup_server' EXIT
+
+start_gpu_monitor
+
+# Per-cell AL is collected into associative arrays keyed by "mode_mtp".
+declare -A AL_RESULT
+
+run_cell() {
+    local mode="$1" mtp="$2"
+    local think_args=()
+    if [[ "$mode" == "on" ]]; then
+        think_args=(--chat-template-kwargs "$CHAT_TEMPLATE_KWARGS_ON")
+    fi
+
+    echo ""
+    echo "=========================================="
+    echo "  Cell: thinking=$mode  MTP=$mtp  category=$CATEGORY"
+    echo "=========================================="
+
+    local serve_args=(
+        --host 0.0.0.0 --port "$PORT"
+        "${PARALLEL_ARGS[@]}"
+        --pipeline-parallel-size 1
+        --kv-cache-dtype fp8
+        --trust-remote-code
+        --block-size 256
+        --no-enable-prefix-caching
+        "${EP_ARGS[@]}"
+        "${MOE_ARGS[@]}"
+        --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+        --attention_config.use_fp4_indexer_cache True
+        --tokenizer-mode deepseek_v4
+        --tool-call-parser deepseek_v4
+        --enable-auto-tool-choice
+        --reasoning-parser deepseek_v4
+        --max-cudagraph-capture-size 2048
+        --max-model-len 16384
+        --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $mtp}"
+    )
+
+    local server_log="$RESULTS_DIR/server_${mode}_mtp${mtp}.log"
+    vllm serve "$SERVE_MODEL" "${serve_args[@]}" > "$server_log" 2>&1 &
+    SERVER_PID=$!
+
+    if ! wait_for_server_ready --port "$PORT" --server-log "$server_log" --server-pid "$SERVER_PID"; then
+        echo "  -> server failed to start (thinking=$mode mtp=$mtp), recording N/A"
+        AL_RESULT["${mode}_${mtp}"]="N/A"
+        cleanup_server
+        return
+    fi
+
+    local acc_before drf_before acc_after drf_after
+    acc_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
+    drf_before=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")
+
+    vllm bench serve \
+        --model "$SERVE_MODEL" \
+        --port "$PORT" \
+        --dataset-name speed_bench \
+        --dataset-path "$SPEEDBENCH_DIR" \
+        --speed-bench-category "$CATEGORY" \
+        --speed-bench-output-len "$SPEEDBENCH_OUTPUT_LEN" \
+        --num-prompts -1 \
+        --max-concurrency "$CONCURRENCY" \
+        --save-result \
+        --result-dir "$RESULTS_DIR" \
+        --result-filename "speedbench_${mode}_mtp${mtp}" \
+        --trust-remote-code \
+        --tokenizer-mode deepseek_v4 \
+        --temperature "$TEMPERATURE" \
+        "${think_args[@]}"
+
+    acc_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_accepted_tokens_total")
+    drf_after=$(fetch_metric "$PORT" "vllm:spec_decode_num_drafts_total")
+
+    local delta_acc delta_drf al
+    delta_acc=$(awk "BEGIN {printf \"%d\", $acc_after - $acc_before}")
+    delta_drf=$(awk "BEGIN {printf \"%d\", $drf_after - $drf_before}")
+    if [[ "$delta_drf" -gt 0 ]]; then
+        al=$(awk "BEGIN {printf \"%.2f\", 1 + ($delta_acc / $delta_drf)}")
+    else
+        al="N/A"
+    fi
+    echo "  -> thinking=$mode MTP=$mtp AL=$al (accepted=$delta_acc drafts=$delta_drf)"
+    AL_RESULT["${mode}_${mtp}"]="$al"
+
+    cleanup_server
+}
+
+for mode in $THINKING_MODES; do
+    for mtp in $MTP_LIST; do
+        run_cell "$mode" "$mtp"
+    done
+done
+
+stop_gpu_monitor
+
+# ---- Emit the YAML matrix ----
+emit_mode_block() {
+    local mode="$1"
+    for mtp in $MTP_LIST; do
+        echo "    $mtp: ${AL_RESULT[${mode}_${mtp}]:-N/A}"
+    done
+}
+
+{
+    echo "# Acceptance Length (AL) reference values measured with SPEED-Bench."
+    echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN"
+    echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON"
+    echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens."
+    echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)."
+    echo "#"
+    echo "# key = num_speculative_tokens (MTP level); value = golden AL"
+    echo "deepseek-v4-pro:"
+    if [[ " $THINKING_MODES " == *" on "* ]]; then
+        echo "  thinking_on:"
+        emit_mode_block on
+    fi
+    if [[ " $THINKING_MODES " == *" off "* ]]; then
+        echo "  thinking_off:"
+        emit_mode_block off
+    fi
+} > "$OUT_YAML"
+
+echo ""
+echo "=========================================="
+echo "  SPEED-Bench AL matrix written to: $OUT_YAML"
+echo "=========================================="
+cat "$OUT_YAML"
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 67e8b48cce..e6bdf1a0da 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -334,6 +334,12 @@ else
         BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
     fi
 
+    # Allow callers (e.g. the speedbench-al.yml AL-collection workflow) to run a
+    # specific script instead of the auto-selected throughput benchmark.
+    if [[ -n "${BENCH_SCRIPT_OVERRIDE:-}" ]]; then
+        BENCH_SCRIPT="$BENCH_SCRIPT_OVERRIDE"
+    fi
+
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell
@@ -379,7 +385,9 @@ else
         fi
     )
 
-    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
+    # Default 180 min; AL-matrix collection (16 server starts) needs longer and
+    # overrides via SALLOC_TIME_LIMIT.
+    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time="${SALLOC_TIME_LIMIT:-180}" --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID \

From bab431dac7c1813491b42f595644dc8b1cd876a7 Mon Sep 17 00:00:00 2001
From: Albert Cheng <albertching0112@gmail.com>
Date: Wed, 3 Jun 2026 14:05:58 -0700
Subject: [PATCH 02/27] speedbench-al: default open-pr to false (artifact-only
 by default)

Make the workflow default to Option 1 (upload the AL matrix as an
artifact for manual review/paste) rather than auto-opening a PR. The
auto-PR path stays available as an opt-in (open-pr: true), but keeping
it off by default avoids exposing a write-scoped PAT on the self-hosted
runner and matches the repo's artifact-collection convention.
---
 .github/workflows/speedbench-al.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
index 771e53e6c4..bbd7a9d7b9 100644
--- a/.github/workflows/speedbench-al.yml
+++ b/.github/workflows/speedbench-al.yml
@@ -49,10 +49,10 @@ on:
         type: string
         default: '480'
       open-pr:
-        description: "Open a PR updating benchmarks/speedbench-reference-al.yaml"
+        description: "Open a PR updating benchmarks/speedbench-reference-al.yaml (default off: artifact-only, paste values in manually)"
         required: false
         type: boolean
-        default: true
+        default: false
       ref:
         description: "Git ref (branch/sha) to checkout"
         required: false

From d595d49ab93175ff1c807a8af4ca61dd85464cc2 Mon Sep 17 00:00:00 2001
From: Albert Cheng <albertching0112@gmail.com>
Date: Thu, 4 Jun 2026 10:17:28 -0700
Subject: [PATCH 03/27] speedbench-al: parameterize model + relocate collector
 script

Address review:
- Model is now a workflow input (model + model-prefix, default
  deepseek-ai/DeepSeek-V4-Pro / dsv4). MODEL, MODEL_PREFIX, EXP_NAME,
  BENCH_SCRIPT_OVERRIDE, artifact names and the Create-PR branch/title/body
  are all derived from those inputs. The emitted YAML top-level key is now
  derived from the model (MODEL_KEY, defaults to the model basename lowercased).
- Move the collector to benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
  and fix its benchmark_lib.sh source path (../ -> ../../) for the deeper dir.
---
 .github/workflows/speedbench-al.yml           | 46 ++++++++++++-------
 .../dsv4_fp4_b300_vllm.sh}                    | 13 ++++--
 2 files changed, 38 insertions(+), 21 deletions(-)
 rename benchmarks/single_node/{dsv4_fp4_b300_vllm_speedbench_matrix.sh => speedbench/dsv4_fp4_b300_vllm.sh} (95%)

diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
index bbd7a9d7b9..ea4baea5c9 100644
--- a/.github/workflows/speedbench-al.yml
+++ b/.github/workflows/speedbench-al.yml
@@ -1,9 +1,10 @@
 name: SpeedBench AL Collection
 
-# Push-button (workflow_dispatch) collection of the DeepSeek-V4-Pro SPEED-Bench
-# acceptance-length (AL) matrix: thinking_on/off x MTP levels. Produces the
-# golden reference consumed by the synthetic-acceptance framework and (optionally)
-# opens a PR updating benchmarks/speedbench-reference-al.yaml.
+# Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length
+# (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to
+# DeepSeek-V4-Pro). Produces the golden reference consumed by the
+# synthetic-acceptance framework and (optionally) opens a PR updating
+# benchmarks/speedbench-reference-al.yaml.
 
 on:
   workflow_dispatch:
@@ -13,6 +14,16 @@ on:
         required: false
         type: string
         default: 'b300'
+      model:
+        description: "HF model id (basename must be in launcher STAGED_MODELS for pre-staged local weights)"
+        required: false
+        type: string
+        default: 'deepseek-ai/DeepSeek-V4-Pro'
+      model-prefix:
+        description: "Model prefix; drives launcher MODEL_PATH resolution, exp name, collector script, and artifact names"
+        required: false
+        type: string
+        default: 'dsv4'
       image:
         description: "vLLM container image"
         required: false
@@ -64,22 +75,22 @@ permissions:
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-  # Drive the dsv4 single-node path in runners/launch_b300-nv.sh. MODEL is the
-  # HF id; its basename (DeepSeek-V4-Pro) is in the launcher's STAGED_MODELS, so
+  # Drive the single-node path in runners/launch_b300-nv.sh. MODEL is the HF id;
+  # its basename (e.g. DeepSeek-V4-Pro) must be in the launcher's STAGED_MODELS so
   # the launcher resolves MODEL_PATH to the pre-staged local weights and mounts
   # them. The collector serves from MODEL_PATH (see SERVE_MODEL), so no download.
-  MODEL: deepseek-ai/DeepSeek-V4-Pro
-  MODEL_PREFIX: dsv4
+  MODEL: ${{ inputs.model }}
+  MODEL_PREFIX: ${{ inputs.model-prefix }}
   PRECISION: fp4
   FRAMEWORK: vllm
-  EXP_NAME: dsv4_speedbench
+  EXP_NAME: ${{ inputs.model-prefix }}_speedbench
   IMAGE: ${{ inputs.image }}
   TP: '8'
   EP_SIZE: '1'
   DP_ATTENTION: 'false'
   SPEC_DECODING: mtp
   # Run the AL-matrix collector instead of the auto-selected throughput script.
-  BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+  BENCH_SCRIPT_OVERRIDE: benchmarks/single_node/speedbench/${{ inputs.model-prefix }}_fp4_b300_vllm.sh
   SALLOC_TIME_LIMIT: ${{ inputs.salloc-time }}
   # Matrix-collector tunables (propagated into the container via srun --export=ALL).
   MTP_LIST: ${{ inputs.mtp-list }}
@@ -158,7 +169,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: speedbench-reference-al
+          name: speedbench-reference-al-${{ inputs.model-prefix }}
           path: speedbench-reference-al.yaml
           if-no-files-found: warn
 
@@ -168,9 +179,12 @@ jobs:
           GH_TOKEN: ${{ secrets.REPO_PAT }}
         run: |
           set -euo pipefail
+          # NOTE: the reference yaml is keyed by model at the top level. This
+          # overwrites it with the current model's matrix; when more than one
+          # model is collected, replace this cp with a per-model-key YAML merge.
           cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml
 
-          BRANCH="speedbench-al/auto-${{ github.run_id }}"
+          BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}"
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
           git checkout -b "$BRANCH"
@@ -179,11 +193,11 @@ jobs:
             echo "No change in reference yaml; skipping PR."
             exit 0
           fi
-          git commit -m "Update SpeedBench AL reference matrix (auto, run ${{ github.run_id }})"
+          git commit -m "Update SpeedBench AL reference matrix for ${{ inputs.model }} (auto, run ${{ github.run_id }})"
           git push -u origin "$BRANCH"
           gh pr create \
-            --title "Update SpeedBench AL reference matrix (auto)" \
-            --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \
+            --title "Update SpeedBench AL reference matrix for ${{ inputs.model-prefix }} (auto)" \
+            --body "Auto-generated by the SpeedBench AL Collection workflow (run ${{ github.run_id }}). Model: \`${{ inputs.model }}\`, category: \`${{ inputs.category }}\`, MTP: \`${{ inputs.mtp-list }}\`, thinking: \`${{ inputs.thinking-modes }}\`, output_len: \`${{ inputs.output-len }}\`. Please review the measured values before merging." \
             --base main \
             --head "$BRANCH"
 
@@ -191,7 +205,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: speedbench_server_logs
+          name: speedbench_server_logs-${{ inputs.model-prefix }}
           path: speedbench_results/server_*.log
           if-no-files-found: ignore
 
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
similarity index 95%
rename from benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
rename to benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
index 572801b2c3..7e39c32b3c 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
@@ -13,7 +13,7 @@
 #
 # Usage (inside the vLLM container, on a B300 node):
 #   export MODEL=/data/models/dsv4-pro
-#   bash benchmarks/single_node/dsv4_fp4_b300_vllm_speedbench_matrix.sh
+#   bash benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
 #
 # Tunables (env):
 #   MTP_LIST          space-separated MTP levels   (default "1 2 3 4 5 6 7 8")
@@ -23,7 +23,7 @@
 #   OUT_YAML          output matrix path            (default $RESULTS_DIR/speedbench-reference-al.yaml)
 
 set -uo pipefail
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 MODEL="${MODEL:?MODEL env var required (e.g. /data/models/dsv4-pro)}"
 # Serve from the local weights dir resolved by the launcher (MODEL_PATH points
@@ -39,6 +39,9 @@ PORT="${PORT:-8888}"
 MTP_LIST="${MTP_LIST:-1 2 3 4 5 6 7 8}"
 THINKING_MODES="${THINKING_MODES:-off on}"
 CATEGORY="${CATEGORY:-coding}"
+# Top-level key in the emitted YAML matrix. Derived from the model by the
+# workflow (e.g. deepseek-v4-pro); falls back to the model basename, lowercased.
+MODEL_KEY="${MODEL_KEY:-$(basename "$SERVE_MODEL" | tr '[:upper:]' '[:lower:]')}"
 SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
 CONCURRENCY="${CONCURRENCY:-1}"
 TEMPERATURE="${TEMPERATURE:-1.0}"
@@ -315,11 +318,11 @@ emit_mode_block() {
     echo "# Acceptance Length (AL) reference values measured with SPEED-Bench."
     echo "# dataset: $CATEGORY | temperature: $TEMPERATURE | output_len: $SPEEDBENCH_OUTPUT_LEN"
     echo "# thinking_on chat_template_kwargs: $CHAT_TEMPLATE_KWARGS_ON"
-    echo "# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens."
-    echo "# Auto-generated by dsv4_fp4_b300_vllm_speedbench_matrix.sh (speedbench-al.yml)."
+    echo "# Measured on $MODEL_KEY (B300, vLLM MTP), per num_speculative_tokens."
+    echo "# Auto-generated by benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh (speedbench-al.yml)."
     echo "#"
     echo "# key = num_speculative_tokens (MTP level); value = golden AL"
-    echo "deepseek-v4-pro:"
+    echo "${MODEL_KEY}:"
     if [[ " $THINKING_MODES " == *" on "* ]]; then
         echo "  thinking_on:"
         emit_mode_block on

From b2dd50adb1e542c2c933a0cb90b4e1dbe9032196 Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Thu, 4 Jun 2026 16:36:10 -0400
Subject: [PATCH 04/27] feat: add SpeedBench AL eval validation

---
 benchmarks/benchmark_lib.sh                   | 214 +++++++++++++
 .../fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh   |   1 +
 .../fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh   |   1 +
 utils/collect_eval_results.py                 |  88 +++++-
 utils/evals/EVALS.md                          |   6 +-
 utils/evals/speedbench_al.py                  | 298 ++++++++++++++++++
 utils/evals/test_speedbench_al.py             | 122 +++++++
 utils/evals/validate_scores.py                |  37 +++
 8 files changed, 752 insertions(+), 15 deletions(-)
 create mode 100644 utils/evals/speedbench_al.py
 create mode 100644 utils/evals/test_speedbench_al.py

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e3080b4bfa..7b277cd28e 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -694,6 +694,219 @@ setup_eval_context() {
     export EVAL_MAX_MODEL_LEN
 }
 
+# ------------------------------
+# SpeedBench acceptance-length eval helpers
+# ------------------------------
+
+_prometheus_metric_sum() {
+    local port="$1"
+    local name="$2"
+    local metrics
+    metrics=$(curl -fsS "http://0.0.0.0:${port}/metrics" 2>/dev/null) || return 1
+    awk -v name="$name" '
+        /^#/ { next }
+        {
+            metric = $1
+            sub(/\{.*/, "", metric)
+            if (metric == name && $NF ~ /^-?([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][-+]?[0-9]+)?$/) {
+                sum += $NF
+                found = 1
+            }
+        }
+        END {
+            if (found) {
+                printf "%.10f\n", sum
+            } else {
+                exit 1
+            }
+        }
+    ' <<< "$metrics"
+}
+
+_speedbench_write_eval_result() {
+    local output="$1"
+    local mode="$2"
+    local mtp="$3"
+    local al="${4:-}"
+    local accepted="${5:-}"
+    local drafts="${6:-}"
+    local error="${7:-}"
+    local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
+
+    local record_cmd=(
+        python3 "$(pwd)/utils/evals/speedbench_al.py"
+        record
+        --output "$output"
+        --reference-yaml "benchmarks/speedbench-reference-al.yaml"
+        --model "$speedbench_model"
+        --model-prefix "${MODEL_PREFIX:-}"
+        --thinking-mode "$mode"
+        --num-speculative-tokens "$mtp"
+        --category "coding"
+        --output-len "4096"
+        --temperature "1.0"
+        --threshold-ratio "0.90"
+    )
+    if [[ -n "$al" ]]; then
+        record_cmd+=(--acceptance-length "$al")
+    fi
+    if [[ -n "$accepted" ]]; then
+        record_cmd+=(--accepted-tokens "$accepted")
+    fi
+    if [[ -n "$drafts" ]]; then
+        record_cmd+=(--draft-tokens "$drafts")
+    fi
+    if [[ -n "$error" ]]; then
+        record_cmd+=(--error "$error")
+    fi
+    "${record_cmd[@]}" || true
+}
+
+_speedbench_reference_available() {
+    local mode="$1"
+    local mtp="$2"
+    local reference="benchmarks/speedbench-reference-al.yaml"
+    local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
+    [[ -f "$reference" ]] || return 1
+    python3 "$(pwd)/utils/evals/speedbench_al.py" resolve \
+        --reference-yaml "$reference" \
+        --model "$speedbench_model" \
+        --model-prefix "${MODEL_PREFIX:-}" \
+        --thinking-mode "$mode" \
+        --num-speculative-tokens "$mtp" \
+        --threshold-ratio "0.90" >/dev/null
+}
+
+_speedbench_prepare_dataset() {
+    local speedbench_dir="$1"
+    if [[ -f "$speedbench_dir/qualitative.jsonl" ]]; then
+        return 0
+    fi
+    mkdir -p "$speedbench_dir"
+    python3 -m pip install -q datasets tiktoken
+    curl -LsSf https://raw.githubusercontent.com/NVIDIA-NeMo/Skills/refs/heads/main/nemo_skills/dataset/speed-bench/prepare.py \
+      | python3 - --config qualitative --output_dir "$speedbench_dir"
+    [[ -f "$speedbench_dir/qualitative.jsonl" ]]
+}
+
+run_speedbench_al_eval() {
+    local port="${PORT:-8888}"
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --port) port="$2"; shift 2 ;;
+            *)
+                if [[ $# -gt 1 && "$2" != --* ]]; then
+                    shift 2
+                else
+                    shift
+                fi
+                ;;
+        esac
+    done
+
+    local mtp="${SPEEDBENCH_NUM_SPEC_TOKENS:-${NUM_SPEC_TOKENS:-${SPECULATIVE_DRAFT_TOKENS:-2}}}"
+    local default_thinking_mode="off"
+    if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+        default_thinking_mode="on"
+    fi
+    local mode="$default_thinking_mode"
+
+    if [[ "${SPEC_DECODING:-none}" != "mtp" ]]; then
+        echo "SpeedBench AL eval: skipping non-MTP config (SPEC_DECODING=${SPEC_DECODING:-none})"
+        return 0
+    fi
+
+    if [[ -z "${EVAL_RESULT_DIR:-}" ]]; then
+        EVAL_RESULT_DIR="$(mktemp -d /tmp/eval_out-XXXXXX)"
+        export EVAL_RESULT_DIR
+    fi
+
+    # TODO: Add unified support for SGLang, TRT-LLM, and disagg (Dynamo).
+    if ! command -v vllm >/dev/null 2>&1; then
+        local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
+        echo "SpeedBench AL eval: vllm CLI is not available for SpeedBench client" >&2
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm CLI is not available for SpeedBench client"
+        return 0
+    fi
+
+    local speedbench_dir="${SPEEDBENCH_DIR:-$(pwd)/speed_bench_data}"
+    if ! _speedbench_prepare_dataset "$speedbench_dir"; then
+        local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
+        echo "SpeedBench AL eval: SpeedBench dataset download failed" >&2
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "SpeedBench dataset download failed"
+        return 0
+    fi
+
+    local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
+    if ! _speedbench_reference_available "$mode" "$mtp"; then
+        echo "SpeedBench AL eval: no reference for mode=${mode} mtp=${mtp}" >&2
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "No SpeedBench AL reference for this eval cell"
+        return 0
+    fi
+
+    local think_args=()
+    if [[ "$mode" == "on" ]]; then
+        think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}')
+    fi
+
+    local accepted_before="" draft_before=""
+    accepted_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true)
+    draft_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true)
+    accepted_before="${accepted_before:-0}"
+    draft_before="${draft_before:-0}"
+
+    local raw_result_dir
+    raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)"
+    local bench_rc=0
+    local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
+    local bench_cmd=(
+        vllm bench serve
+        --model "$speedbench_model"
+        --port "$port"
+        --dataset-name speed_bench
+        --dataset-path "$speedbench_dir"
+        --speed-bench-category coding
+        --speed-bench-output-len 4096
+        --num-prompts -1
+        --max-concurrency 1
+        --save-result
+        --result-dir "$raw_result_dir"
+        --result-filename "speedbench_al_${mode}_mtp${mtp}"
+        --trust-remote-code
+        --tokenizer-mode deepseek_v4
+        --temperature 1.0
+        "${think_args[@]}"
+    )
+
+    echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}"
+    "${bench_cmd[@]}" || bench_rc=$?
+    if [[ "$bench_rc" -ne 0 ]]; then
+        echo "SpeedBench AL eval: vllm bench serve failed with exit code ${bench_rc}" >&2
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm bench serve failed with exit code ${bench_rc}"
+        rm -rf "$raw_result_dir" || true
+        return 0
+    fi
+
+    local accepted_after="" draft_after="" al="" delta_acc="" delta_draft=""
+    accepted_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true)
+    draft_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true)
+    if [[ -n "$accepted_after" && -n "$draft_after" ]]; then
+        delta_acc=$(awk "BEGIN {printf \"%d\", ${accepted_after} - ${accepted_before}}")
+        delta_draft=$(awk "BEGIN {printf \"%d\", ${draft_after} - ${draft_before}}")
+        if [[ "$delta_draft" -gt 0 ]]; then
+            al=$(awk "BEGIN {printf \"%.4f\", 1 + (${delta_acc} / ${delta_draft})}")
+        fi
+    fi
+
+    if [[ -z "$al" ]]; then
+        echo "SpeedBench AL eval: could not collect speculative acceptance metrics from server" >&2
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "$delta_acc" "$delta_draft" "Could not collect speculative acceptance metrics from server"
+    else
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_draft"
+    fi
+    rm -rf "$raw_result_dir" || true
+}
+
 run_lm_eval() {
     local port="${PORT:-8888}"
     local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}"
@@ -876,6 +1089,7 @@ run_eval() {
     fi
 
     local eval_rc=0
+    run_speedbench_al_eval "${forwarded[@]}" || true
     case "$framework" in
         lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
         *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh
index 6846223e8e..0f4eeb8600 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh
@@ -65,6 +65,7 @@ fi
 
 # use 2 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=2
+export SPEEDBENCH_NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS"
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh
index a5e7dd28cb..c2a3741250 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh
@@ -66,6 +66,7 @@ fi
 
 # use 2 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=2
+export SPEEDBENCH_NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS"
 
 start_gpu_monitor
 
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 18917447ec..f4bca741f6 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -36,8 +36,8 @@ def find_eval_sets(root: Path) -> List[Path]:
     return out
 
 
-def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
-    """Return (lm_eval_json) if present.
+def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], List[Path]]:
+    """Return (lm_eval_json, speedbench_al_jsons) if present.
     
     Checks immediate directory for result JSONs.
     """
@@ -46,7 +46,7 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
     ]
     
     lm_path = None
-    le_path = None
+    speedbench_paths: List[Path] = []
     
     for p in immediate_jsons:
         data = load_json(p)
@@ -57,8 +57,12 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
             # lm-eval harness - pick latest if multiple
             if lm_path is None or p.stat().st_mtime > lm_path.stat().st_mtime:
                 lm_path = p
-                
-    return lm_path, le_path
+
+        if 'speedbench_al_eval_version' in data:
+            speedbench_paths.append(p)
+
+    speedbench_paths.sort()
+    return lm_path, speedbench_paths
 
 
 def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]:
@@ -145,6 +149,32 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
     return extracted
 
 
+def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]:
+    """Extract a compact SpeedBench AL result as an eval metric row."""
+    data = load_json(json_path) or {}
+    if 'speedbench_al_eval_version' not in data:
+        return []
+
+    mode = data.get('thinking_mode', 'unknown')
+    mtp = data.get('num_speculative_tokens', 'unknown')
+    task_label = f"speedbench_al/{mode}/mtp{mtp}"
+    return [{
+        'metric_type': 'speedbench_al',
+        'task': 'speedbench_al',
+        'task_label': task_label,
+        'acceptance_length': data.get('acceptance_length'),
+        'reference_acceptance_length': data.get('reference_acceptance_length'),
+        'min_acceptance_length': data.get('min_acceptance_length'),
+        'threshold_ratio': data.get('threshold_ratio'),
+        'thinking_mode': mode,
+        'num_speculative_tokens': mtp,
+        'passed': data.get('passed'),
+        'error': data.get('error'),
+        'model': data.get('model'),
+        'source': str(json_path),
+    }]
+
+
 def pct(x: Any) -> str:
     """Format value as percentage."""
     try:
@@ -222,7 +252,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
         'dp_attention': str(dp_attention).lower(),
         'prefill_dp_attention': str(prefill_dp_attention).lower(),
         'decode_dp_attention': str(decode_dp_attention).lower(),
-        'task': m.get('task', 'unknown'),
+        'task': m.get('task_label') or m.get('task', 'unknown'),
         'em_strict': m.get('strict'),
         'em_strict_se': m.get('strict_se'),
         'em_flexible': m.get('flex'),
@@ -232,7 +262,18 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
     }
 
     # Add universal score field (primary metric for unified comparison)
-    if m.get('strict') is not None:
+    if m.get('metric_type') == 'speedbench_al':
+        row['score'] = m.get('acceptance_length')
+        row['score_name'] = 'acceptance_length'
+        row['score_se'] = None
+        row['speedbench_reference_acceptance_length'] = m.get('reference_acceptance_length')
+        row['speedbench_min_acceptance_length'] = m.get('min_acceptance_length')
+        row['speedbench_threshold_ratio'] = m.get('threshold_ratio')
+        row['speedbench_thinking_mode'] = m.get('thinking_mode')
+        row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens')
+        row['speedbench_passed'] = m.get('passed')
+        row['speedbench_error'] = m.get('error')
+    elif m.get('strict') is not None:
         row['score'] = m.get('strict')
         row['score_name'] = 'em_strict'
         row['score_se'] = m.get('strict_se')
@@ -248,6 +289,24 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
     return row
 
 
+def score_cell(r: Dict[str, Any]) -> str:
+    """Format the primary score for lm-eval and non-percentage eval rows."""
+    if r.get('score_name') == 'acceptance_length':
+        score = r.get('score')
+        minimum = r.get('speedbench_min_acceptance_length')
+        passed = r.get('speedbench_passed')
+        if score is None:
+            return 'FAIL'
+        try:
+            status = 'PASS' if passed else 'FAIL'
+            if minimum is None:
+                return f"{float(score):.2f} ({status})"
+            return f"{float(score):.2f} >= {float(minimum):.2f} ({status})"
+        except Exception:
+            return str(score)
+    return f"{pct(r['score'])}{se(r['score_se'])}"
+
+
 def main():
     if len(sys.argv) < 3:
         print('Usage: collect_eval_results.py <results_dir> <exp_name> [sort_by: model_prefix|hw]')
@@ -259,13 +318,14 @@ def main():
     rows: List[Dict[str, Any]] = []
     for d in find_eval_sets(root):
         meta = load_json(d / 'meta_env.json') or {}
-        lm_path, le_path = detect_eval_jsons(d)
+        lm_path, speedbench_paths = detect_eval_jsons(d)
 
-        # Extract metrics (prefer lm-eval) - returns list for multi-task support
+        metrics_list: List[Dict[str, Any]] = []
+        # Extract metrics - lm-eval returns one row per task.
         if lm_path:
-            metrics_list = extract_lm_metrics(lm_path)
-        else:
-            continue
+            metrics_list.extend(extract_lm_metrics(lm_path))
+        for speedbench_path in speedbench_paths:
+            metrics_list.extend(extract_speedbench_al_metrics(speedbench_path))
 
         if not metrics_list:
             continue
@@ -332,7 +392,7 @@ def main():
                     r['conc'],
                     r['dp_attention'],
                     r['task'],
-                    f"{pct(r['score'])}{se(r['score_se'])}",
+                    score_cell(r),
                     f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
                     f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
                     r['n_eff'] or '',
@@ -367,7 +427,7 @@ def main():
                     r['decode_num_workers'],
                     r['conc'],
                     r['task'],
-                    f"{pct(r['score'])}{se(r['score_se'])}",
+                    score_cell(r),
                     f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
                     f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
                     r['n_eff'] or '',
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index a59bdb40c3..9de27e765f 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -34,6 +34,7 @@ All benchmark scripts in `benchmarks/` follow one of two flows:
 # 3. run_benchmark_serving (skipped automatically when EVAL_ONLY=true)
 # 4. Run evals:
 if [ "${RUN_EVAL}" = "true" ]; then
+    # MTP evals also run SpeedBench AL validation first when a reference exists.
     run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary  # Writes meta_env.json and moves artifacts
 fi
@@ -51,6 +52,7 @@ Key eval functions in `benchmarks/benchmark_lib.sh`:
 | Function | Description |
 |----------|-------------|
 | `run_eval` | Unified entrypoint - dispatches to framework-specific runner |
+| `run_speedbench_al_eval` | Runs SpeedBench on MTP eval jobs, records measured acceptance length, and defers threshold failure to `validate_scores.py` |
 | `run_lm_eval` | Runs lm-eval harness against the OpenAI-compatible endpoint |
 | `append_lm_eval_summary` | Writes `meta_env.json` and moves eval artifacts to workspace |
 | `_install_lm_eval_deps` | Installs lm-eval dependencies |
@@ -131,9 +133,11 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results |
 | `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) |
 | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval |
+| `SPEEDBENCH_DIR` | `$(pwd)/speed_bench_data` | Prepared SpeedBench dataset directory; resolves to `/workspace/speed_bench_data` or `/ix/speed_bench_data` through the runner's container workdir |
+| `SPEEDBENCH_NUM_SPEC_TOKENS` | script-provided or `2` | MTP level used to select the reference AL row |
 
 ### Score validation
-`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
+`utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
 
 ### Adding a new eval task
 
diff --git a/utils/evals/speedbench_al.py b/utils/evals/speedbench_al.py
new file mode 100644
index 0000000000..4a5fd5d6d0
--- /dev/null
+++ b/utils/evals/speedbench_al.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""SpeedBench acceptance-length reference and result helpers."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+
+MODEL_PREFIX_ALIASES = {
+    "dsv4": "deepseek-v4-pro",
+}
+
+
+def _parse_scalar(value: str) -> Any:
+    value = value.strip()
+    if value in {"", "null", "None", "~"}:
+        return None
+    if value in {"N/A", "NA", "n/a", "na"}:
+        return None
+    if (value.startswith('"') and value.endswith('"')) or (
+        value.startswith("'") and value.endswith("'")
+    ):
+        return value[1:-1]
+    try:
+        if re.match(r"^-?\d+$", value):
+            return int(value)
+        return float(value)
+    except ValueError:
+        return value
+
+
+def _load_simple_reference_yaml(path: Path) -> dict[str, Any]:
+    """Parse the simple nested mapping emitted by the SpeedBench AL workflow."""
+    data: dict[str, Any] = {}
+    current_model: str | None = None
+    current_mode: str | None = None
+
+    for raw_line in path.read_text().splitlines():
+        line = raw_line.split("#", 1)[0].rstrip()
+        if not line.strip():
+            continue
+        indent = len(raw_line) - len(raw_line.lstrip(" "))
+        if ":" not in line:
+            continue
+        key, value = line.strip().split(":", 1)
+        key = key.strip().strip("'\"")
+        value = value.strip()
+
+        if indent == 0:
+            current_model = key
+            data.setdefault(current_model, {})
+            current_mode = None
+        elif indent == 2 and current_model is not None:
+            current_mode = key
+            data[current_model].setdefault(current_mode, {})
+        elif indent == 4 and current_model is not None and current_mode is not None:
+            data[current_model][current_mode][key] = _parse_scalar(value)
+
+    return data
+
+
+def load_reference(path: Path) -> dict[str, Any]:
+    try:
+        import yaml  # type: ignore
+    except ImportError:
+        return _load_simple_reference_yaml(path)
+
+    with path.open() as f:
+        loaded = yaml.safe_load(f) or {}
+    if not isinstance(loaded, dict):
+        raise ValueError(f"{path} must contain a mapping at the top level")
+    return loaded
+
+
+def normalize_key(value: str) -> str:
+    value = value.strip().split("/")[-1].lower()
+    value = value.replace("_", "-")
+    value = re.sub(r"[^a-z0-9.+-]+", "-", value)
+    return value.strip("-")
+
+
+def model_candidates(model: str, model_prefix: str | None = None) -> list[str]:
+    candidates: list[str] = []
+    if model_prefix:
+        prefix = normalize_key(model_prefix)
+        candidates.append(MODEL_PREFIX_ALIASES.get(prefix, prefix))
+    if model:
+        normalized = normalize_key(model)
+        candidates.append(MODEL_PREFIX_ALIASES.get(normalized, normalized))
+    seen = set()
+    out = []
+    for candidate in candidates:
+        if candidate and candidate not in seen:
+            out.append(candidate)
+            seen.add(candidate)
+    return out
+
+
+def normalize_mode(thinking_mode: str) -> str:
+    mode = thinking_mode.strip().lower().replace("-", "_")
+    if mode == "on":
+        return "thinking_on"
+    if mode == "off":
+        return "thinking_off"
+    raise ValueError("SpeedBench thinking mode must be 'on' or 'off'")
+
+
+def lookup_reference(
+    reference: dict[str, Any],
+    model: str,
+    model_prefix: str | None,
+    thinking_mode: str,
+    num_speculative_tokens: int,
+) -> tuple[str, str, float]:
+    normalized_reference = {normalize_key(str(k)): v for k, v in reference.items()}
+    mode_key = normalize_mode(thinking_mode)
+    token_key = str(num_speculative_tokens)
+
+    for candidate in model_candidates(model, model_prefix):
+        model_block = normalized_reference.get(candidate)
+        if not isinstance(model_block, dict):
+            continue
+        mode_block = model_block.get(mode_key)
+        if not isinstance(mode_block, dict):
+            continue
+        value = mode_block.get(num_speculative_tokens, mode_block.get(token_key))
+        if value is None:
+            continue
+        try:
+            return candidate, mode_key, float(value)
+        except (TypeError, ValueError):
+            continue
+
+    candidates = ", ".join(model_candidates(model, model_prefix)) or "<none>"
+    raise KeyError(
+        "No SpeedBench AL reference for "
+        f"model candidates [{candidates}], mode {mode_key}, MTP {num_speculative_tokens}"
+    )
+
+
+def _optional_float(value: str | None) -> float | None:
+    if value in {None, "", "None", "null", "N/A"}:
+        return None
+    return float(value)
+
+
+def _optional_int(value: str | None) -> int | None:
+    if value in {None, "", "None", "null", "N/A"}:
+        return None
+    return int(float(value))
+
+
+def build_result(args: argparse.Namespace) -> dict[str, Any]:
+    reference_al: float | None = None
+    min_acceptance_length: float | None = None
+    model_key: str | None = None
+    mode_key = normalize_mode(args.thinking_mode)
+    error: str | None = args.error
+
+    if args.reference_yaml:
+        reference_path = Path(args.reference_yaml)
+        if reference_path.exists():
+            try:
+                model_key, mode_key, reference_al = lookup_reference(
+                    load_reference(reference_path),
+                    args.model,
+                    args.model_prefix,
+                    args.thinking_mode,
+                    args.num_speculative_tokens,
+                )
+                min_acceptance_length = reference_al * args.threshold_ratio
+            except Exception as exc:  # noqa: BLE001 - recorded for CI artifacts
+                error = error or str(exc)
+        else:
+            error = error or f"Reference YAML not found: {reference_path}"
+
+    acceptance_length = _optional_float(args.acceptance_length)
+    accepted_tokens = _optional_int(args.accepted_tokens)
+    draft_tokens = _optional_int(args.draft_tokens)
+    passed = (
+        error is None
+        and acceptance_length is not None
+        and min_acceptance_length is not None
+        and acceptance_length >= min_acceptance_length
+    )
+
+    result = {
+        "speedbench_al_eval_version": 1,
+        "task": "speedbench_al",
+        "model": args.model,
+        "model_key": model_key,
+        "model_prefix": args.model_prefix,
+        "thinking_mode": mode_key,
+        "num_speculative_tokens": args.num_speculative_tokens,
+        "category": args.category,
+        "output_len": args.output_len,
+        "temperature": args.temperature,
+        "acceptance_length": acceptance_length,
+        "accepted_tokens": accepted_tokens,
+        "draft_tokens": draft_tokens,
+        "reference_acceptance_length": reference_al,
+        "threshold_ratio": args.threshold_ratio,
+        "min_acceptance_length": min_acceptance_length,
+        "passed": passed,
+    }
+    if error:
+        result["error"] = error
+    return result
+
+
+def cmd_resolve(args: argparse.Namespace) -> int:
+    model_key, mode_key, reference_al = lookup_reference(
+        load_reference(Path(args.reference_yaml)),
+        args.model,
+        args.model_prefix,
+        args.thinking_mode,
+        args.num_speculative_tokens,
+    )
+    payload = {
+        "model_key": model_key,
+        "thinking_mode": mode_key,
+        "num_speculative_tokens": args.num_speculative_tokens,
+        "reference_acceptance_length": reference_al,
+        "threshold_ratio": args.threshold_ratio,
+        "min_acceptance_length": reference_al * args.threshold_ratio,
+    }
+    print(json.dumps(payload, sort_keys=True))
+    return 0
+
+
+def cmd_record(args: argparse.Namespace) -> int:
+    result = build_result(args)
+    output = Path(args.output)
+    output.write_text(json.dumps(result, indent=2, sort_keys=True) + "\n")
+    status = "PASS" if result["passed"] else "FAIL"
+    actual = result.get("acceptance_length")
+    minimum = result.get("min_acceptance_length")
+    print(
+        f"{status}: SpeedBench AL {actual} "
+        f"(min {minimum}, mode {result['thinking_mode']}, "
+        f"mtp {result['num_speculative_tokens']})"
+    )
+    if args.exit_status and not result["passed"]:
+        return 1
+    return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    resolve = subparsers.add_parser("resolve", help="Resolve a reference AL cell")
+    resolve.add_argument("--reference-yaml", required=True)
+    resolve.add_argument("--model", required=True)
+    resolve.add_argument("--model-prefix", default="")
+    resolve.add_argument("--thinking-mode", required=True)
+    resolve.add_argument("--num-speculative-tokens", type=int, required=True)
+    resolve.add_argument("--threshold-ratio", type=float, default=0.90)
+    resolve.set_defaults(func=cmd_resolve)
+
+    record = subparsers.add_parser("record", help="Write a compact AL eval result")
+    record.add_argument("--output", required=True)
+    record.add_argument("--reference-yaml", default="")
+    record.add_argument("--model", required=True)
+    record.add_argument("--model-prefix", default="")
+    record.add_argument("--thinking-mode", required=True)
+    record.add_argument("--num-speculative-tokens", type=int, required=True)
+    record.add_argument("--category", default="coding")
+    record.add_argument("--output-len", type=int, default=4096)
+    record.add_argument("--temperature", type=float, default=1.0)
+    record.add_argument("--threshold-ratio", type=float, default=0.90)
+    record.add_argument("--acceptance-length", default=None)
+    record.add_argument("--accepted-tokens", default=None)
+    record.add_argument("--draft-tokens", default=None)
+    record.add_argument("--error", default=None)
+    record.add_argument("--exit-status", action="store_true")
+    record.set_defaults(func=cmd_record)
+
+    return parser
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    try:
+        return args.func(args)
+    except Exception as exc:  # noqa: BLE001 - CLI should print concise failures
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
new file mode 100644
index 0000000000..ab480cf782
--- /dev/null
+++ b/utils/evals/test_speedbench_al.py
@@ -0,0 +1,122 @@
+import argparse
+import json
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from collect_eval_results import build_row, extract_speedbench_al_metrics, score_cell
+from speedbench_al import build_result, load_reference, lookup_reference
+from validate_scores import validate_speedbench_al
+
+
+def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None:
+    ref = tmp_path / "speedbench-reference-al.yaml"
+    ref.write_text(
+        """
+deepseek-v4-pro:
+  thinking_on:
+    2: 2.75
+  thinking_off:
+    2: 2.40
+"""
+    )
+
+    data = load_reference(ref)
+    model_key, mode_key, value = lookup_reference(
+        data,
+        model="deepseek-ai/DeepSeek-V4-Pro",
+        model_prefix="dsv4",
+        thinking_mode="on",
+        num_speculative_tokens=2,
+    )
+
+    assert model_key == "deepseek-v4-pro"
+    assert mode_key == "thinking_on"
+    assert value == 2.75
+
+
+def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
+    ref = tmp_path / "speedbench-reference-al.yaml"
+    ref.write_text(
+        """
+deepseek-v4-pro:
+  thinking_on:
+    2: 2.50
+"""
+    )
+    args = argparse.Namespace(
+        reference_yaml=str(ref),
+        model="deepseek-ai/DeepSeek-V4-Pro",
+        model_prefix="dsv4",
+        thinking_mode="on",
+        num_speculative_tokens=2,
+        category="coding",
+        output_len=4096,
+        temperature=1.0,
+        threshold_ratio=0.90,
+        acceptance_length="2.30",
+        accepted_tokens="13",
+        draft_tokens="10",
+        error=None,
+    )
+
+    result = build_result(args)
+
+    assert result["reference_acceptance_length"] == 2.50
+    assert result["min_acceptance_length"] == 2.25
+    assert result["passed"] is True
+
+
+def test_validate_speedbench_al_fails_below_minimum() -> None:
+    ok, checked = validate_speedbench_al(
+        {
+            "speedbench_al_eval_version": 1,
+            "task": "speedbench_al",
+            "thinking_mode": "thinking_on",
+            "num_speculative_tokens": 2,
+            "acceptance_length": 2.0,
+            "min_acceptance_length": 2.25,
+            "passed": False,
+        },
+        "results_speedbench_al.json",
+    )
+
+    assert checked == 1
+    assert ok is False
+
+
+def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
+    result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json"
+    result_path.write_text(
+        json.dumps(
+            {
+                "speedbench_al_eval_version": 1,
+                "task": "speedbench_al",
+                "model": "deepseek-ai/DeepSeek-V4-Pro",
+                "thinking_mode": "thinking_on",
+                "num_speculative_tokens": 2,
+                "acceptance_length": 2.3,
+                "reference_acceptance_length": 2.5,
+                "min_acceptance_length": 2.25,
+                "threshold_ratio": 0.9,
+                "passed": True,
+            }
+        )
+    )
+    metrics = extract_speedbench_al_metrics(result_path)
+    row = build_row(
+        {
+            "infmax_model_prefix": "dsv4",
+            "hw": "b300",
+            "framework": "vllm",
+            "precision": "fp4",
+            "spec_decoding": "mtp",
+        },
+        metrics[0],
+    )
+
+    assert row["task"] == "speedbench_al/thinking_on/mtp2"
+    assert row["score_name"] == "acceptance_length"
+    assert score_cell(row) == "2.30 >= 2.25 (PASS)"
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
index 85433ec4bf..c85becd06b 100644
--- a/utils/evals/validate_scores.py
+++ b/utils/evals/validate_scores.py
@@ -23,6 +23,35 @@ def load_thresholds(path: str) -> dict[str, float]:
         return json.load(f)
 
 
+def validate_speedbench_al(data: dict, source: str) -> tuple[bool, int]:
+    """Validate a compact SpeedBench AL result JSON."""
+    if "speedbench_al_eval_version" not in data:
+        return False, 0
+
+    actual = data.get("acceptance_length")
+    minimum = data.get("min_acceptance_length")
+    passed = data.get("passed")
+    label = (
+        f"{data.get('task', 'speedbench_al')} "
+        f"{data.get('thinking_mode', 'unknown')} "
+        f"mtp{data.get('num_speculative_tokens', 'unknown')}"
+    )
+
+    if passed is True:
+        print(f"PASS: {label} AL = {float(actual):.4f} (>= {float(minimum):.4f})")
+        return True, 1
+
+    if isinstance(actual, (int, float)) and isinstance(minimum, (int, float)):
+        print(
+            f"FAIL: {label} AL = {actual:.4f} (< {minimum:.4f})",
+            file=sys.stderr,
+        )
+    else:
+        error = data.get("error", "missing acceptance length or threshold")
+        print(f"FAIL: {label} in {source}: {error}", file=sys.stderr)
+    return False, 1
+
+
 def main() -> int:
     parser = argparse.ArgumentParser(description="Validate eval scores")
     parser.add_argument(
@@ -63,6 +92,14 @@ def main() -> int:
     for f in sorted(glob.glob(args.results_glob)):
         with open(f) as fh:
             data = json.load(fh)
+
+        speedbench_ok, speedbench_checked = validate_speedbench_al(data, f)
+        if speedbench_checked:
+            checked += speedbench_checked
+            if not speedbench_ok:
+                failed = True
+            continue
+
         for task, metrics in data.get("results", {}).items():
             min_score = thresholds.get(task, args.min_score)
             for name, val in metrics.items():

From 4d72cdb0faa4e849ca7de7b19ab53cd37d8e5941 Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Fri, 5 Jun 2026 11:43:39 -0700
Subject: [PATCH 05/27] test: add SpeedBench AL reference handling

---
 benchmarks/speedbench-reference-al.yaml | 29 +++++++++++++++++++++++++
 utils/collect_eval_results.py           |  8 ++++---
 utils/evals/test_speedbench_al.py       | 29 ++++++++++++++++++++++++-
 3 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/speedbench-reference-al.yaml

diff --git a/benchmarks/speedbench-reference-al.yaml b/benchmarks/speedbench-reference-al.yaml
new file mode 100644
index 0000000000..b3dbf441d1
--- /dev/null
+++ b/benchmarks/speedbench-reference-al.yaml
@@ -0,0 +1,29 @@
+# Acceptance Length (AL) reference values measured with SPEED-Bench.
+# dataset: coding | temperature: 1.0 | output_len: 4096
+# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens.
+#
+# Two modes are reported:
+#   thinking_on  - reasoning enabled; this is the PRODUCTION configuration and
+#                  the golden reference used for synthetic-acceptance modeling.
+#   thinking_off - reasoning disabled; provided for comparison only.
+#
+# key = num_speculative_tokens (MTP level); value = golden AL
+deepseek-v4-pro:
+  thinking_on:
+    1: 1.79
+    2: 2.27
+    3: 2.47
+    4: 2.54
+    5: 2.52
+    6: 2.54
+    7: 2.54
+    8: 2.56
+  thinking_off:
+    1: 1.92
+    2: 2.60
+    3: 2.97
+    4: 3.04
+    5: 3.13
+    6: 3.08
+    7: 3.13
+    8: 3.12
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index f4bca741f6..f6a8c2031a 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -41,9 +41,11 @@ def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], List[Path]]:
     
     Checks immediate directory for result JSONs.
     """
-    immediate_jsons = list(d.glob('results*.json')) + [
-        p for p in d.glob('*.json') if p.name != 'meta_env.json'
-    ]
+    immediate_jsons = sorted(
+        set(d.glob('results*.json')).union(
+            p for p in d.glob('*.json') if p.name != 'meta_env.json'
+        )
+    )
     
     lm_path = None
     speedbench_paths: List[Path] = []
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index ab480cf782..2665609c3e 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -6,7 +6,12 @@
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
-from collect_eval_results import build_row, extract_speedbench_al_metrics, score_cell
+from collect_eval_results import (
+    build_row,
+    detect_eval_jsons,
+    extract_speedbench_al_metrics,
+    score_cell,
+)
 from speedbench_al import build_result, load_reference, lookup_reference
 from validate_scores import validate_speedbench_al
 
@@ -120,3 +125,25 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
     assert row["task"] == "speedbench_al/thinking_on/mtp2"
     assert row["score_name"] == "acceptance_length"
     assert score_cell(row) == "2.30 >= 2.25 (PASS)"
+
+
+def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> None:
+    result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json"
+    result_path.write_text(
+        json.dumps(
+            {
+                "speedbench_al_eval_version": 1,
+                "task": "speedbench_al",
+                "thinking_mode": "thinking_on",
+                "num_speculative_tokens": 2,
+                "acceptance_length": 2.3,
+                "min_acceptance_length": 2.25,
+                "passed": True,
+            }
+        )
+    )
+
+    lm_path, speedbench_paths = detect_eval_jsons(tmp_path)
+
+    assert lm_path is None
+    assert speedbench_paths == [result_path]

From f40d6f276de716cfdc554f1604e5528629557318 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Fri, 5 Jun 2026 12:48:15 -0700
Subject: [PATCH 06/27] Add multi-framework SpeedBench AL metrics

---
 benchmarks/benchmark_lib.sh                   | 396 ++++++++++++++++--
 .../multi_node/amd_utils/server_sglang.sh     |  13 +
 .../multi_node/amd_utils/server_vllm.sh       |  12 +
 .../single_node/dsv4_fp4_mi355x_sglang_mtp.sh |   2 +
 .../fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh    |   1 +
 .../fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh |   2 +
 .../fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh    |   1 +
 .../fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh |   1 +
 runners/launch_b300-nv.sh                     |   2 +-
 utils/collect_eval_results.py                 |  10 +
 utils/evals/EVALS.md                          |   6 +
 utils/evals/speedbench_al.py                  |  12 +
 utils/evals/test_speedbench_al.py             |  18 +
 13 files changed, 439 insertions(+), 37 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 7b277cd28e..0b917ddb09 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -698,29 +698,286 @@ setup_eval_context() {
 # SpeedBench acceptance-length eval helpers
 # ------------------------------
 
-_prometheus_metric_sum() {
-    local port="$1"
-    local name="$2"
-    local metrics
-    metrics=$(curl -fsS "http://0.0.0.0:${port}/metrics" 2>/dev/null) || return 1
+_prometheus_metric_values_from_text() {
+    local name="$1"
     awk -v name="$name" '
         /^#/ { next }
         {
             metric = $1
             sub(/\{.*/, "", metric)
             if (metric == name && $NF ~ /^-?([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][-+]?[0-9]+)?$/) {
-                sum += $NF
+                print $NF
                 found = 1
             }
         }
         END {
-            if (found) {
-                printf "%.10f\n", sum
-            } else {
+            if (!found) {
+                exit 1
+            }
+        }
+    '
+}
+
+_prometheus_metric_values_url() {
+    local url="$1"
+    local name="$2"
+    local metrics
+    metrics=$(curl -fsS --max-time "${SPEEDBENCH_METRICS_CURL_TIMEOUT:-10}" "$url" 2>/dev/null) || return 1
+    _prometheus_metric_values_from_text "$name" <<< "$metrics"
+}
+
+_prometheus_metric_sum_url() {
+    local url="$1"
+    local name="$2"
+    local values
+    values=$(_prometheus_metric_values_url "$url" "$name") || return 1
+    awk '
+        { sum += $1; found = 1 }
+        END {
+            if (!found) {
                 exit 1
             }
+            printf "%.10f\n", sum
+        }
+    ' <<< "$values"
+}
+
+_prometheus_metric_avg_url() {
+    local url="$1"
+    local name="$2"
+    local values
+    values=$(_prometheus_metric_values_url "$url" "$name") || return 1
+    awk '
+        { sum += $1; count += 1 }
+        END {
+            if (count == 0) {
+                exit 1
+            }
+            printf "%.10f\n", sum / count
+        }
+    ' <<< "$values"
+}
+
+_prometheus_metric_sum() {
+    local port="$1"
+    local name="$2"
+    _prometheus_metric_sum_url "http://0.0.0.0:${port}/metrics" "$name"
+}
+
+_speedbench_normalize_metrics_url() {
+    local endpoint="$1"
+    endpoint="${endpoint%,}"
+    endpoint="${endpoint%/}"
+    [[ -z "$endpoint" ]] && return 0
+
+    if [[ "$endpoint" =~ ^https?:// ]]; then
+        if [[ "$endpoint" == */metrics || "$endpoint" == */metrics\?* ]]; then
+            echo "$endpoint"
+        else
+            echo "${endpoint}/metrics"
+        fi
+    elif [[ "$endpoint" =~ ^[0-9]+$ ]]; then
+        echo "http://0.0.0.0:${endpoint}/metrics"
+    elif [[ "$endpoint" =~ ^:[0-9]+$ ]]; then
+        echo "http://0.0.0.0${endpoint}/metrics"
+    elif [[ "$endpoint" == */metrics || "$endpoint" == */metrics\?* ]]; then
+        echo "http://${endpoint}"
+    else
+        echo "http://${endpoint}/metrics"
+    fi
+}
+
+_speedbench_metric_urls() {
+    local port="$1"
+    local raw="${SPEEDBENCH_DECODE_METRICS_URLS:-${SPEEDBENCH_METRICS_URLS:-}}"
+    local endpoint
+
+    if [[ -n "$raw" ]]; then
+        for endpoint in ${raw//,/ }; do
+            _speedbench_normalize_metrics_url "$endpoint"
+        done
+        return 0
+    fi
+
+    raw="${SPEEDBENCH_METRICS_PORTS:-}"
+    if [[ -n "$raw" ]]; then
+        for endpoint in ${raw//,/ }; do
+            _speedbench_normalize_metrics_url "$endpoint"
+        done
+        return 0
+    fi
+
+    echo "http://0.0.0.0:${port}/metrics"
+}
+
+_speedbench_metric_sum() {
+    local port="$1"
+    local name="$2"
+    local url value
+    local total="0"
+    local found=0
+
+    while IFS= read -r url; do
+        [[ -z "$url" ]] && continue
+        value=$(_prometheus_metric_sum_url "$url" "$name" 2>/dev/null || true)
+        if [[ -n "$value" ]]; then
+            total=$(awk -v a="$total" -v b="$value" 'BEGIN { printf "%.10f", a + b }')
+            found=1
+        fi
+    done < <(_speedbench_metric_urls "$port")
+
+    [[ "$found" -eq 1 ]] || return 1
+    awk -v total="$total" 'BEGIN { printf "%.10f\n", total }'
+}
+
+_speedbench_metric_avg() {
+    local port="$1"
+    local name="$2"
+    local url value
+    local total="0"
+    local count=0
+
+    while IFS= read -r url; do
+        [[ -z "$url" ]] && continue
+        while IFS= read -r value; do
+            [[ -z "$value" ]] && continue
+            total=$(awk -v a="$total" -v b="$value" 'BEGIN { printf "%.10f", a + b }')
+            count=$((count + 1))
+        done < <(_prometheus_metric_values_url "$url" "$name" 2>/dev/null || true)
+    done < <(_speedbench_metric_urls "$port")
+
+    [[ "$count" -gt 0 ]] || return 1
+    awk -v total="$total" -v count="$count" 'BEGIN { printf "%.10f\n", total / count }'
+}
+
+_speedbench_metric_endpoint_count() {
+    local port="$1"
+    local url count=0
+    while IFS= read -r url; do
+        [[ -n "$url" ]] && count=$((count + 1))
+    done < <(_speedbench_metric_urls "$port")
+    echo "$count"
+}
+
+_speedbench_metric_delta() {
+    local before="$1"
+    local after="$2"
+    [[ -n "$before" && -n "$after" ]] || return 1
+    awk -v before="$before" -v after="$after" '
+        BEGIN {
+            delta = after - before
+            if (delta < 0) {
+                delta = after
+            }
+            printf "%.10f\n", delta
         }
-    ' <<< "$metrics"
+    '
+}
+
+_speedbench_round_metric() {
+    local value="$1"
+    [[ -n "$value" ]] || return 1
+    awk -v value="$value" 'BEGIN { printf "%.0f\n", value }'
+}
+
+_speedbench_metrics_framework() {
+    local fw="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-vllm}}"
+    fw="${fw,,}"
+    if [[ "$fw" == "dynamo" ]]; then
+        local inner="${SPEEDBENCH_DYNAMO_BACKEND_FRAMEWORK:-${DYNAMO_BACKEND_FRAMEWORK:-${DYNAMO_BACKEND:-}}}"
+        [[ -n "$inner" ]] && fw="dynamo-${inner,,}"
+    fi
+
+    case "$fw" in
+        vllm|dynamo-vllm)
+            echo "vllm"
+            ;;
+        sglang|dynamo-sglang)
+            echo "sglang"
+            ;;
+        trt|trtllm|tensorrt-llm|tensorrt_llm|dynamo-trt|dynamo-trtllm|dynamo-tensorrt-llm|dynamo-tensorrt_llm)
+            echo "trtllm"
+            ;;
+        *)
+            echo "$fw"
+            ;;
+    esac
+}
+
+_speedbench_metric_source_base() {
+    local framework="$1"
+    local configured="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-$framework}}"
+    configured="${configured,,}"
+    if [[ "$configured" == dynamo* ]]; then
+        echo "dynamo-${framework}-prometheus"
+    else
+        echo "${framework}-prometheus"
+    fi
+}
+
+_speedbench_spec_counter_metric() {
+    local framework="$1"
+    local kind="$2"
+    case "${framework}:${kind}" in
+        vllm:accepted)
+            echo "vllm:spec_decode_num_accepted_tokens_total"
+            ;;
+        vllm:proposed)
+            echo "vllm:spec_decode_num_draft_tokens_total"
+            ;;
+        vllm:verify)
+            echo "vllm:spec_decode_num_drafts_total"
+            ;;
+        trtllm:accepted)
+            echo "trtllm_spec_decode_num_accepted_tokens_total"
+            ;;
+        trtllm:proposed)
+            echo "trtllm_spec_decode_num_draft_tokens_total"
+            ;;
+        sglang:verify)
+            echo "sglang:spec_verify_calls_total"
+            ;;
+        *)
+            return 1
+            ;;
+    esac
+}
+
+_speedbench_spec_gauge_metric() {
+    local framework="$1"
+    local kind="$2"
+    case "${framework}:${kind}" in
+        trtllm:acceptance_length)
+            echo "trtllm_spec_decode_acceptance_length"
+            ;;
+        sglang:acceptance_length)
+            echo "sglang:spec_accept_length"
+            ;;
+        sglang:draft_tokens_per_step)
+            echo "sglang:spec_num_draft_tokens"
+            ;;
+        *)
+            return 1
+            ;;
+    esac
+}
+
+_speedbench_spec_counter_sum() {
+    local framework="$1"
+    local port="$2"
+    local kind="$3"
+    local metric
+    metric=$(_speedbench_spec_counter_metric "$framework" "$kind") || return 1
+    _speedbench_metric_sum "$port" "$metric"
+}
+
+_speedbench_spec_gauge_avg() {
+    local framework="$1"
+    local port="$2"
+    local kind="$3"
+    local metric
+    metric=$(_speedbench_spec_gauge_metric "$framework" "$kind") || return 1
+    _speedbench_metric_avg "$port" "$metric"
 }
 
 _speedbench_write_eval_result() {
@@ -729,8 +986,11 @@ _speedbench_write_eval_result() {
     local mtp="$3"
     local al="${4:-}"
     local accepted="${5:-}"
-    local drafts="${6:-}"
-    local error="${7:-}"
+    local verify_steps="${6:-}"
+    local proposed_drafts="${7:-}"
+    local framework="${8:-${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-}}}"
+    local metric_source="${9:-}"
+    local error="${10:-}"
     local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
 
     local record_cmd=(
@@ -747,14 +1007,24 @@ _speedbench_write_eval_result() {
         --temperature "1.0"
         --threshold-ratio "0.90"
     )
+    if [[ -n "$framework" ]]; then
+        record_cmd+=(--framework "$framework")
+    fi
+    if [[ -n "$metric_source" ]]; then
+        record_cmd+=(--metric-source "$metric_source")
+    fi
     if [[ -n "$al" ]]; then
         record_cmd+=(--acceptance-length "$al")
     fi
     if [[ -n "$accepted" ]]; then
         record_cmd+=(--accepted-tokens "$accepted")
     fi
-    if [[ -n "$drafts" ]]; then
-        record_cmd+=(--draft-tokens "$drafts")
+    if [[ -n "$verify_steps" ]]; then
+        record_cmd+=(--draft-tokens "$verify_steps")
+        record_cmd+=(--verify-steps "$verify_steps")
+    fi
+    if [[ -n "$proposed_drafts" ]]; then
+        record_cmd+=(--proposed-draft-tokens "$proposed_drafts")
     fi
     if [[ -n "$error" ]]; then
         record_cmd+=(--error "$error")
@@ -821,26 +1091,40 @@ run_speedbench_al_eval() {
         export EVAL_RESULT_DIR
     fi
 
-    # TODO: Add unified support for SGLang, TRT-LLM, and disagg (Dynamo).
+    local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
+    local metrics_framework result_framework metric_source_base metrics_endpoint_count
+    metrics_framework=$(_speedbench_metrics_framework)
+    result_framework="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-$metrics_framework}}"
+    metric_source_base=$(_speedbench_metric_source_base "$metrics_framework")
+    metrics_endpoint_count=$(_speedbench_metric_endpoint_count "$port")
+
+    case "$metrics_framework" in
+        vllm|sglang|trtllm)
+            ;;
+        *)
+            echo "SpeedBench AL eval: unsupported speculative metrics framework=${metrics_framework}" >&2
+            _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "Unsupported speculative metrics framework: ${metrics_framework}"
+            return 0
+            ;;
+    esac
+
+    echo "SpeedBench AL eval: metrics framework=${metrics_framework}, endpoints=${metrics_endpoint_count}"
     if ! command -v vllm >/dev/null 2>&1; then
-        local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
         echo "SpeedBench AL eval: vllm CLI is not available for SpeedBench client" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm CLI is not available for SpeedBench client"
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm CLI is not available for SpeedBench client"
         return 0
     fi
 
     local speedbench_dir="${SPEEDBENCH_DIR:-$(pwd)/speed_bench_data}"
     if ! _speedbench_prepare_dataset "$speedbench_dir"; then
-        local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
         echo "SpeedBench AL eval: SpeedBench dataset download failed" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "SpeedBench dataset download failed"
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "SpeedBench dataset download failed"
         return 0
     fi
 
-    local output="${EVAL_RESULT_DIR}/results_speedbench_al_${mode}_mtp${mtp}.json"
     if ! _speedbench_reference_available "$mode" "$mtp"; then
         echo "SpeedBench AL eval: no reference for mode=${mode} mtp=${mtp}" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "No SpeedBench AL reference for this eval cell"
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "No SpeedBench AL reference for this eval cell"
         return 0
     fi
 
@@ -849,11 +1133,13 @@ run_speedbench_al_eval() {
         think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}')
     fi
 
-    local accepted_before="" draft_before=""
-    accepted_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true)
-    draft_before=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true)
+    local accepted_before="" proposed_before="" verify_before=""
+    accepted_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true)
+    proposed_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true)
+    verify_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true)
     accepted_before="${accepted_before:-0}"
-    draft_before="${draft_before:-0}"
+    proposed_before="${proposed_before:-0}"
+    verify_before="${verify_before:-0}"
 
     local raw_result_dir
     raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)"
@@ -882,27 +1168,65 @@ run_speedbench_al_eval() {
     "${bench_cmd[@]}" || bench_rc=$?
     if [[ "$bench_rc" -ne 0 ]]; then
         echo "SpeedBench AL eval: vllm bench serve failed with exit code ${bench_rc}" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "vllm bench serve failed with exit code ${bench_rc}"
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm bench serve failed with exit code ${bench_rc}"
         rm -rf "$raw_result_dir" || true
         return 0
     fi
 
-    local accepted_after="" draft_after="" al="" delta_acc="" delta_draft=""
-    accepted_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_accepted_tokens_total" 2>/dev/null || true)
-    draft_after=$(_prometheus_metric_sum "$port" "vllm:spec_decode_num_drafts_total" 2>/dev/null || true)
-    if [[ -n "$accepted_after" && -n "$draft_after" ]]; then
-        delta_acc=$(awk "BEGIN {printf \"%d\", ${accepted_after} - ${accepted_before}}")
-        delta_draft=$(awk "BEGIN {printf \"%d\", ${draft_after} - ${draft_before}}")
-        if [[ "$delta_draft" -gt 0 ]]; then
-            al=$(awk "BEGIN {printf \"%.4f\", 1 + (${delta_acc} / ${delta_draft})}")
+    local accepted_after="" proposed_after="" verify_after=""
+    local al="" delta_acc="" delta_proposed="" delta_verify="" metric_source=""
+    accepted_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true)
+    proposed_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true)
+    verify_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true)
+
+    if [[ -n "$accepted_after" ]]; then
+        delta_acc=$(_speedbench_round_metric "$(_speedbench_metric_delta "$accepted_before" "$accepted_after")")
+    fi
+    if [[ -n "$proposed_after" ]]; then
+        delta_proposed=$(_speedbench_round_metric "$(_speedbench_metric_delta "$proposed_before" "$proposed_after")")
+    fi
+    if [[ -n "$verify_after" ]]; then
+        delta_verify=$(_speedbench_round_metric "$(_speedbench_metric_delta "$verify_before" "$verify_after")")
+    fi
+
+    if [[ "$metrics_framework" == "vllm" && -n "$delta_acc" && -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
+        al=$(awk -v accepted="$delta_acc" -v verify="$delta_verify" 'BEGIN { printf "%.4f", 1 + (accepted / verify) }')
+        metric_source="${metric_source_base}-counters-endpoints${metrics_endpoint_count}"
+    elif [[ "$metrics_framework" == "trtllm" ]]; then
+        al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
+        if [[ -n "$al" ]]; then
+            metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}"
+            if [[ -n "$delta_acc" || -n "$delta_proposed" ]]; then
+                metric_source="${metric_source}+token-counters"
+            fi
+        fi
+    elif [[ "$metrics_framework" == "sglang" ]]; then
+        al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
+        if [[ -n "$al" ]]; then
+            metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}"
+        fi
+        if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
+            local draft_depth=""
+            draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true)
+            if [[ -n "$draft_depth" ]]; then
+                delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")
+            fi
+            if [[ -n "$al" ]]; then
+                delta_acc=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v al="$al" 'BEGIN { value = verify * (al - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")
+                metric_source="${metric_source:-${metric_source_base}-gauge-endpoints${metrics_endpoint_count}}+derived-token-counters"
+            fi
         fi
     fi
 
     if [[ -z "$al" ]]; then
         echo "SpeedBench AL eval: could not collect speculative acceptance metrics from server" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "$delta_acc" "$delta_draft" "Could not collect speculative acceptance metrics from server"
+        local metric_error="Could not collect speculative acceptance metrics from server"
+        if [[ "${FRAMEWORK:-}" == dynamo* && -z "${SPEEDBENCH_DECODE_METRICS_URLS:-}${SPEEDBENCH_METRICS_URLS:-}${SPEEDBENCH_METRICS_PORTS:-}" ]]; then
+            metric_error="${metric_error}; for Dynamo/disagg set SPEEDBENCH_DECODE_METRICS_URLS or SPEEDBENCH_METRICS_PORTS to decode-worker /metrics endpoints"
+        fi
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "$delta_acc" "$delta_verify" "$delta_proposed" "$result_framework" "$metric_source_base" "$metric_error"
     else
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_draft"
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_verify" "$delta_proposed" "$result_framework" "$metric_source"
     fi
     rm -rf "$raw_result_dir" || true
 }
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index c28ccab41a..0d307f6e4d 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -565,6 +565,19 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 export EVAL_MAX_MODEL_LEN="$prefill_context_length"
             fi
 
+            speedbench_decode_metric_urls=""
+            for i in $(seq 0 $((yD - 1))); do
+                decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
+                speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${IP_ARRAY[$decode_idx]}:8000/metrics"
+            done
+            if [[ -z "$speedbench_decode_metric_urls" ]]; then
+                speedbench_decode_metric_urls="http://${NODE0_ADDR}:8000/metrics"
+            fi
+            export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}"
+            if [[ "${SPEC_DECODING:-none}" == "mtp" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then
+                export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-$DECODE_MTP_SIZE}"
+            fi
+
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
             else
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index d61fe03592..a2bed048d0 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -344,6 +344,18 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
             fi
 
+            speedbench_decode_metric_urls=""
+            for decode_ip in ${DECODE_ARGS}; do
+                speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${decode_ip}:${SERVER_PORT}/metrics"
+            done
+            if [[ -z "$speedbench_decode_metric_urls" ]]; then
+                speedbench_decode_metric_urls="http://${NODE0_ADDR}:${SERVER_PORT}/metrics"
+            fi
+            export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}"
+            if [[ "${SPEC_DECODING:-none}" == "mtp" ]]; then
+                export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-${DECODE_MTP_SIZE:-${NUM_SPEC_TOKENS:-2}}}"
+            fi
+
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
             else
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh
index 3addce4526..13c639c98f 100755
--- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_sglang_mtp.sh
@@ -166,6 +166,7 @@ SPEC_FLAGS=(
     --speculative-eagle-topk 1
     --speculative-num-draft-tokens 4
 )
+export SPEEDBENCH_NUM_SPEC_TOKENS=3
 if [ "${DP_ATTENTION}" = "true" ]; then
     PARALLEL_ARGS+=(
         --dp "$TP"
@@ -178,6 +179,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --speculative-eagle-topk 1
         --speculative-num-draft-tokens 3
     )
+    export SPEEDBENCH_NUM_SPEC_TOKENS=2
 fi
 if [ "${EP_SIZE:-1}" -gt 1 ]; then
     PARALLEL_ARGS+=(--ep-size "$EP_SIZE")
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index 9e5c88212b..e51d4043d8 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -57,6 +57,7 @@ EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
 MOE_BACKEND="TRTLLM"
 MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
+export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
 KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index 672d48f4b3..bfb38953b8 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -88,6 +88,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --speculative-eagle-topk 1
         --speculative-num-draft-tokens 2
     )
+    export SPEEDBENCH_NUM_SPEC_TOKENS=1
     PARALLEL_ARGS=(
         --dp-size "$TP"
         --enable-dp-attention
@@ -107,6 +108,7 @@ else
         --speculative-eagle-topk 1
         --speculative-num-draft-tokens 4
     )
+    export SPEEDBENCH_NUM_SPEC_TOKENS=3
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
         --disable-flashinfer-autotune
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
index 507b96e346..e4664dcd59 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
@@ -65,6 +65,7 @@ EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
 MOE_BACKEND="TRTLLM"
 MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-2}"
+export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
 KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
index 788eff5b8b..e8d4ffde79 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
@@ -20,6 +20,7 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
+export SPEEDBENCH_NUM_SPEC_TOKENS=3
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index fc0ac297f0..5eaf15302a 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -2,7 +2,7 @@
 
 # System-specific configuration for B300 NV Slurm cluster
 SLURM_PARTITION="batch_1"
-SLURM_ACCOUNT="benchmark"
+SLURM_ACCOUNT="restricted"
 
 set -x
 
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index f6a8c2031a..45b464329d 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -170,6 +170,11 @@ def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]:
         'threshold_ratio': data.get('threshold_ratio'),
         'thinking_mode': mode,
         'num_speculative_tokens': mtp,
+        'speedbench_framework': data.get('framework'),
+        'speedbench_metric_source': data.get('metric_source'),
+        'speedbench_accepted_tokens': data.get('accepted_tokens'),
+        'speedbench_verify_steps': data.get('verify_steps', data.get('draft_tokens')),
+        'speedbench_proposed_draft_tokens': data.get('proposed_draft_tokens'),
         'passed': data.get('passed'),
         'error': data.get('error'),
         'model': data.get('model'),
@@ -273,6 +278,11 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
         row['speedbench_threshold_ratio'] = m.get('threshold_ratio')
         row['speedbench_thinking_mode'] = m.get('thinking_mode')
         row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens')
+        row['speedbench_framework'] = m.get('speedbench_framework')
+        row['speedbench_metric_source'] = m.get('speedbench_metric_source')
+        row['speedbench_accepted_tokens'] = m.get('speedbench_accepted_tokens')
+        row['speedbench_verify_steps'] = m.get('speedbench_verify_steps')
+        row['speedbench_proposed_draft_tokens'] = m.get('speedbench_proposed_draft_tokens')
         row['speedbench_passed'] = m.get('passed')
         row['speedbench_error'] = m.get('error')
     elif m.get('strict') is not None:
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 9de27e765f..65fca90183 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -135,6 +135,12 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval |
 | `SPEEDBENCH_DIR` | `$(pwd)/speed_bench_data` | Prepared SpeedBench dataset directory; resolves to `/workspace/speed_bench_data` or `/ix/speed_bench_data` through the runner's container workdir |
 | `SPEEDBENCH_NUM_SPEC_TOKENS` | script-provided or `2` | MTP level used to select the reference AL row |
+| `SPEEDBENCH_METRICS_FRAMEWORK` | `FRAMEWORK` or `vllm` | Override speculative metrics parser. Supports `vllm`, `sglang`, `trtllm`/`trt`, and `dynamo-*` variants |
+| `SPEEDBENCH_DECODE_METRICS_URLS` | unset | Comma/space-separated decode worker Prometheus `/metrics` URLs for disaggregated runs |
+| `SPEEDBENCH_METRICS_URLS` | unset | Generic comma/space-separated Prometheus endpoints when decode-specific naming is not applicable |
+| `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied |
+
+SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM records its acceptance-length gauge and token counters because it does not expose verify steps through Prometheus. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values.
 
 ### Score validation
 `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
diff --git a/utils/evals/speedbench_al.py b/utils/evals/speedbench_al.py
index 4a5fd5d6d0..a9167e6565 100644
--- a/utils/evals/speedbench_al.py
+++ b/utils/evals/speedbench_al.py
@@ -182,6 +182,10 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]:
     acceptance_length = _optional_float(args.acceptance_length)
     accepted_tokens = _optional_int(args.accepted_tokens)
     draft_tokens = _optional_int(args.draft_tokens)
+    verify_steps = _optional_int(getattr(args, "verify_steps", None))
+    proposed_draft_tokens = _optional_int(getattr(args, "proposed_draft_tokens", None))
+    if verify_steps is None:
+        verify_steps = draft_tokens
     passed = (
         error is None
         and acceptance_length is not None
@@ -200,8 +204,12 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]:
         "category": args.category,
         "output_len": args.output_len,
         "temperature": args.temperature,
+        "framework": getattr(args, "framework", ""),
+        "metric_source": getattr(args, "metric_source", ""),
         "acceptance_length": acceptance_length,
         "accepted_tokens": accepted_tokens,
+        "verify_steps": verify_steps,
+        "proposed_draft_tokens": proposed_draft_tokens,
         "draft_tokens": draft_tokens,
         "reference_acceptance_length": reference_al,
         "threshold_ratio": args.threshold_ratio,
@@ -274,9 +282,13 @@ def build_parser() -> argparse.ArgumentParser:
     record.add_argument("--output-len", type=int, default=4096)
     record.add_argument("--temperature", type=float, default=1.0)
     record.add_argument("--threshold-ratio", type=float, default=0.90)
+    record.add_argument("--framework", default="")
+    record.add_argument("--metric-source", default="")
     record.add_argument("--acceptance-length", default=None)
     record.add_argument("--accepted-tokens", default=None)
     record.add_argument("--draft-tokens", default=None)
+    record.add_argument("--verify-steps", default=None)
+    record.add_argument("--proposed-draft-tokens", default=None)
     record.add_argument("--error", default=None)
     record.add_argument("--exit-status", action="store_true")
     record.set_defaults(func=cmd_record)
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index 2665609c3e..8663058c30 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -64,6 +64,10 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
         acceptance_length="2.30",
         accepted_tokens="13",
         draft_tokens="10",
+        verify_steps="10",
+        proposed_draft_tokens="20",
+        framework="vllm",
+        metric_source="vllm-prometheus-counters-endpoints1",
         error=None,
     )
 
@@ -71,6 +75,10 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
 
     assert result["reference_acceptance_length"] == 2.50
     assert result["min_acceptance_length"] == 2.25
+    assert result["framework"] == "vllm"
+    assert result["metric_source"] == "vllm-prometheus-counters-endpoints1"
+    assert result["verify_steps"] == 10
+    assert result["proposed_draft_tokens"] == 20
     assert result["passed"] is True
 
 
@@ -103,6 +111,11 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
                 "thinking_mode": "thinking_on",
                 "num_speculative_tokens": 2,
                 "acceptance_length": 2.3,
+                "framework": "sglang",
+                "metric_source": "sglang-prometheus-gauge-endpoints1+derived-token-counters",
+                "accepted_tokens": 13,
+                "verify_steps": 10,
+                "proposed_draft_tokens": 20,
                 "reference_acceptance_length": 2.5,
                 "min_acceptance_length": 2.25,
                 "threshold_ratio": 0.9,
@@ -124,6 +137,11 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
 
     assert row["task"] == "speedbench_al/thinking_on/mtp2"
     assert row["score_name"] == "acceptance_length"
+    assert row["speedbench_framework"] == "sglang"
+    assert row["speedbench_metric_source"] == "sglang-prometheus-gauge-endpoints1+derived-token-counters"
+    assert row["speedbench_accepted_tokens"] == 13
+    assert row["speedbench_verify_steps"] == 10
+    assert row["speedbench_proposed_draft_tokens"] == 20
     assert score_cell(row) == "2.30 >= 2.25 (PASS)"
 
 

From f2aba4c32a1b62810b9972cc791c3a2d24c3c155 Mon Sep 17 00:00:00 2001
From: "Albert Cheng (Engrg-Hardware 1)"
 <albecheng@login-bia02.bia.clusters.nvidia.com>
Date: Mon, 8 Jun 2026 11:12:06 -0700
Subject: [PATCH 07/27] speedbench-al: fix --chat-template-kwargs default
 quoting so thinking-on cells run

---
 benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
index 7e39c32b3c..2a77dcb361 100755
--- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
@@ -48,7 +48,8 @@ TEMPERATURE="${TEMPERATURE:-1.0}"
 # thinking-on chat_template_kwargs. MUST match the production/golden config:
 # the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured
 # with reasoning_effort=high.
-CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-{\"thinking\": true, \"reasoning_effort\": \"high\"}}"
+DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"thinking": true, "reasoning_effort": "high"}'
+CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}"
 
 SPEEDBENCH_DIR="${SPEEDBENCH_DIR:-/workspace/speed_bench_data}"
 RESULTS_DIR="${RESULTS_DIR:-/workspace/speedbench_results}"

From c12acba250c7882bea72882857ff58304edefbc3 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Tue, 9 Jun 2026 13:25:02 -0700
Subject: [PATCH 08/27] Apply SpeedBench chat-template shim to eval helper

---
 benchmarks/benchmark_lib.sh | 72 +++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 0b917ddb09..e54ca6d235 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1059,6 +1059,73 @@ _speedbench_prepare_dataset() {
     [[ -f "$speedbench_dir/qualitative.jsonl" ]]
 }
 
+_speedbench_apply_chat_template_kwargs_shim() {
+    echo "SpeedBench AL eval: patching vLLM benchmark --chat-template-kwargs support if needed"
+    python3 - <<'PYEOF'
+import vllm.benchmarks.serve as S
+import vllm.benchmarks.datasets.datasets as D
+
+
+def patch(mod, edits, marker):
+    f = mod.__file__
+    with open(f) as handle:
+        src = handle.read()
+    if marker in src:
+        print("already patched:", f)
+        return
+    for old, new in edits:
+        n = src.count(old)
+        assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..."
+        src = src.replace(old, new, 1)
+    with open(f, "w") as handle:
+        handle.write(src)
+    print("patched OK ->", f)
+
+
+serve_old = '''    parser.add_argument(
+        "--extra-body",'''
+serve_new = '''    parser.add_argument(
+        "--chat-template-kwargs",
+        type=json.loads,
+        default=None,
+        help="JSON dict forwarded to apply_chat_template during "
+        "client-side prompt rendering, e.g. to enable reasoning mode.",
+    )
+    parser.add_argument(
+        "--extra-body",'''
+patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"')
+
+disp_old = '''                output_len=args.speed_bench_output_len,
+                enable_multimodal_chat=args.enable_multimodal_chat,'''
+disp_new = '''                output_len=args.speed_bench_output_len,
+                chat_template_kwargs=args.chat_template_kwargs,
+                enable_multimodal_chat=args.enable_multimodal_chat,'''
+
+samp_old = '''                # apply template
+                if not skip_chat_template:
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                    )
+
+                prompt_len = len(tokenizer(prompt).input_ids)'''
+samp_new = '''                # apply template
+                if not skip_chat_template:
+                    _ctk = kwargs.get("chat_template_kwargs") or {}
+                    prompt = tokenizer.apply_chat_template(
+                        [{"role": "user", "content": prompt}],
+                        add_generation_prompt=True,
+                        tokenize=False,
+                        **_ctk,
+                    )
+
+                prompt_len = len(tokenizer(prompt).input_ids)'''
+patch(D, [(disp_old, disp_new), (samp_old, samp_new)],
+      marker="chat_template_kwargs=args.chat_template_kwargs")
+PYEOF
+}
+
 run_speedbench_al_eval() {
     local port="${PORT:-8888}"
     while [[ $# -gt 0 ]]; do
@@ -1130,6 +1197,11 @@ run_speedbench_al_eval() {
 
     local think_args=()
     if [[ "$mode" == "on" ]]; then
+        if ! _speedbench_apply_chat_template_kwargs_shim; then
+            echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2
+            _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed"
+            return 0
+        fi
         think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}')
     fi
 

From fa83900b0e9e2407222dedd7f3095bcfaf98464d Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 08:17:04 -0700
Subject: [PATCH 09/27] Add native SpeedBench client fallback

---
 benchmarks/benchmark_lib.sh       | 102 ++++++++-----
 utils/evals/speedbench_client.py  | 242 ++++++++++++++++++++++++++++++
 utils/evals/test_speedbench_al.py |  53 +++++++
 3 files changed, 361 insertions(+), 36 deletions(-)
 create mode 100644 utils/evals/speedbench_client.py

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e54ca6d235..54c2867b50 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1176,11 +1176,6 @@ run_speedbench_al_eval() {
     esac
 
     echo "SpeedBench AL eval: metrics framework=${metrics_framework}, endpoints=${metrics_endpoint_count}"
-    if ! command -v vllm >/dev/null 2>&1; then
-        echo "SpeedBench AL eval: vllm CLI is not available for SpeedBench client" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm CLI is not available for SpeedBench client"
-        return 0
-    fi
 
     local speedbench_dir="${SPEEDBENCH_DIR:-$(pwd)/speed_bench_data}"
     if ! _speedbench_prepare_dataset "$speedbench_dir"; then
@@ -1195,14 +1190,23 @@ run_speedbench_al_eval() {
         return 0
     fi
 
+    local thinking_kwargs='{"thinking": true, "reasoning_effort": "high"}'
+    local client="${SPEEDBENCH_CLIENT:-auto}"
+    local use_vllm_client=0
+    if [[ "$client" != "openai" && "$client" != "native" ]] && command -v vllm >/dev/null 2>&1; then
+        use_vllm_client=1
+    fi
+
     local think_args=()
     if [[ "$mode" == "on" ]]; then
-        if ! _speedbench_apply_chat_template_kwargs_shim; then
-            echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2
-            _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed"
-            return 0
+        if [[ "$use_vllm_client" -eq 1 ]]; then
+            if ! _speedbench_apply_chat_template_kwargs_shim; then
+                echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2
+                _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed"
+                return 0
+            fi
+            think_args=(--chat-template-kwargs "$thinking_kwargs")
         fi
-        think_args=(--chat-template-kwargs '{"thinking": true, "reasoning_effort": "high"}')
     fi
 
     local accepted_before="" proposed_before="" verify_before=""
@@ -1213,35 +1217,62 @@ run_speedbench_al_eval() {
     proposed_before="${proposed_before:-0}"
     verify_before="${verify_before:-0}"
 
-    local raw_result_dir
-    raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)"
     local bench_rc=0
     local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
-    local bench_cmd=(
-        vllm bench serve
-        --model "$speedbench_model"
-        --port "$port"
-        --dataset-name speed_bench
-        --dataset-path "$speedbench_dir"
-        --speed-bench-category coding
-        --speed-bench-output-len 4096
-        --num-prompts -1
-        --max-concurrency 1
-        --save-result
-        --result-dir "$raw_result_dir"
-        --result-filename "speedbench_al_${mode}_mtp${mtp}"
-        --trust-remote-code
-        --tokenizer-mode deepseek_v4
-        --temperature 1.0
-        "${think_args[@]}"
-    )
-
     echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}"
-    "${bench_cmd[@]}" || bench_rc=$?
-    if [[ "$bench_rc" -ne 0 ]]; then
-        echo "SpeedBench AL eval: vllm bench serve failed with exit code ${bench_rc}" >&2
-        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "vllm bench serve failed with exit code ${bench_rc}"
+    if [[ "$use_vllm_client" -eq 1 ]]; then
+        local raw_result_dir
+        raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)"
+        local bench_cmd=(
+            vllm bench serve
+            --model "$speedbench_model"
+            --port "$port"
+            --dataset-name speed_bench
+            --dataset-path "$speedbench_dir"
+            --speed-bench-category coding
+            --speed-bench-output-len 4096
+            --num-prompts -1
+            --max-concurrency 1
+            --save-result
+            --result-dir "$raw_result_dir"
+            --result-filename "speedbench_al_${mode}_mtp${mtp}"
+            --trust-remote-code
+            --tokenizer-mode deepseek_v4
+            --temperature 1.0
+            "${think_args[@]}"
+        )
+        "${bench_cmd[@]}" || bench_rc=$?
         rm -rf "$raw_result_dir" || true
+    else
+        export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
+        local native_cmd=(
+            python3 "$(pwd)/utils/evals/speedbench_client.py"
+            --model "$speedbench_model"
+            --base-url "http://0.0.0.0:${port}"
+            --dataset-path "$speedbench_dir"
+            --category coding
+            --output-len 4096
+            --temperature 1.0
+            --thinking-mode "$mode"
+            --timeout "${SPEEDBENCH_CLIENT_TIMEOUT:-1800}"
+            --retries "${SPEEDBENCH_CLIENT_RETRIES:-2}"
+        )
+        if [[ -n "${SPEEDBENCH_CLIENT_ENDPOINT:-}" ]]; then
+            native_cmd+=(--endpoint "$SPEEDBENCH_CLIENT_ENDPOINT")
+        elif [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+            native_cmd+=(--endpoint completions)
+        fi
+        if [[ "$mode" == "on" ]]; then
+            native_cmd+=(--thinking-kwargs "$thinking_kwargs")
+        fi
+        if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+            native_cmd+=(--dsv4)
+        fi
+        "${native_cmd[@]}" || bench_rc=$?
+    fi
+    if [[ "$bench_rc" -ne 0 ]]; then
+        echo "SpeedBench AL eval: client failed with exit code ${bench_rc}" >&2
+        _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "SpeedBench client failed with exit code ${bench_rc}"
         return 0
     fi
 
@@ -1300,7 +1331,6 @@ run_speedbench_al_eval() {
     else
         _speedbench_write_eval_result "$output" "$mode" "$mtp" "$al" "$delta_acc" "$delta_verify" "$delta_proposed" "$result_framework" "$metric_source"
     fi
-    rm -rf "$raw_result_dir" || true
 }
 
 run_lm_eval() {
diff --git a/utils/evals/speedbench_client.py b/utils/evals/speedbench_client.py
new file mode 100644
index 0000000000..5ad869dd90
--- /dev/null
+++ b/utils/evals/speedbench_client.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""Small OpenAI-compatible client for SpeedBench AL eval load.
+
+This intentionally avoids importing vLLM benchmark code so the eval can run in
+TensorRT-LLM and SGLang runtime images.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
+
+
+def _load_dsv4_encoder():
+    bench_serving_dir = Path(__file__).resolve().parents[1] / "bench_serving"
+    sys.path.insert(0, str(bench_serving_dir))
+    from encoding_dsv4 import encode_messages  # type: ignore
+
+    return encode_messages
+
+
+def _load_speedbench_requests(
+    dataset_path: Path,
+    category: str,
+    num_prompts: int,
+) -> list[list[dict[str, Any]]]:
+    jsonl_path = dataset_path / "qualitative.jsonl"
+    if not jsonl_path.is_file():
+        raise FileNotFoundError(f"missing SpeedBench JSONL: {jsonl_path}")
+
+    requests: list[list[dict[str, Any]]] = []
+    with jsonl_path.open(encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            if category and row.get("category") != category:
+                continue
+            messages = row.get("messages")
+            if not isinstance(messages, list) or not messages:
+                continue
+            requests.append(messages)
+            if num_prompts > 0 and len(requests) >= num_prompts:
+                break
+
+    if not requests:
+        raise ValueError(f"no SpeedBench prompts found for category={category!r}")
+    return requests
+
+
+def _json_post(
+    url: str,
+    payload: dict[str, Any],
+    timeout: int,
+    retries: int,
+) -> dict[str, Any]:
+    body = json.dumps(payload).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    last_error: Exception | None = None
+    for attempt in range(retries + 1):
+        request = Request(url, data=body, headers=headers, method="POST")
+        try:
+            with urlopen(request, timeout=timeout) as response:
+                raw = response.read().decode("utf-8")
+                return json.loads(raw) if raw else {}
+        except HTTPError as exc:
+            detail = exc.read().decode("utf-8", errors="replace")
+            message = f"HTTP {exc.code} from {url}: {detail[:1000]}"
+            last_error = RuntimeError(message)
+            if exc.code < 500:
+                break
+        except URLError as exc:
+            last_error = exc
+        except TimeoutError as exc:
+            last_error = exc
+
+        if attempt < retries:
+            time.sleep(min(2**attempt, 10))
+
+    assert last_error is not None
+    raise last_error
+
+
+def _chat_payload(
+    messages: list[dict[str, Any]],
+    model: str,
+    output_len: int,
+    temperature: float,
+    thinking_mode: str,
+    thinking_kwargs: dict[str, Any],
+) -> dict[str, Any]:
+    payload: dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": output_len,
+        "temperature": temperature,
+        "stream": False,
+    }
+    if thinking_mode == "on" and thinking_kwargs:
+        payload["chat_template_kwargs"] = thinking_kwargs
+        if "reasoning_effort" in thinking_kwargs:
+            payload["reasoning_effort"] = thinking_kwargs["reasoning_effort"]
+    return payload
+
+
+def _completion_payload(
+    messages: list[dict[str, Any]],
+    model: str,
+    output_len: int,
+    temperature: float,
+    thinking_mode: str,
+    thinking_kwargs: dict[str, Any],
+    dsv4: bool,
+) -> dict[str, Any]:
+    if dsv4:
+        encode_messages = _load_dsv4_encoder()
+        prompt = encode_messages(
+            messages,
+            thinking_mode="thinking" if thinking_mode == "on" else "chat",
+            reasoning_effort=thinking_kwargs.get("reasoning_effort"),
+        )
+    else:
+        first = messages[0]
+        prompt = first.get("content", "") if isinstance(first, dict) else str(first)
+
+    return {
+        "model": model,
+        "prompt": prompt,
+        "max_tokens": output_len,
+        "temperature": temperature,
+        "stream": False,
+    }
+
+
+def run(args: argparse.Namespace) -> int:
+    dataset_path = Path(args.dataset_path)
+    prompts = _load_speedbench_requests(dataset_path, args.category, args.num_prompts)
+    base_url = args.base_url.rstrip("/")
+    chat_url = f"{base_url}/v1/chat/completions"
+    completions_url = f"{base_url}/v1/completions"
+    thinking_kwargs = json.loads(args.thinking_kwargs) if args.thinking_kwargs else {}
+
+    failures = 0
+    resolved_endpoint = args.endpoint
+    for index, messages in enumerate(prompts, start=1):
+        endpoint_attempts = ["chat", "completions"] if resolved_endpoint == "auto" else [resolved_endpoint]
+        last_error: Exception | None = None
+        success = False
+        for endpoint in endpoint_attempts:
+            if endpoint == "completions":
+                payload = _completion_payload(
+                    messages,
+                    args.model,
+                    args.output_len,
+                    args.temperature,
+                    args.thinking_mode,
+                    thinking_kwargs,
+                    args.dsv4,
+                )
+                url = completions_url
+            else:
+                payload = _chat_payload(
+                    messages,
+                    args.model,
+                    args.output_len,
+                    args.temperature,
+                    args.thinking_mode,
+                    thinking_kwargs,
+                )
+                url = chat_url
+
+            try:
+                _json_post(url, payload, timeout=args.timeout, retries=args.retries)
+            except Exception as exc:
+                last_error = exc
+                if resolved_endpoint == "auto" and endpoint == "chat":
+                    print(
+                        "SpeedBench client chat endpoint failed; trying completions "
+                        f"fallback: {exc}",
+                        file=sys.stderr,
+                    )
+                    continue
+                break
+            else:
+                if resolved_endpoint == "auto":
+                    resolved_endpoint = endpoint
+                print(
+                    f"SpeedBench client request {index}/{len(prompts)} "
+                    f"completed via {endpoint}",
+                    flush=True,
+                )
+                success = True
+                break
+
+        if success:
+            continue
+
+        if last_error is None:
+            last_error = RuntimeError("no SpeedBench endpoint attempts were made")
+        failures += 1
+        print(
+            f"SpeedBench client request {index}/{len(prompts)} failed: {last_error}",
+            file=sys.stderr,
+        )
+        if failures > args.max_failures:
+            return 1
+
+    return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--base-url", required=True)
+    parser.add_argument("--dataset-path", required=True)
+    parser.add_argument("--category", default="coding")
+    parser.add_argument("--output-len", type=int, default=4096)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--thinking-mode", choices=["on", "off"], default="off")
+    parser.add_argument("--thinking-kwargs", default="")
+    parser.add_argument("--endpoint", choices=["auto", "chat", "completions"], default="auto")
+    parser.add_argument("--num-prompts", type=int, default=-1)
+    parser.add_argument("--timeout", type=int, default=1800)
+    parser.add_argument("--retries", type=int, default=2)
+    parser.add_argument("--max-failures", type=int, default=0)
+    parser.add_argument("--dsv4", action="store_true")
+    return run(parser.parse_args())
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index 8663058c30..bd7db0def2 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -13,6 +13,11 @@
     score_cell,
 )
 from speedbench_al import build_result, load_reference, lookup_reference
+from speedbench_client import (
+    _chat_payload,
+    _completion_payload,
+    _load_speedbench_requests,
+)
 from validate_scores import validate_speedbench_al
 
 
@@ -165,3 +170,51 @@ def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> Non
 
     assert lm_path is None
     assert speedbench_paths == [result_path]
+
+
+def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None:
+    dataset = tmp_path / "speed_bench_data"
+    dataset.mkdir()
+    (dataset / "qualitative.jsonl").write_text(
+        "\n".join(
+            [
+                json.dumps(
+                    {
+                        "category": "coding",
+                        "messages": [{"role": "user", "content": "Write fizzbuzz."}],
+                    }
+                ),
+                json.dumps(
+                    {
+                        "category": "math",
+                        "messages": [{"role": "user", "content": "Solve 2+2."}],
+                    }
+                ),
+            ]
+        )
+    )
+
+    prompts = _load_speedbench_requests(dataset, "coding", -1)
+    chat = _chat_payload(
+        prompts[0],
+        model="deepseek-ai/DeepSeek-V4-Pro",
+        output_len=4096,
+        temperature=1.0,
+        thinking_mode="on",
+        thinking_kwargs={"thinking": True, "reasoning_effort": "high"},
+    )
+    completions = _completion_payload(
+        prompts[0],
+        model="deepseek-ai/DeepSeek-V4-Pro",
+        output_len=4096,
+        temperature=1.0,
+        thinking_mode="on",
+        thinking_kwargs={"thinking": True, "reasoning_effort": "high"},
+        dsv4=True,
+    )
+
+    assert len(prompts) == 1
+    assert chat["chat_template_kwargs"]["thinking"] is True
+    assert chat["reasoning_effort"] == "high"
+    assert "<think>" in completions["prompt"]
+    assert completions["max_tokens"] == 4096

From 60c19dd56db1c5bde1552b42ac1a4f2a230feffa Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 08:35:04 -0700
Subject: [PATCH 10/27] Use shared GB200 workspace for DSV4 Dynamo

---
 runners/launch_gb200-nv.sh | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 45ef3a952a..4c5ca8601e 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -4,6 +4,11 @@
 
 set -x
 
+USE_SHARED_GB200_WORKSPACE=false
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "dsv4" ]]; then
+    USE_SHARED_GB200_WORKSPACE=true
+fi
+
 # MODEL_PATH: Override with pre-downloaded paths on GB200 runner
 # The yaml files specify HuggingFace model IDs for portability, but we use
 # local paths to avoid repeated downloading on the shared GB200 cluster.
@@ -74,15 +79,15 @@ export SLURM_ACCOUNT="benchmark"
 
 NGINX_IMAGE="nginx:1.27.4"
 
-# === Cluster diagnostic probe (minimax only) ===
+# === Cluster diagnostic probe for watchtower-hosted GB200 jobs ===
 # The gb200-nv_* runners may be hosted on different physical clusters
 # (e.g., the legacy NVIDIA Lustre cluster vs Oracle Cloud "watchtower").
 # Print enough info to identify the layout, then pick a writable
 # squash dir on a path that's also visible to compute nodes. Falls
 # back to the legacy sa-shared path so other configs are untouched.
 SQUASH_DIR="/mnt/lustre01/users-public/sa-shared"
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
-    echo "=== cluster diagnostic (minimax sweep) ==="
+if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then
+    echo "=== cluster diagnostic (shared GB200 workspace) ==="
     echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)"
     echo "HOME=$HOME"
     echo "HOSTNAME=$(hostname -f 2>/dev/null || hostname)"
@@ -202,7 +207,7 @@ SRT_REPO_DIR="srt-slurm"
 # cross-mounted to compute nodes. Put the srt-slurm workspace and staged
 # InferenceX checkout on a writable shared-FS path that compute can see.
 # Per-run-unique paths avoid races between parallel sweep jobs.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then
     SHARED_BASE=""
     for cand in \
         /mnt/lustre01/users-public/sa-shared/gha-runs \
@@ -295,7 +300,7 @@ source $HOME/.local/bin/env
 # under a head-node-only path, .venv/bin/python3 becomes a broken
 # symlink on compute. Pin the venv to /usr/bin/python3 — a system
 # path that exists at the same location on both head and compute.
-if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then
+if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" && -x /usr/bin/python3 ]]; then
     uv venv --seed --python /usr/bin/python3
 else
     uv venv --seed
@@ -312,10 +317,10 @@ echo "Configs available at: $SRT_REPO_DIR/"
 
 # Create srtslurm.yaml for srtctl (used by both frameworks)
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
-# Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path
+# Watchtower-hosted jobs: SRT_REPO_DIR was moved to a shared-FS path
 # above so srtctl's outputs/ directory (which lives under
 # SRTCTL_ROOT) is visible to compute nodes.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then
     SRTCTL_ROOT="$SRT_REPO_DIR"
 fi
 echo "Creating srtslurm.yaml configuration..."
@@ -357,7 +362,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 # can't see. Stage the relevant subset to shared FS and repoint
 # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already
 # on shared FS) and .git (not needed in container) for speed.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ "$USE_SHARED_GB200_WORKSPACE" == "true" ]]; then
     SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}"
     mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1
     rsync -a --delete \

From 081cbca98b36c49b87172eee2e411ab5fd04637e Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 08:58:15 -0700
Subject: [PATCH 11/27] Enable metrics for DSV4 SGLang MTP

---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index bfb38953b8..7fabb03f47 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -131,6 +131,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL_PATH --served-model-name $MODEL \
     --host 0.0.0.0 \
     --port $PORT \
+    --enable-metrics \
     --trust-remote-code \
     --tp $TP \
     --ep-size $EP_SIZE \

From 4cf5bbf91e74483372d9bafa6c87764af582b125 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 11:18:57 -0700
Subject: [PATCH 12/27] Enable TRT-LLM spec metrics for DSV4 MTP

---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh | 2 ++
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index e51d4043d8..781bfe9337 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -77,6 +77,8 @@ cuda_graph_config:
     max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
 enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
 print_iter_log: true
+return_perf_metrics: true
+enable_iter_perf_stats: true
 kv_cache_config:
     tokens_per_block: 128
     dtype: fp8
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
index e4664dcd59..50c5908750 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
@@ -85,6 +85,8 @@ cuda_graph_config:
     max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
 enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
 print_iter_log: true
+return_perf_metrics: true
+enable_iter_perf_stats: true
 kv_cache_config:
     tokens_per_block: 128
     dtype: fp8

From 63bf3ebaa218955bb5cfc89bc5399d1101802c71 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 11:46:56 -0700
Subject: [PATCH 13/27] Use TRT-LLM Prometheus metrics endpoint for SpeedBench

---
 benchmarks/benchmark_lib.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 54c2867b50..9a5d0636c3 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1163,6 +1163,9 @@ run_speedbench_al_eval() {
     metrics_framework=$(_speedbench_metrics_framework)
     result_framework="${SPEEDBENCH_METRICS_FRAMEWORK:-${FRAMEWORK:-$metrics_framework}}"
     metric_source_base=$(_speedbench_metric_source_base "$metrics_framework")
+    if [[ "$metrics_framework" == "trtllm" && -z "${SPEEDBENCH_DECODE_METRICS_URLS:-}${SPEEDBENCH_METRICS_URLS:-}${SPEEDBENCH_METRICS_PORTS:-}" ]]; then
+        export SPEEDBENCH_METRICS_URLS="http://0.0.0.0:${port}/prometheus/metrics"
+    fi
     metrics_endpoint_count=$(_speedbench_metric_endpoint_count "$port")
 
     case "$metrics_framework" in

From 4a4fbf13e40c753c65798f7817c816d8cbcda5b5 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 12:42:24 -0700
Subject: [PATCH 14/27] Use TRT-LLM JSON stats for SpeedBench fallback

---
 benchmarks/benchmark_lib.sh | 134 ++++++++++++++++++++++++++++++++++++
 utils/evals/EVALS.md        |   3 +-
 2 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 9a5d0636c3..1beb29a008 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -859,6 +859,133 @@ _speedbench_metric_endpoint_count() {
     echo "$count"
 }
 
+_speedbench_trtllm_json_metrics_urls() {
+    local port="$1"
+    local raw="${SPEEDBENCH_TRTLLM_JSON_METRICS_URLS:-}"
+    local endpoint url
+
+    if [[ -n "$raw" ]]; then
+        for endpoint in ${raw//,/ }; do
+            _speedbench_normalize_metrics_url "$endpoint"
+        done
+        return 0
+    fi
+
+    while IFS= read -r url; do
+        [[ -z "$url" ]] && continue
+        echo "$url" | sed -E 's#/prometheus/metrics([?].*)?$#/metrics#'
+    done < <(_speedbench_metric_urls "$port")
+}
+
+_speedbench_trtllm_json_spec_metrics() {
+    local port="$1"
+    local mtp="$2"
+    local urls=()
+    local url
+
+    while IFS= read -r url; do
+        [[ -n "$url" ]] && urls+=("$url")
+    done < <(_speedbench_trtllm_json_metrics_urls "$port")
+
+    [[ "${#urls[@]}" -gt 0 ]] || return 1
+
+    python3 - "$mtp" "${urls[@]}" <<'PY'
+import json
+import os
+import sys
+import urllib.request
+
+
+def number(value, default=0.0):
+    try:
+        if value is None:
+            return default
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def stats_from_payload(payload):
+    if isinstance(payload, list):
+        return payload
+    if isinstance(payload, dict):
+        return [payload]
+    return []
+
+
+try:
+    mtp = float(sys.argv[1])
+except (IndexError, ValueError):
+    mtp = 0.0
+
+timeout = float(os.environ.get("SPEEDBENCH_METRICS_CURL_TIMEOUT", "10"))
+total_draft = 0.0
+total_accepted = 0.0
+total_requests = 0.0
+weighted_acceptance_length = 0.0
+unweighted_acceptance_length = 0.0
+unweighted_count = 0
+used_endpoints = 0
+
+for url in sys.argv[2:]:
+    try:
+        with urllib.request.urlopen(url, timeout=timeout) as response:
+            payload = json.load(response)
+    except Exception as exc:  # noqa: BLE001 - diagnostics for CI logs
+        print(f"SpeedBench AL eval: TRT-LLM JSON metrics fetch failed for {url}: {exc}", file=sys.stderr)
+        continue
+
+    endpoint_had_spec = False
+    for stat in stats_from_payload(payload):
+        if not isinstance(stat, dict):
+            continue
+        spec = stat.get("specDecodingStats")
+        if not isinstance(spec, dict):
+            continue
+
+        draft = number(spec.get("numDraftTokens"))
+        if draft <= 0:
+            continue
+
+        accepted = number(spec.get("numAcceptedTokens"))
+        requests = number(spec.get("numRequestsWithDraftTokens"))
+        acceptance_length = number(spec.get("acceptanceLength"), default=-1.0)
+
+        total_draft += draft
+        total_accepted += accepted
+        endpoint_had_spec = True
+
+        if acceptance_length > 0:
+            if requests > 0:
+                total_requests += requests
+                weighted_acceptance_length += acceptance_length * requests
+            else:
+                unweighted_acceptance_length += acceptance_length
+                unweighted_count += 1
+
+    if endpoint_had_spec:
+        used_endpoints += 1
+
+if total_requests > 0:
+    acceptance_length = weighted_acceptance_length / total_requests
+elif unweighted_count > 0:
+    acceptance_length = unweighted_acceptance_length / unweighted_count
+elif total_draft > 0 and mtp > 0:
+    acceptance_length = 1.0 + (total_accepted / (total_draft / mtp))
+else:
+    sys.exit(1)
+
+verify_steps = round(total_draft / mtp) if total_draft > 0 and mtp > 0 else 0
+print(
+    f"{acceptance_length:.4f}\t"
+    f"{int(round(total_accepted))}\t"
+    f"{int(verify_steps)}\t"
+    f"{int(round(total_draft))}\t"
+    f"{used_endpoints}"
+)
+PY
+}
+
 _speedbench_metric_delta() {
     local before="$1"
     local after="$2"
@@ -1305,6 +1432,13 @@ run_speedbench_al_eval() {
             if [[ -n "$delta_acc" || -n "$delta_proposed" ]]; then
                 metric_source="${metric_source}+token-counters"
             fi
+        else
+            local trt_json_metrics="" trt_json_endpoints=""
+            trt_json_metrics=$(_speedbench_trtllm_json_spec_metrics "$port" "$mtp" 2>/dev/null || true)
+            if [[ -n "$trt_json_metrics" ]]; then
+                IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_json_endpoints <<< "$trt_json_metrics"
+                metric_source="trtllm-json-iteration-stats-endpoints${trt_json_endpoints}"
+            fi
         fi
     elif [[ "$metrics_framework" == "sglang" ]]; then
         al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 65fca90183..4177bb2fc8 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -139,8 +139,9 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `SPEEDBENCH_DECODE_METRICS_URLS` | unset | Comma/space-separated decode worker Prometheus `/metrics` URLs for disaggregated runs |
 | `SPEEDBENCH_METRICS_URLS` | unset | Generic comma/space-separated Prometheus endpoints when decode-specific naming is not applicable |
 | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied |
+| `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable |
 
-SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM records its acceptance-length gauge and token counters because it does not expose verify steps through Prometheus. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values.
+SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON iteration stats from `/metrics` when the Prometheus spec series are unavailable. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values.
 
 ### Score validation
 `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.

From de360bf04ffd190387c430bcaafcb5230c225eff Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 13:27:45 -0700
Subject: [PATCH 15/27] Use TRT-LLM decoded-token metric as AL fallback

---
 benchmarks/benchmark_lib.sh | 116 +++++++++++++++++++++++++++++++++++-
 utils/evals/EVALS.md        |   2 +-
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 1beb29a008..ef6c8604d5 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -986,6 +986,106 @@ print(
 PY
 }
 
+_speedbench_trtllm_avg_decoded_al() {
+    local port="$1"
+    local value
+    value=$(_speedbench_metric_avg "$port" "trtllm_avg_decoded_tokens_per_iter" 2>/dev/null || true)
+    [[ -n "$value" ]] || return 1
+    awk -v value="$value" '
+        BEGIN {
+            if (value < 1.0) {
+                exit 1
+            }
+            printf "%.4f\n", value
+        }
+    '
+}
+
+_speedbench_trtllm_json_avg_decoded_al() {
+    local port="$1"
+    local urls=()
+    local url
+
+    while IFS= read -r url; do
+        [[ -n "$url" ]] && urls+=("$url")
+    done < <(_speedbench_trtllm_json_metrics_urls "$port")
+
+    [[ "${#urls[@]}" -gt 0 ]] || return 1
+
+    python3 - "${urls[@]}" <<'PY'
+import json
+import os
+import sys
+import urllib.request
+
+
+def number(value, default=0.0):
+    try:
+        if value is None:
+            return default
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+
+
+def stats_from_payload(payload):
+    if isinstance(payload, list):
+        return payload
+    if isinstance(payload, dict):
+        return [payload]
+    return []
+
+
+timeout = float(os.environ.get("SPEEDBENCH_METRICS_CURL_TIMEOUT", "10"))
+weighted_total = 0.0
+total_requests = 0.0
+unweighted_total = 0.0
+unweighted_count = 0
+used_endpoints = 0
+
+for url in sys.argv[1:]:
+    try:
+        with urllib.request.urlopen(url, timeout=timeout) as response:
+            payload = json.load(response)
+    except Exception as exc:  # noqa: BLE001 - diagnostics for CI logs
+        print(f"SpeedBench AL eval: TRT-LLM JSON metrics fetch failed for {url}: {exc}", file=sys.stderr)
+        continue
+
+    endpoint_had_avg = False
+    for stat in stats_from_payload(payload):
+        if not isinstance(stat, dict):
+            continue
+        ifb = stat.get("inflightBatchingStats")
+        if not isinstance(ifb, dict):
+            continue
+
+        avg_decoded = number(ifb.get("avgNumDecodedTokensPerIter"), default=-1.0)
+        if avg_decoded < 1.0:
+            continue
+
+        gen_requests = number(ifb.get("numGenRequests"))
+        endpoint_had_avg = True
+        if gen_requests > 0:
+            weighted_total += avg_decoded * gen_requests
+            total_requests += gen_requests
+        else:
+            unweighted_total += avg_decoded
+            unweighted_count += 1
+
+    if endpoint_had_avg:
+        used_endpoints += 1
+
+if total_requests > 0:
+    acceptance_length = weighted_total / total_requests
+elif unweighted_count > 0:
+    acceptance_length = unweighted_total / unweighted_count
+else:
+    sys.exit(1)
+
+print(f"{acceptance_length:.4f}\t{used_endpoints}")
+PY
+}
+
 _speedbench_metric_delta() {
     local before="$1"
     local after="$2"
@@ -1434,11 +1534,25 @@ run_speedbench_al_eval() {
             fi
         else
             local trt_json_metrics="" trt_json_endpoints=""
-            trt_json_metrics=$(_speedbench_trtllm_json_spec_metrics "$port" "$mtp" 2>/dev/null || true)
+            trt_json_metrics=$(_speedbench_trtllm_json_spec_metrics "$port" "$mtp" || true)
             if [[ -n "$trt_json_metrics" ]]; then
                 IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_json_endpoints <<< "$trt_json_metrics"
                 metric_source="trtllm-json-iteration-stats-endpoints${trt_json_endpoints}"
             fi
+            if [[ -z "$al" ]]; then
+                al=$(_speedbench_trtllm_avg_decoded_al "$port" || true)
+                if [[ -n "$al" ]]; then
+                    metric_source="${metric_source_base}-avg-decoded-tokens-endpoints${metrics_endpoint_count}"
+                fi
+            fi
+            if [[ -z "$al" ]]; then
+                local trt_json_avg_metrics="" trt_json_avg_endpoints=""
+                trt_json_avg_metrics=$(_speedbench_trtllm_json_avg_decoded_al "$port" || true)
+                if [[ -n "$trt_json_avg_metrics" ]]; then
+                    IFS=$'\t' read -r al trt_json_avg_endpoints <<< "$trt_json_avg_metrics"
+                    metric_source="trtllm-json-avg-decoded-tokens-endpoints${trt_json_avg_endpoints}"
+                fi
+            fi
         fi
     elif [[ "$metrics_framework" == "sglang" ]]; then
         al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 4177bb2fc8..6541800b32 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -141,7 +141,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied |
 | `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable |
 
-SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON iteration stats from `/metrics` when the Prometheus spec series are unavailable. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values.
+SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values.
 
 ### Score validation
 `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.

From cc4c7e3d7ae0757f09a12fff17b3433e5d302aa2 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 13:42:38 -0700
Subject: [PATCH 16/27] Collect Dynamo SpeedBench AL from decode logs

---
 runners/launch_b200-dgxc.sh                   |   3 +
 runners/launch_gb200-nv.sh                    |   3 +
 utils/evals/EVALS.md                          |   2 +-
 utils/evals/dynamo_speedbench_al_from_logs.py | 215 ++++++++++++++++++
 utils/evals/test_speedbench_al.py             |  36 +++
 .../write_dynamo_speedbench_al_from_logs.sh   |  57 +++++
 6 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 utils/evals/dynamo_speedbench_al_from_logs.py
 create mode 100644 utils/evals/write_dynamo_speedbench_al_from_logs.sh

diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 9eeed2af6e..1d4b716a69 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -359,6 +359,9 @@ EOF
     # Collect eval results if eval was requested
     if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
         EVAL_DIR="$LOGS_DIR/eval_results"
+        if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
+            bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
+        fi
         if [ -d "$EVAL_DIR" ]; then
             echo "Extracting eval results from $EVAL_DIR"
             shopt -s nullglob
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 4c5ca8601e..ee149e7c41 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -503,6 +503,9 @@ fi
 # Collect eval results if eval was requested
 if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
     EVAL_DIR="$LOGS_DIR/eval_results"
+    if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
+        bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
+    fi
     if [ -d "$EVAL_DIR" ]; then
         echo "Extracting eval results from $EVAL_DIR"
         shopt -s nullglob
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 6541800b32..63a96cb29a 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -141,7 +141,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied |
 | `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable |
 
-SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints, summing counters and averaging gauge-only AL values.
+SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker.
 
 ### Score validation
 `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
diff --git a/utils/evals/dynamo_speedbench_al_from_logs.py b/utils/evals/dynamo_speedbench_al_from_logs.py
new file mode 100644
index 0000000000..0cf71ac4d8
--- /dev/null
+++ b/utils/evals/dynamo_speedbench_al_from_logs.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""Build a SpeedBench AL result from Dynamo decode-worker spec logs."""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+from speedbench_al import build_result, cmd_record
+
+
+SPEC_LINE_RE = re.compile(
+    r"SpecDecoding metrics:\s*"
+    r"Mean acceptance length:\s*(?P<al>[0-9]+(?:\.[0-9]+)?)"
+    r".*?"
+    r"Accepted:\s*(?P<accepted>[0-9]+)\s*tokens,\s*"
+    r"Drafted:\s*(?P<drafted>[0-9]+)\s*tokens"
+)
+WORKER_RE = re.compile(r"_decode_w(?P<worker>[0-9]+)\.out$")
+
+
+@dataclass(frozen=True)
+class LogMetrics:
+    path: Path
+    worker: str
+    samples: int
+    accepted_tokens: int
+    proposed_draft_tokens: int
+
+
+@dataclass(frozen=True)
+class AggregatedMetrics:
+    workers: int
+    samples: int
+    accepted_tokens: int
+    proposed_draft_tokens: int
+    verify_steps: int
+    acceptance_length: float
+    selected_logs: tuple[Path, ...]
+
+
+def _decode_log_files(logs_dir: Path) -> Iterable[Path]:
+    if not logs_dir.is_dir():
+        return []
+    return sorted(logs_dir.rglob("*_decode_w*.out"))
+
+
+def parse_decode_log(path: Path) -> LogMetrics | None:
+    match = WORKER_RE.search(path.name)
+    if not match:
+        return None
+
+    samples = 0
+    accepted = 0
+    drafted = 0
+    try:
+        lines = path.read_text(errors="ignore").splitlines()
+    except OSError:
+        return None
+
+    for line in lines:
+        parsed = SPEC_LINE_RE.search(line)
+        if not parsed:
+            continue
+        samples += 1
+        accepted += int(parsed.group("accepted"))
+        drafted += int(parsed.group("drafted"))
+
+    if samples == 0 or drafted <= 0:
+        return None
+
+    return LogMetrics(
+        path=path,
+        worker=match.group("worker"),
+        samples=samples,
+        accepted_tokens=accepted,
+        proposed_draft_tokens=drafted,
+    )
+
+
+def select_decode_worker_logs(logs_dir: Path) -> list[LogMetrics]:
+    by_worker: dict[str, LogMetrics] = {}
+    for path in _decode_log_files(logs_dir):
+        metrics = parse_decode_log(path)
+        if metrics is None:
+            continue
+        current = by_worker.get(metrics.worker)
+        if current is None:
+            by_worker[metrics.worker] = metrics
+            continue
+        if (metrics.samples, metrics.proposed_draft_tokens) > (
+            current.samples,
+            current.proposed_draft_tokens,
+        ):
+            by_worker[metrics.worker] = metrics
+    return [by_worker[k] for k in sorted(by_worker, key=int)]
+
+
+def aggregate_log_metrics(logs_dir: Path, mtp: int) -> AggregatedMetrics | None:
+    if mtp <= 0:
+        raise ValueError("mtp must be positive")
+
+    selected = select_decode_worker_logs(logs_dir)
+    if not selected:
+        return None
+
+    accepted = sum(item.accepted_tokens for item in selected)
+    proposed = sum(item.proposed_draft_tokens for item in selected)
+    samples = sum(item.samples for item in selected)
+    if proposed <= 0:
+        return None
+
+    verify_steps = round(proposed / mtp)
+    acceptance_length = 1.0 + (accepted / (proposed / mtp))
+
+    return AggregatedMetrics(
+        workers=len(selected),
+        samples=samples,
+        accepted_tokens=accepted,
+        proposed_draft_tokens=proposed,
+        verify_steps=verify_steps,
+        acceptance_length=acceptance_length,
+        selected_logs=tuple(item.path for item in selected),
+    )
+
+
+def _record_args(args: argparse.Namespace, metrics: AggregatedMetrics | None) -> argparse.Namespace:
+    record = argparse.Namespace(
+        output=args.output,
+        reference_yaml=args.reference_yaml,
+        model=args.model,
+        model_prefix=args.model_prefix,
+        thinking_mode=args.thinking_mode,
+        num_speculative_tokens=args.num_speculative_tokens,
+        category=args.category,
+        output_len=args.output_len,
+        temperature=args.temperature,
+        threshold_ratio=args.threshold_ratio,
+        framework=args.framework,
+        metric_source=args.metric_source,
+        acceptance_length=None,
+        accepted_tokens=None,
+        draft_tokens=None,
+        verify_steps=None,
+        proposed_draft_tokens=None,
+        error=None,
+        exit_status=False,
+    )
+    if metrics is None:
+        record.error = (
+            "Could not parse Dynamo speculative acceptance metrics from decode-worker logs"
+        )
+        return record
+
+    record.metric_source = (
+        f"{args.metric_source}-workers{metrics.workers}-samples{metrics.samples}"
+    )
+    record.acceptance_length = f"{metrics.acceptance_length:.4f}"
+    record.accepted_tokens = str(metrics.accepted_tokens)
+    record.verify_steps = str(metrics.verify_steps)
+    record.draft_tokens = str(metrics.verify_steps)
+    record.proposed_draft_tokens = str(metrics.proposed_draft_tokens)
+    return record
+
+
+def cmd_from_logs(args: argparse.Namespace) -> int:
+    metrics = aggregate_log_metrics(Path(args.logs_dir), args.num_speculative_tokens)
+    record_args = _record_args(args, metrics)
+    result = build_result(record_args)
+    rc = cmd_record(record_args)
+
+    if metrics is not None:
+        print("Dynamo SpeedBench AL log aggregation:")
+        for path in metrics.selected_logs:
+            print(f"  selected {path}")
+    if not result.get("passed"):
+        return 0
+    return rc
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--logs-dir", required=True)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--reference-yaml", required=True)
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--model-prefix", default="")
+    parser.add_argument("--thinking-mode", required=True)
+    parser.add_argument("--num-speculative-tokens", type=int, required=True)
+    parser.add_argument("--category", default="coding")
+    parser.add_argument("--output-len", type=int, default=4096)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--threshold-ratio", type=float, default=0.90)
+    parser.add_argument("--framework", default="dynamo")
+    parser.add_argument("--metric-source", default="dynamo-decode-log-counters")
+    parser.set_defaults(func=cmd_from_logs)
+    return parser
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    try:
+        return args.func(args)
+    except Exception as exc:  # noqa: BLE001 - CLI should record a concise failure
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index bd7db0def2..81175ac20b 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -12,6 +12,7 @@
     extract_speedbench_al_metrics,
     score_cell,
 )
+from dynamo_speedbench_al_from_logs import aggregate_log_metrics
 from speedbench_al import build_result, load_reference, lookup_reference
 from speedbench_client import (
     _chat_payload,
@@ -172,6 +173,41 @@ def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> Non
     assert speedbench_paths == [result_path]
 
 
+def test_dynamo_log_parser_aggregates_decode_workers(tmp_path: Path) -> None:
+    def write_log(name: str, rows: list[tuple[float, int, int]]) -> None:
+        lines = []
+        for al, accepted, drafted in rows:
+            lines.append(
+                "INFO metrics.log: SpecDecoding metrics: "
+                f"Mean acceptance length: {al}, "
+                "Accepted throughput: 1.0 tokens/s, "
+                "Drafted throughput: 1.0 tokens/s, "
+                f"Accepted: {accepted} tokens, Drafted: {drafted} tokens, "
+                "Per-position acceptance rate: 0.9, 0.7, "
+                "Avg Draft acceptance rate: 80.0%"
+            )
+        (tmp_path / name).write_text("\n".join(lines))
+
+    write_log("node-a_decode_w0.out", [(2.0, 10, 20)])
+    write_log("node-b_decode_w0.out", [(2.5, 15, 20), (2.5, 5, 10)])
+    write_log("node-c_decode_w1.out", [(2.0, 10, 20)])
+    write_log("node-d_decode_w1.out", [])
+
+    metrics = aggregate_log_metrics(tmp_path, mtp=2)
+
+    assert metrics is not None
+    assert metrics.workers == 2
+    assert metrics.samples == 3
+    assert metrics.accepted_tokens == 30
+    assert metrics.proposed_draft_tokens == 50
+    assert metrics.verify_steps == 25
+    assert metrics.acceptance_length == 2.2
+    assert [p.name for p in metrics.selected_logs] == [
+        "node-b_decode_w0.out",
+        "node-c_decode_w1.out",
+    ]
+
+
 def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None:
     dataset = tmp_path / "speed_bench_data"
     dataset.mkdir()
diff --git a/utils/evals/write_dynamo_speedbench_al_from_logs.sh b/utils/evals/write_dynamo_speedbench_al_from_logs.sh
new file mode 100644
index 0000000000..b33d0bdcd3
--- /dev/null
+++ b/utils/evals/write_dynamo_speedbench_al_from_logs.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+set -u
+
+logs_dir="${1:-}"
+workspace="${2:-${GITHUB_WORKSPACE:-$(pwd)}}"
+
+if [[ -z "$logs_dir" ]]; then
+    echo "Dynamo SpeedBench AL: missing logs directory argument" >&2
+    exit 0
+fi
+
+if [[ "${FRAMEWORK:-}" != dynamo* || "${SPEC_DECODING:-none}" != "mtp" ]]; then
+    echo "Dynamo SpeedBench AL: skipping FRAMEWORK=${FRAMEWORK:-unknown} SPEC_DECODING=${SPEC_DECODING:-none}"
+    exit 0
+fi
+
+mtp="${SPEEDBENCH_NUM_SPEC_TOKENS:-${NUM_SPEC_TOKENS:-${SPECULATIVE_DRAFT_TOKENS:-}}}"
+if [[ -z "$mtp" && -n "${CONFIG_FILE:-}" ]]; then
+    config_path="${CONFIG_FILE%%:*}"
+    if [[ -f "$config_path" ]]; then
+        mtp="$(sed -n 's/.*num_speculative_tokens[^0-9]*\([0-9][0-9]*\).*/\1/p' "$config_path" | head -1)"
+    fi
+fi
+mtp="${mtp:-2}"
+
+mode="${SPEEDBENCH_THINKING_MODE:-}"
+if [[ -z "$mode" ]]; then
+    if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+        mode="on"
+    else
+        mode="off"
+    fi
+fi
+
+model_name="${MODEL_NAME:-${MODEL:-}}"
+if [[ -z "$model_name" ]]; then
+    model_name="${SERVED_MODEL_NAME:-unknown}"
+fi
+
+output="${workspace}/results_speedbench_al_${mode}_mtp${mtp}.json"
+metric_source="dynamo-decode-log-counters"
+if [[ -n "${FRAMEWORK:-}" ]]; then
+    metric_source="${FRAMEWORK}-decode-log-counters"
+fi
+
+echo "Dynamo SpeedBench AL: parsing decode logs from $logs_dir"
+python3 "${workspace}/utils/evals/dynamo_speedbench_al_from_logs.py" \
+    --logs-dir "$logs_dir" \
+    --output "$output" \
+    --reference-yaml "${workspace}/benchmarks/speedbench-reference-al.yaml" \
+    --model "$model_name" \
+    --model-prefix "${MODEL_PREFIX:-}" \
+    --thinking-mode "$mode" \
+    --num-speculative-tokens "$mtp" \
+    --framework "${FRAMEWORK:-dynamo}" \
+    --metric-source "$metric_source" || true

From 2aef667c4432bbc36c1a551f2aa4f9e2fc39aee7 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 19:03:39 -0700
Subject: [PATCH 17/27] Use TRT-LLM server logs for SpeedBench AL fallback

---
 benchmarks/benchmark_lib.sh                  |  31 +++++
 utils/evals/EVALS.md                         |   3 +-
 utils/evals/test_speedbench_al.py            |  51 +++++++
 utils/evals/trtllm_speedbench_al_from_log.py | 138 +++++++++++++++++++
 4 files changed, 222 insertions(+), 1 deletion(-)
 create mode 100644 utils/evals/trtllm_speedbench_al_from_log.py

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ef6c8604d5..affc88aea7 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1086,6 +1086,19 @@ print(f"{acceptance_length:.4f}\t{used_endpoints}")
 PY
 }
 
+_speedbench_trtllm_server_log_metrics() {
+    local mtp="$1"
+    local start_offset="${2:-0}"
+    local log_path="${SPEEDBENCH_TRTLLM_SERVER_LOG:-${SERVER_LOG:-}}"
+
+    [[ -n "$log_path" && -f "$log_path" ]] || return 1
+
+    python3 "$(pwd)/utils/evals/trtllm_speedbench_al_from_log.py" \
+        --log "$log_path" \
+        --num-speculative-tokens "$mtp" \
+        --start-offset "$start_offset"
+}
+
 _speedbench_metric_delta() {
     local before="$1"
     local after="$2"
@@ -1447,6 +1460,16 @@ run_speedbench_al_eval() {
     proposed_before="${proposed_before:-0}"
     verify_before="${verify_before:-0}"
 
+    local trt_server_log_offset="0"
+    if [[ "$metrics_framework" == "trtllm" ]]; then
+        local trt_server_log="${SPEEDBENCH_TRTLLM_SERVER_LOG:-${SERVER_LOG:-}}"
+        if [[ -n "$trt_server_log" && -f "$trt_server_log" ]]; then
+            trt_server_log_offset=$(wc -c < "$trt_server_log" 2>/dev/null || true)
+            trt_server_log_offset="${trt_server_log_offset//[!0-9]/}"
+            trt_server_log_offset="${trt_server_log_offset:-0}"
+        fi
+    fi
+
     local bench_rc=0
     local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
     echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}"
@@ -1539,6 +1562,14 @@ run_speedbench_al_eval() {
                 IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_json_endpoints <<< "$trt_json_metrics"
                 metric_source="trtllm-json-iteration-stats-endpoints${trt_json_endpoints}"
             fi
+            if [[ -z "$al" ]]; then
+                local trt_log_metrics="" trt_log_samples=""
+                trt_log_metrics=$(_speedbench_trtllm_server_log_metrics "$mtp" "$trt_server_log_offset" || true)
+                if [[ -n "$trt_log_metrics" ]]; then
+                    IFS=$'\t' read -r al delta_acc delta_verify delta_proposed trt_log_samples <<< "$trt_log_metrics"
+                    metric_source="trtllm-server-log-generation-tokens-samples${trt_log_samples}"
+                fi
+            fi
             if [[ -z "$al" ]]; then
                 al=$(_speedbench_trtllm_avg_decoded_al "$port" || true)
                 if [[ -n "$al" ]]; then
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 63a96cb29a..6aeeb585d5 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -140,8 +140,9 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `SPEEDBENCH_METRICS_URLS` | unset | Generic comma/space-separated Prometheus endpoints when decode-specific naming is not applicable |
 | `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports to scrape when full URLs are not supplied |
 | `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stats `/metrics` endpoints used when Prometheus spec metrics are unavailable |
+| `SPEEDBENCH_TRTLLM_SERVER_LOG` | `SERVER_LOG` | Optional TRT-LLM `print_iter_log` file used to derive SpeedBench AL from generation-token iteration logs when spec metrics are unavailable |
 
-SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations expose `avgNumDecodedTokensPerIter` without `specDecodingStats`; for those, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker.
+SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations enable `print_iter_log` but do not expose `specDecodingStats`; for those, SpeedBench records the server-log byte offset before running SpeedBench and derives accepted/proposed/verify counters from the new `num_generation_tokens` iteration lines. If neither exact spec stats nor server logs are available, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker.
 
 ### Score validation
 `utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index 81175ac20b..0975b9af97 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -19,6 +19,7 @@
     _completion_payload,
     _load_speedbench_requests,
 )
+from trtllm_speedbench_al_from_log import parse_trtllm_iteration_log
 from validate_scores import validate_speedbench_al
 
 
@@ -208,6 +209,56 @@ def write_log(name: str, rows: list[tuple[float, int, int]]) -> None:
     ]
 
 
+def test_trtllm_log_parser_reads_generation_tokens_after_offset(tmp_path: Path) -> None:
+    log_path = tmp_path / "server.log"
+    prefix = "previous eval traffic\n"
+    body = "\n".join(
+        [
+            "[TRT-LLM] [I] iter = 1, num_scheduled_requests: 1, "
+            "states = {'num_ctx_requests': 1, 'num_ctx_tokens': 1024, 'num_generation_tokens': 0}",
+            "[TRT-LLM] [I] iter = 2, num_scheduled_requests: 1, "
+            "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 3}",
+            "[TRT-LLM] [I] iter = 3, num_scheduled_requests: 1, "
+            "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 2}",
+            'INFO:     127.0.0.1:1 - "GET /prometheus/metrics HTTP/1.1" 200 OK',
+            "[TRT-LLM] [I] iter = 4, num_scheduled_requests: 32, "
+            "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 96}",
+        ]
+    )
+    log_path.write_text(prefix + body)
+
+    metrics = parse_trtllm_iteration_log(log_path, mtp=2, start_offset=len(prefix))
+
+    assert metrics is not None
+    assert metrics.samples == 2
+    assert metrics.generated_tokens == 5
+    assert metrics.accepted_tokens == 3
+    assert metrics.verify_steps == 2
+    assert metrics.proposed_draft_tokens == 4
+    assert metrics.acceptance_length == 2.5
+
+
+def test_trtllm_log_parser_can_infer_batched_steps(tmp_path: Path) -> None:
+    log_path = tmp_path / "server.log"
+    log_path.write_text(
+        "[TRT-LLM] [I] iter = 10, num_scheduled_requests: 28, "
+        "states = {'num_ctx_requests': 9, 'num_ctx_tokens': 9345, 'num_generation_tokens': 57}"
+    )
+
+    metrics = parse_trtllm_iteration_log(
+        log_path,
+        mtp=2,
+        stop_at_metrics_get=False,
+    )
+
+    assert metrics is not None
+    assert metrics.samples == 1
+    assert metrics.verify_steps == 19
+    assert metrics.accepted_tokens == 38
+    assert metrics.proposed_draft_tokens == 38
+    assert metrics.acceptance_length == 3.0
+
+
 def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None:
     dataset = tmp_path / "speed_bench_data"
     dataset.mkdir()
diff --git a/utils/evals/trtllm_speedbench_al_from_log.py b/utils/evals/trtllm_speedbench_al_from_log.py
new file mode 100644
index 0000000000..c63a933915
--- /dev/null
+++ b/utils/evals/trtllm_speedbench_al_from_log.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""Parse TRT-LLM iteration logs into SpeedBench AL counters."""
+
+from __future__ import annotations
+
+import argparse
+import math
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+GEN_TOKENS_RE = re.compile(r"'num_generation_tokens':\s*(?P<tokens>[0-9]+)")
+ITER_LINE_RE = re.compile(r"\biter\s*=\s*[0-9]+,.*\bstates\s*=")
+METRICS_GET_RE = re.compile(r'GET\s+/(?:prometheus/)?metrics\b')
+
+
+@dataclass(frozen=True)
+class TrtLogMetrics:
+    samples: int
+    accepted_tokens: int
+    proposed_draft_tokens: int
+    verify_steps: int
+    generated_tokens: int
+
+    @property
+    def acceptance_length(self) -> float:
+        return self.generated_tokens / self.verify_steps
+
+
+def _read_log_suffix(path: Path, start_offset: int) -> list[str]:
+    with path.open("rb") as f:
+        if start_offset > 0:
+            f.seek(start_offset)
+        return f.read().decode(errors="ignore").splitlines()
+
+
+def parse_trtllm_iteration_log(
+    path: Path,
+    mtp: int,
+    start_offset: int = 0,
+    stop_at_metrics_get: bool = True,
+) -> TrtLogMetrics | None:
+    if mtp <= 0:
+        raise ValueError("mtp must be positive")
+    if not path.is_file():
+        return None
+
+    samples = 0
+    accepted = 0
+    proposed = 0
+    verify_steps = 0
+    generated = 0
+    max_tokens_per_step = mtp + 1
+
+    for line in _read_log_suffix(path, start_offset):
+        if samples and stop_at_metrics_get and METRICS_GET_RE.search(line):
+            break
+        if not ITER_LINE_RE.search(line):
+            continue
+        match = GEN_TOKENS_RE.search(line)
+        if not match:
+            continue
+
+        gen_tokens = int(match.group("tokens"))
+        if gen_tokens <= 0:
+            continue
+
+        # SpeedBench AL is issued at max-concurrency=1 today, where each
+        # generation iteration is one verification step. Keep a batched fallback
+        # for postmortem logs by assuming no step can emit more than mtp + 1
+        # tokens per active request.
+        steps = max(1, math.ceil(gen_tokens / max_tokens_per_step))
+        samples += 1
+        verify_steps += steps
+        generated += gen_tokens
+        accepted += max(gen_tokens - steps, 0)
+        proposed += steps * mtp
+
+    if samples == 0 or verify_steps <= 0:
+        return None
+
+    return TrtLogMetrics(
+        samples=samples,
+        accepted_tokens=accepted,
+        proposed_draft_tokens=proposed,
+        verify_steps=verify_steps,
+        generated_tokens=generated,
+    )
+
+
+def cmd_tsv(args: argparse.Namespace) -> int:
+    metrics = parse_trtllm_iteration_log(
+        Path(args.log),
+        args.num_speculative_tokens,
+        start_offset=max(args.start_offset, 0),
+        stop_at_metrics_get=not args.no_stop_at_metrics_get,
+    )
+    if metrics is None:
+        return 1
+
+    print(
+        f"{metrics.acceptance_length:.4f}\t"
+        f"{metrics.accepted_tokens}\t"
+        f"{metrics.verify_steps}\t"
+        f"{metrics.proposed_draft_tokens}\t"
+        f"{metrics.samples}"
+    )
+    return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--log", required=True)
+    parser.add_argument("--num-speculative-tokens", type=int, required=True)
+    parser.add_argument("--start-offset", type=int, default=0)
+    parser.add_argument(
+        "--no-stop-at-metrics-get",
+        action="store_true",
+        help="Do not stop parsing at the next /metrics request after samples appear.",
+    )
+    parser.set_defaults(func=cmd_tsv)
+    return parser
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    try:
+        return args.func(args)
+    except Exception as exc:  # noqa: BLE001 - CLI should return concise diagnostics
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 816dd1a84bbccbd914449fb0792aefabae324ddf Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 20:50:39 -0700
Subject: [PATCH 18/27] Capture GB200 srt-slurm bootstrap logs on early failure

---
 runners/launch_gb200-nv.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index ee149e7c41..67329f1cc2 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -415,6 +415,18 @@ while ! ls "$LOG_FILE" &>/dev/null; do
     if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
         echo "ERROR: Job $JOB_ID failed before creating log file"
         scontrol show job "$JOB_ID"
+        BOOTSTRAP_LOG="outputs/sweep_${JOB_ID}.bootstrap.log"
+        if [ -f "$BOOTSTRAP_LOG" ]; then
+            echo "Bootstrap log from $BOOTSTRAP_LOG:"
+            cat "$BOOTSTRAP_LOG"
+            if [ -n "${GITHUB_WORKSPACE:-}" ]; then
+                mkdir -p "$GITHUB_WORKSPACE/LOGS"
+                cp "$BOOTSTRAP_LOG" "$GITHUB_WORKSPACE/LOGS/" || true
+                tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$GITHUB_WORKSPACE/LOGS" . || true
+            fi
+        else
+            echo "Bootstrap log not found at $BOOTSTRAP_LOG"
+        fi
         exit 1
     fi
     echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."

From 9cbb11cd0ee2f43840d37bd50390af0f51c144f3 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 20:55:46 -0700
Subject: [PATCH 19/27] Parse Dynamo SGLang SpeedBench AL from decode logs

---
 runners/launch_gb300-cw.sh                    |   3 +
 utils/evals/dynamo_speedbench_al_from_logs.py | 102 +++++++++++++-----
 utils/evals/test_speedbench_al.py             |  25 +++++
 3 files changed, 105 insertions(+), 25 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 6a5c50e381..a92d4bc388 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -394,6 +394,9 @@ fi
 
 if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
     EVAL_DIR="$LOGS_DIR/eval_results"
+    if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
+        bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
+    fi
     if [ -d "$EVAL_DIR" ]; then
         echo "Extracting eval results from $EVAL_DIR"
         shopt -s nullglob
diff --git a/utils/evals/dynamo_speedbench_al_from_logs.py b/utils/evals/dynamo_speedbench_al_from_logs.py
index 0cf71ac4d8..f70992537c 100644
--- a/utils/evals/dynamo_speedbench_al_from_logs.py
+++ b/utils/evals/dynamo_speedbench_al_from_logs.py
@@ -20,6 +20,10 @@
     r"Accepted:\s*(?P<accepted>[0-9]+)\s*tokens,\s*"
     r"Drafted:\s*(?P<drafted>[0-9]+)\s*tokens"
 )
+SGLANG_ACCEPT_LINE_RE = re.compile(
+    r"\baccept len:\s*(?P<al>[0-9]+(?:\.[0-9]+)?)"
+    r"\s*,\s*accept rate:\s*(?P<rate>[0-9]+(?:\.[0-9]+)?)"
+)
 WORKER_RE = re.compile(r"_decode_w(?P<worker>[0-9]+)\.out$")
 
 
@@ -28,18 +32,25 @@ class LogMetrics:
     path: Path
     worker: str
     samples: int
+    acceptance_length_samples: int
+    acceptance_length_total: float
     accepted_tokens: int
     proposed_draft_tokens: int
 
+    @property
+    def has_counter_metrics(self) -> bool:
+        return self.proposed_draft_tokens > 0
+
 
 @dataclass(frozen=True)
 class AggregatedMetrics:
     workers: int
     samples: int
-    accepted_tokens: int
-    proposed_draft_tokens: int
-    verify_steps: int
+    accepted_tokens: int | None
+    proposed_draft_tokens: int | None
+    verify_steps: int | None
     acceptance_length: float
+    has_counter_metrics: bool
     selected_logs: tuple[Path, ...]
 
 
@@ -55,6 +66,8 @@ def parse_decode_log(path: Path) -> LogMetrics | None:
         return None
 
     samples = 0
+    acceptance_length_samples = 0
+    acceptance_length_total = 0.0
     accepted = 0
     drafted = 0
     try:
@@ -65,18 +78,26 @@ def parse_decode_log(path: Path) -> LogMetrics | None:
     for line in lines:
         parsed = SPEC_LINE_RE.search(line)
         if not parsed:
+            sglang_parsed = SGLANG_ACCEPT_LINE_RE.search(line)
+            if not sglang_parsed:
+                continue
+            samples += 1
+            acceptance_length_samples += 1
+            acceptance_length_total += float(sglang_parsed.group("al"))
             continue
         samples += 1
         accepted += int(parsed.group("accepted"))
         drafted += int(parsed.group("drafted"))
 
-    if samples == 0 or drafted <= 0:
+    if samples == 0 or (drafted <= 0 and acceptance_length_samples == 0):
         return None
 
     return LogMetrics(
         path=path,
         worker=match.group("worker"),
         samples=samples,
+        acceptance_length_samples=acceptance_length_samples,
+        acceptance_length_total=acceptance_length_total,
         accepted_tokens=accepted,
         proposed_draft_tokens=drafted,
     )
@@ -92,9 +113,16 @@ def select_decode_worker_logs(logs_dir: Path) -> list[LogMetrics]:
         if current is None:
             by_worker[metrics.worker] = metrics
             continue
-        if (metrics.samples, metrics.proposed_draft_tokens) > (
+        if (
+            metrics.has_counter_metrics,
+            metrics.samples,
+            metrics.proposed_draft_tokens,
+            metrics.acceptance_length_samples,
+        ) > (
+            current.has_counter_metrics,
             current.samples,
             current.proposed_draft_tokens,
+            current.acceptance_length_samples,
         ):
             by_worker[metrics.worker] = metrics
     return [by_worker[k] for k in sorted(by_worker, key=int)]
@@ -108,23 +136,42 @@ def aggregate_log_metrics(logs_dir: Path, mtp: int) -> AggregatedMetrics | None:
     if not selected:
         return None
 
-    accepted = sum(item.accepted_tokens for item in selected)
-    proposed = sum(item.proposed_draft_tokens for item in selected)
-    samples = sum(item.samples for item in selected)
-    if proposed <= 0:
-        return None
+    counter_logs = [item for item in selected if item.has_counter_metrics]
+    if counter_logs:
+        accepted = sum(item.accepted_tokens for item in counter_logs)
+        proposed = sum(item.proposed_draft_tokens for item in counter_logs)
+        samples = sum(item.samples for item in counter_logs)
+        verify_steps = round(proposed / mtp)
+        acceptance_length = 1.0 + (accepted / (proposed / mtp))
+
+        return AggregatedMetrics(
+            workers=len(counter_logs),
+            samples=samples,
+            accepted_tokens=accepted,
+            proposed_draft_tokens=proposed,
+            verify_steps=verify_steps,
+            acceptance_length=acceptance_length,
+            has_counter_metrics=True,
+            selected_logs=tuple(item.path for item in counter_logs),
+        )
 
-    verify_steps = round(proposed / mtp)
-    acceptance_length = 1.0 + (accepted / (proposed / mtp))
+    al_logs = [item for item in selected if item.acceptance_length_samples > 0]
+    al_samples = sum(item.acceptance_length_samples for item in al_logs)
+    if al_samples <= 0:
+        return None
+    acceptance_length = (
+        sum(item.acceptance_length_total for item in al_logs) / al_samples
+    )
 
     return AggregatedMetrics(
-        workers=len(selected),
-        samples=samples,
-        accepted_tokens=accepted,
-        proposed_draft_tokens=proposed,
-        verify_steps=verify_steps,
+        workers=len(al_logs),
+        samples=al_samples,
+        accepted_tokens=None,
+        proposed_draft_tokens=None,
+        verify_steps=None,
         acceptance_length=acceptance_length,
-        selected_logs=tuple(item.path for item in selected),
+        has_counter_metrics=False,
+        selected_logs=tuple(item.path for item in al_logs),
     )
 
 
@@ -156,14 +203,19 @@ def _record_args(args: argparse.Namespace, metrics: AggregatedMetrics | None) ->
         )
         return record
 
-    record.metric_source = (
-        f"{args.metric_source}-workers{metrics.workers}-samples{metrics.samples}"
-    )
+    metric_source = args.metric_source
+    if (
+        not metrics.has_counter_metrics
+        and metric_source.endswith("-decode-log-counters")
+    ):
+        metric_source = metric_source[: -len("counters")] + "accept-length"
+    record.metric_source = f"{metric_source}-workers{metrics.workers}-samples{metrics.samples}"
     record.acceptance_length = f"{metrics.acceptance_length:.4f}"
-    record.accepted_tokens = str(metrics.accepted_tokens)
-    record.verify_steps = str(metrics.verify_steps)
-    record.draft_tokens = str(metrics.verify_steps)
-    record.proposed_draft_tokens = str(metrics.proposed_draft_tokens)
+    if metrics.has_counter_metrics:
+        record.accepted_tokens = str(metrics.accepted_tokens)
+        record.verify_steps = str(metrics.verify_steps)
+        record.draft_tokens = str(metrics.verify_steps)
+        record.proposed_draft_tokens = str(metrics.proposed_draft_tokens)
     return record
 
 
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index 0975b9af97..b02cb5bd98 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -209,6 +209,31 @@ def write_log(name: str, rows: list[tuple[float, int, int]]) -> None:
     ]
 
 
+def test_dynamo_log_parser_reads_sglang_accept_length_samples(tmp_path: Path) -> None:
+    (tmp_path / "node-a_decode_w0.out").write_text(
+        "\n".join(
+            [
+                "Decode batch, #running-req: 1, accept len: 2.10, accept rate: 0.37,",
+                "Decode batch, #running-req: 1, accept len: 2.30, accept rate: 0.43,",
+            ]
+        )
+    )
+    (tmp_path / "node-b_decode_w1.out").write_text(
+        "Decode batch, #running-req: 1, accept len: 2.50, accept rate: 0.50,"
+    )
+
+    metrics = aggregate_log_metrics(tmp_path, mtp=4)
+
+    assert metrics is not None
+    assert metrics.workers == 2
+    assert metrics.samples == 3
+    assert round(metrics.acceptance_length, 4) == 2.3
+    assert metrics.has_counter_metrics is False
+    assert metrics.accepted_tokens is None
+    assert metrics.verify_steps is None
+    assert metrics.proposed_draft_tokens is None
+
+
 def test_trtllm_log_parser_reads_generation_tokens_after_offset(tmp_path: Path) -> None:
     log_path = tmp_path / "server.log"
     prefix = "previous eval traffic\n"

From 046f3042e791edaa56c3a27fbe7a7b29d43de44e Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 10 Jun 2026 20:57:43 -0700
Subject: [PATCH 20/27] Read GB200 bootstrap log from Slurm stderr path

---
 runners/launch_gb200-nv.sh | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 67329f1cc2..7ebb2ac7e3 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -414,8 +414,24 @@ LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
 while ! ls "$LOG_FILE" &>/dev/null; do
     if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
         echo "ERROR: Job $JOB_ID failed before creating log file"
-        scontrol show job "$JOB_ID"
-        BOOTSTRAP_LOG="outputs/sweep_${JOB_ID}.bootstrap.log"
+        JOB_INFO=$(scontrol show job "$JOB_ID" 2>&1 || true)
+        echo "$JOB_INFO"
+        BOOTSTRAP_LOG=""
+        BOOTSTRAP_CANDIDATES=("outputs/sweep_${JOB_ID}.bootstrap.log")
+        SCONTROL_STDERR=$(printf '%s\n' "$JOB_INFO" | awk '{ for (i = 1; i <= NF; i++) if ($i ~ /^StdErr=/) { sub(/^StdErr=/, "", $i); print $i; exit } }')
+        if [ -n "$SCONTROL_STDERR" ]; then
+            BOOTSTRAP_CANDIDATES+=("$SCONTROL_STDERR")
+        fi
+        for candidate in "${BOOTSTRAP_CANDIDATES[@]}"; do
+            [ -n "$candidate" ] || continue
+            for _ in 1 2 3; do
+                if [ -f "$candidate" ]; then
+                    BOOTSTRAP_LOG="$candidate"
+                    break 2
+                fi
+                sleep 2
+            done
+        done
         if [ -f "$BOOTSTRAP_LOG" ]; then
             echo "Bootstrap log from $BOOTSTRAP_LOG:"
             cat "$BOOTSTRAP_LOG"
@@ -425,7 +441,7 @@ while ! ls "$LOG_FILE" &>/dev/null; do
                 tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$GITHUB_WORKSPACE/LOGS" . || true
             fi
         else
-            echo "Bootstrap log not found at $BOOTSTRAP_LOG"
+            echo "Bootstrap log not found. Tried: ${BOOTSTRAP_CANDIDATES[*]}"
         fi
         exit 1
     fi

From 16e7aa572da9b1ae300eaaebda67f600908f68a5 Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Fri, 26 Jun 2026 08:09:19 -0700
Subject: [PATCH 21/27] update SGL metrics gathering method

---
 benchmarks/benchmark_lib.sh | 45 +++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index affc88aea7..31cdc013b6 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1177,6 +1177,9 @@ _speedbench_spec_counter_metric() {
         sglang:verify)
             echo "sglang:spec_verify_calls_total"
             ;;
+        sglang:completion)
+            echo "sglang:generation_tokens_total"
+            ;;
         *)
             return 1
             ;;
@@ -1439,6 +1442,9 @@ run_speedbench_al_eval() {
     if [[ "$client" != "openai" && "$client" != "native" ]] && command -v vllm >/dev/null 2>&1; then
         use_vllm_client=1
     fi
+    if [[ "$metrics_framework" == "sglang" ]]; then
+        use_vllm_client=0
+    fi
 
     local think_args=()
     if [[ "$mode" == "on" ]]; then
@@ -1452,13 +1458,15 @@ run_speedbench_al_eval() {
         fi
     fi
 
-    local accepted_before="" proposed_before="" verify_before=""
+    local accepted_before="" proposed_before="" verify_before="" completion_before=""
     accepted_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true)
     proposed_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true)
     verify_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true)
+    completion_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "completion" 2>/dev/null || true)
     accepted_before="${accepted_before:-0}"
     proposed_before="${proposed_before:-0}"
     verify_before="${verify_before:-0}"
+    completion_before="${completion_before:-0}"
 
     local trt_server_log_offset="0"
     if [[ "$metrics_framework" == "trtllm" ]]; then
@@ -1529,11 +1537,12 @@ run_speedbench_al_eval() {
         return 0
     fi
 
-    local accepted_after="" proposed_after="" verify_after=""
-    local al="" delta_acc="" delta_proposed="" delta_verify="" metric_source=""
+    local accepted_after="" proposed_after="" verify_after="" completion_after=""
+    local al="" delta_acc="" delta_proposed="" delta_verify="" delta_completion="" metric_source=""
     accepted_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true)
     proposed_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "proposed" 2>/dev/null || true)
     verify_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "verify" 2>/dev/null || true)
+    completion_after=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "completion" 2>/dev/null || true)
 
     if [[ -n "$accepted_after" ]]; then
         delta_acc=$(_speedbench_round_metric "$(_speedbench_metric_delta "$accepted_before" "$accepted_after")")
@@ -1544,6 +1553,9 @@ run_speedbench_al_eval() {
     if [[ -n "$verify_after" ]]; then
         delta_verify=$(_speedbench_round_metric "$(_speedbench_metric_delta "$verify_before" "$verify_after")")
     fi
+    if [[ -n "$completion_after" ]]; then
+        delta_completion=$(_speedbench_round_metric "$(_speedbench_metric_delta "$completion_before" "$completion_after")")
+    fi
 
     if [[ "$metrics_framework" == "vllm" && -n "$delta_acc" && -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
         al=$(awk -v accepted="$delta_acc" -v verify="$delta_verify" 'BEGIN { printf "%.4f", 1 + (accepted / verify) }')
@@ -1586,17 +1598,32 @@ run_speedbench_al_eval() {
             fi
         fi
     elif [[ "$metrics_framework" == "sglang" ]]; then
-        al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
-        if [[ -n "$al" ]]; then
-            metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}"
-        fi
-        if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
-            local draft_depth=""
+        local draft_depth=""
+        if [[ -n "$delta_completion" && "$delta_completion" -gt 0 && -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
+            al=$(awk -v completion="$delta_completion" -v verify="$delta_verify" 'BEGIN { printf "%.4f", completion / verify }')
+            delta_acc=$(_speedbench_round_metric "$(awk -v completion="$delta_completion" -v verify="$delta_verify" 'BEGIN { value = completion - verify; if (value < 0) value = 0; printf "%.10f\n", value }')")
             draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true)
             if [[ -n "$draft_depth" ]]; then
                 delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")
+            elif [[ -n "$mtp" ]]; then
+                delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v mtp="$mtp" 'BEGIN { value = verify * mtp; if (value < 0) value = 0; printf "%.10f\n", value }')")
             fi
+            metric_source="${metric_source_base}-generation-counter+verify-counter-endpoints${metrics_endpoint_count}"
+        fi
+        if [[ -z "$al" ]]; then
+            al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
             if [[ -n "$al" ]]; then
+                metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}"
+            fi
+        fi
+        if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
+            if [[ -z "$draft_depth" ]]; then
+                draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true)
+            fi
+            if [[ -n "$draft_depth" ]]; then
+                delta_proposed="${delta_proposed:-$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")}"
+            fi
+            if [[ -n "$al" && "$metric_source" != *"generation-counter+verify-counter"* ]]; then
                 delta_acc=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v al="$al" 'BEGIN { value = verify * (al - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")
                 metric_source="${metric_source:-${metric_source_base}-gauge-endpoints${metrics_endpoint_count}}+derived-token-counters"
             fi

From a48fc820a08e677f528d308334047639c88412e8 Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Mon, 29 Jun 2026 11:42:40 -0700
Subject: [PATCH 22/27] fix: validate SpeedBench AL within golden tolerance

---
 benchmarks/benchmark_lib.sh                   |  6 ++-
 utils/collect_eval_results.py                 | 12 +++++-
 utils/evals/EVALS.md                          |  2 +-
 utils/evals/dynamo_speedbench_al_from_logs.py |  4 +-
 utils/evals/speedbench_al.py                  | 43 ++++++++++++++++---
 utils/evals/test_speedbench_al.py             | 42 ++++++++++++++----
 utils/evals/validate_scores.py                | 26 ++++++++---
 7 files changed, 111 insertions(+), 24 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 31cdc013b6..a9db11e62d 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1248,7 +1248,8 @@ _speedbench_write_eval_result() {
         --category "coding"
         --output-len "4096"
         --temperature "1.0"
-        --threshold-ratio "0.90"
+        --threshold-ratio "0.95"
+        --max-threshold-ratio "1.05"
     )
     if [[ -n "$framework" ]]; then
         record_cmd+=(--framework "$framework")
@@ -1287,7 +1288,8 @@ _speedbench_reference_available() {
         --model-prefix "${MODEL_PREFIX:-}" \
         --thinking-mode "$mode" \
         --num-speculative-tokens "$mtp" \
-        --threshold-ratio "0.90" >/dev/null
+        --threshold-ratio "0.95" \
+        --max-threshold-ratio "1.05" >/dev/null
 }
 
 _speedbench_prepare_dataset() {
diff --git a/utils/collect_eval_results.py b/utils/collect_eval_results.py
index 45b464329d..cca611c6f9 100644
--- a/utils/collect_eval_results.py
+++ b/utils/collect_eval_results.py
@@ -167,7 +167,9 @@ def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]:
         'acceptance_length': data.get('acceptance_length'),
         'reference_acceptance_length': data.get('reference_acceptance_length'),
         'min_acceptance_length': data.get('min_acceptance_length'),
+        'max_acceptance_length': data.get('max_acceptance_length'),
         'threshold_ratio': data.get('threshold_ratio'),
+        'max_threshold_ratio': data.get('max_threshold_ratio'),
         'thinking_mode': mode,
         'num_speculative_tokens': mtp,
         'speedbench_framework': data.get('framework'),
@@ -275,7 +277,9 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
         row['score_se'] = None
         row['speedbench_reference_acceptance_length'] = m.get('reference_acceptance_length')
         row['speedbench_min_acceptance_length'] = m.get('min_acceptance_length')
+        row['speedbench_max_acceptance_length'] = m.get('max_acceptance_length')
         row['speedbench_threshold_ratio'] = m.get('threshold_ratio')
+        row['speedbench_max_threshold_ratio'] = m.get('max_threshold_ratio')
         row['speedbench_thinking_mode'] = m.get('thinking_mode')
         row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens')
         row['speedbench_framework'] = m.get('speedbench_framework')
@@ -306,14 +310,18 @@ def score_cell(r: Dict[str, Any]) -> str:
     if r.get('score_name') == 'acceptance_length':
         score = r.get('score')
         minimum = r.get('speedbench_min_acceptance_length')
+        maximum = r.get('speedbench_max_acceptance_length')
         passed = r.get('speedbench_passed')
         if score is None:
             return 'FAIL'
         try:
             status = 'PASS' if passed else 'FAIL'
-            if minimum is None:
+            if minimum is None or maximum is None:
                 return f"{float(score):.2f} ({status})"
-            return f"{float(score):.2f} >= {float(minimum):.2f} ({status})"
+            return (
+                f"{float(score):.2f} in "
+                f"[{float(minimum):.2f}, {float(maximum):.2f}] ({status})"
+            )
         except Exception:
             return str(score)
     return f"{pct(r['score'])}{se(r['score_se'])}"
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index 6aeeb585d5..f42ccbdb9b 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -145,7 +145,7 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 SpeedBench AL computes vLLM acceptance length from raw accepted-token and verify-step counters. TRT-LLM prefers its Prometheus acceptance-length gauge and token counters, then falls back to JSON `specDecodingStats` from `/metrics` when the Prometheus spec series are unavailable. Some TRT-LLM MTP configurations enable `print_iter_log` but do not expose `specDecodingStats`; for those, SpeedBench records the server-log byte offset before running SpeedBench and derives accepted/proposed/verify counters from the new `num_generation_tokens` iteration lines. If neither exact spec stats nor server logs are available, SpeedBench records acceptance length from `trtllm_avg_decoded_tokens_per_iter` or JSON `inflightBatchingStats.avgNumDecodedTokensPerIter` and leaves token counters empty. SGLang records its acceptance-length gauge, verify-call counter when present, and derived token counts. Dynamo/disaggregated runs scrape all configured decode endpoints when available, summing counters and averaging gauge-only AL values. The NVIDIA srt-slurm Dynamo eval path also writes a SpeedBench AL artifact from decode-worker `SpecDecoding metrics` log counters when the router eval path does not expose decode-worker metrics endpoints to the benchmarker.
 
 ### Score validation
-`utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the embedded minimum AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
+`utils/evals/validate_scores.py` checks lm-eval results against thresholds in `utils/evals/thresholds.json` and checks `results_speedbench_al_*.json` against the inclusive range from 95% to 105% of the golden AL. It runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
 
 ### Adding a new eval task
 
diff --git a/utils/evals/dynamo_speedbench_al_from_logs.py b/utils/evals/dynamo_speedbench_al_from_logs.py
index f70992537c..e2135953e9 100644
--- a/utils/evals/dynamo_speedbench_al_from_logs.py
+++ b/utils/evals/dynamo_speedbench_al_from_logs.py
@@ -187,6 +187,7 @@ def _record_args(args: argparse.Namespace, metrics: AggregatedMetrics | None) ->
         output_len=args.output_len,
         temperature=args.temperature,
         threshold_ratio=args.threshold_ratio,
+        max_threshold_ratio=args.max_threshold_ratio,
         framework=args.framework,
         metric_source=args.metric_source,
         acceptance_length=None,
@@ -246,7 +247,8 @@ def build_parser() -> argparse.ArgumentParser:
     parser.add_argument("--category", default="coding")
     parser.add_argument("--output-len", type=int, default=4096)
     parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--threshold-ratio", type=float, default=0.90)
+    parser.add_argument("--threshold-ratio", type=float, default=0.95)
+    parser.add_argument("--max-threshold-ratio", type=float, default=1.05)
     parser.add_argument("--framework", default="dynamo")
     parser.add_argument("--metric-source", default="dynamo-decode-log-counters")
     parser.set_defaults(func=cmd_from_logs)
diff --git a/utils/evals/speedbench_al.py b/utils/evals/speedbench_al.py
index a9167e6565..94a17c6132 100644
--- a/utils/evals/speedbench_al.py
+++ b/utils/evals/speedbench_al.py
@@ -15,6 +15,14 @@
     "dsv4": "deepseek-v4-pro",
 }
 
+DEFAULT_MIN_THRESHOLD_RATIO = 0.95
+DEFAULT_MAX_THRESHOLD_RATIO = 1.05
+
+
+def scaled_threshold(reference: float, ratio: float) -> float:
+    """Scale an AL reference without leaking binary float noise into boundaries."""
+    return round(reference * ratio, 10)
+
 
 def _parse_scalar(value: str) -> Any:
     value = value.strip()
@@ -158,6 +166,7 @@ def _optional_int(value: str | None) -> int | None:
 def build_result(args: argparse.Namespace) -> dict[str, Any]:
     reference_al: float | None = None
     min_acceptance_length: float | None = None
+    max_acceptance_length: float | None = None
     model_key: str | None = None
     mode_key = normalize_mode(args.thinking_mode)
     error: str | None = args.error
@@ -173,7 +182,12 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]:
                     args.thinking_mode,
                     args.num_speculative_tokens,
                 )
-                min_acceptance_length = reference_al * args.threshold_ratio
+                min_acceptance_length = scaled_threshold(
+                    reference_al, args.threshold_ratio
+                )
+                max_acceptance_length = scaled_threshold(
+                    reference_al, args.max_threshold_ratio
+                )
             except Exception as exc:  # noqa: BLE001 - recorded for CI artifacts
                 error = error or str(exc)
         else:
@@ -190,7 +204,9 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]:
         error is None
         and acceptance_length is not None
         and min_acceptance_length is not None
+        and max_acceptance_length is not None
         and acceptance_length >= min_acceptance_length
+        and acceptance_length <= max_acceptance_length
     )
 
     result = {
@@ -213,7 +229,9 @@ def build_result(args: argparse.Namespace) -> dict[str, Any]:
         "draft_tokens": draft_tokens,
         "reference_acceptance_length": reference_al,
         "threshold_ratio": args.threshold_ratio,
+        "max_threshold_ratio": args.max_threshold_ratio,
         "min_acceptance_length": min_acceptance_length,
+        "max_acceptance_length": max_acceptance_length,
         "passed": passed,
     }
     if error:
@@ -235,7 +253,11 @@ def cmd_resolve(args: argparse.Namespace) -> int:
         "num_speculative_tokens": args.num_speculative_tokens,
         "reference_acceptance_length": reference_al,
         "threshold_ratio": args.threshold_ratio,
-        "min_acceptance_length": reference_al * args.threshold_ratio,
+        "max_threshold_ratio": args.max_threshold_ratio,
+        "min_acceptance_length": scaled_threshold(reference_al, args.threshold_ratio),
+        "max_acceptance_length": scaled_threshold(
+            reference_al, args.max_threshold_ratio
+        ),
     }
     print(json.dumps(payload, sort_keys=True))
     return 0
@@ -248,9 +270,10 @@ def cmd_record(args: argparse.Namespace) -> int:
     status = "PASS" if result["passed"] else "FAIL"
     actual = result.get("acceptance_length")
     minimum = result.get("min_acceptance_length")
+    maximum = result.get("max_acceptance_length")
     print(
         f"{status}: SpeedBench AL {actual} "
-        f"(min {minimum}, mode {result['thinking_mode']}, "
+        f"(range [{minimum}, {maximum}], mode {result['thinking_mode']}, "
         f"mtp {result['num_speculative_tokens']})"
     )
     if args.exit_status and not result["passed"]:
@@ -268,7 +291,12 @@ def build_parser() -> argparse.ArgumentParser:
     resolve.add_argument("--model-prefix", default="")
     resolve.add_argument("--thinking-mode", required=True)
     resolve.add_argument("--num-speculative-tokens", type=int, required=True)
-    resolve.add_argument("--threshold-ratio", type=float, default=0.90)
+    resolve.add_argument(
+        "--threshold-ratio", type=float, default=DEFAULT_MIN_THRESHOLD_RATIO
+    )
+    resolve.add_argument(
+        "--max-threshold-ratio", type=float, default=DEFAULT_MAX_THRESHOLD_RATIO
+    )
     resolve.set_defaults(func=cmd_resolve)
 
     record = subparsers.add_parser("record", help="Write a compact AL eval result")
@@ -281,7 +309,12 @@ def build_parser() -> argparse.ArgumentParser:
     record.add_argument("--category", default="coding")
     record.add_argument("--output-len", type=int, default=4096)
     record.add_argument("--temperature", type=float, default=1.0)
-    record.add_argument("--threshold-ratio", type=float, default=0.90)
+    record.add_argument(
+        "--threshold-ratio", type=float, default=DEFAULT_MIN_THRESHOLD_RATIO
+    )
+    record.add_argument(
+        "--max-threshold-ratio", type=float, default=DEFAULT_MAX_THRESHOLD_RATIO
+    )
     record.add_argument("--framework", default="")
     record.add_argument("--metric-source", default="")
     record.add_argument("--acceptance-length", default=None)
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index b02cb5bd98..751de1cdd7 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -67,8 +67,9 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
         category="coding",
         output_len=4096,
         temperature=1.0,
-        threshold_ratio=0.90,
-        acceptance_length="2.30",
+        threshold_ratio=0.95,
+        max_threshold_ratio=1.05,
+        acceptance_length="2.50",
         accepted_tokens="13",
         draft_tokens="10",
         verify_steps="10",
@@ -81,7 +82,8 @@ def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
     result = build_result(args)
 
     assert result["reference_acceptance_length"] == 2.50
-    assert result["min_acceptance_length"] == 2.25
+    assert result["min_acceptance_length"] == 2.375
+    assert result["max_acceptance_length"] == 2.625
     assert result["framework"] == "vllm"
     assert result["metric_source"] == "vllm-prometheus-counters-endpoints1"
     assert result["verify_steps"] == 10
@@ -97,7 +99,8 @@ def test_validate_speedbench_al_fails_below_minimum() -> None:
             "thinking_mode": "thinking_on",
             "num_speculative_tokens": 2,
             "acceptance_length": 2.0,
-            "min_acceptance_length": 2.25,
+            "min_acceptance_length": 2.375,
+            "max_acceptance_length": 2.625,
             "passed": False,
         },
         "results_speedbench_al.json",
@@ -107,6 +110,25 @@ def test_validate_speedbench_al_fails_below_minimum() -> None:
     assert ok is False
 
 
+def test_validate_speedbench_al_fails_above_maximum() -> None:
+    ok, checked = validate_speedbench_al(
+        {
+            "speedbench_al_eval_version": 1,
+            "task": "speedbench_al",
+            "thinking_mode": "thinking_on",
+            "num_speculative_tokens": 2,
+            "acceptance_length": 2.7,
+            "min_acceptance_length": 2.375,
+            "max_acceptance_length": 2.625,
+            "passed": True,
+        },
+        "results_speedbench_al.json",
+    )
+
+    assert checked == 1
+    assert ok is False
+
+
 def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
     result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json"
     result_path.write_text(
@@ -117,15 +139,17 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
                 "model": "deepseek-ai/DeepSeek-V4-Pro",
                 "thinking_mode": "thinking_on",
                 "num_speculative_tokens": 2,
-                "acceptance_length": 2.3,
+                "acceptance_length": 2.5,
                 "framework": "sglang",
                 "metric_source": "sglang-prometheus-gauge-endpoints1+derived-token-counters",
                 "accepted_tokens": 13,
                 "verify_steps": 10,
                 "proposed_draft_tokens": 20,
                 "reference_acceptance_length": 2.5,
-                "min_acceptance_length": 2.25,
-                "threshold_ratio": 0.9,
+                "min_acceptance_length": 2.375,
+                "max_acceptance_length": 2.625,
+                "threshold_ratio": 0.95,
+                "max_threshold_ratio": 1.05,
                 "passed": True,
             }
         )
@@ -149,7 +173,8 @@ def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
     assert row["speedbench_accepted_tokens"] == 13
     assert row["speedbench_verify_steps"] == 10
     assert row["speedbench_proposed_draft_tokens"] == 20
-    assert score_cell(row) == "2.30 >= 2.25 (PASS)"
+    assert row["speedbench_max_acceptance_length"] == 2.625
+    assert score_cell(row) == "2.50 in [2.38, 2.62] (PASS)"
 
 
 def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> None:
@@ -163,6 +188,7 @@ def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> Non
                 "num_speculative_tokens": 2,
                 "acceptance_length": 2.3,
                 "min_acceptance_length": 2.25,
+                "max_acceptance_length": 2.75,
                 "passed": True,
             }
         )
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
index c85becd06b..4486f07839 100644
--- a/utils/evals/validate_scores.py
+++ b/utils/evals/validate_scores.py
@@ -30,6 +30,7 @@ def validate_speedbench_al(data: dict, source: str) -> tuple[bool, int]:
 
     actual = data.get("acceptance_length")
     minimum = data.get("min_acceptance_length")
+    maximum = data.get("max_acceptance_length")
     passed = data.get("passed")
     label = (
         f"{data.get('task', 'speedbench_al')} "
@@ -37,17 +38,32 @@ def validate_speedbench_al(data: dict, source: str) -> tuple[bool, int]:
         f"mtp{data.get('num_speculative_tokens', 'unknown')}"
     )
 
-    if passed is True:
-        print(f"PASS: {label} AL = {float(actual):.4f} (>= {float(minimum):.4f})")
+    values_are_numeric = all(
+        isinstance(value, (int, float)) for value in (actual, minimum, maximum)
+    )
+    within_range = values_are_numeric and minimum <= actual <= maximum
+
+    if passed is True and within_range:
+        print(
+            f"PASS: {label} AL = {float(actual):.4f} "
+            f"(range [{float(minimum):.4f}, {float(maximum):.4f}])"
+        )
         return True, 1
 
-    if isinstance(actual, (int, float)) and isinstance(minimum, (int, float)):
+    if values_are_numeric:
+        if actual < minimum:
+            comparison = "below"
+        elif actual > maximum:
+            comparison = "above"
+        else:
+            comparison = "marked failed"
         print(
-            f"FAIL: {label} AL = {actual:.4f} (< {minimum:.4f})",
+            f"FAIL: {label} AL = {actual:.4f} ({comparison}; "
+            f"expected [{minimum:.4f}, {maximum:.4f}])",
             file=sys.stderr,
         )
     else:
-        error = data.get("error", "missing acceptance length or threshold")
+        error = data.get("error", "missing acceptance length or validation bounds")
         print(f"FAIL: {label} in {source}: {error}", file=sys.stderr)
     return False, 1
 

From ba7fa14b5f80258d219b6d02561958671843294e Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:20:58 -0700
Subject: [PATCH 23/27] fix: use canonical golden AL distribution

---
 .github/workflows/speedbench-al.yml           | 25 ++++++++++------
 benchmarks/benchmark_lib.sh                   | 23 +++++++++++++--
 .../speedbench/dsr1_fp4_b300_vllm.sh          |  2 +-
 .../speedbench/dsv4_fp4_b300_vllm.sh          |  4 +--
 .../speedbench/glm5_fp4_b300_vllm.sh          |  2 +-
 .../speedbench/kimik2.5_fp4_b300_vllm.sh      |  2 +-
 .../speedbench/minimaxm3_fp4_b300_vllm.sh     |  2 +-
 .../speedbench/qwen3.5_fp4_b300_vllm.sh       |  2 +-
 benchmarks/speedbench-reference-al.yaml       | 29 -------------------
 utils/evals/test_speedbench_al.py             | 15 ++++++++++
 .../write_dynamo_speedbench_al_from_logs.sh   | 15 +++++++++-
 11 files changed, 73 insertions(+), 48 deletions(-)
 delete mode 100644 benchmarks/speedbench-reference-al.yaml

diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
index 84a92e57a4..781228dee2 100644
--- a/.github/workflows/speedbench-al.yml
+++ b/.github/workflows/speedbench-al.yml
@@ -3,8 +3,8 @@ name: SpeedBench AL Collection
 # Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length
 # (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to
 # DeepSeek-V4-Pro). Produces the golden reference consumed by the
-# synthetic-acceptance framework and (optionally) opens a PR updating
-# benchmarks/speedbench-reference-al.yaml.
+# synthetic-acceptance framework and can optionally open a PR updating the
+# model's YAML under golden_al_distribution/.
 
 on:
   workflow_dispatch:
@@ -60,7 +60,7 @@ on:
         type: string
         default: '480'
       open-pr:
-        description: "Open a PR updating benchmarks/speedbench-reference-al.yaml (default off: artifact-only, paste values in manually)"
+        description: "Open a PR updating the model's golden_al_distribution YAML (default off: artifact-only)"
         required: false
         type: boolean
         default: false
@@ -179,16 +179,23 @@ jobs:
           GH_TOKEN: ${{ secrets.REPO_PAT }}
         run: |
           set -euo pipefail
-          # NOTE: the reference yaml is keyed by model at the top level. This
-          # overwrites it with the current model's matrix; when more than one
-          # model is collected, replace this cp with a per-model-key YAML merge.
-          cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml
+          case "${{ inputs.model-prefix }}" in
+            dsv4) GOLDEN_AL_PATH="golden_al_distribution/dsv4_mtp.yaml" ;;
+            qwen3.5) GOLDEN_AL_PATH="golden_al_distribution/qwen3.5_mtp.yaml" ;;
+            kimik2.5) GOLDEN_AL_PATH="golden_al_distribution/kimik2.5_eagle3.yaml" ;;
+            minimaxm3) GOLDEN_AL_PATH="golden_al_distribution/minimaxm3_eagle3.yaml" ;;
+            *)
+              echo "No committed golden AL path for model prefix: ${{ inputs.model-prefix }}" >&2
+              exit 1
+              ;;
+          esac
+          cp speedbench-reference-al.yaml "$GOLDEN_AL_PATH"
 
           BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}"
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
           git checkout -b "$BRANCH"
-          git add benchmarks/speedbench-reference-al.yaml
+          git add "$GOLDEN_AL_PATH"
           if git diff --cached --quiet; then
             echo "No change in reference yaml; skipping PR."
             exit 0
@@ -222,4 +229,4 @@ jobs:
 
       - name: Resource cleanup (post-run)
         if: always()
-        run: *resource-cleanup
\ No newline at end of file
+        run: *resource-cleanup
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 52eea300ed..340e3a9341 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -1223,6 +1223,22 @@ _speedbench_spec_gauge_avg() {
     _speedbench_metric_avg "$port" "$metric"
 }
 
+_speedbench_reference_yaml() {
+    if [[ -n "${SPEEDBENCH_REFERENCE_YAML:-}" ]]; then
+        echo "$SPEEDBENCH_REFERENCE_YAML"
+        return 0
+    fi
+
+    case "${MODEL_PREFIX:-}" in
+        dsv4)
+            echo "golden_al_distribution/dsv4_mtp.yaml"
+            ;;
+        *)
+            return 1
+            ;;
+    esac
+}
+
 _speedbench_write_eval_result() {
     local output="$1"
     local mode="$2"
@@ -1235,12 +1251,14 @@ _speedbench_write_eval_result() {
     local metric_source="${9:-}"
     local error="${10:-}"
     local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
+    local reference
+    reference=$(_speedbench_reference_yaml 2>/dev/null || true)
 
     local record_cmd=(
         python3 "$(pwd)/utils/evals/speedbench_al.py"
         record
         --output "$output"
-        --reference-yaml "benchmarks/speedbench-reference-al.yaml"
+        --reference-yaml "$reference"
         --model "$speedbench_model"
         --model-prefix "${MODEL_PREFIX:-}"
         --thinking-mode "$mode"
@@ -1279,8 +1297,9 @@ _speedbench_write_eval_result() {
 _speedbench_reference_available() {
     local mode="$1"
     local mtp="$2"
-    local reference="benchmarks/speedbench-reference-al.yaml"
+    local reference
     local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
+    reference=$(_speedbench_reference_yaml) || return 1
     [[ -f "$reference" ]] || return 1
     python3 "$(pwd)/utils/evals/speedbench_al.py" resolve \
         --reference-yaml "$reference" \
diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
index d0357c6b43..d032164e67 100755
--- a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
@@ -5,7 +5,7 @@
 # Produces the golden acceptance-length (AL) reference matrix consumed by the
 # synthetic-acceptance framework: for each MTP level (num_speculative_tokens),
 # measure the REAL AL on a single SPEED-Bench category (default: coding) and emit
-# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml.
+# a YAML matrix identical in shape to the files under golden_al_distribution/.
 # This measures real MTP acceptance; the synthetic value is injected downstream
 # by the throughput recipe, not here.
 #
diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
index b8550a3502..97421eaa5a 100755
--- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
@@ -6,7 +6,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
 # level (num_speculative_tokens), measure the AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# benchmarks/speedbench-reference-al.yaml.
+# golden_al_distribution/dsv4_mtp.yaml.
 #
 # This is the "AL distribution collection" script wired into the
 # speedbench-al.yml GitHub Action (workflow_dispatch / push-button).
@@ -46,7 +46,7 @@ SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
 CONCURRENCY="${CONCURRENCY:-1}"
 TEMPERATURE="${TEMPERATURE:-1.0}"
 # thinking-on chat_template_kwargs. MUST match the production/golden config:
-# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured
+# the reference matrix (golden_al_distribution/dsv4_mtp.yaml) was measured
 # with reasoning_effort=high.
 DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"thinking": true, "reasoning_effort": "high"}'
 CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}"
diff --git a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh
index 6265500b94..09577d802f 100755
--- a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh
@@ -6,7 +6,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
 # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance;
+# a model-specific YAML under golden_al_distribution/. This measures real MTP acceptance;
 # the synthetic value is injected downstream by the throughput recipe, not here.
 #
 # Filename *_fp4_* matches both the speedbench-al.yml path convention
diff --git a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh
index 890c059f9d..137e4415c1 100755
--- a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh
@@ -7,7 +7,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each
 # EAGLE3 speculative-token count, measure the REAL AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# benchmarks/speedbench-reference-al.yaml.
+# golden_al_distribution/kimik2.5_eagle3.yaml.
 #
 # Kimi-K2.5 uses the lightseekorg/kimi-k2.5-eagle3-mla draft head (MLA
 # variant, recommended by official docs). The draft model is downloaded
diff --git a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh
index dac39fb538..71dfed4656 100755
--- a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh
@@ -7,7 +7,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each
 # EAGLE3 level (num_speculative_tokens), measure the REAL AL on a single
 # SPEED-Bench category (default: coding) and emit a YAML matrix identical in
-# shape to benchmarks/speedbench-reference-al.yaml. This measures real EAGLE3
+# shape to the files under golden_al_distribution/. This measures real EAGLE3
 # acceptance; the synthetic value is injected downstream by the throughput
 # recipe, not here.
 #
diff --git a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh
index bf2bda7c8d..64dd01178c 100755
--- a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh
@@ -6,7 +6,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
 # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance;
+# golden_al_distribution/qwen3.5_mtp.yaml. This measures real MTP acceptance;
 # the synthetic value is injected downstream by the throughput recipe, not here.
 #
 # Adapted from speedbench/dsv4_fp4_b300_vllm.sh. Differences vs DSV4 (deepseek_v4
diff --git a/benchmarks/speedbench-reference-al.yaml b/benchmarks/speedbench-reference-al.yaml
deleted file mode 100644
index b3dbf441d1..0000000000
--- a/benchmarks/speedbench-reference-al.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Acceptance Length (AL) reference values measured with SPEED-Bench.
-# dataset: coding | temperature: 1.0 | output_len: 4096
-# Measured on DeepSeek-V4-Pro (B300, vLLM MTP), per num_speculative_tokens.
-#
-# Two modes are reported:
-#   thinking_on  - reasoning enabled; this is the PRODUCTION configuration and
-#                  the golden reference used for synthetic-acceptance modeling.
-#   thinking_off - reasoning disabled; provided for comparison only.
-#
-# key = num_speculative_tokens (MTP level); value = golden AL
-deepseek-v4-pro:
-  thinking_on:
-    1: 1.79
-    2: 2.27
-    3: 2.47
-    4: 2.54
-    5: 2.52
-    6: 2.54
-    7: 2.54
-    8: 2.56
-  thinking_off:
-    1: 1.92
-    2: 2.60
-    3: 2.97
-    4: 3.04
-    5: 3.13
-    6: 3.08
-    7: 3.13
-    8: 3.12
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index 751de1cdd7..b49d424a8e 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -49,6 +49,21 @@ def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None:
     assert value == 2.75
 
 
+def test_lookup_reference_uses_committed_dsv4_golden() -> None:
+    ref = Path(__file__).resolve().parents[2] / "golden_al_distribution/dsv4_mtp.yaml"
+
+    _, mode_key, value = lookup_reference(
+        load_reference(ref),
+        model="deepseek-ai/DeepSeek-V4-Pro",
+        model_prefix="dsv4",
+        thinking_mode="on",
+        num_speculative_tokens=3,
+    )
+
+    assert mode_key == "thinking_on"
+    assert value == 2.49
+
+
 def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
     ref = tmp_path / "speedbench-reference-al.yaml"
     ref.write_text(
diff --git a/utils/evals/write_dynamo_speedbench_al_from_logs.sh b/utils/evals/write_dynamo_speedbench_al_from_logs.sh
index b33d0bdcd3..d47d428d48 100644
--- a/utils/evals/write_dynamo_speedbench_al_from_logs.sh
+++ b/utils/evals/write_dynamo_speedbench_al_from_logs.sh
@@ -38,6 +38,19 @@ if [[ -z "$model_name" ]]; then
     model_name="${SERVED_MODEL_NAME:-unknown}"
 fi
 
+reference_yaml="${SPEEDBENCH_REFERENCE_YAML:-}"
+if [[ -z "$reference_yaml" ]]; then
+    case "${MODEL_PREFIX:-}" in
+        dsv4)
+            reference_yaml="${workspace}/golden_al_distribution/dsv4_mtp.yaml"
+            ;;
+        *)
+            echo "Dynamo SpeedBench AL: no golden AL file for MODEL_PREFIX=${MODEL_PREFIX:-unknown}"
+            exit 0
+            ;;
+    esac
+fi
+
 output="${workspace}/results_speedbench_al_${mode}_mtp${mtp}.json"
 metric_source="dynamo-decode-log-counters"
 if [[ -n "${FRAMEWORK:-}" ]]; then
@@ -48,7 +61,7 @@ echo "Dynamo SpeedBench AL: parsing decode logs from $logs_dir"
 python3 "${workspace}/utils/evals/dynamo_speedbench_al_from_logs.py" \
     --logs-dir "$logs_dir" \
     --output "$output" \
-    --reference-yaml "${workspace}/benchmarks/speedbench-reference-al.yaml" \
+    --reference-yaml "$reference_yaml" \
     --model "$model_name" \
     --model-prefix "${MODEL_PREFIX:-}" \
     --thinking-mode "$mode" \

From 3935383910b4f4bdc9920f6cd4b62cc7ae2c4f0a Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:22:17 -0700
Subject: [PATCH 24/27] test: remove golden AL regression coverage

---
 utils/evals/test_speedbench_al.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
index b49d424a8e..751de1cdd7 100644
--- a/utils/evals/test_speedbench_al.py
+++ b/utils/evals/test_speedbench_al.py
@@ -49,21 +49,6 @@ def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None:
     assert value == 2.75
 
 
-def test_lookup_reference_uses_committed_dsv4_golden() -> None:
-    ref = Path(__file__).resolve().parents[2] / "golden_al_distribution/dsv4_mtp.yaml"
-
-    _, mode_key, value = lookup_reference(
-        load_reference(ref),
-        model="deepseek-ai/DeepSeek-V4-Pro",
-        model_prefix="dsv4",
-        thinking_mode="on",
-        num_speculative_tokens=3,
-    )
-
-    assert mode_key == "thinking_on"
-    assert value == 2.49
-
-
 def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
     ref = tmp_path / "speedbench-reference-al.yaml"
     ref.write_text(

From cf98627310fb3c654a2b8581b9db3d554a3d0fa0 Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:27:56 -0700
Subject: [PATCH 25/27] refactor: limit SpeedBench MTP exports to vLLM

---
 benchmarks/multi_node/amd_utils/server_sglang.sh               | 3 ---
 benchmarks/multi_node/amd_utils/server_vllm.sh                 | 3 ---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh  | 1 -
 .../single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh      | 2 --
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh  | 1 -
 .../single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh    | 2 --
 .../single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh      | 1 -
 7 files changed, 13 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index 7c116a2c35..68a68b650b 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -575,9 +575,6 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 speedbench_decode_metric_urls="http://${NODE0_ADDR}:8000/metrics"
             fi
             export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}"
-            if [[ "${SPEC_DECODING:-none}" == "mtp" && "${DECODE_MTP_SIZE:-0}" -gt 0 ]]; then
-                export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-$DECODE_MTP_SIZE}"
-            fi
 
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index b6e30a987d..3d096acaa0 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -355,9 +355,6 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 speedbench_decode_metric_urls="http://${NODE0_ADDR}:${SERVER_PORT}/metrics"
             fi
             export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}"
-            if [[ "${SPEC_DECODING:-none}" == "mtp" ]]; then
-                export SPEEDBENCH_NUM_SPEC_TOKENS="${SPEEDBENCH_NUM_SPEC_TOKENS:-${DECODE_MTP_SIZE:-${NUM_SPEC_TOKENS:-2}}}"
-            fi
 
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index 65ec60aa0c..0c7f323637 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -75,7 +75,6 @@ if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then
 else
     MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
 fi
-export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
 if [[ "$DP_ATTENTION" == "true" ]]; then
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index b3386a60d8..388194ddd3 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -85,7 +85,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --speculative-eagle-topk 1
         --speculative-num-draft-tokens 4
     )
-    export SPEEDBENCH_NUM_SPEC_TOKENS=1
     PARALLEL_ARGS=(
         --dp-size "$TP"
         --enable-dp-attention
@@ -106,7 +105,6 @@ else
         --speculative-eagle-topk 1
         --speculative-num-draft-tokens 4
     )
-    export SPEEDBENCH_NUM_SPEC_TOKENS=3
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
         --disable-flashinfer-autotune
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
index 9f8969d5e8..bb0362c256 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
@@ -83,7 +83,6 @@ if [[ "$ISL" -ge 4096 && "$CONC" -ge 128 ]]; then
 else
     MTP="${TRTLLM_DSV4_MTP_NUM_NEXTN_LAYERS:-3}"
 fi
-export SPEEDBENCH_NUM_SPEC_TOKENS="$MTP"
 MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
 # Cap CUDA-graph capture at batch 1024. TRTLLM_MLA_EXTRA_OVERLAP hands MLA
 # prologue tensors across streams without record_stream(), so graph warmup at
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh
index 13c639c98f..3addce4526 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang_mtp.sh
@@ -166,7 +166,6 @@ SPEC_FLAGS=(
     --speculative-eagle-topk 1
     --speculative-num-draft-tokens 4
 )
-export SPEEDBENCH_NUM_SPEC_TOKENS=3
 if [ "${DP_ATTENTION}" = "true" ]; then
     PARALLEL_ARGS+=(
         --dp "$TP"
@@ -179,7 +178,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --speculative-eagle-topk 1
         --speculative-num-draft-tokens 3
     )
-    export SPEEDBENCH_NUM_SPEC_TOKENS=2
 fi
 if [ "${EP_SIZE:-1}" -gt 1 ]; then
     PARALLEL_ARGS+=(--ep-size "$EP_SIZE")
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
index e8d4ffde79..788eff5b8b 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
@@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-export SPEEDBENCH_NUM_SPEC_TOKENS=3
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 

From 7ae7c56da8263133e7f71a7776aca6eb6dd520da Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:31:06 -0700
Subject: [PATCH 26/27] fix: restore GB200 Dynamo SpeedBench log collection

---
 runners/launch_gb200-nv.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 91331791d4..2c0f872832 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -551,6 +551,9 @@ fi
 # Collect eval results if eval was requested
 if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
     EVAL_DIR="$LOGS_DIR/eval_results"
+    if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
+        bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
+    fi
     if [ -d "$EVAL_DIR" ]; then
         echo "Extracting eval results from $EVAL_DIR"
         shopt -s nullglob

From f3179ad26cf220b7cc5d0f0186c486292d1e90c3 Mon Sep 17 00:00:00 2001
From: hjjq <50634613+hjjq@users.noreply.github.com>
Date: Mon, 29 Jun 2026 12:48:32 -0700
Subject: [PATCH 27/27] refactor: trim SpeedBench AL integration

---
 .github/workflows/speedbench-al.yml           |  25 +-
 benchmarks/benchmark_lib.sh                   | 313 ++-------------
 .../multi_node/amd_utils/server_sglang.sh     |  10 -
 .../multi_node/amd_utils/server_vllm.sh       |   9 -
 .../speedbench/dsr1_fp4_b300_vllm.sh          |   2 +-
 .../speedbench/dsv4_fp4_b300_vllm.sh          |   4 +-
 .../speedbench/glm5_fp4_b300_vllm.sh          |   2 +-
 .../speedbench/kimik2.5_fp4_b300_vllm.sh      |   2 +-
 .../speedbench/minimaxm3_fp4_b300_vllm.sh     |   2 +-
 .../speedbench/qwen3.5_fp4_b300_vllm.sh       |   2 +-
 runners/launch_gb200-nv.sh                    |   2 +-
 utils/evals/EVALS.md                          |  12 +-
 utils/evals/test_speedbench_al.py             | 358 ------------------
 13 files changed, 44 insertions(+), 699 deletions(-)
 delete mode 100644 utils/evals/test_speedbench_al.py

diff --git a/.github/workflows/speedbench-al.yml b/.github/workflows/speedbench-al.yml
index 781228dee2..84a92e57a4 100644
--- a/.github/workflows/speedbench-al.yml
+++ b/.github/workflows/speedbench-al.yml
@@ -3,8 +3,8 @@ name: SpeedBench AL Collection
 # Push-button (workflow_dispatch) collection of a SPEED-Bench acceptance-length
 # (AL) matrix: thinking_on/off x MTP levels, for the given model (defaults to
 # DeepSeek-V4-Pro). Produces the golden reference consumed by the
-# synthetic-acceptance framework and can optionally open a PR updating the
-# model's YAML under golden_al_distribution/.
+# synthetic-acceptance framework and (optionally) opens a PR updating
+# benchmarks/speedbench-reference-al.yaml.
 
 on:
   workflow_dispatch:
@@ -60,7 +60,7 @@ on:
         type: string
         default: '480'
       open-pr:
-        description: "Open a PR updating the model's golden_al_distribution YAML (default off: artifact-only)"
+        description: "Open a PR updating benchmarks/speedbench-reference-al.yaml (default off: artifact-only, paste values in manually)"
         required: false
         type: boolean
         default: false
@@ -179,23 +179,16 @@ jobs:
           GH_TOKEN: ${{ secrets.REPO_PAT }}
         run: |
           set -euo pipefail
-          case "${{ inputs.model-prefix }}" in
-            dsv4) GOLDEN_AL_PATH="golden_al_distribution/dsv4_mtp.yaml" ;;
-            qwen3.5) GOLDEN_AL_PATH="golden_al_distribution/qwen3.5_mtp.yaml" ;;
-            kimik2.5) GOLDEN_AL_PATH="golden_al_distribution/kimik2.5_eagle3.yaml" ;;
-            minimaxm3) GOLDEN_AL_PATH="golden_al_distribution/minimaxm3_eagle3.yaml" ;;
-            *)
-              echo "No committed golden AL path for model prefix: ${{ inputs.model-prefix }}" >&2
-              exit 1
-              ;;
-          esac
-          cp speedbench-reference-al.yaml "$GOLDEN_AL_PATH"
+          # NOTE: the reference yaml is keyed by model at the top level. This
+          # overwrites it with the current model's matrix; when more than one
+          # model is collected, replace this cp with a per-model-key YAML merge.
+          cp speedbench-reference-al.yaml benchmarks/speedbench-reference-al.yaml
 
           BRANCH="speedbench-al/${{ inputs.model-prefix }}-auto-${{ github.run_id }}"
           git config user.name "github-actions"
           git config user.email "github-actions@github.com"
           git checkout -b "$BRANCH"
-          git add "$GOLDEN_AL_PATH"
+          git add benchmarks/speedbench-reference-al.yaml
           if git diff --cached --quiet; then
             echo "No change in reference yaml; skipping PR."
             exit 0
@@ -229,4 +222,4 @@ jobs:
 
       - name: Resource cleanup (post-run)
         if: always()
-        run: *resource-cleanup
+        run: *resource-cleanup
\ No newline at end of file
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 340e3a9341..28f5dd50b7 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -742,28 +742,6 @@ _prometheus_metric_sum_url() {
     ' <<< "$values"
 }
 
-_prometheus_metric_avg_url() {
-    local url="$1"
-    local name="$2"
-    local values
-    values=$(_prometheus_metric_values_url "$url" "$name") || return 1
-    awk '
-        { sum += $1; count += 1 }
-        END {
-            if (count == 0) {
-                exit 1
-            }
-            printf "%.10f\n", sum / count
-        }
-    ' <<< "$values"
-}
-
-_prometheus_metric_sum() {
-    local port="$1"
-    local name="$2"
-    _prometheus_metric_sum_url "http://0.0.0.0:${port}/metrics" "$name"
-}
-
 _speedbench_normalize_metrics_url() {
     local endpoint="$1"
     endpoint="${endpoint%,}"
@@ -986,106 +964,6 @@ print(
 PY
 }
 
-_speedbench_trtllm_avg_decoded_al() {
-    local port="$1"
-    local value
-    value=$(_speedbench_metric_avg "$port" "trtllm_avg_decoded_tokens_per_iter" 2>/dev/null || true)
-    [[ -n "$value" ]] || return 1
-    awk -v value="$value" '
-        BEGIN {
-            if (value < 1.0) {
-                exit 1
-            }
-            printf "%.4f\n", value
-        }
-    '
-}
-
-_speedbench_trtllm_json_avg_decoded_al() {
-    local port="$1"
-    local urls=()
-    local url
-
-    while IFS= read -r url; do
-        [[ -n "$url" ]] && urls+=("$url")
-    done < <(_speedbench_trtllm_json_metrics_urls "$port")
-
-    [[ "${#urls[@]}" -gt 0 ]] || return 1
-
-    python3 - "${urls[@]}" <<'PY'
-import json
-import os
-import sys
-import urllib.request
-
-
-def number(value, default=0.0):
-    try:
-        if value is None:
-            return default
-        return float(value)
-    except (TypeError, ValueError):
-        return default
-
-
-def stats_from_payload(payload):
-    if isinstance(payload, list):
-        return payload
-    if isinstance(payload, dict):
-        return [payload]
-    return []
-
-
-timeout = float(os.environ.get("SPEEDBENCH_METRICS_CURL_TIMEOUT", "10"))
-weighted_total = 0.0
-total_requests = 0.0
-unweighted_total = 0.0
-unweighted_count = 0
-used_endpoints = 0
-
-for url in sys.argv[1:]:
-    try:
-        with urllib.request.urlopen(url, timeout=timeout) as response:
-            payload = json.load(response)
-    except Exception as exc:  # noqa: BLE001 - diagnostics for CI logs
-        print(f"SpeedBench AL eval: TRT-LLM JSON metrics fetch failed for {url}: {exc}", file=sys.stderr)
-        continue
-
-    endpoint_had_avg = False
-    for stat in stats_from_payload(payload):
-        if not isinstance(stat, dict):
-            continue
-        ifb = stat.get("inflightBatchingStats")
-        if not isinstance(ifb, dict):
-            continue
-
-        avg_decoded = number(ifb.get("avgNumDecodedTokensPerIter"), default=-1.0)
-        if avg_decoded < 1.0:
-            continue
-
-        gen_requests = number(ifb.get("numGenRequests"))
-        endpoint_had_avg = True
-        if gen_requests > 0:
-            weighted_total += avg_decoded * gen_requests
-            total_requests += gen_requests
-        else:
-            unweighted_total += avg_decoded
-            unweighted_count += 1
-
-    if endpoint_had_avg:
-        used_endpoints += 1
-
-if total_requests > 0:
-    acceptance_length = weighted_total / total_requests
-elif unweighted_count > 0:
-    acceptance_length = unweighted_total / unweighted_count
-else:
-    sys.exit(1)
-
-print(f"{acceptance_length:.4f}\t{used_endpoints}")
-PY
-}
-
 _speedbench_trtllm_server_log_metrics() {
     local mtp="$1"
     local start_offset="${2:-0}"
@@ -1193,9 +1071,6 @@ _speedbench_spec_gauge_metric() {
         trtllm:acceptance_length)
             echo "trtllm_spec_decode_acceptance_length"
             ;;
-        sglang:acceptance_length)
-            echo "sglang:spec_accept_length"
-            ;;
         sglang:draft_tokens_per_step)
             echo "sglang:spec_num_draft_tokens"
             ;;
@@ -1323,73 +1198,6 @@ _speedbench_prepare_dataset() {
     [[ -f "$speedbench_dir/qualitative.jsonl" ]]
 }
 
-_speedbench_apply_chat_template_kwargs_shim() {
-    echo "SpeedBench AL eval: patching vLLM benchmark --chat-template-kwargs support if needed"
-    python3 - <<'PYEOF'
-import vllm.benchmarks.serve as S
-import vllm.benchmarks.datasets.datasets as D
-
-
-def patch(mod, edits, marker):
-    f = mod.__file__
-    with open(f) as handle:
-        src = handle.read()
-    if marker in src:
-        print("already patched:", f)
-        return
-    for old, new in edits:
-        n = src.count(old)
-        assert n == 1, f"anchor matched {n} times in {f}, aborting:\n{old[:80]}..."
-        src = src.replace(old, new, 1)
-    with open(f, "w") as handle:
-        handle.write(src)
-    print("patched OK ->", f)
-
-
-serve_old = '''    parser.add_argument(
-        "--extra-body",'''
-serve_new = '''    parser.add_argument(
-        "--chat-template-kwargs",
-        type=json.loads,
-        default=None,
-        help="JSON dict forwarded to apply_chat_template during "
-        "client-side prompt rendering, e.g. to enable reasoning mode.",
-    )
-    parser.add_argument(
-        "--extra-body",'''
-patch(S, [(serve_old, serve_new)], marker='"--chat-template-kwargs"')
-
-disp_old = '''                output_len=args.speed_bench_output_len,
-                enable_multimodal_chat=args.enable_multimodal_chat,'''
-disp_new = '''                output_len=args.speed_bench_output_len,
-                chat_template_kwargs=args.chat_template_kwargs,
-                enable_multimodal_chat=args.enable_multimodal_chat,'''
-
-samp_old = '''                # apply template
-                if not skip_chat_template:
-                    prompt = tokenizer.apply_chat_template(
-                        [{"role": "user", "content": prompt}],
-                        add_generation_prompt=True,
-                        tokenize=False,
-                    )
-
-                prompt_len = len(tokenizer(prompt).input_ids)'''
-samp_new = '''                # apply template
-                if not skip_chat_template:
-                    _ctk = kwargs.get("chat_template_kwargs") or {}
-                    prompt = tokenizer.apply_chat_template(
-                        [{"role": "user", "content": prompt}],
-                        add_generation_prompt=True,
-                        tokenize=False,
-                        **_ctk,
-                    )
-
-                prompt_len = len(tokenizer(prompt).input_ids)'''
-patch(D, [(disp_old, disp_new), (samp_old, samp_new)],
-      marker="chat_template_kwargs=args.chat_template_kwargs")
-PYEOF
-}
-
 run_speedbench_al_eval() {
     local port="${PORT:-8888}"
     while [[ $# -gt 0 ]]; do
@@ -1458,26 +1266,6 @@ run_speedbench_al_eval() {
     fi
 
     local thinking_kwargs='{"thinking": true, "reasoning_effort": "high"}'
-    local client="${SPEEDBENCH_CLIENT:-auto}"
-    local use_vllm_client=0
-    if [[ "$client" != "openai" && "$client" != "native" ]] && command -v vllm >/dev/null 2>&1; then
-        use_vllm_client=1
-    fi
-    if [[ "$metrics_framework" == "sglang" ]]; then
-        use_vllm_client=0
-    fi
-
-    local think_args=()
-    if [[ "$mode" == "on" ]]; then
-        if [[ "$use_vllm_client" -eq 1 ]]; then
-            if ! _speedbench_apply_chat_template_kwargs_shim; then
-                echo "SpeedBench AL eval: --chat-template-kwargs shim failed" >&2
-                _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "--chat-template-kwargs shim failed"
-                return 0
-            fi
-            think_args=(--chat-template-kwargs "$thinking_kwargs")
-        fi
-    fi
 
     local accepted_before="" proposed_before="" verify_before="" completion_before=""
     accepted_before=$(_speedbench_spec_counter_sum "$metrics_framework" "$port" "accepted" 2>/dev/null || true)
@@ -1502,56 +1290,31 @@ run_speedbench_al_eval() {
     local bench_rc=0
     local speedbench_model="${MODEL_NAME:-${MODEL:-}}"
     echo "SpeedBench AL eval: running mode=${mode} mtp=${mtp}"
-    if [[ "$use_vllm_client" -eq 1 ]]; then
-        local raw_result_dir
-        raw_result_dir="$(mktemp -d /tmp/speedbench_al_raw-XXXXXX)"
-        local bench_cmd=(
-            vllm bench serve
-            --model "$speedbench_model"
-            --port "$port"
-            --dataset-name speed_bench
-            --dataset-path "$speedbench_dir"
-            --speed-bench-category coding
-            --speed-bench-output-len 4096
-            --num-prompts -1
-            --max-concurrency 1
-            --save-result
-            --result-dir "$raw_result_dir"
-            --result-filename "speedbench_al_${mode}_mtp${mtp}"
-            --trust-remote-code
-            --tokenizer-mode deepseek_v4
-            --temperature 1.0
-            "${think_args[@]}"
-        )
-        "${bench_cmd[@]}" || bench_rc=$?
-        rm -rf "$raw_result_dir" || true
-    else
-        export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
-        local native_cmd=(
-            python3 "$(pwd)/utils/evals/speedbench_client.py"
-            --model "$speedbench_model"
-            --base-url "http://0.0.0.0:${port}"
-            --dataset-path "$speedbench_dir"
-            --category coding
-            --output-len 4096
-            --temperature 1.0
-            --thinking-mode "$mode"
-            --timeout "${SPEEDBENCH_CLIENT_TIMEOUT:-1800}"
-            --retries "${SPEEDBENCH_CLIENT_RETRIES:-2}"
-        )
-        if [[ -n "${SPEEDBENCH_CLIENT_ENDPOINT:-}" ]]; then
-            native_cmd+=(--endpoint "$SPEEDBENCH_CLIENT_ENDPOINT")
-        elif [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
-            native_cmd+=(--endpoint completions)
-        fi
-        if [[ "$mode" == "on" ]]; then
-            native_cmd+=(--thinking-kwargs "$thinking_kwargs")
-        fi
-        if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
-            native_cmd+=(--dsv4)
-        fi
-        "${native_cmd[@]}" || bench_rc=$?
+    export OPENAI_API_KEY="${OPENAI_API_KEY:-EMPTY}"
+    local client_cmd=(
+        python3 "$(pwd)/utils/evals/speedbench_client.py"
+        --model "$speedbench_model"
+        --base-url "http://0.0.0.0:${port}"
+        --dataset-path "$speedbench_dir"
+        --category coding
+        --output-len 4096
+        --temperature 1.0
+        --thinking-mode "$mode"
+        --timeout "${SPEEDBENCH_CLIENT_TIMEOUT:-1800}"
+        --retries "${SPEEDBENCH_CLIENT_RETRIES:-2}"
+    )
+    if [[ -n "${SPEEDBENCH_CLIENT_ENDPOINT:-}" ]]; then
+        client_cmd+=(--endpoint "$SPEEDBENCH_CLIENT_ENDPOINT")
+    elif [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+        client_cmd+=(--endpoint completions)
     fi
+    if [[ "$mode" == "on" ]]; then
+        client_cmd+=(--thinking-kwargs "$thinking_kwargs")
+    fi
+    if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+        client_cmd+=(--dsv4)
+    fi
+    "${client_cmd[@]}" || bench_rc=$?
     if [[ "$bench_rc" -ne 0 ]]; then
         echo "SpeedBench AL eval: client failed with exit code ${bench_rc}" >&2
         _speedbench_write_eval_result "$output" "$mode" "$mtp" "" "" "" "" "$result_framework" "$metric_source_base" "SpeedBench client failed with exit code ${bench_rc}"
@@ -1603,20 +1366,6 @@ run_speedbench_al_eval() {
                     metric_source="trtllm-server-log-generation-tokens-samples${trt_log_samples}"
                 fi
             fi
-            if [[ -z "$al" ]]; then
-                al=$(_speedbench_trtllm_avg_decoded_al "$port" || true)
-                if [[ -n "$al" ]]; then
-                    metric_source="${metric_source_base}-avg-decoded-tokens-endpoints${metrics_endpoint_count}"
-                fi
-            fi
-            if [[ -z "$al" ]]; then
-                local trt_json_avg_metrics="" trt_json_avg_endpoints=""
-                trt_json_avg_metrics=$(_speedbench_trtllm_json_avg_decoded_al "$port" || true)
-                if [[ -n "$trt_json_avg_metrics" ]]; then
-                    IFS=$'\t' read -r al trt_json_avg_endpoints <<< "$trt_json_avg_metrics"
-                    metric_source="trtllm-json-avg-decoded-tokens-endpoints${trt_json_avg_endpoints}"
-                fi
-            fi
         fi
     elif [[ "$metrics_framework" == "sglang" ]]; then
         local draft_depth=""
@@ -1631,22 +1380,12 @@ run_speedbench_al_eval() {
             fi
             metric_source="${metric_source_base}-generation-counter+verify-counter-endpoints${metrics_endpoint_count}"
         fi
-        if [[ -z "$al" ]]; then
-            al=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "acceptance_length" 2>/dev/null | awk '{ printf "%.4f", $1 }' || true)
-            if [[ -n "$al" ]]; then
-                metric_source="${metric_source_base}-gauge-endpoints${metrics_endpoint_count}"
-            fi
-        fi
         if [[ -n "$delta_verify" && "$delta_verify" -gt 0 ]]; then
             if [[ -z "$draft_depth" ]]; then
                 draft_depth=$(_speedbench_spec_gauge_avg "$metrics_framework" "$port" "draft_tokens_per_step" 2>/dev/null || true)
             fi
-            if [[ -n "$draft_depth" ]]; then
-                delta_proposed="${delta_proposed:-$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")}"
-            fi
-            if [[ -n "$al" && "$metric_source" != *"generation-counter+verify-counter"* ]]; then
-                delta_acc=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v al="$al" 'BEGIN { value = verify * (al - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")
-                metric_source="${metric_source:-${metric_source_base}-gauge-endpoints${metrics_endpoint_count}}+derived-token-counters"
+            if [[ -n "$draft_depth" && -z "$delta_proposed" ]]; then
+                delta_proposed=$(_speedbench_round_metric "$(awk -v verify="$delta_verify" -v depth="$draft_depth" 'BEGIN { value = verify * (depth - 1); if (value < 0) value = 0; printf "%.10f\n", value }')")
             fi
         fi
     fi
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index 68a68b650b..34351b1e43 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -566,16 +566,6 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 export EVAL_MAX_MODEL_LEN="$prefill_context_length"
             fi
 
-            speedbench_decode_metric_urls=""
-            for i in $(seq 0 $((yD - 1))); do
-                decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
-                speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${IP_ARRAY[$decode_idx]}:8000/metrics"
-            done
-            if [[ -z "$speedbench_decode_metric_urls" ]]; then
-                speedbench_decode_metric_urls="http://${NODE0_ADDR}:8000/metrics"
-            fi
-            export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}"
-
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
             else
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index 3d096acaa0..f19ce8560b 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -347,15 +347,6 @@ if [ "$NODE_RANK" -eq 0 ]; then
                 export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
             fi
 
-            speedbench_decode_metric_urls=""
-            for decode_ip in ${DECODE_ARGS}; do
-                speedbench_decode_metric_urls+="${speedbench_decode_metric_urls:+,}http://${decode_ip}:${SERVER_PORT}/metrics"
-            done
-            if [[ -z "$speedbench_decode_metric_urls" ]]; then
-                speedbench_decode_metric_urls="http://${NODE0_ADDR}:${SERVER_PORT}/metrics"
-            fi
-            export SPEEDBENCH_DECODE_METRICS_URLS="${SPEEDBENCH_DECODE_METRICS_URLS:-$speedbench_decode_metric_urls}"
-
             if [[ "$DRY_RUN" -eq 1 ]]; then
                 echo "DRY RUN: run_eval --framework lm-eval --port $ROUTER_PORT (conc=${EVAL_CONCURRENT_REQUESTS}, ctx=${EVAL_MAX_MODEL_LEN:-auto})"
             else
diff --git a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
index d032164e67..d0357c6b43 100755
--- a/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/dsr1_fp4_b300_vllm.sh
@@ -5,7 +5,7 @@
 # Produces the golden acceptance-length (AL) reference matrix consumed by the
 # synthetic-acceptance framework: for each MTP level (num_speculative_tokens),
 # measure the REAL AL on a single SPEED-Bench category (default: coding) and emit
-# a YAML matrix identical in shape to the files under golden_al_distribution/.
+# a YAML matrix identical in shape to benchmarks/speedbench-reference-al.yaml.
 # This measures real MTP acceptance; the synthetic value is injected downstream
 # by the throughput recipe, not here.
 #
diff --git a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
index 97421eaa5a..b8550a3502 100755
--- a/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/dsv4_fp4_b300_vllm.sh
@@ -6,7 +6,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
 # level (num_speculative_tokens), measure the AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# golden_al_distribution/dsv4_mtp.yaml.
+# benchmarks/speedbench-reference-al.yaml.
 #
 # This is the "AL distribution collection" script wired into the
 # speedbench-al.yml GitHub Action (workflow_dispatch / push-button).
@@ -46,7 +46,7 @@ SPEEDBENCH_OUTPUT_LEN="${SPEEDBENCH_OUTPUT_LEN:-4096}"
 CONCURRENCY="${CONCURRENCY:-1}"
 TEMPERATURE="${TEMPERATURE:-1.0}"
 # thinking-on chat_template_kwargs. MUST match the production/golden config:
-# the reference matrix (golden_al_distribution/dsv4_mtp.yaml) was measured
+# the reference matrix (benchmarks/speedbench-reference-al.yaml) was measured
 # with reasoning_effort=high.
 DEFAULT_CHAT_TEMPLATE_KWARGS_ON='{"thinking": true, "reasoning_effort": "high"}'
 CHAT_TEMPLATE_KWARGS_ON="${CHAT_TEMPLATE_KWARGS_ON:-$DEFAULT_CHAT_TEMPLATE_KWARGS_ON}"
diff --git a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh
index 09577d802f..6265500b94 100755
--- a/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/glm5_fp4_b300_vllm.sh
@@ -6,7 +6,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
 # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# a model-specific YAML under golden_al_distribution/. This measures real MTP acceptance;
+# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance;
 # the synthetic value is injected downstream by the throughput recipe, not here.
 #
 # Filename *_fp4_* matches both the speedbench-al.yml path convention
diff --git a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh
index 137e4415c1..890c059f9d 100755
--- a/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/kimik2.5_fp4_b300_vllm.sh
@@ -7,7 +7,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each
 # EAGLE3 speculative-token count, measure the REAL AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# golden_al_distribution/kimik2.5_eagle3.yaml.
+# benchmarks/speedbench-reference-al.yaml.
 #
 # Kimi-K2.5 uses the lightseekorg/kimi-k2.5-eagle3-mla draft head (MLA
 # variant, recommended by official docs). The draft model is downloaded
diff --git a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh
index 71dfed4656..dac39fb538 100755
--- a/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/minimaxm3_fp4_b300_vllm.sh
@@ -7,7 +7,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each
 # EAGLE3 level (num_speculative_tokens), measure the REAL AL on a single
 # SPEED-Bench category (default: coding) and emit a YAML matrix identical in
-# shape to the files under golden_al_distribution/. This measures real EAGLE3
+# shape to benchmarks/speedbench-reference-al.yaml. This measures real EAGLE3
 # acceptance; the synthetic value is injected downstream by the throughput
 # recipe, not here.
 #
diff --git a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh
index 64dd01178c..bf2bda7c8d 100755
--- a/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/speedbench/qwen3.5_fp4_b300_vllm.sh
@@ -6,7 +6,7 @@
 # synthetic-acceptance framework: for each thinking mode (on/off) and each MTP
 # level (num_speculative_tokens), measure the REAL AL on a single SPEED-Bench
 # category (default: coding) and emit a YAML matrix identical in shape to
-# golden_al_distribution/qwen3.5_mtp.yaml. This measures real MTP acceptance;
+# benchmarks/speedbench-reference-al.yaml. This measures real MTP acceptance;
 # the synthetic value is injected downstream by the throughput recipe, not here.
 #
 # Adapted from speedbench/dsv4_fp4_b300_vllm.sh. Differences vs DSV4 (deepseek_v4
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 2c0f872832..ac9842312b 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -79,7 +79,7 @@ NGINX_IMAGE="nginx:1.27.4"
 
 uses_watchtower_shared_fs() {
     case "$MODEL_PREFIX" in
-        dsv4|minimaxm2.5|minimaxm3|kimik2.5) return 0 ;;
+        minimaxm2.5|minimaxm3|kimik2.5) return 0 ;;
         *) return 1 ;;
     esac
 }
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index ba23cad76a..7ff878dcec 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -152,19 +152,9 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results |
 | `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) |
 | `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval; a space-separated list enables sequential batched evals against one live engine |
-| `SPEEDBENCH_DIR` | `$(pwd)/speed_bench_data` | Prepared SpeedBench dataset directory |
-| `SPEEDBENCH_NUM_SPEC_TOKENS` | script-provided or `2` | MTP level used to select the reference AL row |
-| `SPEEDBENCH_METRICS_FRAMEWORK` | `FRAMEWORK` or `vllm` | Speculative metrics parser: `vllm`, `sglang`, `trtllm`, or a Dynamo variant |
-| `SPEEDBENCH_DECODE_METRICS_URLS` | unset | Decode-worker Prometheus endpoints for disaggregated runs |
-| `SPEEDBENCH_METRICS_URLS` | unset | Generic Prometheus endpoints |
-| `SPEEDBENCH_METRICS_PORTS` | unset | Localhost Prometheus ports when full URLs are unavailable |
-| `SPEEDBENCH_TRTLLM_JSON_METRICS_URLS` | unset | Optional TRT-LLM JSON iteration-stat endpoints |
-| `SPEEDBENCH_TRTLLM_SERVER_LOG` | `SERVER_LOG` | TRT-LLM iteration log used when spec metrics are unavailable |
-
-SpeedBench AL uses counter deltas over the eval request window. vLLM uses accepted-token and verify-step counters. SGLang uses generation-token and verify-call counters. TRT-LLM prefers Prometheus or JSON speculative metrics and falls back to iteration logs or average decoded tokens. Dynamo runs collect metrics from decode workers rather than the router.
 
 ### Score validation
-`utils/evals/validate_scores.py` checks lm-eval results against `utils/evals/thresholds.json` and requires SpeedBench AL to be within 95% to 105% of its golden value. It runs after artifact upload so results are preserved when validation fails.
+`utils/evals/validate_scores.py` checks eval results against thresholds in `utils/evals/thresholds.json`. Runs as a separate workflow step after artifact upload so results are preserved even if validation fails.
 
 ### Adding a new eval task
 
diff --git a/utils/evals/test_speedbench_al.py b/utils/evals/test_speedbench_al.py
deleted file mode 100644
index 751de1cdd7..0000000000
--- a/utils/evals/test_speedbench_al.py
+++ /dev/null
@@ -1,358 +0,0 @@
-import argparse
-import json
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).resolve().parent))
-sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
-
-from collect_eval_results import (
-    build_row,
-    detect_eval_jsons,
-    extract_speedbench_al_metrics,
-    score_cell,
-)
-from dynamo_speedbench_al_from_logs import aggregate_log_metrics
-from speedbench_al import build_result, load_reference, lookup_reference
-from speedbench_client import (
-    _chat_payload,
-    _completion_payload,
-    _load_speedbench_requests,
-)
-from trtllm_speedbench_al_from_log import parse_trtllm_iteration_log
-from validate_scores import validate_speedbench_al
-
-
-def test_lookup_reference_uses_model_prefix_alias(tmp_path: Path) -> None:
-    ref = tmp_path / "speedbench-reference-al.yaml"
-    ref.write_text(
-        """
-deepseek-v4-pro:
-  thinking_on:
-    2: 2.75
-  thinking_off:
-    2: 2.40
-"""
-    )
-
-    data = load_reference(ref)
-    model_key, mode_key, value = lookup_reference(
-        data,
-        model="deepseek-ai/DeepSeek-V4-Pro",
-        model_prefix="dsv4",
-        thinking_mode="on",
-        num_speculative_tokens=2,
-    )
-
-    assert model_key == "deepseek-v4-pro"
-    assert mode_key == "thinking_on"
-    assert value == 2.75
-
-
-def test_build_result_records_threshold_pass(tmp_path: Path) -> None:
-    ref = tmp_path / "speedbench-reference-al.yaml"
-    ref.write_text(
-        """
-deepseek-v4-pro:
-  thinking_on:
-    2: 2.50
-"""
-    )
-    args = argparse.Namespace(
-        reference_yaml=str(ref),
-        model="deepseek-ai/DeepSeek-V4-Pro",
-        model_prefix="dsv4",
-        thinking_mode="on",
-        num_speculative_tokens=2,
-        category="coding",
-        output_len=4096,
-        temperature=1.0,
-        threshold_ratio=0.95,
-        max_threshold_ratio=1.05,
-        acceptance_length="2.50",
-        accepted_tokens="13",
-        draft_tokens="10",
-        verify_steps="10",
-        proposed_draft_tokens="20",
-        framework="vllm",
-        metric_source="vllm-prometheus-counters-endpoints1",
-        error=None,
-    )
-
-    result = build_result(args)
-
-    assert result["reference_acceptance_length"] == 2.50
-    assert result["min_acceptance_length"] == 2.375
-    assert result["max_acceptance_length"] == 2.625
-    assert result["framework"] == "vllm"
-    assert result["metric_source"] == "vllm-prometheus-counters-endpoints1"
-    assert result["verify_steps"] == 10
-    assert result["proposed_draft_tokens"] == 20
-    assert result["passed"] is True
-
-
-def test_validate_speedbench_al_fails_below_minimum() -> None:
-    ok, checked = validate_speedbench_al(
-        {
-            "speedbench_al_eval_version": 1,
-            "task": "speedbench_al",
-            "thinking_mode": "thinking_on",
-            "num_speculative_tokens": 2,
-            "acceptance_length": 2.0,
-            "min_acceptance_length": 2.375,
-            "max_acceptance_length": 2.625,
-            "passed": False,
-        },
-        "results_speedbench_al.json",
-    )
-
-    assert checked == 1
-    assert ok is False
-
-
-def test_validate_speedbench_al_fails_above_maximum() -> None:
-    ok, checked = validate_speedbench_al(
-        {
-            "speedbench_al_eval_version": 1,
-            "task": "speedbench_al",
-            "thinking_mode": "thinking_on",
-            "num_speculative_tokens": 2,
-            "acceptance_length": 2.7,
-            "min_acceptance_length": 2.375,
-            "max_acceptance_length": 2.625,
-            "passed": True,
-        },
-        "results_speedbench_al.json",
-    )
-
-    assert checked == 1
-    assert ok is False
-
-
-def test_collect_eval_results_formats_speedbench_row(tmp_path: Path) -> None:
-    result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json"
-    result_path.write_text(
-        json.dumps(
-            {
-                "speedbench_al_eval_version": 1,
-                "task": "speedbench_al",
-                "model": "deepseek-ai/DeepSeek-V4-Pro",
-                "thinking_mode": "thinking_on",
-                "num_speculative_tokens": 2,
-                "acceptance_length": 2.5,
-                "framework": "sglang",
-                "metric_source": "sglang-prometheus-gauge-endpoints1+derived-token-counters",
-                "accepted_tokens": 13,
-                "verify_steps": 10,
-                "proposed_draft_tokens": 20,
-                "reference_acceptance_length": 2.5,
-                "min_acceptance_length": 2.375,
-                "max_acceptance_length": 2.625,
-                "threshold_ratio": 0.95,
-                "max_threshold_ratio": 1.05,
-                "passed": True,
-            }
-        )
-    )
-    metrics = extract_speedbench_al_metrics(result_path)
-    row = build_row(
-        {
-            "infmax_model_prefix": "dsv4",
-            "hw": "b300",
-            "framework": "vllm",
-            "precision": "fp4",
-            "spec_decoding": "mtp",
-        },
-        metrics[0],
-    )
-
-    assert row["task"] == "speedbench_al/thinking_on/mtp2"
-    assert row["score_name"] == "acceptance_length"
-    assert row["speedbench_framework"] == "sglang"
-    assert row["speedbench_metric_source"] == "sglang-prometheus-gauge-endpoints1+derived-token-counters"
-    assert row["speedbench_accepted_tokens"] == 13
-    assert row["speedbench_verify_steps"] == 10
-    assert row["speedbench_proposed_draft_tokens"] == 20
-    assert row["speedbench_max_acceptance_length"] == 2.625
-    assert score_cell(row) == "2.50 in [2.38, 2.62] (PASS)"
-
-
-def test_detect_eval_jsons_dedupes_flat_speedbench_result(tmp_path: Path) -> None:
-    result_path = tmp_path / "results_speedbench_al_thinking_on_mtp2.json"
-    result_path.write_text(
-        json.dumps(
-            {
-                "speedbench_al_eval_version": 1,
-                "task": "speedbench_al",
-                "thinking_mode": "thinking_on",
-                "num_speculative_tokens": 2,
-                "acceptance_length": 2.3,
-                "min_acceptance_length": 2.25,
-                "max_acceptance_length": 2.75,
-                "passed": True,
-            }
-        )
-    )
-
-    lm_path, speedbench_paths = detect_eval_jsons(tmp_path)
-
-    assert lm_path is None
-    assert speedbench_paths == [result_path]
-
-
-def test_dynamo_log_parser_aggregates_decode_workers(tmp_path: Path) -> None:
-    def write_log(name: str, rows: list[tuple[float, int, int]]) -> None:
-        lines = []
-        for al, accepted, drafted in rows:
-            lines.append(
-                "INFO metrics.log: SpecDecoding metrics: "
-                f"Mean acceptance length: {al}, "
-                "Accepted throughput: 1.0 tokens/s, "
-                "Drafted throughput: 1.0 tokens/s, "
-                f"Accepted: {accepted} tokens, Drafted: {drafted} tokens, "
-                "Per-position acceptance rate: 0.9, 0.7, "
-                "Avg Draft acceptance rate: 80.0%"
-            )
-        (tmp_path / name).write_text("\n".join(lines))
-
-    write_log("node-a_decode_w0.out", [(2.0, 10, 20)])
-    write_log("node-b_decode_w0.out", [(2.5, 15, 20), (2.5, 5, 10)])
-    write_log("node-c_decode_w1.out", [(2.0, 10, 20)])
-    write_log("node-d_decode_w1.out", [])
-
-    metrics = aggregate_log_metrics(tmp_path, mtp=2)
-
-    assert metrics is not None
-    assert metrics.workers == 2
-    assert metrics.samples == 3
-    assert metrics.accepted_tokens == 30
-    assert metrics.proposed_draft_tokens == 50
-    assert metrics.verify_steps == 25
-    assert metrics.acceptance_length == 2.2
-    assert [p.name for p in metrics.selected_logs] == [
-        "node-b_decode_w0.out",
-        "node-c_decode_w1.out",
-    ]
-
-
-def test_dynamo_log_parser_reads_sglang_accept_length_samples(tmp_path: Path) -> None:
-    (tmp_path / "node-a_decode_w0.out").write_text(
-        "\n".join(
-            [
-                "Decode batch, #running-req: 1, accept len: 2.10, accept rate: 0.37,",
-                "Decode batch, #running-req: 1, accept len: 2.30, accept rate: 0.43,",
-            ]
-        )
-    )
-    (tmp_path / "node-b_decode_w1.out").write_text(
-        "Decode batch, #running-req: 1, accept len: 2.50, accept rate: 0.50,"
-    )
-
-    metrics = aggregate_log_metrics(tmp_path, mtp=4)
-
-    assert metrics is not None
-    assert metrics.workers == 2
-    assert metrics.samples == 3
-    assert round(metrics.acceptance_length, 4) == 2.3
-    assert metrics.has_counter_metrics is False
-    assert metrics.accepted_tokens is None
-    assert metrics.verify_steps is None
-    assert metrics.proposed_draft_tokens is None
-
-
-def test_trtllm_log_parser_reads_generation_tokens_after_offset(tmp_path: Path) -> None:
-    log_path = tmp_path / "server.log"
-    prefix = "previous eval traffic\n"
-    body = "\n".join(
-        [
-            "[TRT-LLM] [I] iter = 1, num_scheduled_requests: 1, "
-            "states = {'num_ctx_requests': 1, 'num_ctx_tokens': 1024, 'num_generation_tokens': 0}",
-            "[TRT-LLM] [I] iter = 2, num_scheduled_requests: 1, "
-            "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 3}",
-            "[TRT-LLM] [I] iter = 3, num_scheduled_requests: 1, "
-            "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 2}",
-            'INFO:     127.0.0.1:1 - "GET /prometheus/metrics HTTP/1.1" 200 OK',
-            "[TRT-LLM] [I] iter = 4, num_scheduled_requests: 32, "
-            "states = {'num_ctx_requests': 0, 'num_ctx_tokens': 0, 'num_generation_tokens': 96}",
-        ]
-    )
-    log_path.write_text(prefix + body)
-
-    metrics = parse_trtllm_iteration_log(log_path, mtp=2, start_offset=len(prefix))
-
-    assert metrics is not None
-    assert metrics.samples == 2
-    assert metrics.generated_tokens == 5
-    assert metrics.accepted_tokens == 3
-    assert metrics.verify_steps == 2
-    assert metrics.proposed_draft_tokens == 4
-    assert metrics.acceptance_length == 2.5
-
-
-def test_trtllm_log_parser_can_infer_batched_steps(tmp_path: Path) -> None:
-    log_path = tmp_path / "server.log"
-    log_path.write_text(
-        "[TRT-LLM] [I] iter = 10, num_scheduled_requests: 28, "
-        "states = {'num_ctx_requests': 9, 'num_ctx_tokens': 9345, 'num_generation_tokens': 57}"
-    )
-
-    metrics = parse_trtllm_iteration_log(
-        log_path,
-        mtp=2,
-        stop_at_metrics_get=False,
-    )
-
-    assert metrics is not None
-    assert metrics.samples == 1
-    assert metrics.verify_steps == 19
-    assert metrics.accepted_tokens == 38
-    assert metrics.proposed_draft_tokens == 38
-    assert metrics.acceptance_length == 3.0
-
-
-def test_speedbench_client_loads_coding_and_builds_dsv4_payloads(tmp_path: Path) -> None:
-    dataset = tmp_path / "speed_bench_data"
-    dataset.mkdir()
-    (dataset / "qualitative.jsonl").write_text(
-        "\n".join(
-            [
-                json.dumps(
-                    {
-                        "category": "coding",
-                        "messages": [{"role": "user", "content": "Write fizzbuzz."}],
-                    }
-                ),
-                json.dumps(
-                    {
-                        "category": "math",
-                        "messages": [{"role": "user", "content": "Solve 2+2."}],
-                    }
-                ),
-            ]
-        )
-    )
-
-    prompts = _load_speedbench_requests(dataset, "coding", -1)
-    chat = _chat_payload(
-        prompts[0],
-        model="deepseek-ai/DeepSeek-V4-Pro",
-        output_len=4096,
-        temperature=1.0,
-        thinking_mode="on",
-        thinking_kwargs={"thinking": True, "reasoning_effort": "high"},
-    )
-    completions = _completion_payload(
-        prompts[0],
-        model="deepseek-ai/DeepSeek-V4-Pro",
-        output_len=4096,
-        temperature=1.0,
-        thinking_mode="on",
-        thinking_kwargs={"thinking": True, "reasoning_effort": "high"},
-        dsv4=True,
-    )
-
-    assert len(prompts) == 1
-    assert chat["chat_template_kwargs"]["thinking"] is True
-    assert chat["reasoning_effort"] == "high"
-    assert "<think>" in completions["prompt"]
-    assert completions["max_tokens"] == 4096