Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3a8a68f
Add GitHub Action to collect SPEED-Bench AL matrix
qiching Jun 2, 2026
bab431d
speedbench-al: default open-pr to false (artifact-only by default)
qiching Jun 3, 2026
d595d49
speedbench-al: parameterize model + relocate collector script
qiching Jun 4, 2026
b2dd50a
feat: add SpeedBench AL eval validation
hjjq Jun 4, 2026
f13ad72
Merge remote-tracking branch 'origin/main' into codex/m1-speedbench-e…
hjjq Jun 4, 2026
d2ce037
Merge remote-tracking branch 'qiching/albecheng/speedbench-al-action'…
hjjq Jun 4, 2026
4d72cdb
test: add SpeedBench AL reference handling
hjjq Jun 5, 2026
7f00621
Merge remote-tracking branch 'origin/main' into codex/m1-speedbench-e…
hjjq Jun 5, 2026
f40d6f2
Add multi-framework SpeedBench AL metrics
jasonlizhengjian Jun 5, 2026
f2aba4c
speedbench-al: fix --chat-template-kwargs default quoting so thinking…
Jun 8, 2026
c12acba
Apply SpeedBench chat-template shim to eval helper
jasonlizhengjian Jun 9, 2026
fa83900
Add native SpeedBench client fallback
jasonlizhengjian Jun 10, 2026
60c19dd
Use shared GB200 workspace for DSV4 Dynamo
jasonlizhengjian Jun 10, 2026
081cbca
Enable metrics for DSV4 SGLang MTP
jasonlizhengjian Jun 10, 2026
4cf5bbf
Enable TRT-LLM spec metrics for DSV4 MTP
jasonlizhengjian Jun 10, 2026
63bf3eb
Use TRT-LLM Prometheus metrics endpoint for SpeedBench
jasonlizhengjian Jun 10, 2026
4a4fbf1
Use TRT-LLM JSON stats for SpeedBench fallback
jasonlizhengjian Jun 10, 2026
de360bf
Use TRT-LLM decoded-token metric as AL fallback
jasonlizhengjian Jun 10, 2026
cc4c7e3
Collect Dynamo SpeedBench AL from decode logs
jasonlizhengjian Jun 10, 2026
2aef667
Use TRT-LLM server logs for SpeedBench AL fallback
jasonlizhengjian Jun 11, 2026
816dd1a
Capture GB200 srt-slurm bootstrap logs on early failure
jasonlizhengjian Jun 11, 2026
9cbb11c
Parse Dynamo SGLang SpeedBench AL from decode logs
jasonlizhengjian Jun 11, 2026
046f304
Read GB200 bootstrap log from Slurm stderr path
jasonlizhengjian Jun 11, 2026
16e7aa5
update SGL metrics gathering method
hjjq Jun 26, 2026
a48fc82
fix: validate SpeedBench AL within golden tolerance
hjjq Jun 29, 2026
4968a31
Merge remote-tracking branch 'upstream/main' into lijas/codex/m1-spee…
hjjq Jun 29, 2026
ba7fa14
fix: use canonical golden AL distribution
hjjq Jun 29, 2026
3935383
test: remove golden AL regression coverage
hjjq Jun 29, 2026
cf98627
refactor: limit SpeedBench MTP exports to vLLM
hjjq Jun 29, 2026
7ae7c56
fix: restore GB200 Dynamo SpeedBench log collection
hjjq Jun 29, 2026
f3179ad
refactor: trim SpeedBench AL integration
hjjq Jun 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
714 changes: 714 additions & 0 deletions benchmarks/benchmark_lib.sh

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ fi

# use 2 speculative tokens for all configs for now
NUM_SPEC_TOKENS=2
export SPEEDBENCH_NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS"

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL_PATH --served-model-name $MODEL \
--host 0.0.0.0 \
--port $PORT \
--enable-metrics \
--trust-remote-code \
--tp $TP \
--ep-size $EP_SIZE \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ fi

# use 2 speculative tokens for all configs for now
NUM_SPEC_TOKENS=2
export SPEEDBENCH_NUM_SPEC_TOKENS="$NUM_SPEC_TOKENS"

start_gpu_monitor

Expand Down
3 changes: 3 additions & 0 deletions runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ EOF
# Collect eval results if eval was requested
if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
EVAL_DIR="$LOGS_DIR/eval_results"
if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
fi
if [ -d "$EVAL_DIR" ]; then
echo "Extracting eval results from $EVAL_DIR"
shopt -s nullglob
Expand Down
3 changes: 3 additions & 0 deletions runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,9 @@ fi
# Collect eval results if eval was requested
if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
EVAL_DIR="$LOGS_DIR/eval_results"
if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
fi
if [ -d "$EVAL_DIR" ]; then
echo "Extracting eval results from $EVAL_DIR"
shopt -s nullglob
Expand Down
3 changes: 3 additions & 0 deletions runners/launch_gb300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,9 @@ fi
# Collect eval results if eval was requested
if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
EVAL_DIR="$LOGS_DIR/eval_results"
if [[ "${FRAMEWORK:-}" == dynamo* && "${SPEC_DECODING:-none}" == "mtp" ]]; then
bash "$GITHUB_WORKSPACE/utils/evals/write_dynamo_speedbench_al_from_logs.sh" "$LOGS_DIR" "$GITHUB_WORKSPACE"
fi
if [ -d "$EVAL_DIR" ]; then
echo "Extracting eval results from $EVAL_DIR"
shopt -s nullglob
Expand Down
100 changes: 93 additions & 7 deletions utils/collect_eval_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,20 @@ def detect_lm_eval_jsons(d: Path, batched: bool = False) -> List[Path]:
return [latest_by_conc[conc] for conc in sorted(latest_by_conc)]


def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], Optional[Path]]:
"""Return the latest legacy lm-eval JSON and deprecated second slot."""
def detect_speedbench_jsons(d: Path) -> List[Path]:
"""Return compact SpeedBench AL result JSONs from one artifact directory."""
paths = []
for path in d.glob('results*.json'):
data = load_json(path)
if isinstance(data, dict) and 'speedbench_al_eval_version' in data:
paths.append(path)
return sorted(paths)


def detect_eval_jsons(d: Path) -> Tuple[Optional[Path], List[Path]]:
"""Return the latest legacy lm-eval JSON and all SpeedBench AL JSONs."""
lm_paths = detect_lm_eval_jsons(d)
return (lm_paths[0] if lm_paths else None), None
return (lm_paths[0] if lm_paths else None), detect_speedbench_jsons(d)


def extract_lm_metrics(json_path: Path) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -171,6 +181,38 @@ def get_val_se(filter_name: str) -> Tuple[Optional[float], Optional[float]]:
return extracted


def extract_speedbench_al_metrics(json_path: Path) -> List[Dict[str, Any]]:
"""Extract a compact SpeedBench AL result as an eval metric row."""
data = load_json(json_path) or {}
if 'speedbench_al_eval_version' not in data:
return []

mode = data.get('thinking_mode', 'unknown')
mtp = data.get('num_speculative_tokens', 'unknown')
return [{
'metric_type': 'speedbench_al',
'task': 'speedbench_al',
'task_label': f"speedbench_al/{mode}/mtp{mtp}",
'acceptance_length': data.get('acceptance_length'),
'reference_acceptance_length': data.get('reference_acceptance_length'),
'min_acceptance_length': data.get('min_acceptance_length'),
'max_acceptance_length': data.get('max_acceptance_length'),
'threshold_ratio': data.get('threshold_ratio'),
'max_threshold_ratio': data.get('max_threshold_ratio'),
'thinking_mode': mode,
'num_speculative_tokens': mtp,
'speedbench_framework': data.get('framework'),
'speedbench_metric_source': data.get('metric_source'),
'speedbench_accepted_tokens': data.get('accepted_tokens'),
'speedbench_verify_steps': data.get('verify_steps', data.get('draft_tokens')),
'speedbench_proposed_draft_tokens': data.get('proposed_draft_tokens'),
'passed': data.get('passed'),
'error': data.get('error'),
'model': data.get('model'),
'source': str(json_path),
}]


def pct(x: Any) -> str:
"""Format value as percentage."""
try:
Expand Down Expand Up @@ -250,7 +292,7 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
'dp_attention': str(dp_attention).lower(),
'prefill_dp_attention': str(prefill_dp_attention).lower(),
'decode_dp_attention': str(decode_dp_attention).lower(),
'task': m.get('task', 'unknown'),
'task': m.get('task_label') or m.get('task', 'unknown'),
'em_strict': m.get('strict'),
'em_strict_se': m.get('strict_se'),
'em_flexible': m.get('flex'),
Expand All @@ -260,7 +302,25 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
}

# Add universal score field (primary metric for unified comparison)
if m.get('strict') is not None:
if m.get('metric_type') == 'speedbench_al':
row['score'] = m.get('acceptance_length')
row['score_name'] = 'acceptance_length'
row['score_se'] = None
row['speedbench_reference_acceptance_length'] = m.get('reference_acceptance_length')
row['speedbench_min_acceptance_length'] = m.get('min_acceptance_length')
row['speedbench_max_acceptance_length'] = m.get('max_acceptance_length')
row['speedbench_threshold_ratio'] = m.get('threshold_ratio')
row['speedbench_max_threshold_ratio'] = m.get('max_threshold_ratio')
row['speedbench_thinking_mode'] = m.get('thinking_mode')
row['speedbench_num_speculative_tokens'] = m.get('num_speculative_tokens')
row['speedbench_framework'] = m.get('speedbench_framework')
row['speedbench_metric_source'] = m.get('speedbench_metric_source')
row['speedbench_accepted_tokens'] = m.get('speedbench_accepted_tokens')
row['speedbench_verify_steps'] = m.get('speedbench_verify_steps')
row['speedbench_proposed_draft_tokens'] = m.get('speedbench_proposed_draft_tokens')
row['speedbench_passed'] = m.get('passed')
row['speedbench_error'] = m.get('error')
elif m.get('strict') is not None:
row['score'] = m.get('strict')
row['score_name'] = 'em_strict'
row['score_se'] = m.get('strict_se')
Expand All @@ -276,6 +336,28 @@ def build_row(meta: Dict[str, Any], m: Dict[str, Any]) -> Dict[str, Any]:
return row


def score_cell(row: Dict[str, Any]) -> str:
"""Format the primary score for lm-eval and non-percentage eval rows."""
if row.get('score_name') == 'acceptance_length':
score = row.get('score')
minimum = row.get('speedbench_min_acceptance_length')
maximum = row.get('speedbench_max_acceptance_length')
passed = row.get('speedbench_passed')
if score is None:
return 'FAIL'
try:
status = 'PASS' if passed else 'FAIL'
if minimum is None or maximum is None:
return f"{float(score):.2f} ({status})"
return (
f"{float(score):.2f} in "
f"[{float(minimum):.2f}, {float(maximum):.2f}] ({status})"
)
except Exception:
return str(score)
return f"{pct(row['score'])}{se(row['score_se'])}"


def collect_eval_rows(root: Path) -> List[Dict[str, Any]]:
"""Collect logical eval rows, expanding batched artifacts by concurrency."""
rows: List[Dict[str, Any]] = []
Expand All @@ -302,6 +384,10 @@ def collect_eval_rows(root: Path) -> List[Dict[str, Any]]:
metrics_list = extract_lm_metrics(lm_path)
for metrics in metrics_list:
rows.append(build_row(row_meta, metrics))

for speedbench_path in detect_speedbench_jsons(d):
for metrics in extract_speedbench_al_metrics(speedbench_path):
rows.append(build_row(meta, metrics))
return rows


Expand Down Expand Up @@ -376,7 +462,7 @@ def main():
r['conc'],
r['dp_attention'],
r['task'],
f"{pct(r['score'])}{se(r['score_se'])}",
score_cell(r),
f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
r['n_eff'] or '',
Expand Down Expand Up @@ -414,7 +500,7 @@ def main():
r['decode_num_workers'],
r['conc'],
r['task'],
f"{pct(r['score'])}{se(r['score_se'])}",
score_cell(r),
f"{pct(r['em_strict'])}{se(r['em_strict_se'])}",
f"{pct(r['em_flexible'])}{se(r['em_flexible_se'])}",
r['n_eff'] or '',
Expand Down
Loading