Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Changelog

**New Features**

- ONNX autotune: warm restart now ignores error records (e.g. from connection loss or TensorRT build timeout) and retries them automatically. Previously, failed schemes were permanently marked as profiled and skipped on resume.
- Add the ``day0-release`` agent skill (``.agents/skills/day0-release/``), a deterministic end-to-end driver that chains the PTQ → evaluation → comparison skills (the evaluation stage deploys the checkpoint itself) with an enforced gate after each stage and returns a publish decision (ACCEPT / REGRESSION / ANOMALOUS / INFEASIBLE). Ships three GPU-free, unit-tested gate scripts (``gate_ptq.py``, ``gate_run.py``, ``gate_compare.py``) that validate checkpoint coverage, evaluation-run completeness, and baseline-vs-candidate accuracy threshold. v1 reports and stops on regression; the recipe-search loop is deferred.
- Add **streaming** speculative-decoding training (EAGLE3 / DFlash): the draft trains on base-model hidden states produced on the fly by a co-located ``vllm serve`` (no disk dump), moved trainer-side over NIXL RDMA, scaling to multi-node (dedicated serve replicas + DDP trainers). New launcher examples for NVFP4 Kimi-K2.5 / K2.6 on GB200/aarch64 under ``tools/launcher/examples/moonshotai/``.

Expand Down
3 changes: 3 additions & 0 deletions docs/source/guides/9_autotune.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ A long run can be interrupted (Ctrl+C, preemption, or crash) and resumed later:

When rerun with the same ``--output_dir``, the autotuner detects ``autotuner_state.yaml``, restores progress, and continues from the next unprofiled region.

Schemes that previously failed due to transient errors (e.g. connection loss, TensorRT build timeout) are automatically reset on warm restart and retried. This means you do not need to manually clean the state file after infrastructure failures.

Custom TensorRT Plugins
-----------------------

Expand Down Expand Up @@ -648,6 +650,7 @@ The model may not benefit from quantization:
* Use the same ``--output_dir`` (and ``--onnx_path``) as the original run
* Confirm ``autotuner_state.yaml`` exists in that directory
* If the state file is corrupted, remove it and start over
* Schemes that failed due to connection loss or transient errors are automatically retried on restart

Debugging
---------
Expand Down
43 changes: 35 additions & 8 deletions modelopt/onnx/quantization/autotune/autotuner_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,16 +257,32 @@ def set_profile_region(self, region: Region | None, commit: bool = True) -> None
logger.debug(f"Pattern signature: {region_pattern.signature}")
return

pattern_schemes, num_seeded = self._seed_from_cache(region_pattern)
if pattern_schemes is None:
pattern_schemes = PatternSchemes()
pattern_schemes.pattern = region_pattern
logger.debug("Initialized with empty scheme collection")
# Check if there's a partially-profiled entry in profiled_patterns
# (e.g. from warm restart with reset error schemes that need retry).
existing_pattern_schemes = None
for idx, p in enumerate(self.profiled_patterns):
if (
p.pattern is not None
and p.pattern.matches(region, self.graph)
and any(not s.is_profiled for s in p.schemes)
):
existing_pattern_schemes = self.profiled_patterns.pop(idx)
break

if existing_pattern_schemes is not None:
pattern_schemes = existing_pattern_schemes
num_unprofiled = sum(1 for s in pattern_schemes.schemes if not s.is_profiled)
mode_info = f"resuming with {num_unprofiled} schemes to retry"
else:
pattern_schemes, num_seeded = self._seed_from_cache(region_pattern)
if pattern_schemes is None:
pattern_schemes = PatternSchemes()
pattern_schemes.pattern = region_pattern
logger.debug("Initialized with empty scheme collection")
mode_info = f"seeded with {num_seeded} schemes" if num_seeded > 0 else "starting fresh"

self.current_profile_region = region
self.current_profile_pattern_schemes = pattern_schemes

mode_info = f"seeded with {num_seeded} schemes" if num_seeded > 0 else "starting fresh"
logger.info(
f"Profiling region {region.id} [level {region.level}, size"
f"{region.get_size_of_region_and_descendants()}, {mode_info}]"
Expand Down Expand Up @@ -784,11 +800,22 @@ def load_state(self, input_path: str) -> None:
if "patterns" in state:
num_loaded_patterns = 0
num_loaded_schemes = 0
num_reset_errors = 0

for pattern_data in state["patterns"]:
try:
pattern_schemes = PatternSchemes.from_dict(pattern_data)

# Reset error schemes so they can be retried on warm restart.
# Errors are often transient (e.g. connection loss) and should
# not permanently block a scheme from being re-profiled.
for scheme in pattern_schemes.schemes:
if scheme.error:
scheme.error = False
scheme.latency_ms = float("inf")
scheme.profile_timestamp = None
num_reset_errors += 1

if pattern_schemes.schemes:
self.profiled_patterns.append(pattern_schemes)
num_loaded_patterns += 1
Expand All @@ -804,7 +831,7 @@ def load_state(self, input_path: str) -> None:

logger.info(
f"Loaded state from {input_path} ({num_loaded_patterns} patterns, "
f"{num_loaded_schemes} schemes)"
f"{num_loaded_schemes} schemes, {num_reset_errors} error records reset for retry)"
)

base_path, ext = os.path.splitext(input_path)
Expand Down