diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 49c58586674..856363fc9c1 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Changelog **New Features** +- ONNX autotune: warm restart now ignores error records (e.g. from connection loss or TensorRT build timeout) and retries them automatically. Previously, failed schemes were permanently marked as profiled and skipped on resume. - Add the ``day0-release`` agent skill (``.agents/skills/day0-release/``), a deterministic end-to-end driver that chains the PTQ → evaluation → comparison skills (the evaluation stage deploys the checkpoint itself) with an enforced gate after each stage and returns a publish decision (ACCEPT / REGRESSION / ANOMALOUS / INFEASIBLE). Ships three GPU-free, unit-tested gate scripts (``gate_ptq.py``, ``gate_run.py``, ``gate_compare.py``) that validate checkpoint coverage, evaluation-run completeness, and baseline-vs-candidate accuracy threshold. v1 reports and stops on regression; the recipe-search loop is deferred. - Add **streaming** speculative-decoding training (EAGLE3 / DFlash): the draft trains on base-model hidden states produced on the fly by a co-located ``vllm serve`` (no disk dump), moved trainer-side over NIXL RDMA, scaling to multi-node (dedicated serve replicas + DDP trainers). New launcher examples for NVFP4 Kimi-K2.5 / K2.6 on GB200/aarch64 under ``tools/launcher/examples/moonshotai/``. diff --git a/docs/source/guides/9_autotune.rst b/docs/source/guides/9_autotune.rst index 583dfcb6ee8..fce378e8d98 100644 --- a/docs/source/guides/9_autotune.rst +++ b/docs/source/guides/9_autotune.rst @@ -209,6 +209,8 @@ A long run can be interrupted (Ctrl+C, preemption, or crash) and resumed later: When rerun with the same ``--output_dir``, the autotuner detects ``autotuner_state.yaml``, restores progress, and continues from the next unprofiled region. +Schemes that previously failed due to transient errors (e.g. connection loss, TensorRT build timeout) are automatically reset on warm restart and retried. This means you do not need to manually clean the state file after infrastructure failures. + Custom TensorRT Plugins ----------------------- @@ -648,6 +650,7 @@ The model may not benefit from quantization: * Use the same ``--output_dir`` (and ``--onnx_path``) as the original run * Confirm ``autotuner_state.yaml`` exists in that directory * If the state file is corrupted, remove it and start over +* Schemes that failed due to connection loss or transient errors are automatically retried on restart Debugging --------- diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py index 6df297e9541..5865a6f3b1c 100644 --- a/modelopt/onnx/quantization/autotune/autotuner_base.py +++ b/modelopt/onnx/quantization/autotune/autotuner_base.py @@ -257,16 +257,32 @@ def set_profile_region(self, region: Region | None, commit: bool = True) -> None logger.debug(f"Pattern signature: {region_pattern.signature}") return - pattern_schemes, num_seeded = self._seed_from_cache(region_pattern) - if pattern_schemes is None: - pattern_schemes = PatternSchemes() - pattern_schemes.pattern = region_pattern - logger.debug("Initialized with empty scheme collection") + # Check if there's a partially-profiled entry in profiled_patterns + # (e.g. from warm restart with reset error schemes that need retry). + existing_pattern_schemes = None + for idx, p in enumerate(self.profiled_patterns): + if ( + p.pattern is not None + and p.pattern.matches(region, self.graph) + and any(not s.is_profiled for s in p.schemes) + ): + existing_pattern_schemes = self.profiled_patterns.pop(idx) + break + + if existing_pattern_schemes is not None: + pattern_schemes = existing_pattern_schemes + num_unprofiled = sum(1 for s in pattern_schemes.schemes if not s.is_profiled) + mode_info = f"resuming with {num_unprofiled} schemes to retry" + else: + pattern_schemes, num_seeded = self._seed_from_cache(region_pattern) + if pattern_schemes is None: + pattern_schemes = PatternSchemes() + pattern_schemes.pattern = region_pattern + logger.debug("Initialized with empty scheme collection") + mode_info = f"seeded with {num_seeded} schemes" if num_seeded > 0 else "starting fresh" self.current_profile_region = region self.current_profile_pattern_schemes = pattern_schemes - - mode_info = f"seeded with {num_seeded} schemes" if num_seeded > 0 else "starting fresh" logger.info( f"Profiling region {region.id} [level {region.level}, size" f"{region.get_size_of_region_and_descendants()}, {mode_info}]" @@ -784,11 +800,22 @@ def load_state(self, input_path: str) -> None: if "patterns" in state: num_loaded_patterns = 0 num_loaded_schemes = 0 + num_reset_errors = 0 for pattern_data in state["patterns"]: try: pattern_schemes = PatternSchemes.from_dict(pattern_data) + # Reset error schemes so they can be retried on warm restart. + # Errors are often transient (e.g. connection loss) and should + # not permanently block a scheme from being re-profiled. + for scheme in pattern_schemes.schemes: + if scheme.error: + scheme.error = False + scheme.latency_ms = float("inf") + scheme.profile_timestamp = None + num_reset_errors += 1 + if pattern_schemes.schemes: self.profiled_patterns.append(pattern_schemes) num_loaded_patterns += 1 @@ -804,7 +831,7 @@ def load_state(self, input_path: str) -> None: logger.info( f"Loaded state from {input_path} ({num_loaded_patterns} patterns, " - f"{num_loaded_schemes} schemes)" + f"{num_loaded_schemes} schemes, {num_reset_errors} error records reset for retry)" ) base_path, ext = os.path.splitext(input_path)