From 66568f069250a669e9703b4a3e82b4975c4cbd06 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:48:54 +0000 Subject: [PATCH 1/2] Rename checkpoint trigger file to be product-agnostic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The on-demand checkpoint sentinel file in /dev/shm was named "instructlab_checkpoint_requested" — rename it to the generic "checkpoint_requested" so the mechanism is not tied to a specific product name. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/on_demand_checkpointing.md | 2 +- src/instructlab/training/on_demand_checkpoint.py | 2 +- tests/unit/test_on_demand_checkpoint.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/on_demand_checkpointing.md b/docs/on_demand_checkpointing.md index b21d71d5..3e48365e 100644 --- a/docs/on_demand_checkpointing.md +++ b/docs/on_demand_checkpointing.md @@ -135,7 +135,7 @@ The trigger file is always at a fixed path. To trigger a checkpoint (e.g. via `kubectl exec` into the training pod): ```bash -touch /dev/shm/instructlab_checkpoint_requested +touch /dev/shm/checkpoint_requested ``` Workers check for the trigger file at each synchronization point in the diff --git a/src/instructlab/training/on_demand_checkpoint.py b/src/instructlab/training/on_demand_checkpoint.py index 2bf0361a..096caf92 100644 --- a/src/instructlab/training/on_demand_checkpoint.py +++ b/src/instructlab/training/on_demand_checkpoint.py @@ -89,7 +89,7 @@ # 2. Shared between all containers in the same Kubernetes pod. # 3. Automatically cleaned up when the pod is destroyed. _TRIGGER_DIR = Path("/dev/shm") -_TRIGGER_FILENAME = "instructlab_checkpoint_requested" +_TRIGGER_FILENAME = "checkpoint_requested" def _get_trigger_path() -> Path: diff --git a/tests/unit/test_on_demand_checkpoint.py b/tests/unit/test_on_demand_checkpoint.py index f9db657e..cda709ca 100644 --- a/tests/unit/test_on_demand_checkpoint.py +++ b/tests/unit/test_on_demand_checkpoint.py @@ -29,7 +29,7 @@ class TestGetTriggerPath: def test_returns_correct_name(self): path = _get_trigger_path() - assert path.name == "instructlab_checkpoint_requested" + assert path.name == "checkpoint_requested" assert str(path.parent) == "/dev/shm" @@ -43,7 +43,7 @@ def test_creates_file(self, tmp_path): def test_returns_correct_path(self, tmp_path): with patch("instructlab.training.on_demand_checkpoint._TRIGGER_DIR", tmp_path): path = write_trigger_file() - assert path == tmp_path / "instructlab_checkpoint_requested" + assert path == tmp_path / "checkpoint_requested" class TestTriggerFileExists: From c6a74803e781972b8972a0d90179dd2ba3050a83 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Thu, 30 Apr 2026 18:57:23 +0000 Subject: [PATCH 2/2] Allow checkpoint trigger filename to be overridden via env var Read CHECKPOINT_TRIGGER_FILENAME from os.environ at call time in _get_trigger_path() instead of using a module-level constant. This lets users customize the trigger filename for environments where multiple training jobs share the same /dev/shm. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/on_demand_checkpointing.md | 4 ++++ src/instructlab/training/on_demand_checkpoint.py | 4 ++-- tests/unit/test_on_demand_checkpoint.py | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/on_demand_checkpointing.md b/docs/on_demand_checkpointing.md index 3e48365e..b4576164 100644 --- a/docs/on_demand_checkpointing.md +++ b/docs/on_demand_checkpointing.md @@ -138,6 +138,10 @@ The trigger file is always at a fixed path. To trigger a checkpoint touch /dev/shm/checkpoint_requested ``` +The default filename is `checkpoint_requested`. To use a custom filename, +set the `CHECKPOINT_TRIGGER_FILENAME` environment variable before starting +training. + Workers check for the trigger file at each synchronization point in the training loop (multiple times per step). Once any rank on any node detects it, all ranks coordinate via `all_reduce` to save a checkpoint and exit. diff --git a/src/instructlab/training/on_demand_checkpoint.py b/src/instructlab/training/on_demand_checkpoint.py index 096caf92..3598c267 100644 --- a/src/instructlab/training/on_demand_checkpoint.py +++ b/src/instructlab/training/on_demand_checkpoint.py @@ -89,12 +89,12 @@ # 2. Shared between all containers in the same Kubernetes pod. # 3. Automatically cleaned up when the pod is destroyed. _TRIGGER_DIR = Path("/dev/shm") -_TRIGGER_FILENAME = "checkpoint_requested" def _get_trigger_path() -> Path: """Return the path to the checkpoint trigger file.""" - return _TRIGGER_DIR / _TRIGGER_FILENAME + filename = os.environ.get("CHECKPOINT_TRIGGER_FILENAME", "checkpoint_requested") + return _TRIGGER_DIR / filename def write_trigger_file() -> Path: diff --git a/tests/unit/test_on_demand_checkpoint.py b/tests/unit/test_on_demand_checkpoint.py index cda709ca..584d547f 100644 --- a/tests/unit/test_on_demand_checkpoint.py +++ b/tests/unit/test_on_demand_checkpoint.py @@ -32,6 +32,12 @@ def test_returns_correct_name(self): assert path.name == "checkpoint_requested" assert str(path.parent) == "/dev/shm" + def test_respects_env_override(self, monkeypatch): + monkeypatch.setenv("CHECKPOINT_TRIGGER_FILENAME", "my_custom_trigger") + path = _get_trigger_path() + assert path.name == "my_custom_trigger" + assert str(path.parent) == "/dev/shm" + class TestWriteTriggerFile: def test_creates_file(self, tmp_path):