fix(checkpoint): honor train_time_interval under manual_optimization

wietzesuijker · wietzesuijker · commit 832130c9672d · 2026-05-04T07:52:24.000-04:00
ModelCheckpoint silently dropped train_time_interval when the LightningModule used manual optimization. The manual-opt branch in on_train_batch_end only checked every_n_train_steps, so a callback configured with `train_time_interval=timedelta(minutes=15)` and no step trigger never fired mid-run. last.ckpt did still appear at fit completion via on_train_end, which made the bug invisible to most tests but broke any workflow that relies on mid-run saves -- chained SLURM segments resuming from epoch 0 every time, spot/preempt training losing all in-flight progress, etc.

The fix mirrors the auto-opt branch's skip_batch + skip_time logic so a save fires when either trigger is satisfied. The new regression test uses a spy callback to observe _last_global_step_saved during fit, since checking the file at end-of-run misses the bug entirely.
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -26,6 +26,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed `SIGTERMException` producing a zero exit code instead of 143 (128 + SIGTERM) ([#21623](https://github.com/Lightning-AI/pytorch-lightning/issues/21623))
 
+- Fixed `ModelCheckpoint(train_time_interval=...)` silently no-op'ing under `automatic_optimization=False`; the manual-optimization branch in `on_train_batch_end` now mirrors the auto-opt branch and fires on time-based saves as well as `every_n_train_steps`
+
 ---
 
 ## [2.6.2] - 2026-03-19
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -348,9 +348,27 @@ def on_train_batch_end(
         """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`"""
         # For manual optimization, we need to handle saving differently
         if not pl_module.automatic_optimization:
-            # Skip if we don't need to save at this step
-            if self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0):
+            # Mirror the auto-opt branch: a save fires when EITHER `every_n_train_steps`
+            # OR `train_time_interval` is satisfied. Without this, `train_time_interval`
+            # silently no-ops under manual optimization and `last.ckpt` is never written
+            # mid-run when `every_n_train_steps` is not also configured.
+            skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0)
+
+            train_time_interval = self._train_time_interval
+            skip_time = True
+            now = time.monotonic()
+            # Important: allow zero timedelta as a valid interval
+            if train_time_interval is not None:
+                prev_time_check = self._last_time_checked
+                skip_time = prev_time_check is None or (now - prev_time_check) < train_time_interval.total_seconds()
+                # in case we have time differences across ranks
+                # broadcast the decision on whether to checkpoint from rank 0 to avoid possible hangs
+                skip_time = trainer.strategy.broadcast(skip_time)
+
+            if skip_batch and skip_time:
                 return
+            if not skip_time:
+                self._last_time_checked = now
 
             # Check if we should skip due to trainer/callback state
             if self._should_skip_saving_checkpoint(trainer):
diff --git a/tests/tests_pytorch/callbacks/test_model_checkpoint_manual_opt.py b/tests/tests_pytorch/callbacks/test_model_checkpoint_manual_opt.py
@@ -3,12 +3,13 @@
 import warnings
 from contextlib import contextmanager
 from copy import deepcopy
+from datetime import timedelta
 from pathlib import Path
 
 import torch
 from torch.utils.data import DataLoader, Dataset
 
-from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch import Callback, LightningModule, Trainer
 from lightning.pytorch.callbacks import ModelCheckpoint
 
 
@@ -180,3 +181,54 @@ def training_step(self, batch, batch_idx):
         # Verify our warning was raised
         assert len(manual_opt_warnings) > 0, "Expected warning about manual optimization not found"
         assert "The checkpoint will contain the model state AFTER optimization" in manual_opt_warnings[0]
+
+
+def test_model_checkpoint_manual_opt_train_time_interval():
+    """Regression: ``train_time_interval`` must fire mid-run under manual optimization.
+
+    Before the fix, the manual-optimization branch in ``on_train_batch_end`` only
+    inspected ``every_n_train_steps`` and silently no-op'd when ``train_time_interval``
+    was the only configured trigger. ``last.ckpt`` was still written by ``on_train_end``,
+    so end-of-run state checks miss the bug -- this test asserts the mid-run save by
+    observing ``_last_global_step_saved`` from a spy callback queued after the
+    ``ModelCheckpoint``.
+    """
+    saved_steps_during_training = []
+
+    class _Spy(Callback):
+        def __init__(self, ckpt: ModelCheckpoint) -> None:
+            self.ckpt = ckpt
+
+        def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+            saved_steps_during_training.append(self.ckpt._last_global_step_saved)
+
+    with cleanup_after_test(), tempfile.TemporaryDirectory() as tmpdir:
+        dataset = FakeDataset()
+        train_dataloader = DataLoader(dataset, batch_size=1)
+        model = SimpleModule()
+        ckpt = ModelCheckpoint(
+            dirpath=tmpdir,
+            save_top_k=0,
+            save_last=True,
+            train_time_interval=timedelta(seconds=0),
+            save_weights_only=True,
+        )
+        trainer = Trainer(
+            max_epochs=1,
+            callbacks=[ckpt, _Spy(ckpt)],
+            log_every_n_steps=1,
+            num_sanity_val_steps=0,
+            logger=False,
+        )
+        try:
+            trainer.fit(model, train_dataloader)
+        finally:
+            trainer._teardown()
+
+        # With ``train_time_interval=0``, the callback must fire on every batch.
+        # Pre-fix the value stayed at 0 until ``on_train_end`` saved once.
+        assert any(step > 0 for step in saved_steps_during_training), (
+            "ModelCheckpoint(train_time_interval=...) silently no-op'd mid-run under manual_optimization; "
+            f"observed _last_global_step_saved values during training: {saved_steps_during_training}"
+        )
+        assert (Path(tmpdir) / "last.ckpt").exists()