Merge branch 'master' into feat/device-stats-monitor-non-expert-mode

deependujha · web-flow · commit af8259a436ef · 2026-06-01T17:49:41.000+05:30
diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml
@@ -60,7 +60,7 @@ jobs:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
         with:
           python-version: "3.10"
           # TODO: Avoid activating environment like this
diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -74,7 +74,7 @@ jobs:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
         with:
           python-version: ${{ matrix.config.python-version || '3.10' }}
           # TODO: Avoid activating environment like this
diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -79,7 +79,7 @@ jobs:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
         with:
           python-version: ${{ matrix.config.python-version || '3.10' }}
           # TODO: Avoid activating environment like this
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -34,7 +34,7 @@ jobs:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
         with:
           python-version: "3.11"
           # TODO: Avoid activating environment like this
diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
@@ -74,7 +74,7 @@ jobs:
           lfs: ${{ matrix.pkg-name == 'pytorch' }}
 
       - name: Install uv and set Python version
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
         with:
           python-version: "3.10"
           # TODO: Avoid activating environment like this
diff --git a/.github/workflows/release-pkg.yml b/.github/workflows/release-pkg.yml
@@ -154,7 +154,7 @@ jobs:
 
       - name: Publish distribution 📦 to PyPI
         # pypa/gh-action-pypi-publish v1.13.0
-        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b
         with:
           packages_dir: dist/${{ steps.folder.outputs.pkg }}
           verbose: true
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -24,8 +24,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed non-zero process exits in `CombinedLoader.reset()` with large tensors and persistent spawned workers by avoiding explicit `_shutdown_workers()` calls and relying on iterator cleanup via `del` [#21708](https://github.com/Lightning-AI/pytorch-lightning/issues/21708)
+
 - Fixed `SIGTERMException` producing a zero exit code instead of 143 (128 + SIGTERM) ([#21623](https://github.com/Lightning-AI/pytorch-lightning/issues/21623))
 
+- Fixed `LightningModule.toggle_optimizer` / `untoggle_optimizer` breaking under `torch.compile` by disabling Dynamo tracing on these bookkeeping helpers ([#21513](https://github.com/Lightning-AI/pytorch-lightning/issues/21513))
+
 ---
 
 ## [2.6.4] - 2026-05-20
diff --git a/src/lightning/pytorch/callbacks/device_stats_monitor.py b/src/lightning/pytorch/callbacks/device_stats_monitor.py
@@ -27,6 +27,7 @@
 from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
 from lightning.pytorch.callbacks.callback import Callback
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
+from lightning.pytorch.utilities.rank_zero import rank_zero_warn
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 
 _CORE_DEVICE_STATS_KEYS = frozenset([
@@ -120,6 +121,12 @@ class DeviceStatsMonitor(Callback):
         verbose: if ``True``, logs all available device stats returned by the accelerator.
             If ``False``, logs only a core set of metrics (memory usage, CPU utilization)
             that are most relevant for monitoring training health. Defaults to ``True``.
+        filter_keys: if ``None``, all stats returned by the accelerator are logged.
+            If a ``set`` of strings is provided, only the keys present in the set will be logged.
+            Keys are matched against the base metric names before prefixing (e.g.,
+            ``"cpu_percent"`` not ``"DeviceStatsMonitor.on_train_batch_end/cpu_percent"``).
+            A ``rank_zero_warn`` is emitted for any key in ``filter_keys`` not found in the
+            collected stats, which helps catch typos early.
 
     Raises:
         MisconfigurationException:
@@ -131,13 +138,29 @@ class DeviceStatsMonitor(Callback):
 
         from lightning import Trainer
         from lightning.pytorch.callbacks import DeviceStatsMonitor
+
+        # log all stats (default behaviour)
         device_stats = DeviceStatsMonitor()
         trainer = Trainer(callbacks=[device_stats])
 
+        # log only peak and current allocated GPU memory
+        device_stats = DeviceStatsMonitor(
+            filter_keys={"allocated_bytes.all.current", "allocated_bytes.all.peak"}
+        )
+        trainer = Trainer(callbacks=[device_stats])
+
+        # log CPU stats alongside a subset of GPU memory stats
+        device_stats = DeviceStatsMonitor(
+            cpu_stats=True,
+            filter_keys={"cpu_percent", "allocated_bytes.all.current"},
+        )
+        trainer = Trainer(callbacks=[device_stats])
+
     """
 
-    def __init__(self, cpu_stats: Optional[bool] = None, verbose: bool = False) -> None:
+    def __init__(self, cpu_stats: Optional[bool] = None, filter_keys: Optional[set[str]] = None, verbose: bool = False) -> None:
         self._cpu_stats = cpu_stats
+        self._filter_keys = filter_keys
         self._verbose = verbose
 
     @override
@@ -160,6 +183,21 @@ def setup(
                 f"`DeviceStatsMonitor` cannot log CPU stats as `psutil` is not installed. {str(_PSUTIL_AVAILABLE)} "
             )
 
+        if self._filter_keys is not None:
+            device_stats = trainer.accelerator.get_device_stats(device)
+            if self._cpu_stats and device.type != "cpu":
+                from lightning.pytorch.accelerators.cpu import get_cpu_stats
+
+                device_stats.update(get_cpu_stats())
+
+            unrecognized = self._filter_keys - device_stats.keys()
+            if unrecognized:
+                rank_zero_warn(
+                    f"`DeviceStatsMonitor` filter_keys contains keys not found in device stats and will be ignored:"
+                    f" {unrecognized}"
+                )
+
+
     @staticmethod
     def _filter_core_device_stats(stats: dict[str, float]) -> dict[str, float]:
         return {
@@ -187,6 +225,8 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None:
 
         if not self._verbose:
             device_stats = self._filter_core_device_stats(device_stats)
+        if self._filter_keys is not None:
+            device_stats = {k: v for k, v in device_stats.items() if k in self._filter_keys}
 
         for logger in trainer.loggers:
             separator = logger.group_separator
diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py
@@ -1136,12 +1136,24 @@ def backward(self, loss):
         else:
             loss.backward(*args, **kwargs)
 
+    @torch.compiler.disable
     def toggle_optimizer(self, optimizer: Union[Optimizer, LightningOptimizer]) -> None:
         """Makes sure only the gradients of the current optimizer's parameters are calculated in the training step to
         prevent dangling gradients in multiple-optimizer setup.
 
         It works with :meth:`untoggle_optimizer` to make sure ``param_requires_grad_state`` is properly reset.
 
+        .. note::
+            This method is decorated with :func:`torch.compiler.disable` so that it is executed as regular
+            Python when the ``LightningModule`` is wrapped with :func:`torch.compile`. Mutating
+            ``requires_grad`` on parameters is not supported by Dynamo/AOTAutograd (it can change a
+            tensor's leaf-ness mid-graph), so tracing this bookkeeping helper would either fail with
+            ``Unsupported: setattr() on Tensor.requires_grad`` or produce a ``KeyError`` on the
+            internal ``param_requires_grad_state`` mapping when the traced parameter references diverge
+            from those held by ``trainer.optimizers``. Disabling the compiler on this method keeps the
+            behavior identical for eager users while making it safe to call from a compiled
+            ``training_step``.
+
         Args:
             optimizer: The optimizer to toggle.
 
@@ -1165,9 +1177,13 @@ def toggle_optimizer(self, optimizer: Union[Optimizer, LightningOptimizer]) -> N
                 param.requires_grad = param_requires_grad_state[param]
         self._param_requires_grad_state = param_requires_grad_state
 
+    @torch.compiler.disable
     def untoggle_optimizer(self, optimizer: Union[Optimizer, LightningOptimizer]) -> None:
         """Resets the state of required gradients that were toggled with :meth:`toggle_optimizer`.
 
+        See :meth:`toggle_optimizer` for details on why this method is decorated with
+        :func:`torch.compiler.disable`.
+
         Args:
             optimizer: The optimizer to untoggle.
 
diff --git a/src/lightning/pytorch/utilities/combined_loader.py b/src/lightning/pytorch/utilities/combined_loader.py
@@ -397,7 +397,7 @@ def _load_state_dicts(self, states: list[dict[str, Any]]) -> None:
 def _shutdown_workers_and_reset_iterator(dataloader: object) -> None:
     if hasattr(dataloader, "_iterator"):
         if isinstance(dataloader._iterator, _MultiProcessingDataLoaderIter):
-            dataloader._iterator._shutdown_workers()
+            del dataloader._iterator
         dataloader._iterator = None
 
 
diff --git a/tests/tests_pytorch/callbacks/test_device_stats_monitor.py b/tests/tests_pytorch/callbacks/test_device_stats_monitor.py
@@ -249,6 +249,49 @@ def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) ->
     trainer.fit(model)
 
 
+@RunIf(psutil=True)
+@pytest.mark.parametrize(
+    ("filter_keys", "expected_present", "expected_absent"),
+    [
+        (
+            {_CPU_VM_PERCENT, _CPU_PERCENT},
+            [_CPU_VM_PERCENT, _CPU_PERCENT],
+            [_CPU_SWAP_PERCENT],
+        ),
+        (
+            {_CPU_PERCENT},
+            [_CPU_PERCENT],
+            [_CPU_VM_PERCENT, _CPU_SWAP_PERCENT],
+        ),
+    ],
+)
+def test_device_stats_monitor_filter_keys(tmp_path, filter_keys, expected_present, expected_absent):
+    """Test that filter_keys logs only the specified keys and omits the rest."""
+    model = BoringModel()
+
+    class AssertFilterLogger(CSVLogger):
+        def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None:
+            for key in expected_present:
+                assert any(key in k for k in metrics), f"Expected key {key!r} not found in metrics"
+            for key in expected_absent:
+                assert not any(key in k for k in metrics), f"Unexpected key {key!r} found in metrics"
+
+    device_stats = DeviceStatsMonitor(cpu_stats=True, filter_keys=filter_keys)
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        log_every_n_steps=1,
+        callbacks=device_stats,
+        logger=AssertFilterLogger(tmp_path),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        accelerator="cpu",
+    )
+    trainer.fit(model)
+
+
 @RunIf(min_cuda_gpus=1)
 @pytest.mark.parametrize("verbose", [True, False])
 def test_device_stats_monitor_verbose_gpu(tmp_path, verbose):
@@ -279,3 +322,23 @@ def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) ->
         enable_progress_bar=False,
     )
     trainer.fit(model)
+
+@RunIf(psutil=True)
+def test_device_stats_monitor_filter_keys_unrecognized_warns(tmp_path):
+    """Test that filter_keys emits a warning for keys not present in device stats."""
+    model = BoringModel()
+    device_stats = DeviceStatsMonitor(cpu_stats=True, filter_keys={"nonexistent_key_xyz"})
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        log_every_n_steps=1,
+        callbacks=device_stats,
+        logger=CSVLogger(tmp_path),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        accelerator="cpu",
+    )
+    with pytest.warns(UserWarning, match="filter_keys contains keys not found"):
+        trainer.fit(model)
diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py
@@ -298,6 +298,27 @@ def configure_optimizers(self):
     trainer.fit(model)
 
 
+@RunIf(dynamo=True)
+def test_toggle_untoggle_optimizer_are_compiler_disabled():
+    """Regression test for https://github.com/Lightning-AI/pytorch-lightning/issues/21513.
+
+    ``toggle_optimizer`` / ``untoggle_optimizer`` mutate ``requires_grad`` on Parameters, which
+    Dynamo/AOTAutograd does not support because it can change a tensor's leaf-ness mid-graph.
+    Tracing these helpers either graph-breaks with ``Unsupported: setattr() on Tensor.requires_grad``
+    or raises a ``KeyError`` on the internal ``param_requires_grad_state`` mapping when the traced
+    parameter references diverge from those held by ``trainer.optimizers``. Both methods are
+    decorated with ``@torch.compiler.disable`` so that Dynamo never enters them. This test verifies
+    the decorator is attached via the ``_torchdynamo_disable`` attribute the decorator installs
+    (the same assertion pattern used by ``tests/utilities/test_compile.py::test_compile_uncompile``).
+    """
+
+    def is_compiler_disabled(fn):
+        return any(el.startswith("_torchdynamo_disable") for el in dir(fn))
+
+    assert is_compiler_disabled(LightningModule.toggle_optimizer)
+    assert is_compiler_disabled(LightningModule.untoggle_optimizer)
+
+
 @pytest.mark.parametrize(
     ("accelerator", "device"),
     [
diff --git a/tests/tests_pytorch/loops/test_loops.py b/tests/tests_pytorch/loops/test_loops.py
@@ -1111,9 +1111,9 @@ def __init__(self, *args, dataloader, **kwargs):
             super().__init__(*args, **kwargs)
             self.dataloader = dataloader
 
-        def _shutdown_workers(self):
+        def __del__(self):
             self.dataloader.shutdown_workers_epochs.append(trainer.current_epoch)
-            super()._shutdown_workers()
+            super().__del__()
 
     class TestDataLoader(DataLoader):
         def __init__(self, *args, **kwargs):
@@ -1137,8 +1137,8 @@ def _get_iterator(self):
         trainer.fit(model, train_dataloader, val_dataloader)
 
     if persistent_workers:
-        # workers get created and persist until the teardown in the final epoch
-        expected = [trainer.current_epoch, trainer.current_epoch]  # once epoch end, once on teardown
+        # workers persist across epochs and are shut down exactly once via __del__.
+        expected = [trainer.current_epoch]
     elif should_fail:
         expected = [
             # <-- iter() on epoch 0, workers get created
@@ -1155,8 +1155,7 @@ def _get_iterator(self):
     assert train_dataloader.shutdown_workers_epochs == expected
 
     if persistent_workers:
-        # workers get created and persist until the teardown in the final epoch
-        expected = [trainer.current_epoch, trainer.current_epoch]  # once epoch end, once on teardown
+        expected = [trainer.current_epoch]
     elif should_fail:
         expected = [
             # <-- iter() on sanity check, workers get created
diff --git a/tests/tests_pytorch/utilities/test_combined_loader.py b/tests/tests_pytorch/utilities/test_combined_loader.py
@@ -657,3 +657,27 @@ def test_load_state_dicts():
     cl._load_state_dicts([state1, state2])
     stateful1.load_state_dict.assert_called_with(state1)
     stateful2.load_state_dict.assert_called_with(state2)
+
+
+def test_combined_loader_reset_uses_del_not_shutdown_workers():
+    """Test that `combined_loader.reset()` uses `del` to reset the dataloader iterator instead of calling
+    `_shutdown_workers()` explicitly.
+
+    This is a regression test for https://github.com/Lightning-AI/pytorch-lightning/issues/21703
+
+    """
+    from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter
+
+    dataloader = DataLoader(range(10), num_workers=2, persistent_workers=True, multiprocessing_context="spawn")
+    combined_loader = CombinedLoader([dataloader])
+
+    mock_iterator = Mock(spec=_MultiProcessingDataLoaderIter)
+    mock_iterator._shutdown_workers = Mock()
+    dataloader._iterator = mock_iterator
+
+    iterator_ref = dataloader._iterator
+
+    combined_loader.reset()
+
+    iterator_ref._shutdown_workers.assert_not_called()
+    assert dataloader._iterator is None