Lightning-AI · deependujha · May 8, 2026 · May 8, 2026
@@ -29,6 +29,24 @@
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 
+_CORE_DEVICE_STATS_KEYS = frozenset([
+    # CPU
+    "cpu_percent",
+    "cpu_vm_percent",
+    # CUDA
+    "allocated_bytes.all.current",
+    "allocated_bytes.all.peak",
+    "reserved_bytes.all.current",
+    "reserved_bytes.all.peak",
+    "num_ooms",
+])
+
+_CORE_TPU_STATS_PREFIXES = frozenset([
+    "memory.free.",
+    "memory.used.",
+    "memory.percent.",
+])
+
 
 class DeviceStatsMonitor(Callback):
     r"""Automatically monitors and logs device stats during training, validation and testing stage.
@@ -99,6 +117,9 @@ class DeviceStatsMonitor(Callback):
         cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU.
             If ``True``, it will log CPU stats regardless of the accelerator.
             If ``False``, it will not log CPU stats regardless of the accelerator.
+        verbose: if ``True``, logs all available device stats returned by the accelerator.
+            If ``False``, logs only a core set of metrics (memory usage, CPU utilization)
+            that are most relevant for monitoring training health. Defaults to ``True``.
 
     Raises:
         MisconfigurationException:
@@ -115,8 +136,9 @@ class DeviceStatsMonitor(Callback):
 
     """
 
-    def __init__(self, cpu_stats: Optional[bool] = None) -> None:
+    def __init__(self, cpu_stats: Optional[bool] = None, verbose: bool = True) -> None:
         self._cpu_stats = cpu_stats
+        self._verbose = verbose
 
     @override
     def setup(
@@ -138,6 +160,14 @@ def setup(
                 f"`DeviceStatsMonitor` cannot log CPU stats as `psutil` is not installed. {str(_PSUTIL_AVAILABLE)} "
             )
 
+    @staticmethod
+    def _filter_core_device_stats(stats: dict[str, float]) -> dict[str, float]:
+        return {
+            k: v
+            for k, v in stats.items()
+            if k in _CORE_DEVICE_STATS_KEYS or any(k.startswith(prefix) for prefix in _CORE_TPU_STATS_PREFIXES)
+        }
+
     def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None:
         if not trainer._logger_connector.should_update_logs:
             return
@@ -155,6 +185,9 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None:
 
             device_stats.update(get_cpu_stats())
 
+        if not self._verbose:
+            device_stats = self._filter_core_device_stats(device_stats)
+
         for logger in trainer.loggers:
             separator = logger.group_separator
             prefixed_device_stats = _prefix_metric_keys(device_stats, f"{self.__class__.__qualname__}.{key}", separator)

@@ -217,3 +217,65 @@ def test_device_stats_monitor_logs_for_different_stages(tmp_path):
     test = any(test_stage_results)
 
     assert test, "testing stage logs not found"
+
+
+@RunIf(psutil=True)
+@pytest.mark.parametrize("verbose", [True, False])
+def test_device_stats_monitor_verbose_cpu(tmp_path, verbose):
+    """Test that verbose=False logs only core CPU stats, verbose=True logs all."""
+    model = BoringModel()
+
+    class AssertVerboseLogger(CSVLogger):
+        def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None:
+            has_swap = any("cpu_swap_percent" in k for k in metrics)
+            assert has_swap if verbose else not has_swap
+
+            assert any("cpu_percent" in k for k in metrics)
+            assert any("cpu_vm_percent" in k for k in metrics)
+
+    device_stats = DeviceStatsMonitor(verbose=verbose)
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        log_every_n_steps=1,
+        callbacks=device_stats,
+        logger=AssertVerboseLogger(tmp_path),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        accelerator="cpu",
+    )
+    trainer.fit(model)
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize("verbose", [True, False])
+def test_device_stats_monitor_verbose_gpu(tmp_path, verbose):
+    """Test that verbose=False logs only core CUDA stats, verbose=True logs all."""
+    model = BoringModel()
+
+    class AssertVerboseLogger(CSVLogger):
+        @rank_zero_only
+        def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None:
+            has_non_core = any("allocated_bytes.all.freed" in k for k in metrics)
+            assert has_non_core if verbose else not has_non_core
+
+            assert any("allocated_bytes.all.current" in k for k in metrics)
+            assert any("reserved_bytes.all.current" in k for k in metrics)
+            assert any("num_ooms" in k for k in metrics)
+
+    device_stats = DeviceStatsMonitor(verbose=verbose)
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        limit_train_batches=2,
+        log_every_n_steps=1,
+        accelerator="gpu",
+        devices=1,
+        callbacks=[device_stats],
+        logger=AssertVerboseLogger(tmp_path),
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+    )
+    trainer.fit(model)