Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/lightning/pytorch/callbacks/device_stats_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,24 @@
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.types import STEP_OUTPUT

_CORE_DEVICE_STATS_KEYS = frozenset([
# CPU
"cpu_percent",
"cpu_vm_percent",
# CUDA
"allocated_bytes.all.current",
"allocated_bytes.all.peak",
"reserved_bytes.all.current",
"reserved_bytes.all.peak",
"num_ooms",
])

_CORE_TPU_STATS_PREFIXES = frozenset([
"memory.free.",
"memory.used.",
"memory.percent.",
])


class DeviceStatsMonitor(Callback):
r"""Automatically monitors and logs device stats during training, validation and testing stage.
Expand Down Expand Up @@ -99,6 +117,9 @@ class DeviceStatsMonitor(Callback):
cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU.
If ``True``, it will log CPU stats regardless of the accelerator.
If ``False``, it will not log CPU stats regardless of the accelerator.
verbose: if ``True``, logs all available device stats returned by the accelerator.
If ``False``, logs only a core set of metrics (memory usage, CPU utilization)
that are most relevant for monitoring training health. Defaults to ``True``.

Raises:
MisconfigurationException:
Expand All @@ -115,8 +136,9 @@ class DeviceStatsMonitor(Callback):

"""

def __init__(self, cpu_stats: Optional[bool] = None) -> None:
def __init__(self, cpu_stats: Optional[bool] = None, verbose: bool = True) -> None:
self._cpu_stats = cpu_stats
self._verbose = verbose

@override
def setup(
Expand All @@ -138,6 +160,14 @@ def setup(
f"`DeviceStatsMonitor` cannot log CPU stats as `psutil` is not installed. {str(_PSUTIL_AVAILABLE)} "
)

@staticmethod
def _filter_core_device_stats(stats: dict[str, float]) -> dict[str, float]:
return {
k: v
for k, v in stats.items()
if k in _CORE_DEVICE_STATS_KEYS or any(k.startswith(prefix) for prefix in _CORE_TPU_STATS_PREFIXES)
}
Comment thread
deependujha marked this conversation as resolved.

def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None:
if not trainer._logger_connector.should_update_logs:
return
Expand All @@ -155,6 +185,9 @@ def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None:

device_stats.update(get_cpu_stats())

if not self._verbose:
device_stats = self._filter_core_device_stats(device_stats)

for logger in trainer.loggers:
separator = logger.group_separator
prefixed_device_stats = _prefix_metric_keys(device_stats, f"{self.__class__.__qualname__}.{key}", separator)
Expand Down
62 changes: 62 additions & 0 deletions tests/tests_pytorch/callbacks/test_device_stats_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,65 @@ def test_device_stats_monitor_logs_for_different_stages(tmp_path):
test = any(test_stage_results)

assert test, "testing stage logs not found"


@RunIf(psutil=True)
@pytest.mark.parametrize("verbose", [True, False])
def test_device_stats_monitor_verbose_cpu(tmp_path, verbose):
"""Test that verbose=False logs only core CPU stats, verbose=True logs all."""
model = BoringModel()
Comment thread
deependujha marked this conversation as resolved.

class AssertVerboseLogger(CSVLogger):
def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None:
has_swap = any("cpu_swap_percent" in k for k in metrics)
assert has_swap if verbose else not has_swap

assert any("cpu_percent" in k for k in metrics)
assert any("cpu_vm_percent" in k for k in metrics)

device_stats = DeviceStatsMonitor(verbose=verbose)
trainer = Trainer(
default_root_dir=tmp_path,
max_epochs=1,
limit_train_batches=2,
limit_val_batches=0,
log_every_n_steps=1,
callbacks=device_stats,
logger=AssertVerboseLogger(tmp_path),
enable_checkpointing=False,
enable_progress_bar=False,
accelerator="cpu",
)
trainer.fit(model)


@RunIf(min_cuda_gpus=1)
@pytest.mark.parametrize("verbose", [True, False])
def test_device_stats_monitor_verbose_gpu(tmp_path, verbose):
"""Test that verbose=False logs only core CUDA stats, verbose=True logs all."""
model = BoringModel()

class AssertVerboseLogger(CSVLogger):
@rank_zero_only
def log_metrics(self, metrics: dict[str, float], step: Optional[int] = None) -> None:
has_non_core = any("allocated_bytes.all.freed" in k for k in metrics)
assert has_non_core if verbose else not has_non_core

assert any("allocated_bytes.all.current" in k for k in metrics)
assert any("reserved_bytes.all.current" in k for k in metrics)
assert any("num_ooms" in k for k in metrics)

device_stats = DeviceStatsMonitor(verbose=verbose)
trainer = Trainer(
default_root_dir=tmp_path,
max_epochs=1,
limit_train_batches=2,
log_every_n_steps=1,
accelerator="gpu",
devices=1,
callbacks=[device_stats],
logger=AssertVerboseLogger(tmp_path),
enable_checkpointing=False,
enable_progress_bar=False,
)
trainer.fit(model)
Loading