pytorch-lightning/src/lightning/pytorch/callbacks/device_stats_monitor.py at ea6eb4de7969397aeab2de46b871e139fa54fe79 · Lightning-AI/pytorch-lightning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Device Stats Monitor
====================

Monitors and logs device stats during training.

"""

from typing import Any, Optional

from typing_extensions import override

import lightning.pytorch as pl
from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
from lightning.pytorch.callbacks.callback import Callback
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.types import STEP_OUTPUT

_CORE_DEVICE_STATS_KEYS = frozenset([
    # CPU
    "cpu_percent",
    "cpu_vm_percent",
    # CUDA
    "allocated_bytes.all.current",
    "allocated_bytes.all.peak",
    "reserved_bytes.all.current",
    "reserved_bytes.all.peak",
    "num_ooms",
])

_CORE_TPU_STATS_PREFIXES = frozenset([
    "memory.free.",
    "memory.used.",
    "memory.percent.",
])


class DeviceStatsMonitor(Callback):
    r"""Automatically monitors and logs device stats during training, validation and testing stage.
    ``DeviceStatsMonitor`` is a special callback as it requires a ``logger`` to passed as argument to the ``Trainer``.

    **Logged Metrics**

    Logs device statistics with keys prefixed as ``DeviceStatsMonitor.{hook_name}/{base_metric_name}``.
    The actual metrics depend on the active accelerator and the ``cpu_stats`` flag. Below are an overview of the
    possible available metrics and their meaning.

    - CPU (via ``psutil``)

        - ``cpu_percent`` — System-wide CPU utilization (%)
        - ``cpu_vm_percent`` — System-wide virtual memory (RAM) utilization (%)
        - ``cpu_swap_percent`` — System-wide swap memory utilization (%)

    - CUDA GPU (via ``torch.cuda.memory_stats``)

        Logs memory statistics from PyTorch caching allocator (all in bytes).
        GPU compute utilization is not logged by default.

        - General Memory Usage:

            - ``allocated_bytes.all.current`` — Current allocated GPU memory
            - ``allocated_bytes.all.peak`` — Peak allocated GPU memory
            - ``reserved_bytes.all.current`` — Current reserved GPU memory (allocated + cached)
            - ``reserved_bytes.all.peak`` — Peak reserved GPU memory
            - ``active_bytes.all.current`` — Current GPU memory in active use
            - ``active_bytes.all.peak`` — Peak GPU memory in active use
            - ``inactive_split_bytes.all.current`` — Memory in inactive, splittable blocks

        - Allocator Pool Statistics* (for ``small_pool`` and ``large_pool``):

            - ``allocated_bytes.{pool_type}.current`` / ``allocated_bytes.{pool_type}.peak``
            - ``reserved_bytes.{pool_type}.current`` / ``reserved_bytes.{pool_type}.peak``
            - ``active_bytes.{pool_type}.current`` / ``active_bytes.{pool_type}.peak``

        - Allocator Events:

            - ``num_ooms`` — Cumulative out-of-memory errors
            - ``num_alloc_retries`` — Number of allocation retries
            - ``num_device_alloc`` — Number of device allocations
            - ``num_device_free`` — Number of device deallocations

        For a full list of CUDA memory stats, see the
        `PyTorch documentation <https://docs.pytorch.org/docs/stable//generated/torch.cuda.device_memory_used.html>`_.

    - TPU (via ``torch_xla``)

        - *Memory Metrics* (per device, e.g., ``xla:0``):

            - ``memory.free.xla:0`` — Free HBM memory (MB)
            - ``memory.used.xla:0`` — Used HBM memory (MB)
            - ``memory.percent.xla:0`` — Percentage of HBM memory used (%)

        - *XLA Operation Counters*:

            - ``CachedCompile.xla``
            - ``CreateXlaTensor.xla``
            - ``DeviceDataCacheMiss.xla``
            - ``UncachedCompile.xla``
            - ``xla::add.xla``, ``xla::addmm.xla``, etc.

        These counters can be retrieved using: ``torch_xla.debug.metrics.counter_names()``

    Args:
        cpu_stats: if ``None``, it will log CPU stats only if the accelerator is CPU.
            If ``True``, it will log CPU stats regardless of the accelerator.
            If ``False``, it will not log CPU stats regardless of the accelerator.
        verbose: if ``True``, logs all available device stats returned by the accelerator.
            If ``False``, logs only a core set of metrics (memory usage, CPU utilization)
            that are most relevant for monitoring training health. Defaults to ``True``.

    Raises:
        MisconfigurationException:
            If ``Trainer`` has no logger.
        ModuleNotFoundError:
            If ``psutil`` is not installed and CPU stats are monitored.

    Example::

        from lightning import Trainer
        from lightning.pytorch.callbacks import DeviceStatsMonitor
        device_stats = DeviceStatsMonitor()
        trainer = Trainer(callbacks=[device_stats])

    """

    def __init__(self, cpu_stats: Optional[bool] = None, verbose: bool = True) -> None:
        self._cpu_stats = cpu_stats
        self._verbose = verbose

    @override
    def setup(
        self,
        trainer: "pl.Trainer",
        pl_module: "pl.LightningModule",
        stage: str,
    ) -> None:
        if stage != "fit":
            return

        if not trainer.loggers:
            raise MisconfigurationException("Cannot use `DeviceStatsMonitor` callback with `Trainer(logger=False)`.")

        # warn in setup to warn once
        device = trainer.strategy.root_device
        if self._cpu_stats is None and device.type == "cpu" and not _PSUTIL_AVAILABLE:
            raise ModuleNotFoundError(
                f"`DeviceStatsMonitor` cannot log CPU stats as `psutil` is not installed. {str(_PSUTIL_AVAILABLE)} "
            )

    @staticmethod
    def _filter_core_device_stats(stats: dict[str, float]) -> dict[str, float]:
        return {
            k: v
            for k, v in stats.items()
            if k in _CORE_DEVICE_STATS_KEYS or any(k.startswith(prefix) for prefix in _CORE_TPU_STATS_PREFIXES)
        }

    def _get_and_log_device_stats(self, trainer: "pl.Trainer", key: str) -> None:
        if not trainer._logger_connector.should_update_logs:
            return

        device = trainer.strategy.root_device
        if self._cpu_stats is False and device.type == "cpu":
            # cpu stats are disabled
            return

        device_stats = trainer.accelerator.get_device_stats(device)

        if self._cpu_stats and device.type != "cpu":
            # Don't query CPU stats twice if CPU is accelerator
            from lightning.pytorch.accelerators.cpu import get_cpu_stats

            device_stats.update(get_cpu_stats())

        if not self._verbose:
            device_stats = self._filter_core_device_stats(device_stats)

        for logger in trainer.loggers:
            separator = logger.group_separator
            prefixed_device_stats = _prefix_metric_keys(device_stats, f"{self.__class__.__qualname__}.{key}", separator)
            logger.log_metrics(prefixed_device_stats, step=trainer.fit_loop.epoch_loop._batches_that_stepped)

    @override
    def on_train_batch_start(
        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
    ) -> None:
        self._get_and_log_device_stats(trainer, "on_train_batch_start")

    @override
    def on_train_batch_end(
        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", outputs: STEP_OUTPUT, batch: Any, batch_idx: int
    ) -> None:
        self._get_and_log_device_stats(trainer, "on_train_batch_end")

    @override
    def on_validation_batch_start(
        self,
        trainer: "pl.Trainer",
        pl_module: "pl.LightningModule",
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self._get_and_log_device_stats(trainer, "on_validation_batch_start")

    @override
    def on_validation_batch_end(
        self,
        trainer: "pl.Trainer",
        pl_module: "pl.LightningModule",
        outputs: STEP_OUTPUT,
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self._get_and_log_device_stats(trainer, "on_validation_batch_end")

    @override
    def on_test_batch_start(
        self,
        trainer: "pl.Trainer",
        pl_module: "pl.LightningModule",
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self._get_and_log_device_stats(trainer, "on_test_batch_start")

    @override
    def on_test_batch_end(
        self,
        trainer: "pl.Trainer",
        pl_module: "pl.LightningModule",
        outputs: STEP_OUTPUT,
        batch: Any,
        batch_idx: int,
        dataloader_idx: int = 0,
    ) -> None:
        self._get_and_log_device_stats(trainer, "on_test_batch_end")


def _prefix_metric_keys(metrics_dict: dict[str, float], prefix: str, separator: str) -> dict[str, float]:
    return {prefix + separator + k: v for k, v in metrics_dict.items()}