Merge pull request #72 from lxr2/device-aware-stats

loribonna · web-flow · commit ea967c34ac56 · 2025-11-22T15:45:35.000+01:00
Hi @lxr2 , thank you. I've had some reports in the past about this but never had the time to proverly go into this. Besides your fix to the track changes utility I add a small fix that was suggested me by my collegues some time ago, which is removing the garbage collector call in `conf.py`. This was added to try and keep the used memory under control but it also results in severe performance degradation.
diff --git a/utils/conf.py b/utils/conf.py
@@ -80,8 +80,8 @@ def get_alloc_memory_all_devices(avail_devices=None, return_all=False) -> Union[
             gpu_memory_nvidiasmi.append(-1)
 
         del _
-        gc.collect()
-        torch.cuda.empty_cache()
+        # gc.collect()
+        # torch.cuda.empty_cache()
 
     if return_all:
         return gpu_memory_reserved, gpu_memory_allocated, gpu_memory_nvidiasmi
diff --git a/utils/deprecated/continual_training.py b/utils/deprecated/continual_training.py
@@ -71,7 +71,7 @@ def train(args: Namespace):
     model.net.to(model.device)
     torch.cuda.empty_cache()
 
-    with track_system_stats(logger) as system_tracker:
+    with track_system_stats(logger, device=args.device) as system_tracker:
         epoch, i = 0, 0
         model.net.train()
 
diff --git a/utils/stats.py b/utils/stats.py
@@ -22,18 +22,18 @@ def get_memory_mb():
 except BaseException:
     get_memory_mb = None
 
-try:
-    import torch
+import torch
 
+try:
     if torch.cuda.is_available():
         from utils.conf import get_alloc_memory_all_devices
 
-        def get_memory_gpu_mb():
+        def get_memory_gpu_mb(avail_devices=None):
             """
-            Get the memory usage of all GPUs in MB.
+            Get the memory usage of the selected GPUs in MB.
             """
 
-            return [d / 1024 / 1024 for d in get_alloc_memory_all_devices()]
+            return [d / 1024 / 1024 for d in get_alloc_memory_all_devices(avail_devices=avail_devices)]
     else:
         get_memory_gpu_mb = None
 except BaseException:
@@ -43,6 +43,54 @@ def get_memory_gpu_mb():
 from utils.loggers import Logger
 
 
+def _parse_device_ids(device):
+    """
+    Normalize a device specification to a list of CUDA ids.
+    """
+    if device is None:
+        return None
+
+    if isinstance(device, torch.device):
+        if device.type != 'cuda':
+            return None
+        if device.index is None:
+            return list(range(torch.cuda.device_count()))
+        if 0 <= device.index < torch.cuda.device_count():
+            return [device.index]
+        logging.warning(f"Requested device index {device.index} is out of range.")
+        return None
+
+    if isinstance(device, str):
+        if 'cuda' not in device:
+            return None
+        parts = [p for p in device.split(',') if p.strip() != '']
+        if len(parts) == 0:
+            return list(range(torch.cuda.device_count()))
+        ids = []
+        for p in parts:
+            try:
+                ids.append(int(p.split(':')[-1]))
+            except ValueError:
+                logging.warning(f"Could not parse device id from `{p}`, skipping.")
+        ids = [i for i in ids if 0 <= i < torch.cuda.device_count()]
+        if len(ids) == 0:
+            logging.warning("No valid CUDA device ids parsed, falling back to all visible devices.")
+            return list(range(torch.cuda.device_count()))
+        return ids
+
+    if isinstance(device, (list, tuple)):
+        ids = []
+        for d in device:
+            if isinstance(d, int):
+                ids.append(d)
+            elif isinstance(d, torch.device) and d.type == 'cuda' and d.index is not None:
+                ids.append(d.index)
+        ids = [i for i in ids if 0 <= i < torch.cuda.device_count()]
+        return ids or None
+
+    return None
+
+
 class track_system_stats:
     """
     A context manager that tracks the memory usage of the system.
@@ -59,9 +107,10 @@ class track_system_stats:
 
             cpu_res, gpu_res = t.cpu_res, t.gpu_res
 
-    Args:
-        logger (Logger): external logger.
-        disabled (bool): If True, the context manager will not track the memory usage.
+        Args:
+            logger (Logger): external logger.
+            device: Device (or list of devices) to monitor. Defaults to all visible CUDA devices.
+            disabled (bool): If True, the context manager will not track the memory usage.
     """
 
     def get_stats(self):
@@ -77,14 +126,16 @@ def get_stats(self):
 
         gpu_res = None
         if get_memory_gpu_mb is not None:
-            gpu_res = get_memory_gpu_mb()
+            gpu_res = get_memory_gpu_mb(self.gpu_ids)
+            gpu_res = self._zip_gpu_res(gpu_res)
 
         return cpu_res, gpu_res
 
-    def __init__(self, logger: Logger = None, disabled=False):
+    def __init__(self, logger: Logger = None, device=None, disabled=False):
         self.logger = logger
         self.disabled = disabled
         self._it = 0
+        self.gpu_ids = _parse_device_ids(device) if torch.cuda.is_available() else None
 
     def __enter__(self):
         if self.disabled:
@@ -93,9 +144,6 @@ def __enter__(self):
         if self.initial_cpu_res is None and self.initial_gpu_res is None:
             self.disabled = True
         else:
-            if self.initial_gpu_res is not None:
-                self.initial_gpu_res = {g: g_res for g, g_res in enumerate(self.initial_gpu_res)}
-
             self.avg_gpu_res = self.initial_gpu_res
             self.avg_cpu_res = self.initial_cpu_res
 
@@ -130,7 +178,7 @@ def update_stats(self, cpu_res, gpu_res):
 
         Args:
             cpu_res (float): The memory usage of the CPU.
-            gpu_res (list): The memory usage of the GPUs.
+            gpu_res (dict): The memory usage of the GPUs keyed by device id.
         """
         if self.disabled:
             return
@@ -143,9 +191,8 @@ def update_stats(self, cpu_res, gpu_res):
             self.max_cpu_res = max(self.max_cpu_res, cpu_res)
 
         if self.initial_gpu_res is not None:
-            self.avg_gpu_res = {g: (g_res + alpha * (g_res - self.avg_gpu_res[g])) for g, g_res in enumerate(gpu_res)}
-            self.max_gpu_res = {g: max(self.max_gpu_res[g], g_res) for g, g_res in enumerate(gpu_res)}
-            gpu_res = {g: g_res for g, g_res in enumerate(gpu_res)}
+            self.avg_gpu_res = {g: (g_res + alpha * (g_res - self.avg_gpu_res[g])) for g, g_res in gpu_res.items()}
+            self.max_gpu_res = {g: max(self.max_gpu_res[g], g_res) for g, g_res in gpu_res.items()}
 
         if self.logger is not None:
             self.logger.log_system_stats(cpu_res, gpu_res)
@@ -166,8 +213,21 @@ def print_stats(self):
             logging.info(f"\tMax CPU memory usage: {self.max_cpu_res:.2f} MB")
 
         if gpu_res is not None:
-            for gpu_id, g_res in enumerate(gpu_res):
+            for gpu_id, g_res in gpu_res.items():
                 logging.info(f"\tInitial GPU {gpu_id} memory usage: {self.initial_gpu_res[gpu_id]:.2f} MB")
                 logging.info(f"\tAverage GPU {gpu_id} memory usage: {self.avg_gpu_res[gpu_id]:.2f} MB")
                 logging.info(f"\tFinal GPU {gpu_id} memory usage: {g_res:.2f} MB")
                 logging.info(f"\tMax GPU {gpu_id} memory usage: {self.max_gpu_res[gpu_id]:.2f} MB")
+
+    def _zip_gpu_res(self, gpu_res):
+        """
+        Zip a list of GPU stats to a dict keyed by the selected GPU ids.
+        """
+        if gpu_res is None:
+            return None
+
+        keys = self.gpu_ids if self.gpu_ids is not None else list(range(len(gpu_res)))
+        if len(keys) != len(gpu_res):
+            logging.warning("Mismatch between provided GPU ids and measured GPUs. Falling back to enumeration.")
+            keys = list(range(len(gpu_res)))
+        return {g: g_res for g, g_res in zip(keys, gpu_res)}
diff --git a/utils/training.py b/utils/training.py
@@ -156,7 +156,7 @@ def train(model: ContinualModel, dataset: ContinualDataset,
     model.net.to(model.device)
     torch.cuda.empty_cache()
 
-    with track_system_stats(logger) as system_tracker:
+    with track_system_stats(logger, device=args.device) as system_tracker:
         results, results_mask_classes = [], []
 
         if args.eval_future: