Merge pull request #134 from JCasaraconn/fix-resume-with-dataparallel

wolny · web-flow · commit 231b80f79640 · 2025-12-16T14:43:33.000+01:00
fix resume from checkpoint when using multiple GPUs
diff --git a/pytorch3dunet/unet3d/trainer.py b/pytorch3dunet/unet3d/trainer.py
@@ -36,11 +36,6 @@ def create_trainer(config: dict) -> "UNetTrainer":
     device = config.get("device", None)
     assert device, "Device not specified in the config file and could not be inferred automatically"
     logger.info(f"Using device: {device}")
-
-    # use DataParallel if more than 1 GPU available
-    if device == TorchDevice.CUDA and torch.cuda.device_count() > 1:
-        model = nn.DataParallel(model)
-        logger.info(f"Using {torch.cuda.device_count()} GPUs for training")
     model.to(device)
 
     # Log the number of learnable parameters
@@ -204,6 +199,11 @@ def __init__(
             if not self.checkpoint_dir:
                 self.checkpoint_dir = os.path.split(pre_trained)[0]
 
+        # use DataParallel if more than 1 GPU available
+        if device == TorchDevice.CUDA and torch.cuda.device_count() > 1:
+            self.model = nn.DataParallel(self.model)
+            logger.info(f"Using {torch.cuda.device_count()} GPUs for training")
+
     def fit(self):
         for _ in range(self.num_epochs, self.max_num_epochs):
             # train for one epoch