allenai · pjreddie · Mar 30, 2026 · favyen2 · Mar 30, 2026
diff --git a/rslearn/train/data_module.py b/rslearn/train/data_module.py
@@ -79,7 +79,9 @@ def __init__(
             task: the task to train on
             path: the dataset path
             path_options: additional options for path to pass to fsspec.
-            batch_size: the batch size
+            batch_size: the total batch size across all GPUs. In multi-GPU
+                training, this is divided by world_size to get the per-GPU
+                batch size.
             num_workers: number of data loader worker processes, or 0 to use main
                 process only
             init_workers: number of workers used to initialize the dataset, e.g. for
@@ -215,9 +217,19 @@ def _get_dataloader(
         ):
             num_workers = min(num_workers, len(dataset.get_dataset_examples()))
 
+        # Compute per-GPU batch size from total batch size.
+        per_gpu_batch_size = self.batch_size
+        if self.trainer is not None and self.trainer.world_size > 1:
+            if self.batch_size % self.trainer.world_size != 0:
+                raise ValueError(
+                    f"batch_size ({self.batch_size}) must be divisible by "
+                    f"world_size ({self.trainer.world_size})"
+                )
+            per_gpu_batch_size = self.batch_size // self.trainer.world_size
+
         kwargs: dict[str, Any] = dict(
             dataset=dataset,
-            batch_size=self.batch_size,
+            batch_size=per_gpu_batch_size,
             num_workers=num_workers,
             collate_fn=collate_fn,
             persistent_workers=persistent_workers,

diff --git a/rslearn/train/lightning_module.py b/rslearn/train/lightning_module.py
@@ -209,11 +209,14 @@ def configure_optimizers(self) -> OptimizerLRSchedulerConfig:
         return d
 
     def on_train_epoch_start(self) -> None:
-        """If we are in a multi-dataset distributed strategy, set the epoch."""
+        """Set the epoch on the distributed sampler so shuffling varies each epoch."""
         try:
             self.trainer.train_dataloader.batch_sampler.set_epoch(self.current_epoch)
         except AttributeError:
-            # Fail silently for single-dataset case, which is okay
+            pass
+        try:
+            self.trainer.train_dataloader.sampler.set_epoch(self.current_epoch)
+        except AttributeError:
             pass
 
     def _log_non_scalar_metric(self, name: str, value: NonScalarMetricOutput) -> None:

diff --git a/rslearn/train/tasks/segmentation.py b/rslearn/train/tasks/segmentation.py
@@ -313,6 +313,7 @@ def __init__(
         weights: list[float] | None = None,
         dice_loss: bool = False,
         temperature: float = 1.0,
+        smooth_sigma: float = 0.0,
     ):
         """Initialize a new SegmentationTask.
 
@@ -321,6 +322,9 @@ def __init__(
             dice_loss: weather to add dice loss to cross entropy
             temperature: temperature scaling for softmax, does not affect the loss,
                 only the predictor outputs
+            smooth_sigma: if > 0, apply a fixed Gaussian blur to logits before
+                computing loss and outputs. The filter is non-learned but
+                differentiable, so gradients flow through it to the model.
         """
         super().__init__()
         if weights is not None:
@@ -329,6 +333,22 @@ def __init__(
             self.weights = None
         self.dice_loss = dice_loss
         self.temperature = temperature
+        self.smooth_sigma = smooth_sigma
+
+    def _gaussian_smooth(self, logits: torch.Tensor) -> torch.Tensor:
+        """Apply depthwise Gaussian blur to logits. Differentiable, no learned params."""
+        sigma = self.smooth_sigma
+        radius = int(3 * sigma + 0.5)
+        size = 2 * radius + 1
+        x = torch.arange(size, device=logits.device, dtype=logits.dtype) - radius
+        g1d = torch.exp(-(x**2) / (2 * sigma**2))
+        g2d = g1d[:, None] * g1d[None, :]
+        g2d = g2d / g2d.sum()
+        channels = logits.shape[1]
+        kernel = g2d.unsqueeze(0).unsqueeze(0).expand(channels, 1, size, size)
+        # Use replicate padding to avoid border artifacts from zero padding
+        padded = torch.nn.functional.pad(logits, [radius] * 4, mode="replicate")
+        return torch.nn.functional.conv2d(padded, kernel, groups=channels)
 
     def forward(
         self,
@@ -357,6 +377,10 @@ def forward(
             )
 
         logits = intermediates.feature_maps[0]
+
+        if self.smooth_sigma > 0:
+            logits = self._gaussian_smooth(logits)
+
         outputs = torch.nn.functional.softmax(logits / self.temperature, dim=1)
 
         losses = {}