fixing distributed instance norm

azrael417 · azrael417 · commit 5320369a36f0 · 2025-10-14T02:43:36.000-07:00
diff --git a/makani/models/common/layer_norm.py b/makani/models/common/layer_norm.py
@@ -58,17 +58,23 @@ def __init__(
 
         # we only need the weights
         self.quadrature = GridQuadrature(
-            quadrature_rule, img_shape=img_shape, crop_shape=crop_shape, crop_offset=crop_offset, normalize=True, pole_mask=pole_mask, distributed=False
+            quadrature_rule, 
+            img_shape=img_shape, 
+            crop_shape=crop_shape, 
+            crop_offset=crop_offset, 
+            normalize=True, 
+            pole_mask=pole_mask, 
+            distributed=False
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         # extract shapes
         B, C, H, W = x.shape
 
+        xtype = x.dtype
         with amp.autocast(device_type="cuda", enabled=False):
-            dtype = x.dtype
-            x = x.float()
+            x = x.to(torch.float32)
 
             # compute var and mean
             mean = self.quadrature(x)
@@ -79,9 +85,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         mean = mean.reshape(B, C, 1, 1)
 
         # convert types
-        x = x.to(dtype)
-        mean = mean.to(dtype)
-        var = var.to(dtype)
+        x = x.to(xtype)
+        mean = mean.to(xtype)
+        var = var.to(xtype)
 
         # apply the normalization
         if self.affine:
diff --git a/makani/mpu/layer_norm.py b/makani/mpu/layer_norm.py
@@ -66,11 +66,8 @@ def _welford_kernel(vars: torch.Tensor, means: torch.Tensor, counts: torch.Tenso
     # use Welford's algorithm to accumulate them into a single mean and variance
     for i in range(1, means.shape[0]):
         delta = means[i, ...] - mean
+        mean = mean + delta * counts[i, ...] / (count + counts[i, ...])
         m2 = m2 + m2s[i, ...] + delta**2 * count * counts[i, ...] / (count + counts[i, ...])
-        if i == 1:
-            mean = (mean * count + means[i, ...] * counts[i, ...]) / (count + counts[i, ...])
-        else:
-            mean = mean + delta * counts[i, ...] / (count + counts[i, ...])
 
         # update the current count
         count = count + counts[i, ...]
@@ -122,7 +119,7 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Computes the statistics locally, then uses the Welford online algorithm to reduce them"""
 
         # extract shapes
-        B, C, H, W = x.shape
+        B, C, _, _ = x.shape
 
         # those have the shapes [B, C]
         var, mean = torch.var_mean(x, dim=(-2, -1), unbiased=False, keepdim=False)
@@ -141,9 +138,9 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
 
+        xtype = x.dtype
         with amp.autocast(device_type="cuda", enabled=False):
-            dtype = x.dtype
-            x = x.float()
+            x = x.to(torch.float32)
 
             # start by computing std and mean
             var, mean = self._stats_welford(x)
@@ -152,9 +149,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             mean = copy_to_parallel_region(mean, "spatial")
             var = copy_to_parallel_region(var, "spatial")
 
-        x = x.to(dtype)
-        mean = mean.to(dtype)
-        var = var.to(dtype)
+        x = x.to(xtype)
+        mean = mean.to(xtype)
+        var = var.to(xtype)
 
         # apply the normalization
         if self.affine:
@@ -188,7 +185,13 @@ def __init__(
 
         # we only need the weights
         quad_weight = GridQuadrature(
-            quadrature_rule, img_shape=img_shape, crop_shape=crop_shape, crop_offset=crop_offset, normalize=True, pole_mask=pole_mask, distributed=True
+            quadrature_rule, 
+            img_shape=img_shape, 
+            crop_shape=crop_shape, 
+            crop_offset=crop_offset, 
+            normalize=True,
+            pole_mask=pole_mask, 
+            distributed=True
         ).quad_weight
 
         self.register_buffer("quad_weight", quad_weight, persistent=False)
@@ -197,12 +200,12 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Computes the statistics locally, then uses the Welford online algorithm to reduce them"""
 
         # extract shapes
-        B, C, H, W = x.shape
+        B, C, _, _ = x.shape
 
         # compute var, mean locally: those have the shapes [B, C]
-        mean = torch.sum(x * self.quad_weight, dim=(-2, -1), keepdim=False)
-        var = torch.sum(torch.square(x - mean.reshape(B, C, 1, 1)) * self.quad_weight, dim=(-2, -1), keepdim=False)
         count = torch.tile(torch.sum(self.quad_weight, dim=(-2, -1), keepdim=False), (B, C))
+        mean = torch.sum(x * self.quad_weight, dim=(-2, -1), keepdim=False) / count
+        var = torch.sum(torch.square(x - mean.reshape(B, C, 1, 1)) * self.quad_weight, dim=(-2, -1), keepdim=False) / count
 
         # compute welford variance
         var, mean, _ = distributed_welford_variance(var, mean, count, "spatial")
@@ -215,9 +218,9 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
 
+        xtype = x.dtype
         with amp.autocast(device_type="cuda", enabled=False):
-            dtype = x.dtype
-            x = x.float()
+            x = x.to(torch.float32)
 
             # start by computing std and mean
             var, mean = self._stats_welford(x)
@@ -226,9 +229,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             mean = copy_to_parallel_region(mean, "spatial")
             var = copy_to_parallel_region(var, "spatial")
 
-        x = x.to(dtype)
-        mean = mean.to(dtype)
-        var = var.to(dtype)
+        x = x.to(xtype)
+        mean = mean.to(xtype)
+        var = var.to(xtype)
 
         # apply the normalization
         if self.affine:
diff --git a/tests/distributed/tests_distributed_layers.py b/tests/distributed/tests_distributed_layers.py
@@ -16,6 +16,8 @@
 
 import os
 import unittest
+
+from torch.nn.modules.container import T
 from parameterized import parameterized
 
 import torch
@@ -33,6 +35,10 @@
 
 from makani.mpu.mappings import init_gradient_reduction_hooks
 
+# layer norm imports
+from makani.models.common.layer_norm import GeometricInstanceNormS2
+from makani.mpu.layer_norm import DistributedGeometricInstanceNormS2
+
 from distributed_helpers import split_helper, gather_helper
 
 class TestDistributedLayers(unittest.TestCase):
@@ -96,14 +102,17 @@ def _gather_helper(self, tensor, hdim=-2, wdim=-1):
         return tensor_gather
 
 
-    @parameterized.expand([
-        [256, 512, 256, 512, 32,  8, 1e-5],
-        [181, 360, 181, 360, 1, 10, 1e-5],
-        [256, 512, 128, 256, 32,  8, 1e-5],
-        [181, 360,  91, 180, 1, 10, 1e-5],
-        [128, 256, 256, 512, 32,  8, 1e-5],
-        [ 91, 180, 181, 360, 1, 10, 1e-5],
-    ])
+    @parameterized.expand(
+        [
+            [180, 360, 256, 512, 32,  8, 1e-5],
+            [181, 360, 181, 360, 1, 10, 1e-5],
+            [180, 360, 128, 256, 32,  8, 1e-5],
+            [181, 360,  91, 180, 1, 10, 1e-5],
+            [128, 256, 256, 512, 32,  8, 1e-5],
+            [ 91, 180, 181, 360, 1, 10, 1e-5],
+        ],
+        skip_on_empty=True,
+    )
     def test_distributed_spectral_conv(self, nlat_in, nlon_in, nlat_out, nlon_out, batch_size, num_chan, tol, verbose=True):
         B, C, Hi, Wi, Ho, Wo = batch_size, num_chan, nlat_in, nlon_in, nlat_out, nlon_out
 
@@ -146,7 +155,7 @@ def test_distributed_spectral_conv(self, nlat_in, nlon_in, nlat_out, nlon_out, b
             reduction_buffer_count=1,
             broadcast_buffers=False,
             find_unused_parameters=False,
-	    gradient_as_bucket_view=True,
+            gradient_as_bucket_view=True,
             static_graph=True,
             verbose=False,
         )
@@ -158,7 +167,6 @@ def test_distributed_spectral_conv(self, nlat_in, nlon_in, nlat_out, nlon_out, b
             spect_conv_dist.module.bias.copy_(spect_conv_local.bias)
         
         # input
-        self._init_seed(444)
         inp_full = torch.randn((B, C, Hi, Wi), dtype=torch.float32, device=self.device)
         
         #############################################################
@@ -169,7 +177,6 @@ def test_distributed_spectral_conv(self, nlat_in, nlon_in, nlat_out, nlon_out, b
         out_full, _ = spect_conv_local(inp_full)
 
         # create grad for backward
-        self._init_seed(555)
         with torch.no_grad():
             # create full grad
             ograd_full = torch.randn_like(out_full)
@@ -237,6 +244,163 @@ def test_distributed_spectral_conv(self, nlat_in, nlon_in, nlat_out, nlon_out, b
             if verbose and (self.world_rank == 0):
                 print(f"final relative error of bias gradients: {err.item()}")
         self.assertTrue(err.item() <= tol)
+
+
+    @parameterized.expand(
+        [
+            [181, 360, 1, 4, 1e-5, "equiangular", True],
+            [181, 360, 1, 4, 1e-5, "equiangular", False],
+            [180, 360, 1, 10, 1e-5, "legendre-gauss", True],
+            [180, 360, 1, 10, 1e-5, "legendre-gauss", False],
+        ],
+        skip_on_empty=True,
+    )
+    def test_distributed_geometric_instance_norm_s2(self, nlat, nlon, batch_size, num_chan, tol, grid_type, affine, verbose=True):
+        B, C, H, W = batch_size, num_chan, nlat, nlon
+
+        # set up layer norm parameters
+        img_shape = (H, W)
+        crop_shape = (H, W)
+        crop_offset = (0, 0)
+        pole_mask = 0
+        eps = 1e-5
+
+        self._init_seed(333)
+
+        # create local (serial) layer norm
+        norm_local = GeometricInstanceNormS2(
+            img_shape=img_shape,
+            crop_shape=crop_shape,
+            crop_offset=crop_offset,
+            grid_type=grid_type,
+            pole_mask=pole_mask,
+            num_features=C,
+            eps=eps,
+            affine=affine,
+        ).to(self.device)
+
+        # create distributed layer norm
+        norm_dist = DistributedGeometricInstanceNormS2(
+            img_shape=img_shape,
+            crop_shape=crop_shape,
+            crop_offset=crop_offset,
+            grid_type=grid_type,
+            pole_mask=pole_mask,
+            num_features=C,
+            eps=eps,
+            affine=affine,
+        ).to(self.device)
+
+        # set up gradient reduction hooks for distributed version
+        if affine:
+            norm_dist = init_gradient_reduction_hooks(
+                norm_dist,
+                device=self.device,
+                reduction_buffer_count=1,
+                broadcast_buffers=False,
+                find_unused_parameters=False,
+                gradient_as_bucket_view=True,
+                static_graph=True,
+                verbose=False,
+            )
+            norm_dist_handle = norm_dist.module
+
+        #make sure weights are the same if affine=True
+        if affine:
+            with torch.no_grad():
+                norm_dist.module.weight.copy_(norm_local.weight)
+                norm_dist.module.bias.copy_(norm_local.bias)
+        
+        # input
+        inp_full = torch.randn((B, C, H, W), dtype=torch.float32, device=self.device)
+        
+        #############################################################
+        # local (serial) transform
+        #############################################################
+        # FWD pass
+        inp_full.requires_grad = True
+        out_full = norm_local(inp_full)
+
+        # create grad for backward
+        with torch.no_grad():
+            # create full grad
+            ograd_full = torch.randn_like(out_full)
+
+        # BWD pass
+        out_full.backward(ograd_full)
+        igrad_full = inp_full.grad.clone()
+
+        if affine:
+            wgrad_full = norm_local.weight.grad.clone()
+            bgrad_full = norm_local.bias.grad.clone()
+        
+        #############################################################
+        # distributed transform
+        #############################################################
+        # FWD pass
+        inp_local = self._split_helper(inp_full, hdim=-2, wdim=-1)
+        inp_local.requires_grad = True
+        out_local = norm_dist(inp_local)
+
+        # BWD pass
+        ograd_local = self._split_helper(ograd_full, hdim=-2, wdim=-1)
+        out_local.backward(ograd_local)
+        igrad_local = inp_local.grad.clone()
+
+        if affine:
+            wgrad_local = norm_dist.module.weight.grad.clone()
+            bgrad_local = norm_dist.module.bias.grad.clone()
+        
+        #############################################################
+        # evaluate FWD pass
+        #############################################################
+        with torch.no_grad():
+            out_gather_full = self._gather_helper(out_local, hdim=-2, wdim=-1)
+            err = fn.relative_error(out_gather_full, out_full)
+            if verbose and (self.world_rank == 0):
+                print(f"GeometricInstanceNormS2 forward relative error: {err.item()}")
+        self.assertTrue(err.item() <= tol)
+
+        #############################################################
+        # evaluate input grads
+        #############################################################
+        with torch.no_grad():
+            igrad_gather_full = self._gather_helper(igrad_local, hdim=-2, wdim=-1)
+            err = fn.relative_error(igrad_gather_full, igrad_full)
+            if verbose and (self.world_rank == 0):
+                print(f"GeometricInstanceNormS2 input grad relative error: {err.item()}")
+        self.assertTrue(err.item() <= tol)
+
+        #############################################################
+        # evaluate weight and bias grads
+        #############################################################
+        # weight gradients should be the same across all processes
+        if affine:
+            with torch.no_grad():
+                wgrad_gather_list = [torch.empty_like(wgrad_local) for _ in range(self.world_size)]
+                wgrad_gather_list[self.world_rank] = wgrad_local
+                dist.all_gather(wgrad_gather_list, wgrad_local, group=None)
+                errs = []
+                for wgrad_gather_full in wgrad_gather_list:
+                    errs.append(fn.relative_error(wgrad_gather_full, wgrad_full))
+                err = torch.mean(torch.stack(errs, dim=0))
+                if verbose and (self.world_rank == 0):
+                    print(f"GeometricInstanceNormS2 weight grad relative error: {err.item()}")
+            self.assertTrue(err.item() <= tol)
+
+        # bias gradients should be the same across all processes
+        if affine:
+            with torch.no_grad():
+                bgrad_gather_list = [torch.empty_like(bgrad_local) for _ in range(self.world_size)]
+                bgrad_gather_list[self.world_rank] = bgrad_local
+                dist.all_gather(bgrad_gather_list, bgrad_local, group=None)
+                errs = []
+                for bgrad_gather_full in bgrad_gather_list:
+                    errs.append(fn.relative_error(bgrad_gather_full, bgrad_full))
+                err = torch.mean(torch.stack(errs, dim=0))
+                if verbose and (self.world_rank == 0):
+                    print(f"GeometricInstanceNormS2 bias grad relative error: {err.item()}")
+            self.assertTrue(err.item() <= tol)
         
 
 if __name__ == '__main__':