adding test for euclidian distributed instance norm

azrael417 · azrael417 · commit 6dfa46500ada · 2025-10-14T03:02:30.000-07:00
diff --git a/tests/distributed/tests_distributed_layers.py b/tests/distributed/tests_distributed_layers.py
@@ -16,11 +16,10 @@
 
 import os
 import unittest
-
-from torch.nn.modules.container import T
 from parameterized import parameterized
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 import torch.distributed as dist
 
@@ -29,15 +28,12 @@
 
 from makani.utils import comm
 from makani.utils import functions as fn
-from physicsnemo.distributed.utils import split_tensor_along_dim
-from physicsnemo.distributed.mappings import gather_from_parallel_region, scatter_to_parallel_region, \
-                                         reduce_from_parallel_region
 
 from makani.mpu.mappings import init_gradient_reduction_hooks
 
 # layer norm imports
 from makani.models.common.layer_norm import GeometricInstanceNormS2
-from makani.mpu.layer_norm import DistributedGeometricInstanceNormS2
+from makani.mpu.layer_norm import DistributedGeometricInstanceNormS2, DistributedInstanceNorm2d
 
 from distributed_helpers import split_helper, gather_helper
 
@@ -246,6 +242,149 @@ def test_distributed_spectral_conv(self, nlat_in, nlon_in, nlat_out, nlon_out, b
         self.assertTrue(err.item() <= tol)
 
 
+    @parameterized.expand(
+        [
+            [256, 512, 32, 8, 1e-5, True],
+            [181, 360, 1, 10, 1e-5, True],
+            [256, 512, 32, 8, 1e-5, False],
+            [181, 360, 1, 10, 1e-5, False],
+        ],
+        skip_on_empty=True,
+    )
+    def test_distributed_instance_norm_2d(self, nlat, nlon, batch_size, num_chan, tol, affine, verbose=True):
+        B, C, H, W = batch_size, num_chan, nlat, nlon
+
+        self._init_seed(333)
+
+        # create local (serial) instance norm - using PyTorch's standard InstanceNorm2d
+        norm_local = nn.InstanceNorm2d(
+            num_features=C,
+            eps=1e-5,
+            affine=affine,
+            track_running_stats=False,
+        ).to(self.device)
+
+        # create distributed instance norm
+        norm_dist = DistributedInstanceNorm2d(
+            num_features=C,
+            eps=1e-5,
+            affine=affine,
+        ).to(self.device)
+
+        # set up gradient reduction hooks for distributed version if affine=True
+        if affine:
+            norm_dist = init_gradient_reduction_hooks(
+                norm_dist,
+                device=self.device,
+                reduction_buffer_count=1,
+                broadcast_buffers=False,
+                find_unused_parameters=False,
+                gradient_as_bucket_view=True,
+                static_graph=True,
+                verbose=False,
+            )
+            norm_dist_handle = norm_dist.module
+        else:
+            norm_dist_handle = norm_dist
+
+        # make sure weights are the same if affine=True
+        if affine:
+            with torch.no_grad():
+                norm_dist_handle.weight.copy_(norm_local.weight)
+                norm_dist_handle.bias.copy_(norm_local.bias)
+        
+        # input
+        inp_full = torch.randn((B, C, H, W), dtype=torch.float32, device=self.device)
+        
+        #############################################################
+        # local (serial) transform
+        #############################################################
+        # FWD pass
+        inp_full.requires_grad = True
+        out_full = norm_local(inp_full)
+
+        # create grad for backward
+        with torch.no_grad():
+            # create full grad
+            ograd_full = torch.randn_like(out_full)
+
+        # BWD pass
+        out_full.backward(ograd_full)
+        igrad_full = inp_full.grad.clone()
+
+        if affine:
+            wgrad_full = norm_local.weight.grad.clone()
+            bgrad_full = norm_local.bias.grad.clone()
+        
+        #############################################################
+        # distributed transform
+        #############################################################
+        # FWD pass
+        inp_local = self._split_helper(inp_full, hdim=-2, wdim=-1)
+        inp_local.requires_grad = True
+        out_local = norm_dist(inp_local)
+
+        # BWD pass
+        ograd_local = self._split_helper(ograd_full, hdim=-2, wdim=-1)
+        out_local.backward(ograd_local)
+        igrad_local = inp_local.grad.clone()
+
+        if affine:
+            wgrad_local = norm_dist_handle.weight.grad.clone()
+            bgrad_local = norm_dist_handle.bias.grad.clone()
+        
+        #############################################################
+        # evaluate FWD pass
+        #############################################################
+        with torch.no_grad():
+            out_gather_full = self._gather_helper(out_local, hdim=-2, wdim=-1)
+            err = fn.relative_error(out_gather_full, out_full)
+            if verbose and (self.world_rank == 0):
+                print(f"InstanceNorm2d forward relative error: {err.item()}")
+        self.assertTrue(err.item() <= tol)
+
+        #############################################################
+        # evaluate input grads
+        #############################################################
+        with torch.no_grad():
+            igrad_gather_full = self._gather_helper(igrad_local, hdim=-2, wdim=-1)
+            err = fn.relative_error(igrad_gather_full, igrad_full)
+            if verbose and (self.world_rank == 0):
+                print(f"InstanceNorm2d input grad relative error: {err.item()}")
+        self.assertTrue(err.item() <= tol)
+
+        #############################################################
+        # evaluate weight and bias grads
+        #############################################################
+        # weight gradients should be the same across all processes
+        if affine:
+            with torch.no_grad():
+                wgrad_gather_list = [torch.empty_like(wgrad_local) for _ in range(self.world_size)]
+                wgrad_gather_list[self.world_rank] = wgrad_local
+                dist.all_gather(wgrad_gather_list, wgrad_local, group=None)
+                errs = []
+                for wgrad_gather_full in wgrad_gather_list:
+                    errs.append(fn.relative_error(wgrad_gather_full, wgrad_full))
+                err = torch.mean(torch.stack(errs, dim=0))
+                if verbose and (self.world_rank == 0):
+                    print(f"InstanceNorm2d weight grad relative error: {err.item()}")
+            self.assertTrue(err.item() <= tol)
+
+        # bias gradients should be the same across all processes
+        if affine:
+            with torch.no_grad():
+                bgrad_gather_list = [torch.empty_like(bgrad_local) for _ in range(self.world_size)]
+                bgrad_gather_list[self.world_rank] = bgrad_local
+                dist.all_gather(bgrad_gather_list, bgrad_local, group=None)
+                errs = []
+                for bgrad_gather_full in bgrad_gather_list:
+                    errs.append(fn.relative_error(bgrad_gather_full, bgrad_full))
+                err = torch.mean(torch.stack(errs, dim=0))
+                if verbose and (self.world_rank == 0):
+                    print(f"InstanceNorm2d bias grad relative error: {err.item()}")
+            self.assertTrue(err.item() <= tol)
+
+
     @parameterized.expand(
         [
             [181, 360, 1, 4, 1e-5, "equiangular", True],
@@ -303,7 +442,6 @@ def test_distributed_geometric_instance_norm_s2(self, nlat, nlon, batch_size, nu
                 static_graph=True,
                 verbose=False,
             )
-            norm_dist_handle = norm_dist.module
 
         #make sure weights are the same if affine=True
         if affine: