fixing distributed instance norm (#33)

azrael417 · web-flow · commit b179910c13d1 · 2025-10-14T16:47:01.000+02:00
* fixing distributed instance norm

* adding test for euclidian distributed instance norm

* cleaning up imports
diff --git a/makani/models/common/layer_norm.py b/makani/models/common/layer_norm.py
@@ -58,17 +58,23 @@ def __init__(
 
         # we only need the weights
         self.quadrature = GridQuadrature(
-            quadrature_rule, img_shape=img_shape, crop_shape=crop_shape, crop_offset=crop_offset, normalize=True, pole_mask=pole_mask, distributed=False
+            quadrature_rule, 
+            img_shape=img_shape, 
+            crop_shape=crop_shape, 
+            crop_offset=crop_offset, 
+            normalize=True, 
+            pole_mask=pole_mask, 
+            distributed=False
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         # extract shapes
         B, C, H, W = x.shape
 
+        xtype = x.dtype
         with amp.autocast(device_type="cuda", enabled=False):
-            dtype = x.dtype
-            x = x.float()
+            x = x.to(torch.float32)
 
             # compute var and mean
             mean = self.quadrature(x)
@@ -79,9 +85,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         mean = mean.reshape(B, C, 1, 1)
 
         # convert types
-        x = x.to(dtype)
-        mean = mean.to(dtype)
-        var = var.to(dtype)
+        x = x.to(xtype)
+        mean = mean.to(xtype)
+        var = var.to(xtype)
 
         # apply the normalization
         if self.affine:
diff --git a/makani/mpu/layer_norm.py b/makani/mpu/layer_norm.py
@@ -66,11 +66,8 @@ def _welford_kernel(vars: torch.Tensor, means: torch.Tensor, counts: torch.Tenso
     # use Welford's algorithm to accumulate them into a single mean and variance
     for i in range(1, means.shape[0]):
         delta = means[i, ...] - mean
+        mean = mean + delta * counts[i, ...] / (count + counts[i, ...])
         m2 = m2 + m2s[i, ...] + delta**2 * count * counts[i, ...] / (count + counts[i, ...])
-        if i == 1:
-            mean = (mean * count + means[i, ...] * counts[i, ...]) / (count + counts[i, ...])
-        else:
-            mean = mean + delta * counts[i, ...] / (count + counts[i, ...])
 
         # update the current count
         count = count + counts[i, ...]
@@ -122,7 +119,7 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Computes the statistics locally, then uses the Welford online algorithm to reduce them"""
 
         # extract shapes
-        B, C, H, W = x.shape
+        B, C, _, _ = x.shape
 
         # those have the shapes [B, C]
         var, mean = torch.var_mean(x, dim=(-2, -1), unbiased=False, keepdim=False)
@@ -141,9 +138,9 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
 
+        xtype = x.dtype
         with amp.autocast(device_type="cuda", enabled=False):
-            dtype = x.dtype
-            x = x.float()
+            x = x.to(torch.float32)
 
             # start by computing std and mean
             var, mean = self._stats_welford(x)
@@ -152,9 +149,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             mean = copy_to_parallel_region(mean, "spatial")
             var = copy_to_parallel_region(var, "spatial")
 
-        x = x.to(dtype)
-        mean = mean.to(dtype)
-        var = var.to(dtype)
+        x = x.to(xtype)
+        mean = mean.to(xtype)
+        var = var.to(xtype)
 
         # apply the normalization
         if self.affine:
@@ -188,7 +185,13 @@ def __init__(
 
         # we only need the weights
         quad_weight = GridQuadrature(
-            quadrature_rule, img_shape=img_shape, crop_shape=crop_shape, crop_offset=crop_offset, normalize=True, pole_mask=pole_mask, distributed=True
+            quadrature_rule, 
+            img_shape=img_shape, 
+            crop_shape=crop_shape, 
+            crop_offset=crop_offset, 
+            normalize=True,
+            pole_mask=pole_mask, 
+            distributed=True
         ).quad_weight
 
         self.register_buffer("quad_weight", quad_weight, persistent=False)
@@ -197,12 +200,12 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Computes the statistics locally, then uses the Welford online algorithm to reduce them"""
 
         # extract shapes
-        B, C, H, W = x.shape
+        B, C, _, _ = x.shape
 
         # compute var, mean locally: those have the shapes [B, C]
-        mean = torch.sum(x * self.quad_weight, dim=(-2, -1), keepdim=False)
-        var = torch.sum(torch.square(x - mean.reshape(B, C, 1, 1)) * self.quad_weight, dim=(-2, -1), keepdim=False)
         count = torch.tile(torch.sum(self.quad_weight, dim=(-2, -1), keepdim=False), (B, C))
+        mean = torch.sum(x * self.quad_weight, dim=(-2, -1), keepdim=False) / count
+        var = torch.sum(torch.square(x - mean.reshape(B, C, 1, 1)) * self.quad_weight, dim=(-2, -1), keepdim=False) / count
 
         # compute welford variance
         var, mean, _ = distributed_welford_variance(var, mean, count, "spatial")
@@ -215,9 +218,9 @@ def _stats_welford(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
 
+        xtype = x.dtype
         with amp.autocast(device_type="cuda", enabled=False):
-            dtype = x.dtype
-            x = x.float()
+            x = x.to(torch.float32)
 
             # start by computing std and mean
             var, mean = self._stats_welford(x)
@@ -226,9 +229,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             mean = copy_to_parallel_region(mean, "spatial")
             var = copy_to_parallel_region(var, "spatial")
 
-        x = x.to(dtype)
-        mean = mean.to(dtype)
-        var = var.to(dtype)
+        x = x.to(xtype)
+        mean = mean.to(xtype)
+        var = var.to(xtype)
 
         # apply the normalization
         if self.affine:
diff --git a/makani/utils/metrics/functions.py b/makani/utils/metrics/functions.py
@@ -13,13 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, List
-from dataclasses import dataclass
+from typing import Optional, Tuple
 
-import math
 import torch
 
-from makani.utils.grids import grid_to_quadrature_rule, GridQuadrature
 from makani.utils import comm
 from physicsnemo.distributed.mappings import scatter_to_parallel_region, reduce_from_parallel_region
 from physicsnemo.distributed.utils import split_tensor_along_dim
diff --git a/tests/distributed/tests_distributed_layers.py b/tests/distributed/tests_distributed_layers.py