fixing grad norm

azrael417 · azrael417 · commit 15c40c4a27ec · 2025-11-06T02:06:11.000-08:00
diff --git a/makani/models/networks/fourcastnet3.py b/makani/models/networks/fourcastnet3.py
@@ -38,10 +38,6 @@
 # more distributed stuff
 from makani.utils import comm
 
-# layer normalization
-from physicsnemo.distributed.mappings import scatter_to_parallel_region, gather_from_parallel_region
-#from makani.mpu.layer_norm import DistributedInstanceNorm2d, DistributedLayerNorm
-
 # for annotation of models
 from dataclasses import dataclass
 import physicsnemo
diff --git a/makani/utils/training/deterministic_trainer.py b/makani/utils/training/deterministic_trainer.py
@@ -50,7 +50,7 @@
 from makani.utils import visualize
 
 from makani.mpu.mappings import init_gradient_reduction_hooks
-from makani.mpu.helpers import sync_params, gather_uneven
+from makani.mpu.helpers import sync_params
 
 # for counting model parameters
 from makani.models.helpers import count_parameters
diff --git a/makani/utils/training/training_helpers.py b/makani/utils/training/training_helpers.py
@@ -45,34 +45,59 @@ def normalize_weights(model, eps=1e-5):
     return
 
 
-def clip_grads(model, max_grad_norm):
-
+def _compute_total_grad_norm(model, norm_type=2.0):
     # iterate over parameters
-    with torch.no_grad():
-        for param in model.parameters():
+    gnorms = []
+    for param in model.parameters():
 
-            if param.grad is None:
-                continue
+        if param.grad is None:
+            continue
 
-            # compute local norm: compute abs first to support complex grads
+        # compute local norm: compute abs first to support complex grads
+        if norm_type == 2.0:
             gnorm = torch.sum(torch.square(torch.abs(param.grad)))
+        else:
+            gnorm = torch.sum(torch.abs(param.grad))
+
+        # compute global norm
+        if hasattr(param, "sharded_dims_mp"):
+
+            for group in param.sharded_dims_mp:
+                # continue if there is nothing to do
+                if (group is None) or (comm.get_size(group) == 1):
+                    continue
 
-            # compute global norm
-            if hasattr(param, "sharded_dims_mp"):
+                dist.all_reduce(gnorm, group=comm.get_group(group))
 
-                for d, group in enumerate(param.sharded_dims_mp):
-                    # continue if there is nothing to do
-                    if (group is None) or (comm.get_size(group) == 1):
-                        continue
+        gnorms.append(gnorm)
 
-                    dist.all_reduce(gnorm, group=comm.get_group(group))
+    # compute total norm
+    if gnorms:
+        total_gnorm = torch.sum(torch.stack(gnorms))
+    else:
+        total_gnorm = torch.tensor(0.0, device=model.device)
 
-            # compute square root
-            gnorm = torch.sqrt(gnorm)
+    # post-process norm
+    if norm_type == 2.0:
+        total_gnorm = torch.sqrt(total_gnorm)
+
+    return total_gnorm
+
+
+def clip_grads(model, max_grad_norm, norm_type=2.0):
+
+    # iterate over parameters
+    with torch.no_grad():
+        total_gnorm = _compute_total_grad_norm(model, norm_type)
+
+        clip_factor = max_grad_norm / total_gnorm
+        clip_factor = torch.clamp(clip_factor, max=1.0)
+
+        for param in model.parameters():
+            if param.grad is None:
+                continue
 
-            # update grads
-            if gnorm > max_grad_norm:
-                param.grad.mul_(max_grad_norm / gnorm)
+            param.grad.mul_(clip_factor)
 
     return