trying to incorporate bias in distributed attention

azrael417 · azrael417 · commit 8f30e0f5dcab · 2025-10-14T01:18:52.000-07:00
diff --git a/makani/models/stepper.py b/makani/models/stepper.py
@@ -153,11 +153,11 @@ def _forward_eval(self, inp, update_state=True, replace_state=True):
 
         return y
 
-    def forward(self, inp, replace_state=True):
+    def forward(self, inp, update_state=True, replace_state=True):
         # decide which routine to call
         if self.training:
-            y = self._forward_train(inp, update_state=True, replace_state=replace_state)
+            y = self._forward_train(inp, update_state=update_state, replace_state=replace_state)
         else:
-            y = self._forward_eval(inp, update_state=True, replace_state=replace_state)
+            y = self._forward_eval(inp, update_state=update_state, replace_state=replace_state)
 
         return y
diff --git a/makani/mpu/layers.py b/makani/mpu/layers.py
@@ -33,8 +33,6 @@
 # use some distributed routines from torch harmonics
 from torch_harmonics.distributed import distributed_transpose_azimuth as distributed_transpose_w
 from torch_harmonics.distributed import distributed_transpose_polar as distributed_transpose_h
-from torch_harmonics.distributed import DistributedAttentionS2 as THDistributedAttentionS2
-
 
 class _DistMatmulHelper(torch.autograd.Function):
     @staticmethod
@@ -509,60 +507,3 @@ def forward(self, x):
 
         return x
 
-
-class DistributedAttentionS2(nn.Module):
-    def __init__(
-        self, 
-        in_channels: int,
-        num_heads: int,
-        in_shape: Tuple[int],
-        out_shape: Tuple[int],
-        grid_in: Optional[str] = "equiangular",
-        grid_out: Optional[str] = "equiangular",
-        scale: Optional[Union[torch.Tensor, float]] = None,
-        bias: Optional[bool] = True,
-        k_channels: Optional[int] = None,
-        out_channels: Optional[int] = None,
-        drop_rate: Optional[float]=0.0,
-    ):
-        super().__init__()
-
-        assert in_channels % num_heads == 0, "in_channels should be divisible by num_heads"
-        assert out_channels % num_heads == 0, "out_channels should be divisible by num_heads"
-        
-        self.attn = THDistributedAttentionS2(
-            in_channels=in_channels,
-            num_heads=num_heads,
-            in_shape=in_shape,
-            out_shape=out_shape,
-            grid_in=grid_in,
-            grid_out=grid_out,
-            scale=scale,
-            bias=bias,
-            k_channels=k_channels,
-            out_channels=out_channels,
-            drop_rate=drop_rate,
-        )
-
-        # set up weight sharing
-        if comm.get_size("spatial") > 1:
-            self.attn.q_weights.is_shared_mp = ["spatial"]
-            self.attn.q_weights.sharded_dims_mp = [None, None, None, None]
-            self.attn.k_weights.is_shared_mp = ["spatial"]
-            self.attn.k_weights.sharded_dims_mp = [None, None, None, None]
-            self.attn.v_weights.is_shared_mp = ["spatial"]
-            self.attn.v_weights.sharded_dims_mp = [None, None, None, None]
-            self.attn.proj_weights.is_shared_mp = ["spatial"]
-            self.attn.proj_weights.sharded_dims_mp = [None, None, None, None]
-            if bias:
-                self.attn.q_bias.is_shared_mp = ["spatial"]
-                self.attn.q_bias.sharded_dims_mp = [None]
-                self.attn.k_bias.is_shared_mp = ["spatial"]
-                self.attn.k_bias.sharded_dims_mp = [None]
-                self.attn.v_bias.is_shared_mp = ["spatial"]
-                self.attn.v_bias.sharded_dims_mp = [None]
-                self.attn.proj_bias.is_shared_mp = ["spatial"]
-                self.attn.proj_bias.sharded_dims_mp = [None]
-
-    def forward(self, query: torch.Tensor, key: Optional[torch.Tensor] = None, value: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return self.attn(query, key, value)
diff --git a/tests/test_constraints.py b/tests/test_constraints.py
@@ -111,7 +111,7 @@ def test_hydrostatic_balance_constraint_wrapper_era5(self):
                 # check the hb loss
                 hb_loss_tens = hbloss(data_map, None)
 
-		# average over batch and sum over channels
+                # average over batch and sum over channels
                 hb_loss_val = torch.mean(torch.sum(hb_loss_tens, dim=1)).item()
                 
                 self.assertTrue(hb_loss_val <= 1e-6)