Merge pull request #494 from datamol-io/fix_mup_attn

DomInvivo · web-flow · commit f698df4ca5c1 · 2023-12-24T08:57:12.000-05:00
Fix mup for the layers with AttentionLayerMup
diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py
@@ -1338,6 +1338,12 @@ def _recursive_divide_dim(x: collections.abc.Mapping):
                     _recursive_divide_dim(v)
                 elif k in ["in_dim", "out_dim", "in_dim_edges", "out_dim_edges"]:
                     x[k] = round(v / divide_factor)
+                elif k in ["embed_dim"]:
+                    num_heads = x.get("num_heads", 1)
+                    x[k] = round(v / divide_factor)
+                    assert (
+                        x[k] % num_heads == 0
+                    ), f"embed_dim={x[k]} is not divisible by num_heads={num_heads}"
 
         _recursive_divide_dim(kwargs["layer_kwargs"])
 
diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py
@@ -68,6 +68,7 @@ def __init__(
         precision: str = "32",
         biased_attention_key: Optional[str] = None,
         attn_kwargs=None,
+        force_consistent_in_dim: bool = True,
         droppath_rate_attn: float = 0.0,
         droppath_rate_ffn: float = 0.0,
         hidden_dim_scaling: float = 4.0,
@@ -93,12 +94,6 @@ def __init__(
             out_dim:
                 Output node feature dimensions of the layer
 
-            in_dim:
-                Input edge feature dimensions of the layer
-
-            out_dim:
-                Output edge feature dimensions of the layer
-
             in_dim_edges:
                 input edge-feature dimensions of the layer
 
@@ -134,6 +129,11 @@ def __init__(
             attn_kwargs:
                 kwargs for attention layer
 
+            force_consistent_in_dim:
+                whether to force the `embed_dim` to be the same as the `in_dim` for the attention and mpnn.
+                The argument is only valid if `attn_type` is not None. If `embed_dim` is not provided,
+                it will be set to `in_dim` by default, so this parameter won't have an effect.
+
             droppath_rate_attn:
                 stochastic depth drop rate for attention layer https://arxiv.org/abs/1603.09382
 
@@ -208,7 +208,9 @@ def __init__(
         self.biased_attention_key = biased_attention_key
         # Initialize the MPNN and Attention layers
         self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs)
-        self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs)
+        self.attn_layer = self._parse_attn_layer(
+            attn_type, self.biased_attention_key, attn_kwargs, force_consistent_in_dim=force_consistent_in_dim
+        )
 
         self.output_scale = output_scale
         self.use_edges = True if self.in_dim_edges is not None else False
@@ -251,8 +253,6 @@ def forward(self, batch: Batch) -> Batch:
         """
         # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat
         feat = batch.feat
-        if self.use_edges:
-            edges_feat_in = batch.edge_feat
 
         feat_in = feat  # for first residual connection
 
@@ -323,26 +323,38 @@ def _parse_mpnn_layer(self, mpnn_type, mpnn_kwargs: Dict[str, Any]) -> Optional[
         return mpnn_layer
 
     def _parse_attn_layer(
-        self, attn_type, biased_attention_key: str, attn_kwargs: Dict[str, Any]
+        self,
+        attn_type,
+        biased_attention_key: str,
+        attn_kwargs: Dict[str, Any],
+        force_consistent_in_dim: bool = True,
     ) -> Optional[Module]:
         """
         parse the input attention layer and check if it is valid
         Parameters:
             attn_type: type of the attention layer
             biased_attention_key: key for the attenion bias
+            attn_kwargs: kwargs for the attention layer
+            force_consistent_in_dim: whether to force the `embed_dim` to be the same as the `in_dim`
+
         Returns:
             attn_layer: the attention layer
         """
 
         # Set the default values for the Attention layer
         if attn_kwargs is None:
             attn_kwargs = {}
-        attn_kwargs.setdefault("embed_dim", self.in_dim)
         attn_kwargs.setdefault("num_heads", 1)
         attn_kwargs.setdefault("dropout", self.dropout)
         attn_kwargs.setdefault("batch_first", True)
         self.attn_kwargs = attn_kwargs
 
+        # Force the `embed_dim` to be the same as the `in_dim`
+        attn_kwargs.setdefault("embed_dim", self.in_dim)
+        if force_consistent_in_dim:
+            embed_dim = attn_kwargs["embed_dim"]
+            assert embed_dim == self.in_dim, f"embed_dim={embed_dim} must be equal to in_dim={self.in_dim}"
+
         # Initialize the Attention layer
         attn_layer, attn_class = None, None
         if attn_type is not None: