Add UnifiedRadialMLP for batched radial computation (#1831)

misko · web-flow · commit 08f7b0b8defe · 2026-03-03T18:03:01.000Z
Adds precomputed per-layer radial embeddings to the umas_fast_pytorch
and umas_fast_gpu backends

Unfortunately this only speeds up the first layer of the ~3 linear
layers we have in each of the radial MLPs since they diverge after

However this does give us a 0.8qps (15.5-&gt;16.3qps) boost for 2000 carbon
system for UMA-S 1.1 when using umas_fast_gpu or umas_fast_pytorch
diff --git a/src/fairchem/core/models/uma/escn_md.py b/src/fairchem/core/models/uma/escn_md.py
@@ -759,11 +759,18 @@ def forward(self, data_dict: AtomicData) -> dict[str, torch.Tensor]:
         ###############################################################
         # Update spherical node embeddings
         ###############################################################
+
+        # Get edge embeddings for each layer
+        # General backend: raw x_edge (rad_func computed inside SO2_Convolution)
+        # Fast backends: precomputed radials
+        with record_function("layer_radial_emb"):
+            x_edge_per_layer = self.backend.get_layer_radial_emb(x_edge, self)
+
         for i in range(self.num_layers):
             with record_function(f"message passing {i}"):
                 x_message = self.blocks[i](
                     x_message,
-                    x_edge,
+                    x_edge_per_layer[i],
                     graph_dict["edge_index"],
                     wigner,
                     wigner_inv_envelope,
diff --git a/src/fairchem/core/models/uma/escn_md_block.py b/src/fairchem/core/models/uma/escn_md_block.py
@@ -192,7 +192,7 @@ def forward_chunk(
             )
             x_message, x_0_gating = self.so2_conv_1(x_message, x_edge)
             x_message = self.act(x_0_gating, x_message)
-            x_message = self.so2_conv_2(x_message, x_edge)
+            x_message = self.so2_conv_2(x_message)
             new_embedding = self.backend.permute_wigner_inv_edge_to_node(
                 x_message,
                 wigner_inv_envelope,
diff --git a/src/fairchem/core/models/uma/nn/execution_backends.py b/src/fairchem/core/models/uma/nn/execution_backends.py
@@ -12,6 +12,8 @@
 
 import torch
 
+from fairchem.core.models.uma.nn.unified_radial import UnifiedRadialMLP
+
 if TYPE_CHECKING:
     from fairchem.core.units.mlip_unit.api.inference import InferenceSettings
 
@@ -85,6 +87,28 @@ def prepare_model_for_inference(model: torch.nn.Module) -> None:
             model: The backbone model to prepare.
         """
 
+    @staticmethod
+    def get_layer_radial_emb(
+        x_edge: torch.Tensor,
+        model: torch.nn.Module,
+    ) -> list[torch.Tensor]:
+        """
+        Get edge embeddings for each layer.
+
+        Default implementation returns the same raw x_edge for all layers.
+        SO2_Convolution will compute rad_func(x_edge) internally.
+
+        Override in fast backends to precompute radials.
+
+        Args:
+            x_edge: Edge embeddings [E, edge_features]
+            model: The backbone model
+
+        Returns:
+            List of edge embeddings, one per layer
+        """
+        return [x_edge] * len(model.blocks)
+
     @staticmethod
     def prepare_wigner(
         wigner: torch.Tensor,
@@ -261,11 +285,13 @@ def validate(
     @staticmethod
     def prepare_model_for_inference(model: torch.nn.Module) -> None:
         """
-        Convert SO2_Convolution modules to block-diagonal GEMM variants.
+        Convert SO2_Convolution modules to block-diagonal GEMM variants
+        and create unified radial MLP for batched computation.
 
         Replaces so2_conv_1 with SO2_Conv1_WithRadialBlock and
         so2_conv_2 with SO2_Conv2_InternalBlock in each block's
-        Edgewise module.
+        Edgewise module. Then creates a UnifiedRadialMLP from all
+        radial functions for efficient batched computation.
         """
         from fairchem.core.models.uma.nn.so2_layers import (
             convert_so2_conv1,
@@ -276,6 +302,27 @@ def prepare_model_for_inference(model: torch.nn.Module) -> None:
             block.edge_wise.so2_conv_1 = convert_so2_conv1(block.edge_wise.so2_conv_1)
             block.edge_wise.so2_conv_2 = convert_so2_conv2(block.edge_wise.so2_conv_2)
 
+        # Create unified radial MLP for batched computation
+        rad_funcs = [block.edge_wise.so2_conv_1.rad_func for block in model.blocks]
+        model._unified_radial_mlp = UnifiedRadialMLP(rad_funcs)
+
+    @staticmethod
+    def get_layer_radial_emb(
+        x_edge: torch.Tensor,
+        model: torch.nn.Module,
+    ) -> list[torch.Tensor]:
+        """
+        Compute radial embeddings for all layers using batched UnifiedRadialMLP.
+
+        Args:
+            x_edge: Edge embeddings [E, edge_features]
+            model: The backbone model with _unified_radial_mlp
+
+        Returns:
+            List of radial embeddings, one per layer [E, radial_features]
+        """
+        return model._unified_radial_mlp(x_edge)
+
 
 class UMASFastGPUBackend(UMASFastPytorchBackend):
     """
diff --git a/src/fairchem/core/models/uma/nn/so2_layers.py b/src/fairchem/core/models/uma/nn/so2_layers.py
@@ -243,15 +243,15 @@ def forward(
 
         Args:
             x: Input features [E, coeffs, channels]
-            x_edge: Edge embeddings [E, edge_features]
+            x_edge: Precomputed radial embeddings [E, radial_features]
 
         Returns:
             (output, gating): output [E, coeffs, m_output_channels],
                 gating [E, extra_m0_output_channels]
         """
-        x_edge_by_m = self.rad_func(x_edge).split(self.edge_split_sizes, dim=1)
+        x_edge_by_m = x_edge.split(self.edge_split_sizes, dim=1)
         x_by_m = x.split(self.m_split_sizes, dim=1)
-        num_edges = len(x_edge)
+        num_edges = x.shape[0]
 
         # m=0: apply radial, linear, split gating
         x_0 = x_by_m[0].view(num_edges, -1) * x_edge_by_m[0]
@@ -511,18 +511,22 @@ def __init__(
     def forward(
         self,
         x: torch.Tensor,
-        x_edge: torch.Tensor,
+        x_edge: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # radial function
+        # Compute radial embedding from raw x_edge if we have external weights
         if self.rad_func is not None:
-            x_edge_by_m = self.rad_func(x_edge).split(self.edge_split_sizes, dim=1)
+            x_edge = self.rad_func(x_edge)
 
         x_by_m = x.split(self.m_split_sizes, dim=1)
 
-        num_edges = len(x_edge)
+        # Split radial embeddings if provided (external weights mode)
+        if x_edge is not None:
+            x_edge_by_m = x_edge.split(self.edge_split_sizes, dim=1)
+
+        num_edges = x.shape[0]
         # Compute m=0 coefficients separately since they only have real values (no imaginary)
         x_0 = x_by_m[0].view(num_edges, -1)
-        if self.rad_func is not None:
+        if x_edge is not None:
             x_0 = x_0 * x_edge_by_m[0]
         x_0 = self.fc_m0(x_0)
 
@@ -541,7 +545,7 @@ def forward(
         # Compute the values for the m > 0 coefficients
         for m in range(1, self.mmax + 1):
             x_m = x_by_m[m].view(num_edges, 2, -1)
-            if self.rad_func is not None:
+            if x_edge is not None:
                 x_m = x_m * x_edge_by_m[m].unsqueeze(1)
             x_m = self.so2_m_conv[m - 1](x_m)
             out.extend(x_m)
diff --git a/src/fairchem/core/models/uma/nn/unified_radial.py b/src/fairchem/core/models/uma/nn/unified_radial.py
@@ -0,0 +1,190 @@
+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+
+Unified Radial MLP: Computes all layers' radial functions in a single
+batched operation.
+
+Instead of running N separate RadialMLP forward passes:
+    for layer in layers:
+        radial_out = layer.so2_conv_1.rad_func(x_edge)  # Sequential
+
+We run one batched first layer, then each tail:
+    all_radial_outs = unified_radial_mlp(x_edge)  # list of [E, out]
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.nn as nn
+
+if TYPE_CHECKING:
+    from .radial import RadialMLP
+
+__all__ = ["UnifiedRadialMLP", "create_unified_radial_mlp"]
+
+# Expected structure of RadialMLP.net Sequential
+_EXPECTED_NET_STRUCTURE = (
+    nn.Linear,  # 0: first linear
+    nn.LayerNorm,  # 1
+    nn.SiLU,  # 2
+    nn.Linear,  # 3: second linear
+    nn.LayerNorm,  # 4
+    nn.SiLU,  # 5
+    nn.Linear,  # 6: third linear
+)
+
+
+def _validate_radial_mlp(mlp: RadialMLP, idx: int, reference: RadialMLP | None) -> None:
+    """
+    Validate a single RadialMLP has expected structure and matches reference.
+
+    Args:
+        mlp: The RadialMLP to validate.
+        idx: Index in the list (for error messages).
+        reference: First RadialMLP to compare dimensions against (None for first).
+    """
+    # Check layer count
+    if len(mlp.net) != 7:
+        raise ValueError(f"RadialMLP[{idx}]: expected 7 layers, got {len(mlp.net)}")
+
+    # Check layer types
+    for j, expected_type in enumerate(_EXPECTED_NET_STRUCTURE):
+        if not isinstance(mlp.net[j], expected_type):
+            raise TypeError(
+                f"RadialMLP[{idx}].net[{j}]: expected {expected_type.__name__}, "
+                f"got {type(mlp.net[j]).__name__}"
+            )
+
+    # Check feature dimensions match reference (all MLPs must be identical)
+    if reference is not None:
+        for j in (0, 3, 6):  # Linear layers
+            if mlp.net[j].in_features != reference.net[j].in_features:
+                raise ValueError(
+                    f"RadialMLP[{idx}].net[{j}]: in_features mismatch "
+                    f"({mlp.net[j].in_features} vs {reference.net[j].in_features})"
+                )
+            if mlp.net[j].out_features != reference.net[j].out_features:
+                raise ValueError(
+                    f"RadialMLP[{idx}].net[{j}]: out_features mismatch "
+                    f"({mlp.net[j].out_features} vs {reference.net[j].out_features})"
+                )
+
+
+class UnifiedRadialMLP(nn.Module):
+    """
+    Unified radial MLP that batches the first linear layer across N RadialMLPs.
+
+    The first layer uses concatenated weights for a single GEMM (all N layers
+    share the same input). Layers 2+ use stacked weight buffers for fast
+    indexed functional calls.
+    """
+
+    def __init__(self, radial_mlps: list[RadialMLP]) -> None:
+        """
+        Initialize from a list of RadialMLP modules.
+
+        Args:
+            radial_mlps: List of RadialMLP modules with identical architecture.
+        """
+        super().__init__()
+
+        assert len(radial_mlps) > 0, "Need at least one RadialMLP"
+
+        # Validate all MLPs have expected structure and match each other
+        for i, mlp in enumerate(radial_mlps):
+            _validate_radial_mlp(mlp, i, radial_mlps[0] if i > 0 else None)
+
+        self.num_layers = len(radial_mlps)
+        self.hidden_features = radial_mlps[0].net[0].out_features
+        self.ln_eps = radial_mlps[0].net[1].eps
+
+        # First layer: concatenated for single GEMM
+        self.register_buffer(
+            "W1_cat",
+            torch.cat([mlp.net[0].weight.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "b1_cat",
+            torch.cat([mlp.net[0].bias.data for mlp in radial_mlps], dim=0),
+        )
+
+        # Remaining layers: stacked [N, ...] for indexed access
+        self.register_buffer(
+            "ln1_weight",
+            torch.stack([mlp.net[1].weight.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "ln1_bias",
+            torch.stack([mlp.net[1].bias.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "fc2_weight",
+            torch.stack([mlp.net[3].weight.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "fc2_bias",
+            torch.stack([mlp.net[3].bias.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "ln2_weight",
+            torch.stack([mlp.net[4].weight.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "ln2_bias",
+            torch.stack([mlp.net[4].bias.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "fc3_weight",
+            torch.stack([mlp.net[6].weight.data for mlp in radial_mlps], dim=0),
+        )
+        self.register_buffer(
+            "fc3_bias",
+            torch.stack([mlp.net[6].bias.data for mlp in radial_mlps], dim=0),
+        )
+
+    def umas_radial_mlp(self, h: torch.Tensor, i: int) -> torch.Tensor:
+        """Apply layers 2+ (LN -> SiLU -> Linear -> LN -> SiLU -> Linear)."""
+        H = self.hidden_features
+        h = torch.nn.functional.layer_norm(
+            h, (H,), self.ln1_weight[i], self.ln1_bias[i], self.ln_eps
+        )
+        h = torch.nn.functional.silu(h)
+        h = torch.nn.functional.linear(h, self.fc2_weight[i], self.fc2_bias[i])
+        h = torch.nn.functional.layer_norm(
+            h, (H,), self.ln2_weight[i], self.ln2_bias[i], self.ln_eps
+        )
+        h = torch.nn.functional.silu(h)
+        return torch.nn.functional.linear(h, self.fc3_weight[i], self.fc3_bias[i])
+
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        """
+        Compute all N radial outputs.
+
+        Args:
+            x: Input tensor of shape [E, in_features]
+
+        Returns:
+            List of N tensors, each of shape [E, out_features]
+        """
+        # Single batched GEMM for first layer, then split into per-layer chunks
+        h_all = torch.nn.functional.linear(x, self.W1_cat, self.b1_cat)
+        h_per_layer = h_all.split(self.hidden_features, dim=1)
+        return [self.umas_radial_mlp(h_per_layer[i], i) for i in range(self.num_layers)]
+
+
+def create_unified_radial_mlp(radial_mlps: list) -> UnifiedRadialMLP:
+    """
+    Factory function to create a UnifiedRadialMLP from a list of RadialMLPs.
+
+    Args:
+        radial_mlps: List of RadialMLP modules
+
+    Returns:
+        UnifiedRadialMLP instance with shared first layer weights
+    """
+    return UnifiedRadialMLP(radial_mlps)
diff --git a/tests/core/models/uma/nn/test_so2_layers.py b/tests/core/models/uma/nn/test_so2_layers.py
diff --git a/tests/core/models/uma/nn/test_unified_radial.py b/tests/core/models/uma/nn/test_unified_radial.py

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,7 @@ def forward_chunk(`
`192`	`192`	`)`
`193`	`193`	`x_message, x_0_gating = self.so2_conv_1(x_message, x_edge)`
`194`	`194`	`x_message = self.act(x_0_gating, x_message)`
`195`		`- x_message = self.so2_conv_2(x_message, x_edge)`
	`195`	`+ x_message = self.so2_conv_2(x_message)`
`196`	`196`	`new_embedding = self.backend.permute_wigner_inv_edge_to_node(`
`197`	`197`	`x_message,`
`198`	`198`	`wigner_inv_envelope,`