Store fused MoE wi weight as (G,K,2N) when fused_mlp=True

abhinavgoel95 · abhinavgoel95 · commit d8b3334f3f2b · 2026-04-24T09:53:28.000-07:00
When fused_mlp is enabled, initialize self.wi as a single (G,K,2N)
parameter instead of two separate wi_0/wi_1 (G,K,N) tensors. This
loads expert weights from HBM once per forward pass; the concat in
sparse_matmul becomes a view of adjacent slices that XLA elides.
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -410,7 +410,7 @@ def __init__(
       self.wi_0 = jnp.zeros((num_experts, self.moe_expert_input_dim, intermediate_dim))
       self.wi_1 = jnp.zeros((num_experts, self.moe_expert_input_dim, intermediate_dim))
       self.wo = jnp.zeros((num_experts, intermediate_dim, self.moe_expert_input_dim))
-    elif self.config.prefuse_moe_weights and self.config.attention == "vllm_rpa":
+    elif (self.config.prefuse_moe_weights and self.config.attention == "vllm_rpa") or self.config.fused_mlp:
       self.wi = nnx.Param(
           self.kernel_init(
               self.rngs.params(),
@@ -1319,7 +1319,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
       )
 
       if self.config.fused_mlp:
-        # Fuse wi_0 and wi_1: [G,K,N] + [G,K,N] -> [G,K,2N], one GEMM, split result.
+        # Weights are stored as (G,K,2N); w0/w1 are adjacent slices so XLA elides this concat.
         w_fused = jnp.concatenate([w0, w1], axis=-1)
         out = gmm_fn(x, w_fused, tiling=wi_tile_size, weight_gather_axes=wi_gather_axes)
         n = w0.shape[-1]
@@ -2159,6 +2159,11 @@ def __call__(
     w1_kernel = None
     if cfg.prefuse_moe_weights and cfg.attention == "vllm_rpa":
       fused_kernel = jnp.asarray(self.wi[...], self.dtype)
+    elif cfg.fused_mlp:
+      wi = jnp.asarray(self.wi[...], self.dtype)
+      n = wi.shape[-1] // 2
+      w0_kernel = wi[..., :n]
+      w1_kernel = wi[..., n:]
     else:
       w0_kernel = jnp.asarray(self.wi_0[...], self.dtype)
       w1_kernel = jnp.asarray(self.wi_1[...], self.dtype)