marin-community
diff --git a/‎experiments/grug/moe/model.py‎
Lines changed: 20 additions & 19 deletions b/‎experiments/grug/moe/model.py‎
Lines changed: 20 additions & 19 deletions
@@ -26,7 +26,12 @@
     from jax.experimental.shard_map import shard_map
 from jaxtyping import Array, Float, Int, PRNGKeyArray
 from levanter.grug.attention import AttentionMask, RotaryConfig, align_kv_heads, apply_rotary_embedding, attention
-from levanter.grug.grug_moe import MoeActivation, MoeImplementation, moe_mlp
+from levanter.grug.grug_moe import (
+    MoeActivation,
+    MoEExpertMlp,
+    MoeImplementation,
+    resolve_moe_implementation,
+)
 from levanter.grug.loss import fused_linear_softmax_cross_entropy_loss
 from levanter.grug.sharding import Pembed_vocab, Plm_head, unshard
 from levanter.tracker.histogram import Histogram
@@ -89,6 +94,7 @@ def __post_init__(self) -> None:
             raise ValueError("num_experts_per_token must be <= num_experts")
         if self.shared_expert_intermediate_dim < 0:
             raise ValueError("shared_expert_intermediate_dim must be non-negative")
+        resolve_moe_implementation(self.moe_implementation)
 
     @property
     def inferred_head_dim(self) -> int:
@@ -312,32 +318,33 @@ class MoEMLP(eqx.Module):
 
     router: jax.Array
     router_bias: jax.Array
-    w_gate_up: jax.Array
-    w_down: jax.Array
+    expert_mlp: MoEExpertMlp
     cfg: GrugModelConfig = eqx.field(static=True)
 
     @staticmethod
     def init(cfg: GrugModelConfig, *, key: PRNGKeyArray) -> "MoEMLP":
-        k_router, k_gate, k_up, k_down = random.split(key, 4)
+        k_router, k_expert_mlp = random.split(key, 2)
         mesh = get_abstract_mesh()
 
         expert_axis_size = _mesh_axis_size(mesh, "expert")
         if cfg.num_experts % expert_axis_size != 0:
             raise ValueError(f"num_experts={cfg.num_experts} must be divisible by expert axis size={expert_axis_size}")
 
         d, e, i = cfg.hidden_dim, cfg.num_experts, cfg.intermediate_dim
-        w_gate = _init_weight(k_gate, (e, d, i), cfg.initializer_std)
-        w_up = _init_weight(k_up, (e, d, i), cfg.initializer_std)
-        # TODO: Explore whether concatenating gate/up at init (instead of keeping separate params)
-        # is (1) a meaningful MFU speedup and (2) a meaningful perf hit due to AdamH treating the
-        # concatenated tensor as a single parameter for its scale-invariant norm computation.
-        w_gate_up = jnp.concatenate([w_gate, w_up], axis=-1)
 
         return MoEMLP(
             router=reshard(_init_weight(k_router, (d, e), cfg.initializer_std), P(None, None)),
             router_bias=jnp.zeros((e,)),
-            w_gate_up=reshard(w_gate_up, P("expert", "data", "model")),
-            w_down=reshard(_init_weight(k_down, (e, i, d), cfg.initializer_std), P("expert", "model", "data")),
+            expert_mlp=MoEExpertMlp.init(
+                num_experts=e,
+                hidden_dim=d,
+                intermediate_dim=i,
+                initializer_std=cfg.initializer_std,
+                key=k_expert_mlp,
+                implementation=cfg.moe_implementation,
+                activation=ActivationFunctionEnum.silu,
+                capacity_factor=_DEFAULT_EP_CAPACITY_FACTOR,
+            ),
             cfg=cfg,
         )
 
@@ -389,16 +396,11 @@ def _local_qb_beta(s_ma):
             out_specs=P(),
         )(s_minus_alpha)
 
-        routed_flat = moe_mlp(
+        routed_flat = self.expert_mlp(
             x_flat,
             selected_experts.astype(jnp.int32),
             combine_weights,
-            self.w_gate_up,
-            self.w_down,
-            activation=ActivationFunctionEnum.silu,
-            implementation=self.cfg.moe_implementation,
             mesh=get_abstract_mesh(),
-            capacity_factor=_DEFAULT_EP_CAPACITY_FACTOR,
         )
 
         routed = rearrange(routed_flat, "(b s) d -> b s d", b=b, s=s)
@@ -592,5 +594,4 @@ def debug_mesh_and_token_pspec(num_devices: int) -> tuple[jax.sharding.AbstractM
     "RMSNorm",
     "Transformer",
     "debug_mesh_and_token_pspec",
-    "moe_mlp",
 ]