[moe] Add gated norm support and ablation launch script

github-actions[bot] · claude · github-actions[bot] · commit 8df5916e1e02 · 2026-03-23T21:53:05.000Z
Add GatedNorm (low-rank self-gating after RMSNorm) to the MoE grug model with a gated_norm_rank config field, and create an ablation launch script comparing baseline vs gated-norm at ~1e19 FLOPs for the good 10T gate. Fixes #4026 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/experiments/grug/moe/launch_ablate_gated_norms.py b/experiments/grug/moe/launch_ablate_gated_norms.py
@@ -0,0 +1,131 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Ablation: gated norms in the MoE grug model at ~1e19 FLOPs.
+
+Runs two matched configurations:
+  - baseline (no gated norms)
+  - gated_norm_rank=16
+
+See https://github.com/marin-community/marin/issues/4026
+"""
+
+import dataclasses
+
+from fray.cluster import ResourceConfig
+from levanter.optim import AdamConfig
+from levanter.tracker.wandb import WandbConfig
+
+from experiments.grug.moe.launch import (
+    NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
+    GrugMoeLaunchConfig,
+    GrugTrainerConfig,
+    run_grug_moe,
+)
+from experiments.grug.moe.model import GrugModelConfig
+from experiments.grug.moe.train import GrugEvalConfig
+from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
+
+GATED_NORM_RANK = 16
+
+_BASE_MODEL = GrugModelConfig(
+    vocab_size=128_256,
+    hidden_dim=768,
+    intermediate_dim=2048,
+    shared_expert_intermediate_dim=2048,
+    num_experts=8,
+    num_experts_per_token=2,
+    num_layers=12,
+    num_heads=12,
+    num_kv_heads=12,
+    max_seq_len=4096,
+)
+
+_GATED_NORM_MODEL = dataclasses.replace(_BASE_MODEL, gated_norm_rank=GATED_NORM_RANK)
+
+_OPTIMIZER = AdamConfig(
+    learning_rate=3e-3,
+    weight_decay=0.1,
+    lr_schedule="cosine",
+    decay=0.2,
+    min_lr_ratio=0.1,
+    warmup=500,
+)
+
+_TRAINER = GrugTrainerConfig(
+    z_loss_weight=1e-4,
+    ema_beta=None,
+    log_every=1,
+)
+
+_EVAL = GrugEvalConfig(
+    eval_batch_size=512,
+    steps_per_eval=500,
+    max_eval_batches=8,
+    eval_current=True,
+    eval_ema=False,
+)
+
+_WANDB_TAGS = ["grug", "moe", "good-10t", "ablation", "gated-norm"]
+_STEPS = 2_130
+_BATCH_SIZE = 512
+_RESOURCES = ResourceConfig.with_tpu("v5p-8")
+
+
+def _make_launch_config(
+    model: GrugModelConfig,
+    run_id: str,
+    wandb_group: str,
+    extra_tags: list[str] | None = None,
+) -> GrugMoeLaunchConfig:
+    tags = list(_WANDB_TAGS) + (extra_tags or [])
+    return GrugMoeLaunchConfig(
+        model=versioned(model),
+        data=NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
+        output_path=this_output_path(),
+        run_id=run_id,
+        resources=versioned(_RESOURCES),
+        steps=versioned(_STEPS),
+        batch_size=versioned(_BATCH_SIZE),
+        seed=versioned(0),
+        mp=versioned("params=float32,compute=bfloat16,output=bfloat16"),
+        tracker=WandbConfig(
+            project="marin",
+            tags=tags,
+            group=wandb_group,
+            name=None,
+        ),
+        optimizer=versioned(_OPTIMIZER),
+        grug_trainer=versioned(_TRAINER),
+        eval=versioned(_EVAL),
+    )
+
+
+ablate_gated_norm_baseline = ExecutorStep(
+    name="grug/ablate-gated-norm-baseline",
+    fn=run_grug_moe,
+    config=_make_launch_config(
+        model=_BASE_MODEL,
+        run_id="ablate-gated-norm-baseline",
+        wandb_group="ablate-gated-norm",
+        extra_tags=["baseline"],
+    ),
+)
+
+ablate_gated_norm_enabled = ExecutorStep(
+    name="grug/ablate-gated-norm-enabled",
+    fn=run_grug_moe,
+    config=_make_launch_config(
+        model=_GATED_NORM_MODEL,
+        run_id="ablate-gated-norm-enabled",
+        wandb_group="ablate-gated-norm",
+        extra_tags=[f"gated_norm_rank={GATED_NORM_RANK}"],
+    ),
+)
+
+
+if __name__ == "__main__":
+    executor_main(
+        steps=[ablate_gated_norm_baseline, ablate_gated_norm_enabled],
+        description="Ablation: gated norms in MoE grug at ~1e19 FLOPs (issue #4026).",
+    )
diff --git a/experiments/grug/moe/model.py b/experiments/grug/moe/model.py
@@ -67,6 +67,7 @@ class GrugModelConfig:
     load_balancing_loss_coef: float | None = 0.01
     router_z_loss_coef: float | None = 0.001
     moe_implementation: MoeImplementation | None = None
+    gated_norm_rank: int | None = None
     rope: RotaryConfig = dataclasses.field(default_factory=RotaryConfig)
 
     def __post_init__(self) -> None:
@@ -92,6 +93,8 @@ def __post_init__(self) -> None:
             raise ValueError("load_balancing_loss_coef must be non-negative when set")
         if self.router_z_loss_coef is not None and self.router_z_loss_coef < 0:
             raise ValueError("router_z_loss_coef must be non-negative when set")
+        if self.gated_norm_rank is not None and self.gated_norm_rank <= 0:
+            raise ValueError("gated_norm_rank must be positive when set")
 
     @property
     def inferred_head_dim(self) -> int:
@@ -156,6 +159,32 @@ def __call__(self, x: Float[Array, "... D"]) -> Float[Array, "... D"]:
         return (normed * weight).astype(dtype)
 
 
+class GatedNorm(eqx.Module):
+    """Low-rank self-gating applied after RMSNorm.
+
+    Computes: x * sigmoid(up(silu(down(x)))), where down projects from
+    hidden_dim to rank and up projects back.
+    """
+
+    w_down: jax.Array
+    w_up: jax.Array
+
+    @staticmethod
+    def init(hidden_dim: int, rank: int, initializer_std: float, *, key: PRNGKeyArray) -> "GatedNorm":
+        k_down, k_up = random.split(key)
+        return GatedNorm(
+            w_down=reshard(_init_weight(k_down, (hidden_dim, rank), initializer_std), P(None, None)),
+            w_up=reshard(_init_weight(k_up, (rank, hidden_dim), initializer_std), P(None, None)),
+        )
+
+    @named_call
+    def __call__(self, x: Float[Array, "... D"]) -> Float[Array, "... D"]:
+        gate_hidden = jnp.einsum("...d,dr->...r", x, self.w_down)
+        gate_hidden = jax.nn.silu(gate_hidden)
+        gate = jax.nn.sigmoid(jnp.einsum("...r,rd->...d", gate_hidden, self.w_up))
+        return x * gate.astype(x.dtype)
+
+
 class DenseMLP(eqx.Module):
     w_gate: jax.Array
     w_up: jax.Array
@@ -343,14 +372,16 @@ def __call__(
 
 class Block(eqx.Module):
     rms_attn: RMSNorm
+    gated_norm_attn: GatedNorm | None
     attn: CausalSelfAttention
     rms_mlp: RMSNorm
+    gated_norm_mlp: GatedNorm | None
     mlp: MoEMLP
     shared: DenseMLP | None
 
     @staticmethod
     def init(cfg: GrugModelConfig, *, key: PRNGKeyArray) -> "Block":
-        attn_key, mlp_key, shared_key = random.split(key, 3)
+        attn_key, mlp_key, shared_key, gn_attn_key, gn_mlp_key = random.split(key, 5)
         shared = None
         if cfg.shared_expert_intermediate_dim > 0:
             shared = DenseMLP.init(
@@ -359,10 +390,17 @@ def init(cfg: GrugModelConfig, *, key: PRNGKeyArray) -> "Block":
                 cfg.initializer_std,
                 key=shared_key,
             )
+        gated_norm_attn = None
+        gated_norm_mlp = None
+        if cfg.gated_norm_rank is not None:
+            gated_norm_attn = GatedNorm.init(cfg.hidden_dim, cfg.gated_norm_rank, cfg.initializer_std, key=gn_attn_key)
+            gated_norm_mlp = GatedNorm.init(cfg.hidden_dim, cfg.gated_norm_rank, cfg.initializer_std, key=gn_mlp_key)
         return Block(
             rms_attn=RMSNorm.init(cfg.hidden_dim, cfg.layer_norm_eps),
+            gated_norm_attn=gated_norm_attn,
             attn=CausalSelfAttention.init(cfg, key=attn_key),
             rms_mlp=RMSNorm.init(cfg.hidden_dim, cfg.layer_norm_eps),
+            gated_norm_mlp=gated_norm_mlp,
             mlp=MoEMLP.init(cfg, key=mlp_key),
             shared=shared,
         )
@@ -373,8 +411,13 @@ def __call__(
         x: Float[Array, "B S D"],
         mask: AttentionMask | jax.Array,
     ) -> tuple[Float[Array, "B S D"], dict[str, jax.Array]]:
-        x = x + self.attn(self.rms_attn(x), mask)
+        attn_in = self.rms_attn(x)
+        if self.gated_norm_attn is not None:
+            attn_in = self.gated_norm_attn(attn_in)
+        x = x + self.attn(attn_in, mask)
         mlp_in = self.rms_mlp(x)
+        if self.gated_norm_mlp is not None:
+            mlp_in = self.gated_norm_mlp(mlp_in)
         mlp_out, router_stats = self.mlp(mlp_in)
         if self.shared is not None:
             mlp_out = mlp_out + self.shared(mlp_in, activation=ActivationFunctionEnum.silu)
@@ -518,6 +561,7 @@ def debug_mesh_and_token_pspec(num_devices: int) -> tuple[jax.sharding.AbstractM
     "Block",
     "CausalSelfAttention",
     "DenseMLP",
+    "GatedNorm",
     "GrugModelConfig",
     "MoEMLP",
     "MoeActivation",
diff --git a/tests/test_grug_variant_contracts.py b/tests/test_grug_variant_contracts.py
@@ -179,6 +179,54 @@ def build():
     assert with_ema_state_shape.ema_params is not None
 
 
+def test_grug_moe_gated_norm_lowers():
+    """Verify that the MoE variant with gated_norm_rank lowers without error."""
+    from experiments.grug.moe.model import GrugModelConfig, debug_mesh_and_token_pspec
+    from experiments.grug.moe.train import initial_state as moe_initial_state, _make_train_step
+
+    cfg = GrugModelConfig(
+        vocab_size=1024,
+        hidden_dim=32,
+        intermediate_dim=64,
+        shared_expert_intermediate_dim=64,
+        num_experts=4,
+        num_experts_per_token=2,
+        num_layers=2,
+        num_heads=2,
+        num_kv_heads=2,
+        max_seq_len=4,
+        gated_norm_rank=8,
+    )
+    optimizer = optax.adam(1e-2)
+    mp = jmp.get_policy("f32")
+    train_step = _make_train_step(optimizer, mp, z_loss_weight=0.0, ema_beta=None)
+    mesh, token_pspec = debug_mesh_and_token_pspec(num_devices=4)
+    batch = GrugLmExample(
+        tokens=jnp.zeros((8, 4), dtype=jnp.int32),
+        loss_weight=jnp.ones((8, 4), dtype=jnp.float32),
+        attn_mask=GrugAttentionMask.causal(),
+    )
+
+    def one_step():
+        sharded_batch = dataclasses.replace(
+            batch,
+            tokens=jax.sharding.reshard(batch.tokens, token_pspec),
+            loss_weight=jax.sharding.reshard(batch.loss_weight, token_pspec),
+        )
+        state = moe_initial_state(cfg, optimizer=optimizer, mp=mp, key=jax.random.PRNGKey(0), ema_beta=None)
+        return train_step(state, sharded_batch, compute_watch=False)
+
+    with _reset_abstract_mesh(), use_abstract_mesh(mesh):
+        out_state_shape, out_metrics_shape, _out_watch_shape = eqx.filter_eval_shape(one_step)
+
+    assert out_state_shape.step.shape == ()
+    assert "train/loss" in out_metrics_shape
+    # Verify gated norm params exist in the model tree
+    block = out_state_shape.params.blocks[0]
+    assert block.gated_norm_attn is not None
+    assert block.gated_norm_mlp is not None
+
+
 def test_grug_base_run_emits_expected_metrics_with_json_tracker(tmp_path: Path):
     train_module = importlib.import_module("experiments.grug.base.train")
     model_module = importlib.import_module("experiments.grug.base.model")