[moe] Add headwise attention gate and ablation launch script

github-actions[bot] · claude · github-actions[bot] · commit 3c3d0a5ed08c · 2026-03-23T22:01:49.000Z
Add attention_gate config field to GrugModelConfig with headwise gating support in CausalSelfAttention. When enabled, a learned per-head sigmoid gate modulates attention output before the output projection. Includes ablation launch script comparing baseline vs headwise-gated at ~1e19 FLOPs and a lowering contract test. Fixes #4020 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/experiments/grug/moe/launch_ablate_attention_gate.py b/experiments/grug/moe/launch_ablate_attention_gate.py
@@ -0,0 +1,129 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Ablation: attention gate in the MoE grug model at ~1e19 FLOPs.
+
+Runs two matched configurations:
+  - baseline (no attention gate)
+  - headwise attention gate
+
+See https://github.com/marin-community/marin/issues/4020
+"""
+
+import dataclasses
+
+from fray.cluster import ResourceConfig
+from levanter.optim import AdamConfig
+from levanter.tracker.wandb import WandbConfig
+
+from experiments.grug.moe.launch import (
+    NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
+    GrugMoeLaunchConfig,
+    GrugTrainerConfig,
+    run_grug_moe,
+)
+from experiments.grug.moe.model import GrugModelConfig
+from experiments.grug.moe.train import GrugEvalConfig
+from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
+
+_BASE_MODEL = GrugModelConfig(
+    vocab_size=128_256,
+    hidden_dim=768,
+    intermediate_dim=2048,
+    shared_expert_intermediate_dim=2048,
+    num_experts=8,
+    num_experts_per_token=2,
+    num_layers=12,
+    num_heads=12,
+    num_kv_heads=12,
+    max_seq_len=4096,
+)
+
+_GATED_MODEL = dataclasses.replace(_BASE_MODEL, attention_gate="headwise")
+
+_OPTIMIZER = AdamConfig(
+    learning_rate=3e-3,
+    weight_decay=0.1,
+    lr_schedule="cosine",
+    decay=0.2,
+    min_lr_ratio=0.1,
+    warmup=500,
+)
+
+_TRAINER = GrugTrainerConfig(
+    z_loss_weight=1e-4,
+    ema_beta=None,
+    log_every=1,
+)
+
+_EVAL = GrugEvalConfig(
+    eval_batch_size=512,
+    steps_per_eval=500,
+    max_eval_batches=8,
+    eval_current=True,
+    eval_ema=False,
+)
+
+_WANDB_TAGS = ["grug", "moe", "good-10t", "ablation", "attention-gate"]
+_STEPS = 2_130
+_BATCH_SIZE = 512
+_RESOURCES = ResourceConfig.with_tpu("v5p-8")
+
+
+def _make_launch_config(
+    model: GrugModelConfig,
+    run_id: str,
+    wandb_group: str,
+    extra_tags: list[str] | None = None,
+) -> GrugMoeLaunchConfig:
+    tags = list(_WANDB_TAGS) + (extra_tags or [])
+    return GrugMoeLaunchConfig(
+        model=versioned(model),
+        data=NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
+        output_path=this_output_path(),
+        run_id=run_id,
+        resources=versioned(_RESOURCES),
+        steps=versioned(_STEPS),
+        batch_size=versioned(_BATCH_SIZE),
+        seed=versioned(0),
+        mp=versioned("params=float32,compute=bfloat16,output=bfloat16"),
+        tracker=WandbConfig(
+            project="marin",
+            tags=tags,
+            group=wandb_group,
+            name=None,
+        ),
+        optimizer=versioned(_OPTIMIZER),
+        grug_trainer=versioned(_TRAINER),
+        eval=versioned(_EVAL),
+    )
+
+
+ablate_attn_gate_baseline = ExecutorStep(
+    name="grug/ablate-attn-gate-baseline",
+    fn=run_grug_moe,
+    config=_make_launch_config(
+        model=_BASE_MODEL,
+        run_id="ablate-attn-gate-baseline",
+        wandb_group="ablate-attn-gate",
+        extra_tags=["baseline"],
+    ),
+)
+
+ablate_attn_gate_headwise = ExecutorStep(
+    name="grug/ablate-attn-gate-headwise",
+    fn=run_grug_moe,
+    config=_make_launch_config(
+        model=_GATED_MODEL,
+        run_id="ablate-attn-gate-headwise",
+        wandb_group="ablate-attn-gate",
+        extra_tags=["headwise"],
+    ),
+)
+
+
+if __name__ == "__main__":
+    executor_main(
+        steps=[ablate_attn_gate_baseline, ablate_attn_gate_headwise],
+        description="Ablation: attention gate in MoE grug at ~1e19 FLOPs (issue #4020).",
+    )
diff --git a/experiments/grug/moe/model.py b/experiments/grug/moe/model.py
@@ -11,7 +11,7 @@
 import dataclasses
 
 from dataclasses import dataclass
-from typing import get_args
+from typing import Literal, get_args
 import equinox as eqx
 import jax
 import jax.numpy as jnp
@@ -67,6 +67,7 @@ class GrugModelConfig:
     load_balancing_loss_coef: float | None = 0.01
     router_z_loss_coef: float | None = 0.001
     moe_implementation: MoeImplementation | None = None
+    attention_gate: Literal["none", "headwise"] = "none"
     rope: RotaryConfig = dataclasses.field(default_factory=RotaryConfig)
 
     def __post_init__(self) -> None:
@@ -92,6 +93,8 @@ def __post_init__(self) -> None:
             raise ValueError("load_balancing_loss_coef must be non-negative when set")
         if self.router_z_loss_coef is not None and self.router_z_loss_coef < 0:
             raise ValueError("router_z_loss_coef must be non-negative when set")
+        if self.attention_gate not in ("none", "headwise"):
+            raise ValueError(f"attention_gate must be 'none' or 'headwise', got {self.attention_gate!r}")
 
     @property
     def inferred_head_dim(self) -> int:
@@ -109,17 +112,22 @@ class CausalSelfAttention(eqx.Module):
     w_k: Float[Array, "D MH"]
     w_v: Float[Array, "D MH"]
     w_o: Float[Array, "NH D"]
+    w_gate: Float[Array, "D N"] | None
     cfg: GrugModelConfig = eqx.field(static=True)
 
     @staticmethod
     def init(cfg: GrugModelConfig, *, key: PRNGKeyArray) -> "CausalSelfAttention":
-        k_q, k_k, k_v, k_o = random.split(key, 4)
+        k_q, k_k, k_v, k_o, k_g = random.split(key, 5)
         d, n, m, h = cfg.hidden_dim, cfg.num_heads, cfg.num_kv_heads, cfg.inferred_head_dim
+        w_gate = None
+        if cfg.attention_gate == "headwise":
+            w_gate = reshard(_init_weight(k_g, (d, n), cfg.initializer_std), P("data", "model"))
         return CausalSelfAttention(
             w_q=reshard(_init_weight(k_q, (d, n * h), cfg.initializer_std), P("data", "model")),
             w_k=reshard(_init_weight(k_k, (d, m * h), cfg.initializer_std), P("data", "model")),
             w_v=reshard(_init_weight(k_v, (d, m * h), cfg.initializer_std), P("data", "model")),
             w_o=reshard(_init_weight(k_o, (n * h, d), cfg.initializer_std), P("model", "data")),
+            w_gate=w_gate,
             cfg=cfg,
         )
 
@@ -134,6 +142,11 @@ def __call__(self, x: Float[Array, "B S D"], mask: AttentionMask | jax.Array) ->
         v = rearrange(jnp.einsum("bsh,hd->bsd", x, self.w_v), "... (m d) -> ... m d", d=head_dim)
         q, k = apply_rotary_embedding(q, k, seq_len=seq_len, head_dim=head_dim, rope=self.cfg.rope)
         attn_out = attention(q, k, v, mask)
+        if self.w_gate is not None:
+            # Headwise gating: sigmoid(x @ w_gate) produces one scalar per head,
+            # broadcast across head_dim.  Shape: (B, S, N, 1).
+            gate = jax.nn.sigmoid(jnp.einsum("bsd,dn->bsn", x, self.w_gate))[..., None]
+            attn_out = attn_out * gate
         attn_out = rearrange(attn_out, "... n d -> ... (n d)")
         return jnp.einsum("bsh,hd->bsd", attn_out, self.w_o, out_sharding=batch_spec)
 
diff --git a/tests/test_grug_variant_contracts.py b/tests/test_grug_variant_contracts.py
@@ -179,6 +179,52 @@ def build():
     assert with_ema_state_shape.ema_params is not None
 
 
+def test_grug_moe_attention_gate_lowers():
+    """Verify that the MoE variant with attention_gate='headwise' lowers without error."""
+    from experiments.grug.moe.model import GrugModelConfig, debug_mesh_and_token_pspec
+    from experiments.grug.moe.train import initial_state as moe_initial_state, _make_train_step
+
+    cfg = GrugModelConfig(
+        vocab_size=1024,
+        hidden_dim=32,
+        intermediate_dim=64,
+        shared_expert_intermediate_dim=64,
+        num_experts=4,
+        num_experts_per_token=2,
+        num_layers=2,
+        num_heads=2,
+        num_kv_heads=2,
+        max_seq_len=4,
+        attention_gate="headwise",
+    )
+    optimizer = optax.adam(1e-2)
+    mp = jmp.get_policy("f32")
+    train_step = _make_train_step(optimizer, mp, z_loss_weight=0.0, ema_beta=None)
+    mesh, token_pspec = debug_mesh_and_token_pspec(num_devices=4)
+    batch = GrugLmExample(
+        tokens=jnp.zeros((8, 4), dtype=jnp.int32),
+        loss_weight=jnp.ones((8, 4), dtype=jnp.float32),
+        attn_mask=GrugAttentionMask.causal(),
+    )
+
+    def one_step():
+        sharded_batch = dataclasses.replace(
+            batch,
+            tokens=jax.sharding.reshard(batch.tokens, token_pspec),
+            loss_weight=jax.sharding.reshard(batch.loss_weight, token_pspec),
+        )
+        state = moe_initial_state(cfg, optimizer=optimizer, mp=mp, key=jax.random.PRNGKey(0), ema_beta=None)
+        return train_step(state, sharded_batch, compute_watch=False)
+
+    with _reset_abstract_mesh(), use_abstract_mesh(mesh):
+        out_state_shape, out_metrics_shape, _out_watch_shape = eqx.filter_eval_shape(one_step)
+
+    assert out_state_shape.step.shape == ()
+    assert "train/loss" in out_metrics_shape
+    block = out_state_shape.params.blocks[0]
+    assert block.attn.w_gate is not None
+
+
 def test_grug_base_run_emits_expected_metrics_with_json_tracker(tmp_path: Path):
     train_module = importlib.import_module("experiments.grug.base.train")
     model_module = importlib.import_module("experiments.grug.base.model")