[moe] Make capacity_factor configurable in GrugModelConfig and add sweep script

github-actions[bot] · claude · github-actions[bot] · commit dc6e4c0fb910 · 2026-03-23T21:52:55.000Z
Add capacity_factor field to GrugModelConfig (default 1.25, matching the existing hardcoded value) so it can be varied in experiment sweeps. Add sweep_capacity_factor.py to sweep over {1.0, 1.125, 1.25, 1.5, 2.0}. Fixes #4017 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/experiments/grug/moe/model.py b/experiments/grug/moe/model.py
@@ -66,6 +66,7 @@ class GrugModelConfig:
     initializer_std: float = 0.02
     load_balancing_loss_coef: float | None = 0.01
     router_z_loss_coef: float | None = 0.001
+    capacity_factor: float = _DEFAULT_EP_CAPACITY_FACTOR
     moe_implementation: MoeImplementation | None = None
     rope: RotaryConfig = dataclasses.field(default_factory=RotaryConfig)
 
@@ -92,6 +93,8 @@ def __post_init__(self) -> None:
             raise ValueError("load_balancing_loss_coef must be non-negative when set")
         if self.router_z_loss_coef is not None and self.router_z_loss_coef < 0:
             raise ValueError("router_z_loss_coef must be non-negative when set")
+        if self.capacity_factor <= 0:
+            raise ValueError("capacity_factor must be positive")
 
     @property
     def inferred_head_dim(self) -> int:
@@ -334,7 +337,7 @@ def __call__(
             activation=ActivationFunctionEnum.silu,
             implementation=self.cfg.moe_implementation,
             mesh=get_abstract_mesh(),
-            capacity_factor=_DEFAULT_EP_CAPACITY_FACTOR,
+            capacity_factor=self.cfg.capacity_factor,
         )
         routed = rearrange(routed_flat, "(b s) d -> b s d", b=b, s=s)
         routed = reshard(routed, _batch_spec())
diff --git a/experiments/grug/moe/sweep_capacity_factor.py b/experiments/grug/moe/sweep_capacity_factor.py
@@ -0,0 +1,95 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Sweep capacity_factor for the MoE grug variant.
+
+Runs the trial model at several capacity factors to determine whether the
+default 1.25 is safe or whether it masks avoidable overflow or throughput loss.
+
+See: https://github.com/marin-community/marin/issues/4017
+"""
+
+import dataclasses
+
+from levanter.tracker.wandb import WandbConfig
+from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
+
+from experiments.grug.moe.launch import (
+    GRUG_MOE_TRIAL_MODEL,
+    NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
+    GrugMoeLaunchConfig,
+    run_grug_moe,
+)
+from experiments.grug.moe.model import GrugModelConfig
+from experiments.grug.moe.train import GrugEvalConfig, GrugTrainerConfig
+
+from fray.cluster import ResourceConfig
+from levanter.optim import AdamConfig
+
+CAPACITY_FACTORS = [1.0, 1.125, 1.25, 1.5, 2.0]
+
+
+def _build_sweep_steps() -> list[ExecutorStep]:
+    steps: list[ExecutorStep] = []
+    for cf in CAPACITY_FACTORS:
+        tag = f"cf{cf:.3f}".replace(".", "p")
+        model = dataclasses.replace(GRUG_MOE_TRIAL_MODEL, capacity_factor=cf)
+        run_id = f"grug-moe-sweep-cf-{tag}"
+        step = ExecutorStep(
+            name=f"grug/moe-sweep-cf-{tag}",
+            fn=run_grug_moe,
+            config=GrugMoeLaunchConfig(
+                model=versioned(model),
+                data=NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
+                output_path=this_output_path(),
+                run_id=run_id,
+                resources=versioned(ResourceConfig.with_tpu("v5p-8")),
+                steps=versioned(2_000),
+                batch_size=versioned(512),
+                seed=versioned(0),
+                mp=versioned("params=float32,compute=bfloat16,output=bfloat16"),
+                tracker=WandbConfig(
+                    project="marin",
+                    tags=["grug", "moe", "sweep", "capacity-factor"],
+                    group="grug-moe-sweep-capacity-factor",
+                    name=None,
+                ),
+                optimizer=versioned(
+                    AdamConfig(
+                        learning_rate=3e-3,
+                        weight_decay=0.1,
+                        lr_schedule="cosine",
+                        decay=0.2,
+                        min_lr_ratio=0.1,
+                        warmup=1000,
+                    )
+                ),
+                grug_trainer=versioned(
+                    GrugTrainerConfig(
+                        z_loss_weight=1e-4,
+                        ema_beta=None,
+                        log_every=1,
+                    )
+                ),
+                eval=versioned(
+                    GrugEvalConfig(
+                        eval_batch_size=512,
+                        steps_per_eval=1000,
+                        max_eval_batches=8,
+                        eval_current=True,
+                        eval_ema=False,
+                    )
+                ),
+            ),
+        )
+        steps.append(step)
+    return steps
+
+
+sweep_steps = _build_sweep_steps()
+
+if __name__ == "__main__":
+    executor_main(
+        steps=sweep_steps,
+        description="Sweep capacity_factor over {1.0, 1.125, 1.25, 1.5, 2.0} for the MoE grug trial model.",
+    )
diff --git a/tests/test_grug_variant_contracts.py b/tests/test_grug_variant_contracts.py
@@ -263,3 +263,48 @@ def test_grug_base_run_emits_expected_metrics_with_json_tracker(tmp_path: Path):
     ]
     for key in required_keys:
         assert key in summary
+
+
+@pytest.mark.parametrize("capacity_factor", [1.0, 1.5, 2.0])
+def test_moe_capacity_factor_config_propagates_to_lowering(capacity_factor: float):
+    """Verify that GrugModelConfig.capacity_factor is accepted and the model lowers with non-default values."""
+    model_module = importlib.import_module("experiments.grug.moe.model")
+    train_module = importlib.import_module("experiments.grug.moe.train")
+    model_config_cls = model_module.GrugModelConfig
+    mesh_fn = model_module.debug_mesh_and_token_pspec
+
+    cfg = model_config_cls(vocab_size=1024, capacity_factor=capacity_factor)
+    assert cfg.capacity_factor == capacity_factor
+
+    optimizer = optax.adam(1e-2)
+    mp = jmp.get_policy("f32")
+    train_step = train_module._make_train_step(optimizer, mp, z_loss_weight=0.0, ema_beta=None)
+    mesh, token_pspec = mesh_fn(num_devices=4)
+    batch = GrugLmExample(
+        tokens=jnp.zeros((8, 4), dtype=jnp.int32),
+        loss_weight=jnp.ones((8, 4), dtype=jnp.float32),
+        attn_mask=GrugAttentionMask.causal(),
+    )
+
+    def one_step():
+        sharded_batch = dataclasses.replace(
+            batch,
+            tokens=jax.sharding.reshard(batch.tokens, token_pspec),
+            loss_weight=jax.sharding.reshard(batch.loss_weight, token_pspec),
+        )
+        state = train_module.initial_state(cfg, optimizer=optimizer, mp=mp, key=jax.random.PRNGKey(0), ema_beta=None)
+        return train_step(state, sharded_batch, compute_watch=False)
+
+    with _reset_abstract_mesh(), use_abstract_mesh(mesh):
+        out_state_shape, out_metrics_shape, _ = eqx.filter_eval_shape(one_step)
+
+    assert out_state_shape.step.shape == ()
+    assert "train/loss" in out_metrics_shape
+
+
+def test_moe_capacity_factor_rejects_non_positive():
+    model_module = importlib.import_module("experiments.grug.moe.model")
+    with pytest.raises(ValueError, match="capacity_factor must be positive"):
+        model_module.GrugModelConfig(vocab_size=1024, capacity_factor=0.0)
+    with pytest.raises(ValueError, match="capacity_factor must be positive"):
+        model_module.GrugModelConfig(vocab_size=1024, capacity_factor=-1.0)