|
| 1 | +# Copyright The Marin Authors |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +"""Good 10T experiment: inv-sqrt LR schedule for MoE. |
| 5 | +
|
| 6 | +Compares inverse-square-root learning-rate decay against the default cosine |
| 7 | +schedule on the standard MoE trial configuration. Everything else (model, data, |
| 8 | +resources, training steps) is identical to the cosine baseline in launch.py so |
| 9 | +the comparison is apples-to-apples. |
| 10 | +
|
| 11 | +Tracking issue: https://github.com/marin-community/marin/issues/4028 |
| 12 | +""" |
| 13 | + |
| 14 | +from fray.cluster import ResourceConfig |
| 15 | +from levanter.optim import AdamConfig |
| 16 | +from levanter.tracker.wandb import WandbConfig |
| 17 | +from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned |
| 18 | + |
| 19 | +from experiments.grug.moe.launch import ( |
| 20 | + GRUG_MOE_TRIAL_MODEL, |
| 21 | + GrugMoeLaunchConfig, |
| 22 | + NEMOTRON_MIX_WITH_DEFAULT_VALIDATION, |
| 23 | + _resolve_run_id, |
| 24 | + run_grug_moe, |
| 25 | +) |
| 26 | + |
| 27 | +RESOLVED_RUN_ID = _resolve_run_id("grug-moe-inv-sqrt-lr") |
| 28 | + |
| 29 | +# Same optimizer as the cosine baseline but with inv_sqrt schedule. The decay |
| 30 | +# fraction is omitted because inv_sqrt decays continuously from peak rather |
| 31 | +# than using a cosine-style stable/decay split. |
| 32 | +INV_SQRT_OPTIMIZER = AdamConfig( |
| 33 | + learning_rate=3e-3, |
| 34 | + weight_decay=0.1, |
| 35 | + lr_schedule="inv_sqrt", |
| 36 | + min_lr_ratio=0.1, |
| 37 | + warmup=1000, |
| 38 | +) |
| 39 | + |
| 40 | +grug_moe_inv_sqrt_lr = ExecutorStep( |
| 41 | + name="grug/moe-inv-sqrt-lr", |
| 42 | + fn=run_grug_moe, |
| 43 | + config=GrugMoeLaunchConfig( |
| 44 | + model=versioned(GRUG_MOE_TRIAL_MODEL), |
| 45 | + data=NEMOTRON_MIX_WITH_DEFAULT_VALIDATION, |
| 46 | + output_path=this_output_path(), |
| 47 | + run_id=RESOLVED_RUN_ID, |
| 48 | + resources=versioned(ResourceConfig.with_tpu("v5p-8")), |
| 49 | + steps=versioned(2_000), |
| 50 | + batch_size=versioned(512), |
| 51 | + seed=versioned(0), |
| 52 | + mp=versioned("params=float32,compute=bfloat16,output=bfloat16"), |
| 53 | + tracker=WandbConfig( |
| 54 | + project="marin", |
| 55 | + tags=["grug", "moe", "inv-sqrt-lr", "good-10t"], |
| 56 | + group="grug-moe-inv-sqrt-lr", |
| 57 | + name=None, |
| 58 | + ), |
| 59 | + optimizer=versioned(INV_SQRT_OPTIMIZER), |
| 60 | + ), |
| 61 | +) |
| 62 | + |
| 63 | + |
| 64 | +if __name__ == "__main__": |
| 65 | + executor_main( |
| 66 | + steps=[grug_moe_inv_sqrt_lr], |
| 67 | + description="Good 10T: inv-sqrt LR schedule for MoE (issue #4028).", |
| 68 | + ) |
0 commit comments