Skip to content

Commit 30b49fb

Browse files
[moe] Add inv-sqrt LR schedule experiment for Good 10T gate
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a243fe5 commit 30b49fb

1 file changed

Lines changed: 68 additions & 0 deletions

File tree

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Good 10T experiment: inv-sqrt LR schedule for MoE.
5+
6+
Compares inverse-square-root learning-rate decay against the default cosine
7+
schedule on the standard MoE trial configuration. Everything else (model, data,
8+
resources, training steps) is identical to the cosine baseline in launch.py so
9+
the comparison is apples-to-apples.
10+
11+
Tracking issue: https://github.com/marin-community/marin/issues/4028
12+
"""
13+
14+
from fray.cluster import ResourceConfig
15+
from levanter.optim import AdamConfig
16+
from levanter.tracker.wandb import WandbConfig
17+
from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
18+
19+
from experiments.grug.moe.launch import (
20+
GRUG_MOE_TRIAL_MODEL,
21+
GrugMoeLaunchConfig,
22+
NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
23+
_resolve_run_id,
24+
run_grug_moe,
25+
)
26+
27+
RESOLVED_RUN_ID = _resolve_run_id("grug-moe-inv-sqrt-lr")
28+
29+
# Same optimizer as the cosine baseline but with inv_sqrt schedule. The decay
30+
# fraction is omitted because inv_sqrt decays continuously from peak rather
31+
# than using a cosine-style stable/decay split.
32+
INV_SQRT_OPTIMIZER = AdamConfig(
33+
learning_rate=3e-3,
34+
weight_decay=0.1,
35+
lr_schedule="inv_sqrt",
36+
min_lr_ratio=0.1,
37+
warmup=1000,
38+
)
39+
40+
grug_moe_inv_sqrt_lr = ExecutorStep(
41+
name="grug/moe-inv-sqrt-lr",
42+
fn=run_grug_moe,
43+
config=GrugMoeLaunchConfig(
44+
model=versioned(GRUG_MOE_TRIAL_MODEL),
45+
data=NEMOTRON_MIX_WITH_DEFAULT_VALIDATION,
46+
output_path=this_output_path(),
47+
run_id=RESOLVED_RUN_ID,
48+
resources=versioned(ResourceConfig.with_tpu("v5p-8")),
49+
steps=versioned(2_000),
50+
batch_size=versioned(512),
51+
seed=versioned(0),
52+
mp=versioned("params=float32,compute=bfloat16,output=bfloat16"),
53+
tracker=WandbConfig(
54+
project="marin",
55+
tags=["grug", "moe", "inv-sqrt-lr", "good-10t"],
56+
group="grug-moe-inv-sqrt-lr",
57+
name=None,
58+
),
59+
optimizer=versioned(INV_SQRT_OPTIMIZER),
60+
),
61+
)
62+
63+
64+
if __name__ == "__main__":
65+
executor_main(
66+
steps=[grug_moe_inv_sqrt_lr],
67+
description="Good 10T: inv-sqrt LR schedule for MoE (issue #4028).",
68+
)

0 commit comments

Comments
 (0)