Skip to content

Commit 7f6e5ba

Browse files
authored
Speedrun changes iteration (#2161)
## Description Addresses feedback from Percy (see #1981) - more detailed documentation, comments - eliminate forking from step 1; ask to do in step 3 at submission time
1 parent a6058de commit 7f6e5ba

File tree

4 files changed

+342
-88
lines changed

4 files changed

+342
-88
lines changed

docs/tutorials/submitting-speedrun.md

Lines changed: 178 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,19 @@
22

33
The Marin Speedrun, inspired by the [nanogpt Speedrun](https://github.com/KellerJordan/modded-nanogpt), is a benchmark aimed at improving the compute efficiency of language model training. This tutorial assumes you are familiar with the core premise of the Marin Speedrun—if not, check out [the overview of Marin Speedrun](../explanations/speedrun.md) for a more detailed explanation. Let's walk through how to submit your first speedrun to the leaderboard.
44

5-
![Marin speedrun 3 steps](../images/marin-speedrun-3-steps.png){width=75%}
5+
![Marin speedrun 3 steps](../images/marin-speedrun-3-steps.png){width=50%}
66

77
## Quickstart (GPU environment with CUDA 12)
88

9-
**1. Fork & Setup Marin**
9+
**1. Setup**
1010

11-
[Click here](https://github.com/marin-community/marin/fork) to fork Marin, then clone your fork. Run the Marin setup script:
11+
You can get started by just running:
1212

1313
```bash
1414
curl -LsSf https://raw.githubusercontent.com/marin-community/marin/refs/heads/main/scripts/speedrun/onboarding_setup.sh | bash
1515
```
1616

17-
You can skip manually creating the fork if you have GitHub CLI installed and authenticated, which the script will use.
17+
Note: this clones Marin for local development. When you want to contribute to Marin, you will need to convert to a fork and submit a PR (see Step 3 below). If you have GitHub CLI installed and authenticated, a fork will be created for you.
1818

1919
<details><summary>Manual Setup Steps (alternative)</summary>
2020

@@ -43,33 +43,195 @@ Create a subdirectory under <code>experiments/speedrun</code> and copy a starter
4343

4444
</details>
4545

46-
**2. Develop & Test Submission**
46+
**2. Develop**
4747

48-
You can now work on your speedrun submission! You can check your code and your estimated compute cost using a dry run:
48+
You can now work on your speedrun submission! The setup script has prepared the ["hackable transformer"](https://github.com/marin-community/marin/tree/main/experiments/hackable_transformer_starter_template.py) starter file for you. This is a self-contained file that contains the implementation of a transformer-based language model, and configurations for training it at four different sizes (130M, 300M, 520M, 1.2B parameters). Sections that require your attention are marked with TODOs.
49+
50+
You can check your code and your estimated compute cost using a dry run:
4951

5052
```bash
51-
python -m experiments.speedrun.my_submission.main --dry_run true --prefix local_store
53+
python -m experiments.speedrun.${NAME_OF_YOUR_SUBMISSION}.main --dry_run true --prefix local_store
54+
```
55+
56+
which will show the output of `print_run_info()` without starting training.
57+
58+
<details><summary>Example print_run_info() output (1x A100-80G, training 130m configuration)</summary>
59+
60+
```
61+
----- START OF PRINT RUN INFO -----
62+
Speedrun Configuration:
63+
{
64+
"author":
65+
{
66+
"name": "__SUBMISSION_AUTHOR_NAME__",
67+
"affiliation": "__SUBMISSION_AUTHOR_AFFILIATION__",
68+
"url": "__SUBMISSION_AUTHOR_URL__"
69+
},
70+
"description": "__SUBMISSION_DESCRIPTION__ (130m)",
71+
"model_config":
72+
{
73+
"cross_entropy_block_size": 4096,
74+
"seq_len": 4096,
75+
"hidden_dim": 512,
76+
"intermediate_dim": 1792,
77+
"num_layers": 6,
78+
"num_heads": 8,
79+
"num_kv_heads": 8,
80+
"head_dim": null,
81+
"activation_function": "silu",
82+
"use_bias": false,
83+
"use_layer_norm_weight": true,
84+
"layer_norm_epsilon": 0.00001,
85+
"tie_word_embeddings": false,
86+
"input_embedding_norm": false,
87+
"use_attention_sink": false,
88+
"upcast_attn": false,
89+
"attn_backend": "JAX_FLASH",
90+
"flash_attention_block_size": null,
91+
"rope":
92+
{
93+
"theta": 10000,
94+
"factor": 1.0
95+
},
96+
"qk_norm": null,
97+
"gradient_checkpointing": true,
98+
"initializer_range": 0.02,
99+
"reference_checkpoint": "NousResearch/Llama-2-7b-hf",
100+
"tokenizer": null
101+
},
102+
"train_config":
103+
{
104+
"train_batch_size": 128,
105+
"num_train_steps": 5880,
106+
"learning_rate": 0.032,
107+
"data_seed": null,
108+
"weight_decay": null,
109+
"beta1": null,
110+
"beta2": null,
111+
"epsilon": null,
112+
"max_grad_norm": null,
113+
"warmup": null,
114+
"decay": null,
115+
"rewarmup": null,
116+
"lr_schedule": null,
117+
"min_lr_ratio": null,
118+
"cycle_length": null,
119+
"z_loss_weight": null,
120+
"ema_beta": null,
121+
"skip_bad_steps": false,
122+
"steps_per_eval": null,
123+
"steps_per_export": 10000,
124+
"steps_per_task_eval": null,
125+
"steps_per_hf_export": -1,
126+
"per_device_eval_parallelism": null,
127+
"max_eval_batches": null,
128+
"initialize_from_checkpoint_path": null,
129+
"initialize_from_hf": null,
130+
"reset_data_loader_on_init": true,
131+
"allow_partial_checkpoint": false,
132+
"int8": false,
133+
"optimizer_config":
134+
{
135+
"learning_rate": 0.032,
136+
"weight_decay": 0.1,
137+
"min_lr_ratio": 0,
138+
"warmup": 0,
139+
"decay": 0.8,
140+
"rewarmup": 0.0,
141+
"weight_decay_modules": null,
142+
"default_weight_decay_mask": null,
143+
"lr": 0.02,
144+
"adam_lr": 0.0064,
145+
"momentum": 0.95,
146+
"nesterov": true,
147+
"backend_steps": 5,
148+
"adam_weight_decay": null,
149+
"beta1": 0.8,
150+
"beta2": 0.98,
151+
"epsilon": 1E-15,
152+
"muon_epsilon": 0.00001,
153+
"max_grad_norm": 1,
154+
"use_kimi_scaling": false
155+
},
156+
"watch":
157+
{
158+
"watch_targets":
159+
[
160+
"grads",
161+
"params"
162+
],
163+
"include_norms": true,
164+
"include_per_parameter_norms": true,
165+
"include_histograms": false,
166+
"split_scan_layers": true,
167+
"interval": 10
168+
},
169+
"profiler": false,
170+
"profiler_start_step": 5,
171+
"profiler_num_steps": 100
172+
},
173+
"tokenized_dataset": "ExecutorStep(name='tokenized/subcache/fineweb-edu-10B', fn=<function _actually_download_pretokenized_cache at 0x7c3779c26520>, config=PretokenizedCacheDownloadConfig(cache_path=OutputName(name=None), tokenizer=VersionedValue(value='marin-community/marin-tokenizer'), hf_repo_id=VersionedValue(value='marin-community/fineweb-edu-pretokenized-10B'), hf_revision=VersionedValue(value=None), hf_repo_type_prefix='datasets', hf_token=None, format=TextLmDatasetFormat(text_key='text'), cache_options=None, tags=[]), description=None, override_output_path=None, pip_dependency_groups=None)",
174+
"resources":
175+
{
176+
"gpu_count": 1,
177+
"accelerator_type": "A100-80G",
178+
"device_flops_override": null
179+
}
180+
}
181+
The rough estimated compute (calculated as (total model FLOPs / Assumed MFU)) for your run is probably between:
182+
* 4.21e+18 FLOPs assuming an MFU of 0.5, and
183+
* 1.05e+19 FLOPs assuming an MFU of 0.2.
184+
185+
This is calculated based on assumed MFU values and can be used as a rough estimate to guide your config/training setup.
186+
Hardware and Model FLOPS Information:
187+
Number of devices: 1
188+
Number of chips: 1
189+
Device FLOPs: 3.12e+14 FLOP/s
190+
Total peak hardware FLOPs: 3.12e+14 FLOP/s
191+
Model FLOPs: 2.11e+18 FLOP
192+
Model size: 154.15 million parameters
193+
----- END OF PRINT RUN INFO -----
52194
```
53195

54-
then fire off training on your hardware.
196+
</details>
197+
198+
Remove the dry run setting when you are ready and fire off training on your hardware.
55199

56-
**3. Open PR & Merge**
200+
**3. Submit**
57201

58202
When you are ready, open a PR and contribute to Marin. We ask that you:
59203

60204
- Give a brief explanation of your approach (model architecture, training strategy, optimizations)
61-
- Include the output of `print_run_info()` in the PR description (obtainable via a dry run), and `speedrun_results.json` files
62-
- Leave "Allow edits by maintainers" on so we can help work on your code and scale up your ideas on TPU clusters
205+
- Include the output of `print_run_info()` in the PR description, and `speedrun_results.json` files
206+
- Leave "Allow edits by maintainers" on so we can help work on your code and scale up your ideas on Marin's clusters
63207

64-
Once the PR is merged, your run will appear on the [public leaderboard](https://marin.community/speedrun/).
208+
!!! info
209+
210+
If you did not create a fork of Marin on GitHub previously, you need to do it now to be able to submit a PR. You can convert the existing repo into a fork using the following steps:
211+
212+
1. Install the GitHub CLI (see [https://github.com/cli/cli#installation](https://github.com/cli/cli#installation)) and log in to your GitHub account with `gh auth login`.
213+
2. Inside the Marin repo, run `gh repo fork`, and press `y` to add a remote. You should see the following:
214+
```
215+
$ gh repo fork
216+
✓ Created fork {YOUR_GITHUB_USER_NAME}/marin
217+
? Would you like to add a remote for the fork? Yes
218+
✓ Renamed remote origin to upstream
219+
✓ Added remote origin
220+
```
221+
222+
3. Run `git push -u origin HEAD` to push your changes to your fork.
223+
4. Run `gh repo set-default` and select `marin-community/marin` to contribute to.
224+
5. Run `gh pr create --web` to create the PR in your browser. Marin staff will then review your submission.
225+
226+
Once the PR is merged, your submission will appear on the [public leaderboard](https://marin.community/speedrun/).
65227

66228
## FAQ
67229

68230
### How do I create my speedrun submission?
69231

70232
1. Create a new directory for your run:
71233
```bash
72-
mkdir -p experiments/speedrun/my_submission
234+
mkdir -p experiments/speedrun/${NAME_OF_YOUR_SUBMISSION}
73235
```
74236

75237
2. Create your training script in this directory. You can start by copying the "[hackable transformer](https://github.com/marin-community/marin/tree/main/experiments/hackable_transformer_starter_template.py)" starter file, where a generic transformer language model is implemented for you to make changes easily. To see examples of other speedruns and configurations, check out the [speedrun directory](https://github.com/marin-community/marin/tree/main/experiments/speedrun). You can also [add new optimizers](https://github.com/marin-community/marin/blob/main/docs/tutorials/add-optimizer.md), change learning rate schedules, play with hyperparameters, etc.
@@ -95,24 +257,23 @@ export WANDB_PROJECT=...
95257
Train your model locally:
96258

97259
```bash
98-
python -m experiments.speedrun.my_submission.main --force_run_failed true --prefix local_store
260+
python -m experiments.speedrun.${NAME_OF_YOUR_SUBMISSION}.main --force_run_failed true --prefix local_store
99261
```
100262

101263
If you have a remote Ray cluster set up:
102264

103265
```bash
104-
python marin/run/ray_run.py -- python -m experiments.speedrun.my_submission.main --force_run_failed true --prefix local_store
266+
python marin/run/ray_run.py -- python -m experiments.speedrun.${NAME_OF_YOUR_SUBMISSION}.main --force_run_failed true --prefix local_store
105267
```
106268

107269
### How do I submit my results?
108270

109271
1. Add the resulting `speedrun_results.json` file to your run directory:
110272
```bash
111-
cp ${MARIN_PREFIX}/checkpoints/speedrun/speedrun_results.json experiments/speedrun/<your_run_name>/
273+
cp ${MARIN_PREFIX}/checkpoints/speedrun/speedrun_results.json experiments/speedrun/${NAME_OF_YOUR_SUBMISSION}/
112274
```
113275

114276
2. Create a pull request including:
115-
116277
- Your run directory (training script and results file)
117278
- A brief explanation of your approach (model architecture, training strategy, optimizations)
118279

experiments/hackable_transformer_starter_template.py

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@
2727
3) Optional: SR_USE_TPU=1 to use TPU resource presets (default is GPU).
2828
"""
2929

30+
# =========================
31+
# Submission metadata
32+
# TODO: fill out your information when you start
33+
# =========================
34+
35+
SUBMISSION_BRANCH = "__SUBMISSION_BRANCH__"
36+
SUBMISSION_DESCRIPTION = "__SUBMISSION_DESCRIPTION__"
37+
SUBMISSION_AUTHOR_NAME = "__SUBMISSION_AUTHOR_NAME__"
38+
SUBMISSION_AUTHOR_AFFILIATION = "__SUBMISSION_AUTHOR_AFFILIATION__"
39+
SUBMISSION_AUTHOR_URL = "__SUBMISSION_AUTHOR_URL__"
40+
41+
# ruff: noqa: E402
3042
# nodryrun
3143
import sys
3244
import os
@@ -68,19 +80,10 @@
6880

6981
silence_transformer_nag()
7082

71-
# =========================
72-
# Submission metadata (filled by onboarding)
73-
# =========================
74-
# The onboarding workflow replaces these placeholders before committing the file.
75-
SUBMISSION_BRANCH = "__SUBMISSION_BRANCH__"
76-
SUBMISSION_DESCRIPTION = "__SUBMISSION_DESCRIPTION__"
77-
SUBMISSION_AUTHOR_NAME = "__SUBMISSION_AUTHOR_NAME__"
78-
SUBMISSION_AUTHOR_AFFILIATION = "__SUBMISSION_AUTHOR_AFFILIATION__"
79-
SUBMISSION_AUTHOR_URL = "__SUBMISSION_AUTHOR_URL__"
80-
8183

8284
# =========================
8385
# Hackable config & modules
86+
# TODO: make any model architecture changes
8487
# =========================
8588

8689

@@ -365,6 +368,12 @@ def _get_num_train_steps(param_count: int, batch_size: int, seq_len: int, tpp: i
365368
return max(1, total_tokens // (batch_size * seq_len))
366369

367370

371+
# =========================
372+
# Model configuration presets
373+
# TODO: make any model configuration changes
374+
# =========================
375+
376+
368377
def _size_presets() -> dict[str, HackableTransformerConfig]:
369378
base = dict(
370379
seq_len=4096,
@@ -390,6 +399,25 @@ def _size_presets() -> dict[str, HackableTransformerConfig]:
390399
}
391400

392401

402+
# =========================
403+
# Muon optimizer presets
404+
# See https://wandb.ai/marin-community/marin/reports/Fantastic-Optimizers-and-Where-to-Find-Them--VmlldzoxMjgzMzQ2NQ
405+
# TODO: make any optimizer changes. You can use different optimizers: e.g.,
406+
# "130m": AdamHConfig(
407+
# learning_rate=0.02,
408+
# adam_lr=0.008,
409+
# min_lr_ratio=0,
410+
# warmup=1000,
411+
# beta1=0.9,
412+
# beta2=0.98,
413+
# epsilon=1e-20,
414+
# max_grad_norm=1,
415+
# nesterov=False,
416+
# ),
417+
# see available optimizers in lib/levanter/src/levanter/optim
418+
# =========================
419+
420+
393421
def _muon_presets() -> dict[str, MuonConfig]:
394422
return {
395423
"130m": MuonConfig(
@@ -455,6 +483,14 @@ def _muon_presets() -> dict[str, MuonConfig]:
455483
}
456484

457485

486+
# =========================
487+
# Resource presets (IMPORTANT!)
488+
# TODO: edit tpu_type or accelerator_type to match what you have available on your hardware
489+
# e.g., GpuConfig(gpu_count=8, accelerator_type="H100"),
490+
# If you ignore this and there is a mismatch, training cannot start if an unavailable resource is requested!
491+
# =========================
492+
493+
458494
def _resource_presets(use_tpu: bool = False):
459495
if use_tpu:
460496
return {
@@ -471,6 +507,12 @@ def _resource_presets(use_tpu: bool = False):
471507
}
472508

473509

510+
# =========================
511+
# Batch size presets
512+
# TODO: edit to adjust for your hardware
513+
# =========================
514+
515+
474516
def _batch_sizes() -> dict[str, int]:
475517
return {"130m": 128, "300m": 128, "520m": 128, "1_2b": 256}
476518

@@ -520,7 +562,11 @@ def build_run(size: str, *, use_tpu: bool = False) -> tuple[str, SpeedrunConfig]
520562
_cls.__module__ = _IMPORT_PATH
521563
###
522564

523-
sizes = ["130m", "300m", "520m", "1_2b"]
565+
sizes = [
566+
"130m",
567+
]
568+
# TODO: uncomment to run all sizes
569+
# sizes = ["130m", "300m", "520m", "1_2b"]
524570
use_tpu = bool(int(os.environ.get("SR_USE_TPU", "0")))
525571
steps = []
526572
for s in sizes:

lib/marin/src/marin/speedrun/speedrun.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def print_run_info(self) -> None:
136136
total_peak_flops = device_flops * num_chips
137137

138138
# Print simplified config info
139+
logger.info("----- START OF PRINT RUN INFO -----")
139140
logger.info("Speedrun Configuration:")
140141
logger.info(json.dumps(self.as_json_dict(), indent=4))
141142

@@ -154,6 +155,7 @@ def print_run_info(self) -> None:
154155
logger.info("Model size: unknown (model did not report total_trainable_params).")
155156
else:
156157
logger.info(f"Model size: {model_size/1e6:.2f} million parameters")
158+
logger.info("----- END OF PRINT RUN INFO -----")
157159

158160
def compute_model_flops(self) -> float:
159161
# TODO (Nikil): make this a helper and handle edge-cases

0 commit comments

Comments
 (0)