Skip to content

Commit b304223

Browse files
authored
Merge pull request #334 from alan-turing-institute/2026-04-18/runs
Update config timings and naming resolution
2 parents bb42acc + 58712c4 commit b304223

10 files changed

Lines changed: 103 additions & 41 deletions

File tree

slurm_scripts/comparison/README.md

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,32 @@ extracted per-dataset from `*_timing.sh` via
9999
`uv run autocast time-epochs --from-checkpoint <path>/timing.ckpt -b 24`
100100
and live in `COSINE_EPOCHS_BY_DATASET` at the top of each `*_large.sh`.
101101

102-
Ambient (timing 2026-04-17):
102+
Ambient (timing 2026-04-18; CRPS from `timing_efficient_crps/`, FM from `timing/`):
103103

104104
| variant | gray_scott | gpe_laser_only_wake | cond_navier_stokes | advection_diffusion |
105105
|---|---|---|---|---|
106-
| CRPS ambient | **398** (212.3 s/ep) | 477 (177.3) | 471 (179.5) | 486 (174.0) |
107-
| FM ambient | **2619** (32.3 s/ep) | 3097 (27.3) | 2917 (29.0) | 3279 (25.8) |
106+
| CRPS ambient (permute_concat) | **399** (212.0 s/ep) | 477 (177.2) | 473 (178.8) | 478 (177.0) |
107+
| CRPS-via-AE ambient (EPD) | **49** (1724.6 s/ep) | 85 (991.0) | 85 (985.0) | 58 (1436.9) |
108+
| FM ambient | **2631** (32.2 s/ep) | 3171 (26.7) | 2982 (28.4) | 3264 (25.9) |
108109

109-
Latent: placeholders (1080) pending `submit_{crps_latent,fm}_timing.sh`.
110+
Latent (timing 2026-04-18, FM only — full 4-dataset CRPS-latent timing pending):
111+
112+
| variant | gray_scott | gpe_laser_only_wake | cond_navier_stokes | advection_diffusion |
113+
|---|---|---|---|---|
114+
| FM latent (cached) | **2830** (29.9 s/ep) | 3411 (24.8) | 3223 (26.3) | 3314 (25.6) |
115+
| CRPS latent (cached) | 1080 (placeholder) | 1080 | 1080 | 1080 |
116+
117+
CNS-only ablations (timing 2026-04-18):
118+
119+
| variant | conditioned_navier_stokes |
120+
|---|---|
121+
| CRPS ambient (identity + global_cond AdaLN) | 469 (180.3 s/ep) |
122+
| CRPS latent (cached, ablation) | 345 (245.0 s/ep) |
123+
124+
All `s/ep` values are the mean across the n=5 epoch durations recorded by
125+
`TrainingTimerCallback` in `timing.ckpt` (saved after `on_train_end`, so
126+
it captures the final epoch — unlike `last.ckpt`, which is saved during
127+
`on_train_epoch_end` and only holds the first 4 durations).
110128

111129
Each script saves quarter-schedule checkpoints (every `cosine_epochs / 4`)
112130
plus `last.ckpt` at train-end (guaranteed final state). Quarter boundaries

slurm_scripts/comparison/cached_latents/submit_crps_latent_cns_large.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ set -euo pipefail
1414
DATAMODULE="conditioned_navier_stokes"
1515
EXPERIMENT="processor/conditioned_navier_stokes/crps_vit_azula_large"
1616
AE_RUN_DIR="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8"
17-
COSINE_EPOCHS=1080 # placeholder
17+
COSINE_EPOCHS=345 # 245.0 s/ep (timing_efficient_crps, 2026-04-18)
1818

1919
BUDGET_MAX_TIME="00:23:59:00"
2020
# SLURM timeout with 1-min buffer beyond the 24h budget.
@@ -58,5 +58,5 @@ for run_dry in "${RUN_DRY_STATES[@]}"; do
5858
+trainer.max_epochs="${COSINE_EPOCHS}" \
5959
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
6060
trainer.callbacks.0.save_top_k=-1 \
61-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
61+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
6262
done

slurm_scripts/comparison/cached_latents/submit_crps_latent_large.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,6 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
8383
+trainer.max_epochs="${cosine_epochs}" \
8484
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
8585
trainer.callbacks.0.save_top_k=-1 \
86-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
86+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
8787
done
8888
done

slurm_scripts/comparison/cached_latents/submit_fm_large.sh

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,17 @@ set -euo pipefail
99
# the authoritative hyperparameters.
1010
#
1111
# Per-dataset cosine schedule: each (method, dataset) pair fills its own
12-
# 24h budget so each model gets its best shot within budget. PLACEHOLDERS
13-
# pending submit_fm_timing.sh — extract per-dataset values via
12+
# 24h budget so each model gets its best shot within budget. Values from
13+
# submit_fm_timing.sh (2026-04-18) via
1414
# uv run autocast time-epochs --from-checkpoint <path>/timing.ckpt -b 24
15-
# and replace each entry below.
1615
#
1716
# learning_rate (1e-4) and warmup (0) are baked into each per-dataset
1817
# local_experiment config; adjust the yaml to change them.
1918
declare -A COSINE_EPOCHS_BY_DATASET=(
20-
["gray_scott"]=1080 # placeholder
21-
["gpe_laser_only_wake"]=1080 # placeholder
22-
["conditioned_navier_stokes"]=1080 # placeholder
23-
["advection_diffusion"]=1080 # placeholder
19+
["gray_scott"]=2830 # 29.9 s/ep
20+
["gpe_laser_only_wake"]=3411 # 24.8 s/ep
21+
["conditioned_navier_stokes"]=3223 # 26.3 s/ep
22+
["advection_diffusion"]=3314 # 25.6 s/ep
2423
)
2524
BUDGET_MAX_TIME="00:23:59:00"
2625
# SLURM timeout with 1-min buffer beyond the 24h budget.
@@ -82,6 +81,6 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
8281
+trainer.max_epochs="${cosine_epochs}" \
8382
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
8483
trainer.callbacks.0.save_top_k=-1 \
85-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
84+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
8685
done
8786
done

slurm_scripts/comparison/epd/submit_crps_ae_ambient_large.sh

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,27 +11,27 @@ set -euo pipefail
1111
# uv run autocast time-epochs --from-checkpoint <path>/timing.ckpt -b 24
1212

1313
declare -A COSINE_EPOCHS_BY_DATASET=(
14-
["gray_scott"]=1080 # placeholder
15-
["gpe_laser_only_wake"]=1080 # placeholder
16-
["conditioned_navier_stokes"]=1080 # placeholder
17-
["advection_diffusion"]=1080 # placeholder
14+
# ["gray_scott"]=49 # 1724.6 s/ep (timing_efficient_crps, 2026-04-18)
15+
# ["gpe_laser_only_wake"]=85 # 991.0 s/ep (timing_efficient_crps, 2026-04-18)
16+
["conditioned_navier_stokes"]=85 # 985.0 s/ep (timing_efficient_crps, 2026-04-18)
17+
# ["advection_diffusion"]=58 # 1436.9 s/ep (timing_efficient_crps, 2026-04-18)
1818
)
1919
BUDGET_MAX_TIME="00:23:59:00"
2020
# SLURM timeout with 1-min buffer beyond the 24h budget.
2121
TIMEOUT_MIN=1439
2222
RUN_DRY_STATES=("true" "false")
2323

2424
declare -A EXPERIMENTS=(
25-
["gray_scott"]="epd/gray_scott/crps_vit_azula_large_ae_ambient"
26-
["gpe_laser_only_wake"]="epd/gpe_laser_wake_only/crps_vit_azula_large_ae_ambient"
25+
# ["gray_scott"]="epd/gray_scott/crps_vit_azula_large_ae_ambient"
26+
# ["gpe_laser_only_wake"]="epd/gpe_laser_wake_only/crps_vit_azula_large_ae_ambient"
2727
["conditioned_navier_stokes"]="epd/conditioned_navier_stokes/crps_vit_azula_large_ae_ambient"
28-
["advection_diffusion"]="epd/advection_diffusion/crps_vit_azula_large_ae_ambient"
28+
# ["advection_diffusion"]="epd/advection_diffusion/crps_vit_azula_large_ae_ambient"
2929
)
3030
declare -A AE_RUN_DIRS=(
31-
["gray_scott"]="$HOME/autocast/outputs/2026-04-17/ae_gs64_3a7999b_ed36b8e"
32-
["gpe_laser_only_wake"]="$HOME/autocast/outputs/2026-04-17/ae_gpe64_3a7999b_31e1c9f"
31+
# ["gray_scott"]="$HOME/autocast/outputs/2026-04-17/ae_gs64_3a7999b_ed36b8e"
32+
# ["gpe_laser_only_wake"]="$HOME/autocast/outputs/2026-04-17/ae_gpe64_3a7999b_31e1c9f"
3333
["conditioned_navier_stokes"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8"
34-
["advection_diffusion"]="$HOME/autocast/outputs/2026-04-17/ae_ad64_3a7999b_1a1e300"
34+
# ["advection_diffusion"]="$HOME/autocast/outputs/2026-04-17/ae_ad64_3a7999b_1a1e300"
3535
)
3636

3737
for datamodule in "${!EXPERIMENTS[@]}"; do
@@ -70,6 +70,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
7070
echo " cosine_epochs: ${cosine_epochs}"
7171

7272
uv run autocast epd --mode slurm "${dry_run_arg[@]}" \
73+
datamodule="${datamodule}" \
7374
local_experiment="${experiment}" \
7475
autoencoder_checkpoint="${ckpt}" \
7576
logging.wandb.enabled=true \
@@ -79,6 +80,6 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
7980
+trainer.max_epochs="${cosine_epochs}" \
8081
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
8182
trainer.callbacks.0.save_top_k=-1 \
82-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
83+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
8384
done
8485
done

slurm_scripts/comparison/epd/submit_crps_ambient_identity_global_cond_large.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ set -euo pipefail
1414
# and then extracting:
1515
# uv run autocast time-epochs --from-checkpoint <path>/timing.ckpt -b 24
1616
declare -A COSINE_EPOCHS_BY_DATASET=(
17-
["conditioned_navier_stokes"]=471 # seed from CRPS ambient baseline; update from timing
17+
["conditioned_navier_stokes"]=469 # 180.3 s/ep (timing_efficient_crps, 2026-04-18)
1818
)
1919
BUDGET_MAX_TIME="00:23:59:00"
2020
# SLURM timeout with 1-min buffer beyond the 24h budget.
@@ -48,6 +48,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
4848
echo " cosine_epochs: ${cosine_epochs}"
4949

5050
uv run autocast epd --mode slurm "${dry_run_arg[@]}" \
51+
datamodule="${datamodule}" \
5152
local_experiment="${experiment}" \
5253
logging.wandb.enabled=true \
5354
optimizer.cosine_epochs="${cosine_epochs}" \
@@ -56,6 +57,6 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
5657
+trainer.max_epochs="${cosine_epochs}" \
5758
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
5859
trainer.callbacks.0.save_top_k=-1 \
59-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
60+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
6061
done
6162
done

slurm_scripts/comparison/epd/submit_crps_large.sh

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@ set -euo pipefail
99
#
1010
# Per-dataset cosine schedule: each (method, dataset) pair fills its own
1111
# 24h budget so each model gets its best shot within budget. Values from
12-
# submit_crps_timing.sh (2026-04-17) via
12+
# submit_crps_timing.sh (2026-04-18) via
1313
# uv run autocast time-epochs --from-checkpoint <path>/timing.ckpt -b 24
1414
#
1515
# learning_rate (2e-4) and warmup (0) are baked into each per-dataset
1616
# local_experiment config; adjust the yaml to change them.
1717
declare -A COSINE_EPOCHS_BY_DATASET=(
18-
["gray_scott"]=398 # 212.3s/epoch
19-
["gpe_laser_only_wake"]=477 # 177.3s/epoch
20-
["conditioned_navier_stokes"]=471 # 179.5s/epoch
21-
["advection_diffusion"]=486 # 174.0s/epoch
18+
["gray_scott"]=399 # 212.0 s/ep
19+
["gpe_laser_only_wake"]=477 # 177.2 s/ep
20+
["conditioned_navier_stokes"]=473 # 178.8 s/ep
21+
["advection_diffusion"]=478 # 177.0 s/ep
2222
)
2323
BUDGET_MAX_TIME="00:23:59:00"
2424
# SLURM timeout with 1-min buffer beyond the 24h budget.
@@ -56,6 +56,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
5656
echo " cosine_epochs: ${cosine_epochs}"
5757

5858
uv run autocast epd --mode slurm "${dry_run_arg[@]}" \
59+
datamodule="${datamodule}" \
5960
local_experiment="${experiment}" \
6061
logging.wandb.enabled=true \
6162
optimizer.cosine_epochs="${cosine_epochs}" \
@@ -64,6 +65,6 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
6465
+trainer.max_epochs="${cosine_epochs}" \
6566
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
6667
trainer.callbacks.0.save_top_k=-1 \
67-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
68+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
6869
done
6970
done

slurm_scripts/comparison/epd/submit_fm_ambient_large.sh

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ set -euo pipefail
1313
#
1414
# Per-dataset cosine schedule: each (method, dataset) pair fills its own
1515
# 24h budget so each model gets its best shot within budget. Values from
16-
# submit_fm_ambient_timing.sh (2026-04-17) via
16+
# submit_fm_ambient_timing.sh (2026-04-18) via
1717
# uv run autocast time-epochs --from-checkpoint <path>/timing.ckpt -b 24
1818
#
1919
# learning_rate (1e-4) and warmup (0) are baked into each per-dataset
2020
# local_experiment config; adjust the yaml to change them.
2121
declare -A COSINE_EPOCHS_BY_DATASET=(
22-
["gray_scott"]=2619 # 32.3s/epoch
23-
["gpe_laser_only_wake"]=3097 # 27.3s/epoch
24-
["conditioned_navier_stokes"]=2917 # 29.0s/epoch
25-
["advection_diffusion"]=3279 # 25.8s/epoch
22+
["gray_scott"]=2631 # 32.2 s/ep
23+
["gpe_laser_only_wake"]=3171 # 26.7 s/ep
24+
["conditioned_navier_stokes"]=2982 # 28.4 s/ep
25+
["advection_diffusion"]=3264 # 25.9 s/ep
2626
)
2727
BUDGET_MAX_TIME="00:23:59:00"
2828
# SLURM timeout with 1-min buffer beyond the 24h budget.
@@ -60,6 +60,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
6060
echo " cosine_epochs: ${cosine_epochs}"
6161

6262
uv run autocast epd --mode slurm "${dry_run_arg[@]}" \
63+
datamodule="${datamodule}" \
6364
local_experiment="${experiment}" \
6465
logging.wandb.enabled=true \
6566
optimizer.cosine_epochs="${cosine_epochs}" \
@@ -68,6 +69,6 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
6869
+trainer.max_epochs="${cosine_epochs}" \
6970
trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
7071
trainer.callbacks.0.save_top_k=-1 \
71-
trainer.callbacks.0.filename="quarter-{epoch:04d}"
72+
trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
7273
done
7374
done

src/autocast/scripts/workflow/naming.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pathlib import Path
99

1010
from omegaconf import OmegaConf
11+
from omegaconf.errors import OmegaConfBaseException
1112

1213
from autocast.scripts.workflow.constants import DATASET_NAME_TOKENS, NAMING_DEFAULT_KEYS
1314
from autocast.scripts.workflow.overrides import extract_override_value
@@ -85,7 +86,13 @@ def _extract_naming_hints_from_preset(path: Path) -> list[str]:
8586
if not path.exists():
8687
return []
8788

88-
loaded = OmegaConf.to_container(OmegaConf.load(path), resolve=True)
89+
try:
90+
# Naming only needs a few literal fields; avoid resolving unrelated
91+
# interpolations that can fail outside full Hydra composition.
92+
loaded = OmegaConf.to_container(OmegaConf.load(path), resolve=False)
93+
except OmegaConfBaseException:
94+
return []
95+
8996
if not isinstance(loaded, dict):
9097
return []
9198

tests/scripts/test_workflow.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,40 @@ def test_auto_run_name_hidden_dim_included():
332332
assert "256" in name
333333

334334

335+
def test_auto_run_name_local_experiment_ignores_unresolved_interpolation(
336+
tmp_path: Path, monkeypatch
337+
):
338+
local_cfg = tmp_path / "local_hydra" / "local_experiment" / "repro.yaml"
339+
local_cfg.parent.mkdir(parents=True, exist_ok=True)
340+
local_cfg.write_text(
341+
"\n".join(
342+
[
343+
"model:",
344+
" processor:",
345+
" _target_: autocast.nn.vit.TemporalViTBackbone",
346+
" n_steps_input: ${datamodule.n_steps_input}",
347+
" loss_func:",
348+
" _target_: autocast.losses.ensemble.CRPSLoss",
349+
]
350+
),
351+
encoding="utf-8",
352+
)
353+
354+
monkeypatch.chdir(tmp_path)
355+
356+
with (
357+
patch("autocast.scripts.workflow.naming._git_hash", return_value="abc1234"),
358+
patch("autocast.scripts.workflow.naming._short_uuid", return_value="xyz7890"),
359+
):
360+
name = auto_run_name(
361+
"epd",
362+
"reaction_diffusion",
363+
["local_experiment=repro"],
364+
)
365+
366+
assert name == "crps_rd64_vit_abc1234_xyz7890"
367+
368+
335369
# ---------------------------------------------------------------------------
336370
# commands
337371
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)