Skip to content

Commit 72a6369

Browse files
authored
Merge pull request #342 from alan-turing-institute/2026-04-20/runs-part
Refactor and update eval scripts and docs
2 parents 4fe6d1a + 21b779f commit 72a6369

9 files changed

Lines changed: 133 additions & 241 deletions

File tree

slurm_scripts/ablations/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ small edit.
2323
| crps_variants (AlphaFair / Fair / CRPS) | comparison | CNS | 3 | stub |
2424
| fm_vs_diffusion | comparison | CNS | 1 | stub |
2525
| arch_unet_fno_vit | comparison | CNS | 2 | stub |
26-
| model_size | sweep | CNS | 2 | ready |
27-
| cached_latent_crps | comparison | CNS | 1 (done, 2026-04-19) | stub |
26+
| model_size | sweep | CNS | 2 active (+2 staged) | in progress |
27+
| cached_latent_crps | comparison | CNS | 1 (done, 2026-04-20) | stub |
2828
| cond_global_vs_permute | comparison | CNS | 1 (done for CRPS-ViT, 2026-04-18) | stub |
2929
| eval_only/ode_steps | eval-only | FM runs | 0 | stub |
3030
| eval_only/ema | eval-only | EMA ckpts | 0 | stub |

slurm_scripts/ablations/cached_latent_crps/README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,19 @@ CRPS loss trained in cached-latent space (processor-only training on
44
pre-encoded latents, decoded only at eval time).
55

66
**Status:** CNS data point exists —
7-
`outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be`.
8-
No new training script needed for this pass; eval is handled by
9-
`slurm_scripts/comparison/eval/submit_eval_crps_latent.sh`.
7+
`outputs/2026-04-20/crps_cns64_vit_azula_large_09490da_8b7573d`.
8+
No new training script needed for this pass; comparison eval is handled by
9+
`slurm_scripts/comparison/eval/submit_eval_crps_latent.sh` via the default
10+
`auto -> encode_once` path.
1011

1112
## Baseline
1213

1314
`local_hydra/local_experiment/processor/conditioned_navier_stokes/crps_vit_azula_large.yaml`.
1415

1516
## Next steps
1617

17-
- When the second dataset is added, extend the `DATASETS` map in
18-
`submit_eval_crps_latent.sh` and submit a matching training run via
18+
- When the second dataset is added, extend the `RUN_DIRS` list and `AE_CKPT`
19+
map in `submit_eval_crps_latent.sh` and submit a matching training run via
1920
`slurm_scripts/comparison/cached_latents/submit_crps_latent_large.sh`.
20-
- Decide whether to include `eval.mode=latent` ablation alongside
21-
`eval.mode=ambient` for this ablation specifically — it answers "how
22-
much of the latent-CRPS gap is decode/encode drift?".
21+
- If we ever need a latent-only diagnostic again, use the eval CLI directly
22+
with explicit overrides rather than keeping a dedicated comparison submitter.

slurm_scripts/ablations/model_size/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ baseline comparison design, submission order, and the ~80M processor matrix.
55
This folder records the CNS-only delta for the follow-up model-size sweep
66
around the ~80M baselines.
77

8-
**Status:** ready — timing and 24h submit scripts cover the full scan.
8+
**Status:** in progress — timing covers the full 4-variant scan, while the
9+
current 24h submitter has the `2x` legs enabled and keeps the `0p4x` legs
10+
staged behind commented `COSINE_EPOCHS_BY_VARIANT` entries.
911

1012
## Goal
1113

slurm_scripts/comparison/cached_latents/validate_cached_latents_against_ae.sh

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,59 @@ yaml_get_scalar_in_block() {
4646
' "${yaml_file}"
4747
}
4848

49+
resolve_oc_env_scalar() {
50+
local raw="$1"
51+
52+
# Handle scalars like:
53+
# ${oc.env:VAR,./fallback}/suffix
54+
# used in Hydra yaml files before interpolation is resolved.
55+
if [[ "${raw}" =~ ^\$\{oc\.env:([^,}]+),([^}]*)\}(.*)$ ]]; then
56+
local var_name="${BASH_REMATCH[1]}"
57+
local fallback="${BASH_REMATCH[2]}"
58+
local suffix="${BASH_REMATCH[3]}"
59+
local var_value="${!var_name:-}"
60+
if [[ -n "${var_value}" ]]; then
61+
printf "%s\n" "${var_value}${suffix}"
62+
else
63+
printf "%s\n" "${fallback}${suffix}"
64+
fi
65+
return 0
66+
fi
67+
68+
printf "%s\n" "${raw}"
69+
}
70+
71+
normalize_path_scalar() {
72+
local raw="$1"
73+
local block_data_path="${2:-}"
74+
local token='${.data_path}'
75+
local resolved
76+
77+
# Handle same-block references such as "${.data_path}/stats.yml".
78+
if [[ -n "${block_data_path}" && "${raw}" == *"${token}"* ]]; then
79+
raw="${raw//$token/${block_data_path}}"
80+
fi
81+
82+
resolved="$(resolve_oc_env_scalar "${raw}")"
83+
84+
# Expand leading "~" so HOME-relative paths compare consistently.
85+
if [[ "${resolved}" == "~/"* ]]; then
86+
resolved="${HOME}/${resolved#~/}"
87+
fi
88+
89+
# Canonicalize when possible; if the path does not exist, still normalize
90+
# relative references against the current working directory.
91+
if [[ -e "${resolved}" ]]; then
92+
resolved="$(realpath "${resolved}")"
93+
elif [[ "${resolved}" != /* ]]; then
94+
resolved="$(realpath -m "${resolved}" 2>/dev/null || printf "%s" "${resolved}")"
95+
fi
96+
97+
# Avoid mismatch from a trailing slash only.
98+
resolved="${resolved%/}"
99+
printf "%s\n" "${resolved}"
100+
}
101+
49102
validate_cached_latents_against_ae() {
50103
local ae_run_dir="$1"
51104
local ae_cfg="${ae_run_dir}/resolved_autoencoder_config.yaml"
@@ -68,22 +121,38 @@ validate_cached_latents_against_ae() {
68121
"use_normalization"
69122
"normalization_path"
70123
)
124+
local ae_data_path_raw
125+
local cache_data_path_raw
126+
ae_data_path_raw="$(yaml_get_scalar_in_block "${ae_cfg}" "datamodule" "data_path")"
127+
cache_data_path_raw="$(yaml_get_scalar_in_block "${cache_cfg}" "datamodule" "data_path")"
71128

72129
local key
73130
for key in "${keys[@]}"; do
74131
local ae_val
75132
local cache_val
133+
local ae_cmp
134+
local cache_cmp
76135
ae_val="$(yaml_get_scalar_in_block "${ae_cfg}" "datamodule" "${key}")"
77136
cache_val="$(yaml_get_scalar_in_block "${cache_cfg}" "datamodule" "${key}")"
78137

79138
if [[ -z "${ae_val}" || -z "${cache_val}" ]]; then
80139
echo "Missing datamodule.${key} in ${ae_cfg} or ${cache_cfg}" >&2
81140
return 1
82141
fi
83-
if [[ "${ae_val}" != "${cache_val}" ]]; then
142+
ae_cmp="${ae_val}"
143+
cache_cmp="${cache_val}"
144+
if [[ "${key}" == "data_path" || "${key}" == "normalization_path" ]]; then
145+
ae_cmp="$(normalize_path_scalar "${ae_val}" "${ae_data_path_raw}")"
146+
cache_cmp="$(normalize_path_scalar "${cache_val}" "${cache_data_path_raw}")"
147+
fi
148+
if [[ "${ae_cmp}" != "${cache_cmp}" ]]; then
84149
echo "Mismatch datamodule.${key}" >&2
85150
echo " AE config: ${ae_val}" >&2
86151
echo " Cached config: ${cache_val}" >&2
152+
if [[ "${key}" == "data_path" || "${key}" == "normalization_path" ]]; then
153+
echo " AE normalized: ${ae_cmp}" >&2
154+
echo " Cached normalized: ${cache_cmp}" >&2
155+
fi
87156
echo " AE cfg: ${ae_cfg}" >&2
88157
echo " Cache cfg: ${cache_cfg}" >&2
89158
return 1

slurm_scripts/comparison/eval/README.md

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ submitter only targets a study-specific ablation run set, keep it under
55
`slurm_scripts/ablations/<name>/eval/` until that run set is promoted into the
66
main comparison.
77

8-
Six submission scripts cover ambient and cached-latent checkpoints produced
9-
under `outputs/2026-04-18/` and `outputs/2026-04-19/`. Each script iterates
8+
Four submission scripts cover ambient and cached-latent checkpoints produced
9+
under `outputs/2026-04-18/` and `outputs/2026-04-20/`. Each script iterates
1010
`--dry-run` first, then submits for real.
1111

1212
All comparison eval submitters explicitly pass `eval.n_members=10` for now so
@@ -16,10 +16,8 @@ comparison numbers do not silently drift if the global eval default changes.
1616
|---|---|---|---|
1717
| `submit_eval_crps_ambient.sh` | `outputs/2026-04-18/crps_*` (4 primary + 2 CNS ablations) | default (auto → ambient) | 8 |
1818
| `submit_eval_fm_ambient.sh` | `outputs/2026-04-18/diff_*` ambient (4 datasets) | default (auto → ambient) | 4 |
19-
| `submit_eval_crps_latent.sh` | `outputs/2026-04-19/crps_*` cached-latent (CNS so far) | `ambient` | 8 |
20-
| `submit_eval_fm_latent.sh` | `outputs/2026-04-18/diff_*` cached-latent (4 datasets) | `ambient` | 4 |
21-
| `submit_eval_crps_latent_rollout_latent.sh` | same runs as `submit_eval_crps_latent.sh` | `latent` (writes to `eval_latent/`) | 8 |
22-
| `submit_eval_fm_latent_rollout_latent.sh` | same runs as `submit_eval_fm_latent.sh` | `latent` (writes to `eval_latent/`) | 4 |
19+
| `submit_eval_crps_latent.sh` | `outputs/2026-04-20/crps_*` cached-latent (CNS so far) | default (`auto -> encode_once`) | 8 |
20+
| `submit_eval_fm_latent.sh` | `outputs/2026-04-20/diff_*` cached-latent (4 datasets) | default (`auto -> encode_once`) | 4 |
2321

2422
## Batch-size rationale
2523

@@ -29,30 +27,22 @@ for 25 steps on 64×64 fields:
2927
- **CRPS** (single forward per step) handles `eval.batch_size=8` fine.
3028
- **FM / diffusion** integrates `flow_ode_steps=50` per rollout step, so
3129
ambient fits `eval.batch_size=4` — drop to 2 if OOM.
32-
- **Cached-latent in ambient mode** still encodes/decodes at every step
33-
but the processor forward is cheaper (64 tokens vs 256 for
34-
ambient-patch4), so the CRPS variant matches ambient CRPS at 8 and the
35-
FM variant matches ambient FM at 4. Can try bumping up if there's
36-
headroom.
37-
- **Cached-latent in latent mode** avoids per-step AE encode/decode and is
38-
typically cheaper. We keep 8 (CRPS) / 4 (FM) for consistency across
39-
comparisons; increase only after confirming cluster headroom.
30+
- **Cached-latent via `auto -> encode_once`** encodes once up front,
31+
decodes per step, and scores against raw ground truth. It is cheaper
32+
than the ambient ablation while still being faithful for processor-only
33+
evaluation, so the CRPS variant stays at 8 and the FM variant stays at 4
34+
for easy comparison with the ambient scripts.
4035

4136
## eval.mode for cached latents
4237

43-
The cached-latent scripts use the `eval.mode` selector that landed via
44-
[PR #327](https://github.com/alan-turing-institute/autocast/pull/327) and is
45-
now available in-tree. `eval.mode=ambient` forces full
46-
`encoder → processor → decoder` rollout, so the decode/encode drift is
47-
included in the metrics — the only fair regime for cross-comparison with
48-
ambient CRPS/FM baselines that roll out in data space natively. Latent-only
49-
rollout (`eval.mode=latent`) is faster and is useful as an additional
50-
diagnostic view when written to a separate subdir (`eval_latent/`).
51-
52-
When `eval.mode=ambient` is set on a cached-latents datamodule, the eval
53-
script auto-substitutes the raw datamodule from
54-
`<cache_dir>/autoencoder_config.yaml`, and the AE weights are supplied via
55-
`autoencoder_checkpoint=<ae.ckpt>` (hard-coded per run in each script).
38+
The cached-latent comparison scripts now rely on the default
39+
`eval.mode=auto`, which resolves to `encode_once` for processor-only
40+
cached-latent runs when `autoencoder_checkpoint=<ae.ckpt>` is supplied.
41+
That behavior landed in
42+
[PR #339](https://github.com/alan-turing-institute/autocast/pull/339).
43+
It keeps metrics in raw data space while avoiding the extra decode/encode
44+
drift charged by the explicit ambient ablation. That is now the only
45+
comparison-suite path we keep under `slurm_scripts/comparison/eval/`.
5646

5747
## Submission order
5848

@@ -61,5 +51,4 @@ checkpoint. There are no branch prerequisites for the cached-latent scripts.
6151

6252
Dry-run everything first, review the printed sbatch commands, then re-run
6353
without `RUN_DRY_STATES` edits to submit. Outputs land under each run's
64-
`eval/` (ambient rollout) or `eval_latent/` (latent rollout) subdirectory
65-
(`evaluation_metrics.csv`, rollout videos, etc.).
54+
`eval/` subdirectory (`evaluation_metrics.csv`, rollout videos, etc.).

slurm_scripts/comparison/eval/submit_eval_crps_latent.sh

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,17 @@
11
#!/bin/bash
22

33
set -euo pipefail
4-
# Evaluate CRPS cached-latent processor runs (2026-04-19) in AMBIENT mode.
4+
# Evaluate CRPS cached-latent processor runs from 2026-04-20 using the
5+
# default eval.mode=auto path.
56
#
6-
# eval.mode=ambient forces encoder->processor->decoder rollout at every
7-
# step, so decode/encode drift is included in the metrics. This makes the
8-
# latent-space CRPS numbers directly comparable with the ambient CRPS and
9-
# FM baselines (see slurm_scripts/comparison/eval/README.md).
7+
# eval.mode=auto resolves to encode_once for processor-only cached-latent runs
8+
# when autoencoder_checkpoint is supplied. That preserves raw-space metrics
9+
# while avoiding the extra per-step decode->encode drift charged by the
10+
# ambient ablation.
1011
#
11-
# The eval.mode selector landed via PR #327 and is now in-tree. When ambient
12-
# is requested on a cached-latents datamodule, eval auto-substitutes the raw
13-
# datamodule from <cache_dir>/autoencoder_config.yaml; the trained AE weights
14-
# are supplied via autoencoder_checkpoint.
15-
#
16-
# Batch size: cached-latent eval pays the ambient AE encode/decode per step
17-
# but processor forward is cheap (64 tokens vs 256 for ambient-patch4), so
18-
# 8/GPU fits comfortably — same as pure-ambient CRPS.
12+
# Batch size: encode_once pays one upfront AE encode and a decode each rollout
13+
# step while still scoring in raw data space. That is cheaper than the
14+
# explicit ambient ablation, so 8/GPU stays aligned with ambient CRPS.
1915
#
2016
# We also pin eval.n_members explicitly here so the comparison scripts do not
2117
# depend on the global eval default staying at 10.
@@ -29,12 +25,11 @@ EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,ps
2925
# (run_dir, autoencoder_checkpoint) pairs. Extend as more cached-latent CRPS
3026
# runs land (gs, gpe, ad) — the AE paths are the same as training.
3127
RUN_DIRS=(
32-
"outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"
28+
"outputs/2026-04-20/crps_cns64_vit_azula_large_09490da_8b7573d"
3329
)
3430
declare -A AE_CKPT=(
35-
["outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
31+
["outputs/2026-04-20/crps_cns64_vit_azula_large_09490da_8b7573d"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
3632
)
37-
3833
for run_dir in "${RUN_DIRS[@]}"; do
3934
ae_ckpt="${AE_CKPT[$run_dir]:-}"
4035
if [[ -z "${ae_ckpt}" ]]; then
@@ -58,18 +53,18 @@ for run_dir in "${RUN_DIRS[@]}"; do
5853
run_label="slurm --dry-run"
5954
fi
6055

61-
echo "Submitting CRPS cached-latent eval (mode=ambient)"
56+
echo "Submitting CRPS cached-latent eval (mode=auto -> encode_once)"
6257
echo " mode: ${run_label}"
6358
echo " run_dir: ${run_dir}"
6459
echo " autoencoder_checkpoint: ${ae_ckpt}"
60+
echo " eval.mode: auto"
6561
echo " eval.batch_size: ${EVAL_BATCH_SIZE}"
6662
echo " eval.n_members: ${EVAL_N_MEMBERS}"
6763
echo " eval.metrics: ${EVAL_METRICS}"
6864

6965
uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
7066
--workdir "${run_dir}" \
7167
eval.checkpoint=processor.ckpt \
72-
++eval.mode=ambient \
7368
+autoencoder_checkpoint="${ae_ckpt}" \
7469
eval.metrics="${EVAL_METRICS}" \
7570
eval.batch_size="${EVAL_BATCH_SIZE}" \

slurm_scripts/comparison/eval/submit_eval_crps_latent_rollout_latent.sh

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)