Merge pull request #342 from alan-turing-institute/2026-04-20/runs-part

sgreenbury · web-flow · commit 72a636958a1a · 2026-04-22T22:28:00.000+01:00
Refactor and update eval scripts and docs
diff --git a/slurm_scripts/ablations/README.md b/slurm_scripts/ablations/README.md
@@ -23,8 +23,8 @@ small edit.
 | crps_variants (AlphaFair / Fair / CRPS) | comparison | CNS | 3 | stub |
 | fm_vs_diffusion | comparison | CNS | 1 | stub |
 | arch_unet_fno_vit | comparison | CNS | 2 | stub |
-| model_size | sweep | CNS | 2 | ready |
-| cached_latent_crps | comparison | CNS | 1 (done, 2026-04-19) | stub |
+| model_size | sweep | CNS | 2 active (+2 staged) | in progress |
+| cached_latent_crps | comparison | CNS | 1 (done, 2026-04-20) | stub |
 | cond_global_vs_permute | comparison | CNS | 1 (done for CRPS-ViT, 2026-04-18) | stub |
 | eval_only/ode_steps | eval-only | FM runs | 0 | stub |
 | eval_only/ema | eval-only | EMA ckpts | 0 | stub |
diff --git a/slurm_scripts/ablations/cached_latent_crps/README.md b/slurm_scripts/ablations/cached_latent_crps/README.md
@@ -4,19 +4,19 @@ CRPS loss trained in cached-latent space (processor-only training on
 pre-encoded latents, decoded only at eval time).
 
 **Status:** CNS data point exists —
-`outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be`.
-No new training script needed for this pass; eval is handled by
-`slurm_scripts/comparison/eval/submit_eval_crps_latent.sh`.
+`outputs/2026-04-20/crps_cns64_vit_azula_large_09490da_8b7573d`.
+No new training script needed for this pass; comparison eval is handled by
+`slurm_scripts/comparison/eval/submit_eval_crps_latent.sh` via the default
+`auto -> encode_once` path.
 
 ## Baseline
 
 `local_hydra/local_experiment/processor/conditioned_navier_stokes/crps_vit_azula_large.yaml`.
 
 ## Next steps
 
-- When the second dataset is added, extend the `DATASETS` map in
-  `submit_eval_crps_latent.sh` and submit a matching training run via
+- When the second dataset is added, extend the `RUN_DIRS` list and `AE_CKPT`
+  map in `submit_eval_crps_latent.sh` and submit a matching training run via
   `slurm_scripts/comparison/cached_latents/submit_crps_latent_large.sh`.
-- Decide whether to include `eval.mode=latent` ablation alongside
-  `eval.mode=ambient` for this ablation specifically — it answers "how
-  much of the latent-CRPS gap is decode/encode drift?".
+- If we ever need a latent-only diagnostic again, use the eval CLI directly
+  with explicit overrides rather than keeping a dedicated comparison submitter.
diff --git a/slurm_scripts/ablations/model_size/README.md b/slurm_scripts/ablations/model_size/README.md
@@ -5,7 +5,9 @@ baseline comparison design, submission order, and the ~80M processor matrix.
 This folder records the CNS-only delta for the follow-up model-size sweep
 around the ~80M baselines.
 
-**Status:** ready — timing and 24h submit scripts cover the full scan.
+**Status:** in progress — timing covers the full 4-variant scan, while the
+current 24h submitter has the `2x` legs enabled and keeps the `0p4x` legs
+staged behind commented `COSINE_EPOCHS_BY_VARIANT` entries.
 
 ## Goal
 
diff --git a/slurm_scripts/comparison/cached_latents/validate_cached_latents_against_ae.sh b/slurm_scripts/comparison/cached_latents/validate_cached_latents_against_ae.sh
@@ -46,6 +46,59 @@ yaml_get_scalar_in_block() {
     ' "${yaml_file}"
 }
 
+resolve_oc_env_scalar() {
+    local raw="$1"
+
+    # Handle scalars like:
+    #   ${oc.env:VAR,./fallback}/suffix
+    # used in Hydra yaml files before interpolation is resolved.
+    if [[ "${raw}" =~ ^\$\{oc\.env:([^,}]+),([^}]*)\}(.*)$ ]]; then
+        local var_name="${BASH_REMATCH[1]}"
+        local fallback="${BASH_REMATCH[2]}"
+        local suffix="${BASH_REMATCH[3]}"
+        local var_value="${!var_name:-}"
+        if [[ -n "${var_value}" ]]; then
+            printf "%s\n" "${var_value}${suffix}"
+        else
+            printf "%s\n" "${fallback}${suffix}"
+        fi
+        return 0
+    fi
+
+    printf "%s\n" "${raw}"
+}
+
+normalize_path_scalar() {
+    local raw="$1"
+    local block_data_path="${2:-}"
+    local token='${.data_path}'
+    local resolved
+
+    # Handle same-block references such as "${.data_path}/stats.yml".
+    if [[ -n "${block_data_path}" && "${raw}" == *"${token}"* ]]; then
+        raw="${raw//$token/${block_data_path}}"
+    fi
+
+    resolved="$(resolve_oc_env_scalar "${raw}")"
+
+    # Expand leading "~" so HOME-relative paths compare consistently.
+    if [[ "${resolved}" == "~/"* ]]; then
+        resolved="${HOME}/${resolved#~/}"
+    fi
+
+    # Canonicalize when possible; if the path does not exist, still normalize
+    # relative references against the current working directory.
+    if [[ -e "${resolved}" ]]; then
+        resolved="$(realpath "${resolved}")"
+    elif [[ "${resolved}" != /* ]]; then
+        resolved="$(realpath -m "${resolved}" 2>/dev/null || printf "%s" "${resolved}")"
+    fi
+
+    # Avoid mismatch from a trailing slash only.
+    resolved="${resolved%/}"
+    printf "%s\n" "${resolved}"
+}
+
 validate_cached_latents_against_ae() {
     local ae_run_dir="$1"
     local ae_cfg="${ae_run_dir}/resolved_autoencoder_config.yaml"
@@ -68,22 +121,38 @@ validate_cached_latents_against_ae() {
         "use_normalization"
         "normalization_path"
     )
+    local ae_data_path_raw
+    local cache_data_path_raw
+    ae_data_path_raw="$(yaml_get_scalar_in_block "${ae_cfg}" "datamodule" "data_path")"
+    cache_data_path_raw="$(yaml_get_scalar_in_block "${cache_cfg}" "datamodule" "data_path")"
 
     local key
     for key in "${keys[@]}"; do
         local ae_val
         local cache_val
+        local ae_cmp
+        local cache_cmp
         ae_val="$(yaml_get_scalar_in_block "${ae_cfg}" "datamodule" "${key}")"
         cache_val="$(yaml_get_scalar_in_block "${cache_cfg}" "datamodule" "${key}")"
 
         if [[ -z "${ae_val}" || -z "${cache_val}" ]]; then
             echo "Missing datamodule.${key} in ${ae_cfg} or ${cache_cfg}" >&2
             return 1
         fi
-        if [[ "${ae_val}" != "${cache_val}" ]]; then
+        ae_cmp="${ae_val}"
+        cache_cmp="${cache_val}"
+        if [[ "${key}" == "data_path" || "${key}" == "normalization_path" ]]; then
+            ae_cmp="$(normalize_path_scalar "${ae_val}" "${ae_data_path_raw}")"
+            cache_cmp="$(normalize_path_scalar "${cache_val}" "${cache_data_path_raw}")"
+        fi
+        if [[ "${ae_cmp}" != "${cache_cmp}" ]]; then
             echo "Mismatch datamodule.${key}" >&2
             echo "  AE config:     ${ae_val}" >&2
             echo "  Cached config: ${cache_val}" >&2
+            if [[ "${key}" == "data_path" || "${key}" == "normalization_path" ]]; then
+                echo "  AE normalized:     ${ae_cmp}" >&2
+                echo "  Cached normalized: ${cache_cmp}" >&2
+            fi
             echo "  AE cfg:        ${ae_cfg}" >&2
             echo "  Cache cfg:     ${cache_cfg}" >&2
             return 1
diff --git a/slurm_scripts/comparison/eval/README.md b/slurm_scripts/comparison/eval/README.md
@@ -5,8 +5,8 @@ submitter only targets a study-specific ablation run set, keep it under
 `slurm_scripts/ablations/<name>/eval/` until that run set is promoted into the
 main comparison.
 
-Six submission scripts cover ambient and cached-latent checkpoints produced
-under `outputs/2026-04-18/` and `outputs/2026-04-19/`. Each script iterates
+Four submission scripts cover ambient and cached-latent checkpoints produced
+under `outputs/2026-04-18/` and `outputs/2026-04-20/`. Each script iterates
 `--dry-run` first, then submits for real.
 
 All comparison eval submitters explicitly pass `eval.n_members=10` for now so
@@ -16,10 +16,8 @@ comparison numbers do not silently drift if the global eval default changes.
 |---|---|---|---|
 | `submit_eval_crps_ambient.sh` | `outputs/2026-04-18/crps_*` (4 primary + 2 CNS ablations) | default (auto → ambient) | 8 |
 | `submit_eval_fm_ambient.sh` | `outputs/2026-04-18/diff_*` ambient (4 datasets) | default (auto → ambient) | 4 |
-| `submit_eval_crps_latent.sh` | `outputs/2026-04-19/crps_*` cached-latent (CNS so far) | `ambient` | 8 |
-| `submit_eval_fm_latent.sh` | `outputs/2026-04-18/diff_*` cached-latent (4 datasets) | `ambient` | 4 |
-| `submit_eval_crps_latent_rollout_latent.sh` | same runs as `submit_eval_crps_latent.sh` | `latent` (writes to `eval_latent/`) | 8 |
-| `submit_eval_fm_latent_rollout_latent.sh` | same runs as `submit_eval_fm_latent.sh` | `latent` (writes to `eval_latent/`) | 4 |
+| `submit_eval_crps_latent.sh` | `outputs/2026-04-20/crps_*` cached-latent (CNS so far) | default (`auto -> encode_once`) | 8 |
+| `submit_eval_fm_latent.sh` | `outputs/2026-04-20/diff_*` cached-latent (4 datasets) | default (`auto -> encode_once`) | 4 |
 
 ## Batch-size rationale
 
@@ -29,30 +27,22 @@ for 25 steps on 64×64 fields:
 - **CRPS** (single forward per step) handles `eval.batch_size=8` fine.
 - **FM / diffusion** integrates `flow_ode_steps=50` per rollout step, so
   ambient fits `eval.batch_size=4` — drop to 2 if OOM.
-- **Cached-latent in ambient mode** still encodes/decodes at every step
-  but the processor forward is cheaper (64 tokens vs 256 for
-  ambient-patch4), so the CRPS variant matches ambient CRPS at 8 and the
-  FM variant matches ambient FM at 4. Can try bumping up if there's
-  headroom.
-- **Cached-latent in latent mode** avoids per-step AE encode/decode and is
-  typically cheaper. We keep 8 (CRPS) / 4 (FM) for consistency across
-  comparisons; increase only after confirming cluster headroom.
+- **Cached-latent via `auto -> encode_once`** encodes once up front,
+  decodes per step, and scores against raw ground truth. It is cheaper
+  than the ambient ablation while still being faithful for processor-only
+  evaluation, so the CRPS variant stays at 8 and the FM variant stays at 4
+  for easy comparison with the ambient scripts.
 
 ## eval.mode for cached latents
 
-The cached-latent scripts use the `eval.mode` selector that landed via
-[PR #327](https://github.com/alan-turing-institute/autocast/pull/327) and is
-now available in-tree. `eval.mode=ambient` forces full
-`encoder → processor → decoder` rollout, so the decode/encode drift is
-included in the metrics — the only fair regime for cross-comparison with
-ambient CRPS/FM baselines that roll out in data space natively. Latent-only
-rollout (`eval.mode=latent`) is faster and is useful as an additional
-diagnostic view when written to a separate subdir (`eval_latent/`).
-
-When `eval.mode=ambient` is set on a cached-latents datamodule, the eval
-script auto-substitutes the raw datamodule from
-`<cache_dir>/autoencoder_config.yaml`, and the AE weights are supplied via
-`autoencoder_checkpoint=<ae.ckpt>` (hard-coded per run in each script).
+The cached-latent comparison scripts now rely on the default
+`eval.mode=auto`, which resolves to `encode_once` for processor-only
+cached-latent runs when `autoencoder_checkpoint=<ae.ckpt>` is supplied.
+That behavior landed in
+[PR #339](https://github.com/alan-turing-institute/autocast/pull/339).
+It keeps metrics in raw data space while avoiding the extra decode/encode
+drift charged by the explicit ambient ablation. That is now the only
+comparison-suite path we keep under `slurm_scripts/comparison/eval/`.
 
 ## Submission order
 
@@ -61,5 +51,4 @@ checkpoint. There are no branch prerequisites for the cached-latent scripts.
 
 Dry-run everything first, review the printed sbatch commands, then re-run
 without `RUN_DRY_STATES` edits to submit. Outputs land under each run's
-`eval/` (ambient rollout) or `eval_latent/` (latent rollout) subdirectory
-(`evaluation_metrics.csv`, rollout videos, etc.).
+`eval/` subdirectory (`evaluation_metrics.csv`, rollout videos, etc.).
diff --git a/slurm_scripts/comparison/eval/submit_eval_crps_latent.sh b/slurm_scripts/comparison/eval/submit_eval_crps_latent.sh
@@ -1,21 +1,17 @@
 #!/bin/bash
 
 set -euo pipefail
-# Evaluate CRPS cached-latent processor runs (2026-04-19) in AMBIENT mode.
+# Evaluate CRPS cached-latent processor runs from 2026-04-20 using the
+# default eval.mode=auto path.
 #
-# eval.mode=ambient forces encoder->processor->decoder rollout at every
-# step, so decode/encode drift is included in the metrics. This makes the
-# latent-space CRPS numbers directly comparable with the ambient CRPS and
-# FM baselines (see slurm_scripts/comparison/eval/README.md).
+# eval.mode=auto resolves to encode_once for processor-only cached-latent runs
+# when autoencoder_checkpoint is supplied. That preserves raw-space metrics
+# while avoiding the extra per-step decode->encode drift charged by the
+# ambient ablation.
 #
-# The eval.mode selector landed via PR #327 and is now in-tree. When ambient
-# is requested on a cached-latents datamodule, eval auto-substitutes the raw
-# datamodule from <cache_dir>/autoencoder_config.yaml; the trained AE weights
-# are supplied via autoencoder_checkpoint.
-#
-# Batch size: cached-latent eval pays the ambient AE encode/decode per step
-# but processor forward is cheap (64 tokens vs 256 for ambient-patch4), so
-# 8/GPU fits comfortably — same as pure-ambient CRPS.
+# Batch size: encode_once pays one upfront AE encode and a decode each rollout
+# step while still scoring in raw data space. That is cheaper than the
+# explicit ambient ablation, so 8/GPU stays aligned with ambient CRPS.
 #
 # We also pin eval.n_members explicitly here so the comparison scripts do not
 # depend on the global eval default staying at 10.
@@ -29,12 +25,11 @@ EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,ps
 # (run_dir, autoencoder_checkpoint) pairs. Extend as more cached-latent CRPS
 # runs land (gs, gpe, ad) — the AE paths are the same as training.
 RUN_DIRS=(
-    "outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"
+    "outputs/2026-04-20/crps_cns64_vit_azula_large_09490da_8b7573d"
 )
 declare -A AE_CKPT=(
-    ["outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
+    ["outputs/2026-04-20/crps_cns64_vit_azula_large_09490da_8b7573d"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
 )
-
 for run_dir in "${RUN_DIRS[@]}"; do
     ae_ckpt="${AE_CKPT[$run_dir]:-}"
     if [[ -z "${ae_ckpt}" ]]; then
@@ -58,18 +53,18 @@ for run_dir in "${RUN_DIRS[@]}"; do
             run_label="slurm --dry-run"
         fi
 
-        echo "Submitting CRPS cached-latent eval (mode=ambient)"
+        echo "Submitting CRPS cached-latent eval (mode=auto -> encode_once)"
         echo "  mode: ${run_label}"
         echo "  run_dir: ${run_dir}"
         echo "  autoencoder_checkpoint: ${ae_ckpt}"
+        echo "  eval.mode: auto"
         echo "  eval.batch_size: ${EVAL_BATCH_SIZE}"
         echo "  eval.n_members: ${EVAL_N_MEMBERS}"
         echo "  eval.metrics: ${EVAL_METRICS}"
 
         uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
             --workdir "${run_dir}" \
             eval.checkpoint=processor.ckpt \
-            ++eval.mode=ambient \
             +autoencoder_checkpoint="${ae_ckpt}" \
             eval.metrics="${EVAL_METRICS}" \
             eval.batch_size="${EVAL_BATCH_SIZE}" \
diff --git a/slurm_scripts/comparison/eval/submit_eval_crps_latent_rollout_latent.sh b/slurm_scripts/comparison/eval/submit_eval_crps_latent_rollout_latent.sh
diff --git a/slurm_scripts/comparison/eval/submit_eval_fm_latent.sh b/slurm_scripts/comparison/eval/submit_eval_fm_latent.sh
diff --git a/slurm_scripts/comparison/eval/submit_eval_fm_latent_rollout_latent.sh b/slurm_scripts/comparison/eval/submit_eval_fm_latent_rollout_latent.sh