alan-turing-institute
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎slurm_scripts/comparison/eval/README.md‎
Lines changed: 57 additions & 0 deletions b/‎slurm_scripts/comparison/eval/README.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎slurm_scripts/comparison/eval/submit_eval_crps_ambient.sh‎
Lines changed: 57 additions & 0 deletions b/‎slurm_scripts/comparison/eval/submit_eval_crps_ambient.sh‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎slurm_scripts/comparison/eval/submit_eval_crps_latent.sh‎
Lines changed: 73 additions & 0 deletions b/‎slurm_scripts/comparison/eval/submit_eval_crps_latent.sh‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎slurm_scripts/comparison/eval/submit_eval_crps_latent_rollout_latent.sh‎
Lines changed: 71 additions & 0 deletions b/‎slurm_scripts/comparison/eval/submit_eval_crps_latent_rollout_latent.sh‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎slurm_scripts/comparison/eval/submit_eval_fm_ambient.sh‎
Lines changed: 50 additions & 0 deletions b/‎slurm_scripts/comparison/eval/submit_eval_fm_ambient.sh‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎slurm_scripts/comparison/eval/submit_eval_fm_latent.sh‎
Lines changed: 77 additions & 0 deletions b/‎slurm_scripts/comparison/eval/submit_eval_fm_latent.sh‎
Lines changed: 77 additions & 0 deletions
@@ -18,4 +18,5 @@ core
 .clinerules
 .cursorrules
 .github/copilot-instructions.md
-.agent
+.agent
+.codex
@@ -0,0 +1,57 @@
+# Eval scripts for the comparison study
+
+Six submission scripts cover ambient and cached-latent checkpoints produced
+under `outputs/2026-04-18/` and `outputs/2026-04-19/`. Each script iterates
+`--dry-run` first, then submits for real.
+
+| script | runs covered | eval.mode | eval.batch_size |
+|---|---|---|---|
+| `submit_eval_crps_ambient.sh` | `outputs/2026-04-18/crps_*` (4 primary + 2 CNS ablations) | default (auto → ambient) | 8 |
+| `submit_eval_fm_ambient.sh` | `outputs/2026-04-18/diff_*` ambient (4 datasets) | default (auto → ambient) | 4 |
+| `submit_eval_crps_latent.sh` | `outputs/2026-04-19/crps_*` cached-latent (CNS so far) | `ambient` | 8 |
+| `submit_eval_fm_latent.sh` | `outputs/2026-04-18/diff_*` cached-latent (4 datasets) | `ambient` | 4 |
+| `submit_eval_crps_latent_rollout_latent.sh` | same runs as `submit_eval_crps_latent.sh` | `latent` (writes to `eval_latent/`) | 8 |
+| `submit_eval_fm_latent_rollout_latent.sh` | same runs as `submit_eval_fm_latent.sh` | `latent` (writes to `eval_latent/`) | 4 |
+
+## Batch-size rationale
+
+Empirically, the knobs are tight because eval rolls out with n_members=10
+for 25 steps on 64×64 fields:
+
+- **CRPS** (single forward per step) handles `eval.batch_size=8` fine.
+- **FM / diffusion** integrates `flow_ode_steps=50` per rollout step, so
+  ambient fits `eval.batch_size=4` — drop to 2 if OOM.
+- **Cached-latent in ambient mode** still encodes/decodes at every step
+  but the processor forward is cheaper (64 tokens vs 256 for
+  ambient-patch4), so the CRPS variant matches ambient CRPS at 8 and the
+  FM variant matches ambient FM at 4. Can try bumping up if there's
+  headroom.
+- **Cached-latent in latent mode** avoids per-step AE encode/decode and is
+  typically cheaper. We keep 8 (CRPS) / 4 (FM) for consistency across
+  comparisons; increase only after confirming cluster headroom.
+
+## eval.mode for cached latents
+
+The cached-latent scripts use the `eval.mode` selector that landed via
+[PR #327](https://github.com/alan-turing-institute/autocast/pull/327) and is
+now available in-tree. `eval.mode=ambient` forces full
+`encoder → processor → decoder` rollout, so the decode/encode drift is
+included in the metrics — the only fair regime for cross-comparison with
+ambient CRPS/FM baselines that roll out in data space natively. Latent-only
+rollout (`eval.mode=latent`) is faster and is useful as an additional
+diagnostic view when written to a separate subdir (`eval_latent/`).
+
+When `eval.mode=ambient` is set on a cached-latents datamodule, the eval
+script auto-substitutes the raw datamodule from
+`<cache_dir>/autoencoder_config.yaml`, and the AE weights are supplied via
+`autoencoder_checkpoint=<ae.ckpt>` (hard-coded per run in each script).
+
+## Submission order
+
+These scripts are all independent — each run eval'd against its own
+checkpoint. There are no branch prerequisites for the cached-latent scripts.
+
+Dry-run everything first, review the printed sbatch commands, then re-run
+without `RUN_DRY_STATES` edits to submit. Outputs land under each run's
+`eval/` (ambient rollout) or `eval_latent/` (latent rollout) subdirectory
+(`evaluation_metrics.csv`, rollout videos, etc.).
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+set -euo pipefail
+# Evaluate CRPS-in-ambient EPD runs trained on 2026-04-18.
+# Covers 4 primary runs (permute_concat across all 4 datasets) plus two CNS
+# ablations: AE-ambient (DC encoder/decoder, frozen) and identity+global_cond.
+# All are EPD checkpoints (encoder_processor_decoder.ckpt); eval uses the
+# resolved_config.yaml written alongside each run, so the trained architecture
+# is reproduced exactly for eval.
+#
+# Batch size: CRPS eval fits 8/GPU comfortably (ambient 64x64, n_members=10,
+# single forward pass per rollout step — no ODE).
+
+EVAL_BATCH_SIZE=8
+TIMEOUT_MIN=240
+RUN_DRY_STATES=("true" "false")
+EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,psrmse_mid,psrmse_high,psrmse_tail,pscc,pscc_low,pscc_mid,pscc_high,pscc_tail,crps,fcrps,afcrps,energy,ssr,winkler]"
+
+# Run dirs (absolute paths work; relative paths resolved from repo root).
+RUN_DIRS=(
+    # CRPS ambient (permute_concat) — 4 datasets
+    "outputs/2026-04-18/crps_gs64_vit_azula_large_0f89f06_779325a"
+    "outputs/2026-04-18/crps_gpe64_vit_azula_large_0f89f06_d337bd8"
+    "outputs/2026-04-18/crps_cns64_vit_azula_large_0f89f06_5b7332b"
+    "outputs/2026-04-18/crps_ad64_vit_azula_large_0f89f06_4667606"
+    # CNS ablations
+    "outputs/2026-04-18/crps_cns64_vit_azula_large_0f89f06_cf53b48"  # identity+global_cond
+    "outputs/2026-04-18/crps_cns64_vit_azula_large_0f89f06_e7e60d9"  # AE-ambient (DC encoder/decoder)
+)
+
+for run_dir in "${RUN_DIRS[@]}"; do
+    if [[ ! -f "${run_dir}/resolved_config.yaml" ]]; then
+        echo "Skipping ${run_dir}: resolved_config.yaml missing" >&2
+        continue
+    fi
+
+    for run_dry in "${RUN_DRY_STATES[@]}"; do
+        dry_run_arg=()
+        run_label="slurm"
+        if [[ "${run_dry}" == "true" ]]; then
+            dry_run_arg=(--dry-run)
+            run_label="slurm --dry-run"
+        fi
+
+        echo "Submitting CRPS-ambient eval"
+        echo "  mode: ${run_label}"
+        echo "  run_dir: ${run_dir}"
+        echo "  eval.batch_size: ${EVAL_BATCH_SIZE}"
+        echo "  eval.metrics: ${EVAL_METRICS}"
+
+        uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
+            --workdir "${run_dir}" \
+            eval.metrics="${EVAL_METRICS}" \
+            eval.batch_size="${EVAL_BATCH_SIZE}" \
+            hydra.launcher.timeout_min="${TIMEOUT_MIN}"
+    done
+done
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+set -euo pipefail
+# Evaluate CRPS cached-latent processor runs (2026-04-19) in AMBIENT mode.
+#
+# eval.mode=ambient forces encoder->processor->decoder rollout at every
+# step, so decode/encode drift is included in the metrics. This makes the
+# latent-space CRPS numbers directly comparable with the ambient CRPS and
+# FM baselines (see slurm_scripts/comparison/eval/README.md).
+#
+# The eval.mode selector landed via PR #327 and is now in-tree. When ambient
+# is requested on a cached-latents datamodule, eval auto-substitutes the raw
+# datamodule from <cache_dir>/autoencoder_config.yaml; the trained AE weights
+# are supplied via autoencoder_checkpoint.
+#
+# Batch size: cached-latent eval pays the ambient AE encode/decode per step
+# but processor forward is cheap (64 tokens vs 256 for ambient-patch4), so
+# 8/GPU fits comfortably — same as pure-ambient CRPS.
+
+EVAL_BATCH_SIZE=8
+TIMEOUT_MIN=240
+RUN_DRY_STATES=("true" "false")
+EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,psrmse_mid,psrmse_high,psrmse_tail,pscc,pscc_low,pscc_mid,pscc_high,pscc_tail,crps,fcrps,afcrps,energy,ssr,winkler]"
+
+# (run_dir, autoencoder_checkpoint) pairs. Extend as more cached-latent CRPS
+# runs land (gs, gpe, ad) — the AE paths are the same as training.
+RUN_DIRS=(
+    "outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"
+)
+declare -A AE_CKPT=(
+    ["outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
+)
+
+for run_dir in "${RUN_DIRS[@]}"; do
+    ae_ckpt="${AE_CKPT[$run_dir]:-}"
+    if [[ -z "${ae_ckpt}" ]]; then
+        echo "Skipping ${run_dir}: no autoencoder_checkpoint mapping" >&2
+        continue
+    fi
+    if [[ ! -f "${run_dir}/resolved_config.yaml" ]]; then
+        echo "Skipping ${run_dir}: resolved_config.yaml missing" >&2
+        continue
+    fi
+    if [[ ! -f "${ae_ckpt}" ]]; then
+        echo "Skipping ${run_dir}: AE checkpoint missing at ${ae_ckpt}" >&2
+        continue
+    fi
+
+    for run_dry in "${RUN_DRY_STATES[@]}"; do
+        dry_run_arg=()
+        run_label="slurm"
+        if [[ "${run_dry}" == "true" ]]; then
+            dry_run_arg=(--dry-run)
+            run_label="slurm --dry-run"
+        fi
+
+        echo "Submitting CRPS cached-latent eval (mode=ambient)"
+        echo "  mode: ${run_label}"
+        echo "  run_dir: ${run_dir}"
+        echo "  autoencoder_checkpoint: ${ae_ckpt}"
+        echo "  eval.batch_size: ${EVAL_BATCH_SIZE}"
+        echo "  eval.metrics: ${EVAL_METRICS}"
+
+        uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
+            --workdir "${run_dir}" \
+            eval.checkpoint=processor.ckpt \
+            ++eval.mode=ambient \
+            +autoencoder_checkpoint="${ae_ckpt}" \
+            eval.metrics="${EVAL_METRICS}" \
+            eval.batch_size="${EVAL_BATCH_SIZE}" \
+            hydra.launcher.timeout_min="${TIMEOUT_MIN}"
+    done
+done
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+set -euo pipefail
+# Evaluate CRPS cached-latent processor runs (2026-04-19) in LATENT mode.
+#
+# eval.mode=latent rolls out only in latent space and writes results to
+# eval_latent/ so ambient-vs-latent comparisons can coexist per run.
+#
+# The eval.mode selector landed via PR #327 and is now in-tree. We still pass
+# autoencoder_checkpoint to load the trained AE for eval setup/final decode.
+#
+# Batch size: latent rollout avoids per-step AE encode/decode, so 8/GPU is a
+# conservative setting and matches the ambient-compare CRPS script.
+
+EVAL_BATCH_SIZE=8
+TIMEOUT_MIN=240
+RUN_DRY_STATES=("true" "false")
+EVAL_SUBDIR="eval_latent"
+EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,psrmse_mid,psrmse_high,psrmse_tail,pscc,pscc_low,pscc_mid,pscc_high,pscc_tail,crps,fcrps,afcrps,energy,ssr,winkler]"
+
+# (run_dir, autoencoder_checkpoint) pairs. Extend as more cached-latent CRPS
+# runs land (gs, gpe, ad) — the AE paths are the same as training.
+RUN_DIRS=(
+    "outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"
+)
+declare -A AE_CKPT=(
+    ["outputs/2026-04-19/crps_cns64_vit_azula_large_58712c4_71ba7be"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
+)
+
+for run_dir in "${RUN_DIRS[@]}"; do
+    ae_ckpt="${AE_CKPT[$run_dir]:-}"
+    if [[ -z "${ae_ckpt}" ]]; then
+        echo "Skipping ${run_dir}: no autoencoder_checkpoint mapping" >&2
+        continue
+    fi
+    if [[ ! -f "${run_dir}/resolved_config.yaml" ]]; then
+        echo "Skipping ${run_dir}: resolved_config.yaml missing" >&2
+        continue
+    fi
+    if [[ ! -f "${ae_ckpt}" ]]; then
+        echo "Skipping ${run_dir}: AE checkpoint missing at ${ae_ckpt}" >&2
+        continue
+    fi
+
+    for run_dry in "${RUN_DRY_STATES[@]}"; do
+        dry_run_arg=()
+        run_label="slurm"
+        if [[ "${run_dry}" == "true" ]]; then
+            dry_run_arg=(--dry-run)
+            run_label="slurm --dry-run"
+        fi
+
+        echo "Submitting CRPS cached-latent eval (mode=latent)"
+        echo "  mode: ${run_label}"
+        echo "  run_dir: ${run_dir}"
+        echo "  autoencoder_checkpoint: ${ae_ckpt}"
+        echo "  eval.batch_size: ${EVAL_BATCH_SIZE}"
+        echo "  eval.metrics: ${EVAL_METRICS}"
+        echo "  output_dir: ${run_dir}/${EVAL_SUBDIR}"
+
+        uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
+            --workdir "${run_dir}" \
+            eval.checkpoint=processor.ckpt \
+            ++eval.mode=latent \
+            +autoencoder_checkpoint="${ae_ckpt}" \
+            eval.metrics="${EVAL_METRICS}" \
+            eval.batch_size="${EVAL_BATCH_SIZE}" \
+            hydra.sweep.dir="${run_dir}/${EVAL_SUBDIR}" \
+            hydra.launcher.timeout_min="${TIMEOUT_MIN}"
+    done
+done
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+set -euo pipefail
+# Evaluate FM-in-ambient (flow matching, identity encoder) EPD runs from
+# 2026-04-18 across all 4 datasets. Eval reuses resolved_config.yaml so
+# flow_ode_steps (=50), hid_channels, and backbone match training.
+#
+# Batch size: diffusion rollout is ODE-integrated (flow_ode_steps=50) per
+# rollout step, so ambient 64x64 × n_members=10 × 50 ODE substeps is the
+# tightest of the three. 4/GPU fits; drop to 2 if OOM.
+
+EVAL_BATCH_SIZE=4
+TIMEOUT_MIN=360
+RUN_DRY_STATES=("true" "false")
+EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,psrmse_mid,psrmse_high,psrmse_tail,pscc,pscc_low,pscc_mid,pscc_high,pscc_tail,crps,fcrps,afcrps,energy,ssr,winkler]"
+
+RUN_DIRS=(
+    "outputs/2026-04-18/diff_gs64_flow_matching_vit_0f89f06_6e3a299"
+    "outputs/2026-04-18/diff_gpe64_flow_matching_vit_0f89f06_3b3604d"
+    "outputs/2026-04-18/diff_cns64_flow_matching_vit_0f89f06_483bb70"
+    "outputs/2026-04-18/diff_ad64_flow_matching_vit_0f89f06_725d44a"
+)
+
+for run_dir in "${RUN_DIRS[@]}"; do
+    if [[ ! -f "${run_dir}/resolved_config.yaml" ]]; then
+        echo "Skipping ${run_dir}: resolved_config.yaml missing" >&2
+        continue
+    fi
+
+    for run_dry in "${RUN_DRY_STATES[@]}"; do
+        dry_run_arg=()
+        run_label="slurm"
+        if [[ "${run_dry}" == "true" ]]; then
+            dry_run_arg=(--dry-run)
+            run_label="slurm --dry-run"
+        fi
+
+        echo "Submitting FM-ambient eval"
+        echo "  mode: ${run_label}"
+        echo "  run_dir: ${run_dir}"
+        echo "  eval.batch_size: ${EVAL_BATCH_SIZE}"
+        echo "  eval.metrics: ${EVAL_METRICS}"
+
+        uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
+            --workdir "${run_dir}" \
+            eval.metrics="${EVAL_METRICS}" \
+            eval.batch_size="${EVAL_BATCH_SIZE}" \
+            hydra.launcher.timeout_min="${TIMEOUT_MIN}"
+    done
+done
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+set -euo pipefail
+# Evaluate FM cached-latent processor runs (2026-04-18) in AMBIENT mode.
+#
+# eval.mode=ambient forces encoder->processor->decoder rollout at every
+# step, so decode/encode drift is included in the metrics — the apples-to-
+# apples regime for comparison with the ambient FM baseline.
+#
+# The eval.mode selector landed via PR #327 and is now in-tree. When ambient
+# is requested on a cached-latents datamodule, eval auto-substitutes the raw
+# datamodule from <cache_dir>/autoencoder_config.yaml; the trained AE weights
+# are supplied via autoencoder_checkpoint.
+#
+# Batch size: ambient rollout pays encode/decode every step plus 50 ODE
+# substeps through the processor. Cached-latent processor forward is lighter
+# (64 tokens vs 256 for ambient FM), so 4/GPU is a safe start; the tight
+# spot is the same ODE + AE stack so it mirrors FM-ambient.
+
+EVAL_BATCH_SIZE=4
+TIMEOUT_MIN=360
+RUN_DRY_STATES=("true" "false")
+EVAL_METRICS="[mse,mae,nmse,nmae,rmse,nrmse,vmse,vrmse,linf,psrmse,psrmse_low,psrmse_mid,psrmse_high,psrmse_tail,pscc,pscc_low,pscc_mid,pscc_high,pscc_tail,crps,fcrps,afcrps,energy,ssr,winkler]"
+
+RUN_DIRS=(
+    "outputs/2026-04-18/diff_gs64_flow_matching_vit_0f89f06_f6e8f51"
+    "outputs/2026-04-18/diff_gpe64_flow_matching_vit_0f89f06_b954f94"
+    "outputs/2026-04-18/diff_cns64_flow_matching_vit_0f89f06_0e1c64b"
+    "outputs/2026-04-18/diff_ad64_flow_matching_vit_0f89f06_df2137c"
+)
+declare -A AE_CKPT=(
+    ["outputs/2026-04-18/diff_gs64_flow_matching_vit_0f89f06_f6e8f51"]="$HOME/autocast/outputs/2026-04-17/ae_gs64_3a7999b_ed36b8e/autoencoder.ckpt"
+    ["outputs/2026-04-18/diff_gpe64_flow_matching_vit_0f89f06_b954f94"]="$HOME/autocast/outputs/2026-04-17/ae_gpe64_3a7999b_31e1c9f/autoencoder.ckpt"
+    ["outputs/2026-04-18/diff_cns64_flow_matching_vit_0f89f06_0e1c64b"]="$HOME/autocast/outputs/2026-04-17/ae_cns64_3a7999b_b9c29f8/autoencoder.ckpt"
+    ["outputs/2026-04-18/diff_ad64_flow_matching_vit_0f89f06_df2137c"]="$HOME/autocast/outputs/2026-04-17/ae_ad64_3a7999b_1a1e300/autoencoder.ckpt"
+)
+
+for run_dir in "${RUN_DIRS[@]}"; do
+    ae_ckpt="${AE_CKPT[$run_dir]:-}"
+    if [[ -z "${ae_ckpt}" ]]; then
+        echo "Skipping ${run_dir}: no autoencoder_checkpoint mapping" >&2
+        continue
+    fi
+    if [[ ! -f "${run_dir}/resolved_config.yaml" ]]; then
+        echo "Skipping ${run_dir}: resolved_config.yaml missing" >&2
+        continue
+    fi
+    if [[ ! -f "${ae_ckpt}" ]]; then
+        echo "Skipping ${run_dir}: AE checkpoint missing at ${ae_ckpt}" >&2
+        continue
+    fi
+
+    for run_dry in "${RUN_DRY_STATES[@]}"; do
+        dry_run_arg=()
+        run_label="slurm"
+        if [[ "${run_dry}" == "true" ]]; then
+            dry_run_arg=(--dry-run)
+            run_label="slurm --dry-run"
+        fi
+
+        echo "Submitting FM cached-latent eval (mode=ambient)"
+        echo "  mode: ${run_label}"
+        echo "  run_dir: ${run_dir}"
+        echo "  autoencoder_checkpoint: ${ae_ckpt}"
+        echo "  eval.batch_size: ${EVAL_BATCH_SIZE}"
+        echo "  eval.metrics: ${EVAL_METRICS}"
+
+        uv run autocast eval --mode slurm "${dry_run_arg[@]}" \
+            --workdir "${run_dir}" \
+            eval.checkpoint=processor.ckpt \
+            ++eval.mode=ambient \
+            +autoencoder_checkpoint="${ae_ckpt}" \
+            eval.metrics="${EVAL_METRICS}" \
+            eval.batch_size="${EVAL_BATCH_SIZE}" \
+            hydra.launcher.timeout_min="${TIMEOUT_MIN}"
+    done
+done