alan-turing-institute · sgreenbury · Apr 23, 2026 · Apr 23, 2026
diff --git a/slurm_scripts/ablations/ensemble_size/README.md b/slurm_scripts/ablations/ensemble_size/README.md
@@ -54,7 +54,8 @@ batch) CRPS ablations on the other comparison datasets.
 | `submit_ensemble_timing.sh` | 5-epoch timing for the current `eff_bs1024` timing set (`conditioned_navier_stokes`, `gray_scott`, `gpe_laser_only_wake`, `advection_diffusion`) -> `timing.ckpt` per run |
 | `submit_ensemble_large.sh`  | 24h production runs for the same three active runs, using cached or timing-derived cosine schedules |
 | `eval/submit_eval_crps_ambient.sh` | ambient eval for the current `m=16` CRPS run set (CNS `fixed_bs32` pilot plus all available `eff_bs1024` runs), with conservative `eval.batch_size=4` and explicit `eval.n_members=10` to match the comparison-study eval regime |
-| `eval_0p75/submit_eval_crps_ambient.sh` | ambient eval for the same run set, but against each run's third `quarter-*.ckpt` (the 75% schedule checkpoint), with outputs isolated under `eval_0p75/` |
+| `eval_0p50/submit_eval_crps_ambient.sh` | ambient eval for each run's `snapshot-0p50-*.ckpt` progress checkpoint, falling back to legacy quarter checkpoints |
+| `eval_0p75/submit_eval_crps_ambient.sh` | ambient eval for each run's `snapshot-0p75-*.ckpt` progress checkpoint, falling back to legacy quarter checkpoints |
 
 ## Extending the sweep
 
@@ -78,7 +79,8 @@ the run set is still partly ablation-only (`fixed_bs32`) even though the
 We keep two sibling eval directories here:
 
 - `eval/` for the standard final-checkpoint evals.
-- `eval_0p75/` for the third quarter-checkpoint (`75%`) evals, so those
+- `eval_0p50/` and `eval_0p75/` for 50% and 75% progress-checkpoint
+  evals, so those
   partial-schedule outputs do not mix with the canonical final-checkpoint
   metrics and videos.
 

diff --git a/slurm_scripts/ablations/ensemble_size/eval_0p50/submit_eval_crps_ambient.sh b/slurm_scripts/ablations/ensemble_size/eval_0p50/submit_eval_crps_ambient.sh
@@ -1,19 +1,18 @@
 #!/bin/bash
 
 set -euo pipefail
-# Ambient eval submitter for the 50%-schedule checkpoints from the current
+# Ambient eval submitter for the 50%-progress checkpoints from the current
 # CRPS ensemble-size runs.
 #
 # This mirrors ../eval/submit_eval_crps_ambient.sh but keeps outputs under a
 # sibling eval_0p50/ folder so partial-schedule metrics, rollout videos, and
 # SLURM logs do not mix with the standard final-checkpoint evals.
 #
-# The large-run training scripts save checkpoints at 25/50/75/100% of the
-# cosine schedule via quarter-*.ckpt filenames. Rather than hard-coding epoch
-# numbers per dataset, we sort the available quarter checkpoints and pick the
-# second one (the 0.50 checkpoint) for each run.
+# Current large-run training scripts save checkpoints every 5% of training
+# progress via snapshot-<progress>-*.ckpt filenames. For older quarter-schedule
+# runs, fall back to the second sorted quarter-*.ckpt checkpoint.
 #
-# Force eval.mode=ambient here. These quarter checkpoints can look
+# Force eval.mode=ambient here. These intermediate checkpoints can look
 # processor-only to the early eval.mode=auto dispatcher because stateless
 # encoders/decoders (PermuteConcat / ChannelsLast) contribute no
 # encoder_decoder.* weights, even though the full raw-space ambient path is the
@@ -41,17 +40,28 @@ RUN_DIRS=(
 
 resolve_half_checkpoint() {
     local run_dir="$1"
+    local -a snapshot_ckpts=()
     local -a quarter_ckpts=()
 
+    mapfile -t snapshot_ckpts < <(
+        find "${run_dir}" -type f -path '*/checkpoints/snapshot-0p50-*.ckpt' | sort
+    )
+
+    if (( ${#snapshot_ckpts[@]} >= 1 )); then
+        printf '%s\n' "${snapshot_ckpts[$(( ${#snapshot_ckpts[@]} - 1 ))]}"
+        return 0
+    fi
+
     mapfile -t quarter_ckpts < <(
         find "${run_dir}" -type f -path '*/checkpoints/quarter-*.ckpt' | sort
     )
 
-    if (( ${#quarter_ckpts[@]} < 2 )); then
-        return 1
+    if (( ${#quarter_ckpts[@]} > 1 )); then
+        printf '%s\n' "${quarter_ckpts[1]}"
+        return 0
     fi
 
-    printf '%s\n' "${quarter_ckpts[1]}"
+    return 1
 }
 
 for run_dir in "${RUN_DIRS[@]}"; do
@@ -62,7 +72,7 @@ for run_dir in "${RUN_DIRS[@]}"; do
     fi
 
     if ! eval_ckpt="$(resolve_half_checkpoint "${run_dir_abs}")"; then
-        echo "Skipping ${run_dir}: fewer than two quarter-*.ckpt files found" >&2
+        echo "Skipping ${run_dir}: neither snapshot-0p50-*.ckpt nor legacy second quarter-*.ckpt found" >&2
         continue
     fi
     eval_ckpt_abs="$(realpath "${eval_ckpt}")"

diff --git a/slurm_scripts/ablations/ensemble_size/eval_0p75/submit_eval_crps_ambient.sh b/slurm_scripts/ablations/ensemble_size/eval_0p75/submit_eval_crps_ambient.sh
@@ -1,19 +1,18 @@
 #!/bin/bash
 
 set -euo pipefail
-# Ambient eval submitter for the 75%-schedule checkpoints from the current
+# Ambient eval submitter for the 75%-progress checkpoints from the current
 # CRPS ensemble-size runs.
 #
 # This mirrors ../eval/submit_eval_crps_ambient.sh but keeps outputs under a
 # sibling eval_0p75/ folder so partial-schedule metrics, rollout videos, and
 # SLURM logs do not mix with the standard final-checkpoint evals.
 #
-# The large-run training scripts save checkpoints at 25/50/75/100% of the
-# cosine schedule via quarter-*.ckpt filenames. Rather than hard-coding epoch
-# numbers per dataset, we sort the available quarter checkpoints and pick the
-# third one (the 0.75 checkpoint) for each run.
+# Current large-run training scripts save checkpoints every 5% of training
+# progress via snapshot-<progress>-*.ckpt filenames. For older quarter-schedule
+# runs, fall back to the third sorted quarter-*.ckpt checkpoint.
 #
-# Force eval.mode=ambient here. These quarter checkpoints can look
+# Force eval.mode=ambient here. These intermediate checkpoints can look
 # processor-only to the early eval.mode=auto dispatcher because stateless
 # encoders/decoders (PermuteConcat / ChannelsLast) contribute no
 # encoder_decoder.* weights, even though the full raw-space ambient path is the
@@ -39,19 +38,32 @@ RUN_DIRS=(
     "outputs/2026-04-21/ensemble_size/crps_ad64_vit_azula_large_ac1bb06_ef6368d"
 )
 
-resolve_three_quarter_checkpoint() {
+resolve_progress_checkpoint() {
     local run_dir="$1"
+    local progress_token="$2"
+    local legacy_quarter_index="$3"
+    local -a snapshot_ckpts=()
     local -a quarter_ckpts=()
 
+    mapfile -t snapshot_ckpts < <(
+        find "${run_dir}" -type f -path "*/checkpoints/snapshot-${progress_token}-*.ckpt" | sort
+    )
+
+    if (( ${#snapshot_ckpts[@]} >= 1 )); then
+        printf '%s\n' "${snapshot_ckpts[$(( ${#snapshot_ckpts[@]} - 1 ))]}"
+        return 0
+    fi
+
     mapfile -t quarter_ckpts < <(
         find "${run_dir}" -type f -path '*/checkpoints/quarter-*.ckpt' | sort
     )
 
-    if (( ${#quarter_ckpts[@]} < 3 )); then
-        return 1
+    if (( ${#quarter_ckpts[@]} > legacy_quarter_index )); then
+        printf '%s\n' "${quarter_ckpts[$legacy_quarter_index]}"
+        return 0
     fi
 
-    printf '%s\n' "${quarter_ckpts[2]}"
+    return 1
 }
 
 for run_dir in "${RUN_DIRS[@]}"; do
@@ -61,8 +73,8 @@ for run_dir in "${RUN_DIRS[@]}"; do
         continue
     fi
 
-    if ! eval_ckpt="$(resolve_three_quarter_checkpoint "${run_dir_abs}")"; then
-        echo "Skipping ${run_dir}: fewer than three quarter-*.ckpt files found" >&2
+    if ! eval_ckpt="$(resolve_progress_checkpoint "${run_dir_abs}" "0p75" 2)"; then
+        echo "Skipping ${run_dir}: neither snapshot-0p75-*.ckpt nor legacy third quarter-*.ckpt found" >&2
         continue
     fi
     eval_ckpt_abs="$(realpath "${eval_ckpt}")"

diff --git a/slurm_scripts/ablations/ensemble_size/submit_ensemble_large.sh b/slurm_scripts/ablations/ensemble_size/submit_ensemble_large.sh
@@ -162,7 +162,6 @@ for datamodule in "${!DATASETS[@]}"; do
             continue
         fi
 
-        quarter_epochs=$((cosine_epochs / 4))
         wandb_name="ensemble_m${n_members}_${regime}"
 
         for run_dry in "${RUN_DRY_STATES[@]}"; do
@@ -192,9 +191,10 @@ for datamodule in "${!DATASETS[@]}"; do
                 hydra.launcher.timeout_min="${TIMEOUT_MIN}" \
                 trainer.max_time="${BUDGET_MAX_TIME}" \
                 +trainer.max_epochs="${cosine_epochs}" \
-                trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
+                trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
+                +trainer.callbacks.0.every_n_epochs=0 \
                 trainer.callbacks.0.save_top_k=-1 \
-                trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
+                trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
         done
     done
 done
diff --git a/slurm_scripts/ablations/model_size/submit_model_size_large.sh b/slurm_scripts/ablations/model_size/submit_model_size_large.sh
@@ -97,7 +97,6 @@ for datamodule in "${!DATASETS[@]}"; do
             continue
         fi
 
-        quarter_epochs=$((cosine_epochs / 4))
         wandb_name="model_size_${variant}"
 
         for run_dry in "${RUN_DRY_STATES[@]}"; do
@@ -127,9 +126,10 @@ for datamodule in "${!DATASETS[@]}"; do
                 hydra.launcher.timeout_min="${TIMEOUT_MIN}" \
                 trainer.max_time="${BUDGET_MAX_TIME}" \
                 +trainer.max_epochs="${cosine_epochs}" \
-                trainer.callbacks.0.every_n_epochs="${quarter_epochs}" \
+                trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
+                +trainer.callbacks.0.every_n_epochs=0 \
                 trainer.callbacks.0.save_top_k=-1 \
-                trainer.callbacks.0.filename=\"quarter-{epoch:04d}\"
+                trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
         done
     done
 done
diff --git a/slurm_scripts/comparison/cached_latents/submit_crps_latent_cns_large.sh b/slurm_scripts/comparison/cached_latents/submit_crps_latent_cns_large.sh
@@ -61,7 +61,7 @@ for run_dry in "${RUN_DRY_STATES[@]}"; do
         trainer.max_time="${BUDGET_MAX_TIME}" \
         +trainer.max_epochs="${COSINE_EPOCHS}" \
         trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-        trainer.callbacks.0.every_n_epochs=0 \
+        +trainer.callbacks.0.every_n_epochs=0 \
         trainer.callbacks.0.save_top_k=-1 \
         trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
 done
diff --git a/slurm_scripts/comparison/cached_latents/submit_crps_latent_large.sh b/slurm_scripts/comparison/cached_latents/submit_crps_latent_large.sh
@@ -86,7 +86,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
             trainer.max_time="${BUDGET_MAX_TIME}" \
             +trainer.max_epochs="${cosine_epochs}" \
             trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-            trainer.callbacks.0.every_n_epochs=0 \
+            +trainer.callbacks.0.every_n_epochs=0 \
             trainer.callbacks.0.save_top_k=-1 \
             trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
     done

diff --git a/slurm_scripts/comparison/cached_latents/submit_fm_large.sh b/slurm_scripts/comparison/cached_latents/submit_fm_large.sh
@@ -84,7 +84,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
             trainer.max_time="${BUDGET_MAX_TIME}" \
             +trainer.max_epochs="${cosine_epochs}" \
             trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-            trainer.callbacks.0.every_n_epochs=0 \
+            +trainer.callbacks.0.every_n_epochs=0 \
             trainer.callbacks.0.save_top_k=-1 \
             trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
     done

diff --git a/slurm_scripts/comparison/epd/submit_crps_ae_ambient_large.sh b/slurm_scripts/comparison/epd/submit_crps_ae_ambient_large.sh
@@ -78,7 +78,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
             trainer.max_time="${BUDGET_MAX_TIME}" \
             +trainer.max_epochs="${cosine_epochs}" \
             trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-            trainer.callbacks.0.every_n_epochs=0 \
+            +trainer.callbacks.0.every_n_epochs=0 \
             trainer.callbacks.0.save_top_k=-1 \
             trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
     done

diff --git a/slurm_scripts/comparison/epd/submit_crps_ambient_identity_global_cond_large.sh b/slurm_scripts/comparison/epd/submit_crps_ambient_identity_global_cond_large.sh
@@ -55,7 +55,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
             trainer.max_time="${BUDGET_MAX_TIME}" \
             +trainer.max_epochs="${cosine_epochs}" \
             trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-            trainer.callbacks.0.every_n_epochs=0 \
+            +trainer.callbacks.0.every_n_epochs=0 \
             trainer.callbacks.0.save_top_k=-1 \
             trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
     done

diff --git a/slurm_scripts/comparison/epd/submit_crps_large.sh b/slurm_scripts/comparison/epd/submit_crps_large.sh
@@ -63,7 +63,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
             trainer.max_time="${BUDGET_MAX_TIME}" \
             +trainer.max_epochs="${cosine_epochs}" \
             trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-            trainer.callbacks.0.every_n_epochs=0 \
+            +trainer.callbacks.0.every_n_epochs=0 \
             trainer.callbacks.0.save_top_k=-1 \
             trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
     done

diff --git a/slurm_scripts/comparison/epd/submit_fm_ambient_large.sh b/slurm_scripts/comparison/epd/submit_fm_ambient_large.sh
@@ -67,7 +67,7 @@ for datamodule in "${!EXPERIMENTS[@]}"; do
             trainer.max_time="${BUDGET_MAX_TIME}" \
             +trainer.max_epochs="${cosine_epochs}" \
             trainer.callbacks.0.every_n_train_steps_fraction=0.05 \
-            trainer.callbacks.0.every_n_epochs=0 \
+            +trainer.callbacks.0.every_n_epochs=0 \
             trainer.callbacks.0.save_top_k=-1 \
             trainer.callbacks.0.filename=\"snapshot-{progress_token}-{epoch:04d}-{step:08d}\"
     done