Merge pull request #327 from alan-turing-institute/add-eval-modes

sgreenbury · web-flow · commit 9830e1ddc924 · 2026-04-20T16:35:22.000+01:00
Add eval.mode selector for ambient vs latent rollout
diff --git a/src/autocast/configs/eval/README.md b/src/autocast/configs/eval/README.md
@@ -44,6 +44,9 @@ python -m autocast.scripts.eval.encoder_processor_decoder \
 All eval configs support these parameters:
 
 - `checkpoint`: Path to model checkpoint (required for evaluation)
+- `mode`: Evaluation regime (`auto` | `ambient` | `latent`). Controls the
+  **rollout space**, not just the metrics space. See
+  [Ambient vs latent rollout](#ambient-vs-latent-rollout) below.
 - `metrics`: List of metrics to compute (default includes mse/mae/rmse/vrmse,
   power spectrum scores `psrmse*`, cross-correlation spectrum scores `pscc*`,
   and ensemble scores `crps`, `fcrps`, `afcrps`, `energy`, `ssr`; `variogram`
@@ -79,3 +82,51 @@ On SLURM, `srun` propagates `LOCAL_RANK` / `WORLD_SIZE` into the
 process so Fabric DDP initialises automatically — no extra flags needed.
 - `max_rollout_steps`: Maximum number of rollout steps
 - `free_running_only`: Whether to disable teacher forcing
+
+## Ambient vs latent rollout
+
+Processor checkpoints trained on cached latents can be evaluated in two
+qualitatively different regimes. The `eval.mode` knob makes the choice
+explicit and surfaces clear errors when the rest of the config is
+inconsistent with the request.
+
+- `eval.mode=auto` (default) preserves historical behavior: the script picks
+  a path based on `(checkpoint type, datamodule batch type,
+  autoencoder_checkpoint)`.
+- `eval.mode=ambient` forces full `encoder -> processor -> decoder` rollout.
+  Each rollout step decodes to ambient fields and re-encodes on the next
+  step, so decode/encode drift is included in the metrics. **This is the
+  apples-to-apples regime for comparing against baselines that natively roll
+  out in data space (e.g. a CRPS comparison against a non-autoencoder
+  model).** Requires `autoencoder_checkpoint=<ae.ckpt>` and a raw-Batch
+  datamodule. When the current datamodule yields `EncodedBatch` (cached
+  latents), eval auto-substitutes the datamodule from
+  `<cache_dir>/autoencoder_config.yaml` saved by `autocast cache-latents`.
+  Pass `datamodule=...` explicitly to override the default.
+- `eval.mode=latent` forces latent-space rollout: the processor's predicted
+  latent is fed back as the next latent input; the encoder is invoked only
+  once. Metrics are decoded to data space via the decoder saved alongside
+  the cached latents when available, otherwise they are reported in latent
+  space. Requires an `EncodedBatch` / cached-latents datamodule.
+
+### Running the ambient ablation
+
+Given an autoencoder checkpoint and a processor checkpoint trained on its
+cached latents, a minimal invocation is:
+
+```bash
+# Ambient (encoder -> processor -> decoder at every rollout step)
+autocast eval --workdir <processor_workdir> \
+  eval.mode=ambient \
+  eval.checkpoint=<processor.ckpt> \
+  autoencoder_checkpoint=<autoencoder.ckpt>
+
+# Latent (processor rollout stays in latent space; decoded only for metrics)
+autocast eval --workdir <processor_workdir> \
+  eval.mode=latent \
+  eval.checkpoint=<processor.ckpt>
+```
+
+The ambient run will differ from the latent run by exactly the
+decode/encode drift accumulated over rollout steps, which is the relevant
+delta when comparing against purely-ambient baselines.
diff --git a/src/autocast/configs/eval/default.yaml b/src/autocast/configs/eval/default.yaml
@@ -2,6 +2,26 @@
 # Path to checkpoint for evaluation (required for eval)
 checkpoint: null
 
+# Evaluation mode selector (controls rollout space, not just metrics space).
+#
+#   auto     (default) infer from checkpoint type + batch type + autoencoder_checkpoint.
+#            Preserves historical behavior.
+#   ambient  Force full encoder -> processor -> decoder rollout. Each rollout step
+#            decodes and re-encodes, so decode/encode drift is included in the
+#            metrics -- this is the apples-to-apples regime for comparing against
+#            models that natively roll out in ambient/data space (e.g. CRPS baselines).
+#            Requires `autoencoder_checkpoint=<ae.ckpt>` and a raw-Batch datamodule.
+#            When the datamodule yields EncodedBatch (cached latents), the eval
+#            script auto-substitutes the datamodule from
+#            `<cache_dir>/autoencoder_config.yaml` written by `autocast cache-latents`.
+#            Pass `datamodule=...` explicitly to override that default.
+#   latent   Force latent-space rollout (processor predictions are fed back as
+#            latents; encoder is not re-invoked). Metrics are decoded to data
+#            space via the decoder saved alongside the cached latents if
+#            available, otherwise computed in latent space. Requires an
+#            EncodedBatch datamodule (cached latents).
+mode: auto
+
 # Evaluation metrics to compute
 metrics:
   - mse
diff --git a/src/autocast/scripts/eval/encoder_processor_decoder.py b/src/autocast/scripts/eval/encoder_processor_decoder.py
@@ -137,6 +137,14 @@
 
 MEMORY_INTENSIVE_METRICS = {"variogram"}
 
+EVAL_MODES = ("auto", "ambient", "latent")
+
+# Resolved eval paths exposed for validation / testing. Each corresponds to
+# exactly one branch in `run_evaluation`'s model-selection block.
+EVAL_PATH_AMBIENT_EPD = "ambient_epd"  # full EPD checkpoint or processor+AE
+EVAL_PATH_LATENT_CACHED_WITH_DECODER = "latent_cached_with_decoder"  # Mode 2
+EVAL_PATH_LATENT_CACHED_LATENT_ONLY = "latent_cached_latent_only"  # fallback
+
 
 def _decode_tensor(
     x: torch.Tensor,
@@ -1071,6 +1079,121 @@ def _load_autoencoder_config_from_cache(cache_dir: Path) -> DictConfig | None:
         return None
 
 
+def _normalize_eval_mode(mode: Any) -> str:
+    """Normalize and validate the eval.mode config value."""
+    if mode is None:
+        return "auto"
+    mode_str = str(mode).strip().lower()
+    if mode_str not in EVAL_MODES:
+        msg = f"Unknown eval.mode={mode!r}. Valid values: {', '.join(EVAL_MODES)}."
+        raise ValueError(msg)
+    return mode_str
+
+
+def _maybe_swap_to_ambient_datamodule(
+    cfg: DictConfig,
+    *,
+    eval_mode: str,
+    example_batch: Any,
+) -> DictConfig:
+    """Substitute the raw-data datamodule from `autoencoder_config.yaml`.
+
+    When the user requests ``eval.mode=ambient`` but the current datamodule
+    yields ``EncodedBatch`` (cached latents), we cannot run encoder->processor
+    ->decoder in ambient space: the encoder needs raw fields.  This helper
+    reads the ``autoencoder_config.yaml`` written next to the cached latents
+    by ``autocast cache-latents`` and overwrites ``cfg.datamodule`` with the
+    datamodule the autoencoder was trained on, which guarantees matching
+    normalization and field layout.
+
+    Returns the (possibly-modified) ``cfg`` in-place. Raises a descriptive
+    error when the swap is needed but ``autoencoder_config.yaml`` is absent;
+    callers should pass ``datamodule=...`` explicitly in that case.
+    """
+    if eval_mode != "ambient" or not isinstance(example_batch, EncodedBatch):
+        return cfg
+
+    data_path = cfg.get("datamodule", {}).get("data_path")
+    if not data_path:
+        msg = (
+            "eval.mode=ambient requires a raw-data datamodule, but the current "
+            "datamodule yields EncodedBatch and has no data_path to locate the "
+            "original autoencoder config. Pass datamodule=<raw> explicitly."
+        )
+        raise ValueError(msg)
+
+    ae_cfg = _load_autoencoder_config_from_cache(Path(data_path))
+    if ae_cfg is None:
+        msg = (
+            "eval.mode=ambient requested but the cached-latents directory "
+            f"{data_path} has no 'autoencoder_config.yaml'. Either regenerate "
+            "the cache with a recent `autocast cache-latents` (which saves the "
+            "autoencoder config), or pass datamodule=<raw> explicitly."
+        )
+        raise FileNotFoundError(msg)
+
+    ae_datamodule = ae_cfg.get("datamodule")
+    if ae_datamodule is None:
+        msg = (
+            f"autoencoder_config.yaml at {data_path} is missing a 'datamodule' "
+            "section; cannot auto-wire ambient eval. Pass datamodule=<raw> "
+            "explicitly."
+        )
+        raise ValueError(msg)
+
+    log.info(
+        "eval.mode=ambient: substituting cached_latents datamodule with the "
+        "raw-data datamodule from %s/autoencoder_config.yaml so the encoder "
+        "sees the same fields/normalization it was trained on.",
+        data_path,
+    )
+    with open_dict(cfg):
+        cfg.datamodule = ae_datamodule
+    return cfg
+
+
+def _resolve_eval_path(
+    *,
+    processor_only: bool,
+    example_batch: Any,
+    has_autoencoder_checkpoint: bool,
+    decode_fn_loaded: bool,
+) -> str:
+    """Map the auto-detected branch in `run_evaluation` to a stable label."""
+    if not processor_only:
+        return EVAL_PATH_AMBIENT_EPD
+    if isinstance(example_batch, Batch) and has_autoencoder_checkpoint:
+        return EVAL_PATH_AMBIENT_EPD
+    if decode_fn_loaded:
+        return EVAL_PATH_LATENT_CACHED_WITH_DECODER
+    return EVAL_PATH_LATENT_CACHED_LATENT_ONLY
+
+
+def _validate_resolved_eval_path(*, eval_mode: str, resolved_path: str) -> None:
+    """Raise if the resolved code path disagrees with the user-requested mode."""
+    if eval_mode == "auto":
+        return
+    if eval_mode == "ambient" and resolved_path != EVAL_PATH_AMBIENT_EPD:
+        msg = (
+            "eval.mode=ambient but the resolved eval path is "
+            f"{resolved_path!r}. Ambient eval requires a full EPD checkpoint, "
+            "OR a processor-only checkpoint combined with "
+            "autoencoder_checkpoint=<ae.ckpt> AND a raw-Batch datamodule. "
+            "Double-check eval.checkpoint, autoencoder_checkpoint, and "
+            "datamodule=."
+        )
+        raise ValueError(msg)
+    if eval_mode == "latent" and resolved_path == EVAL_PATH_AMBIENT_EPD:
+        msg = (
+            "eval.mode=latent but the resolved eval path is "
+            f"{resolved_path!r}. Latent-space eval requires a processor-only "
+            "checkpoint paired with an EncodedBatch (cached_latents) "
+            "datamodule. Use datamodule=cached_latents and remove "
+            "autoencoder_checkpoint=, or switch to eval.mode=ambient/auto."
+        )
+        raise ValueError(msg)
+
+
 def _try_build_decode_fn(
     cfg: DictConfig,
 ) -> "tuple[Any, Any] | tuple[None, None]":
@@ -1180,11 +1303,13 @@ def run_evaluation(cfg: DictConfig, work_dir: Path | None = None) -> None:  # no
     eval_batch_size: int = eval_cfg.get("batch_size", 1)
     max_test_batches = eval_cfg.get("max_test_batches")
     max_rollout_batches = _resolve_rollout_batch_limit(eval_cfg)
+    eval_mode = _normalize_eval_mode(eval_cfg.get("mode", "auto"))
     log.info(
         "Batch limits: max_test_batches=%s, max_rollout_batches=%s",
         max_test_batches,
         max_rollout_batches,
     )
+    log.info("eval.mode=%s", eval_mode)
 
     checkpoint_path = resolve_checkpoint_path(
         eval_cfg,
@@ -1220,6 +1345,19 @@ def run_evaluation(cfg: DictConfig, work_dir: Path | None = None) -> None:  # no
     # Setup datamodule and resolve config
     datamodule, cfg, stats = setup_datamodule(cfg)
 
+    # If the user asked for ambient eval but the resolved datamodule yields
+    # EncodedBatch (cached_latents), substitute the raw-data datamodule stored
+    # in the cache dir's autoencoder_config.yaml and rebuild. Honors an
+    # explicit `datamodule=...` override implicitly: when the override targets
+    # a raw-Batch datamodule the swap becomes a no-op.
+    cfg = _maybe_swap_to_ambient_datamodule(
+        cfg,
+        eval_mode=eval_mode,
+        example_batch=stats.get("example_batch"),
+    )
+    if eval_mode == "ambient" and isinstance(stats.get("example_batch"), EncodedBatch):
+        datamodule, cfg, stats = setup_datamodule(cfg)
+
     # Override model n_members from eval config if specified
     if "n_members" in eval_cfg:
         with open_dict(cfg.model):
@@ -1318,6 +1456,18 @@ def run_evaluation(cfg: DictConfig, work_dir: Path | None = None) -> None:  # no
         )
         raise RuntimeError(msg)
 
+    resolved_eval_path = _resolve_eval_path(
+        processor_only=processor_only,
+        example_batch=example_batch,
+        has_autoencoder_checkpoint=bool(cfg.get("autoencoder_checkpoint")),
+        decode_fn_loaded=decode_fn is not None,
+    )
+    log.info("Resolved eval path: %s", resolved_eval_path)
+    _validate_resolved_eval_path(
+        eval_mode=eval_mode,
+        resolved_path=resolved_eval_path,
+    )
+
     # Get eval parameters from config
     metrics_list = eval_cfg.get("metrics", DEFAULT_EVAL_METRICS)
     batch_indices = eval_cfg.get("batch_indices", [])
diff --git a/tests/models/test_encoder_processor_decoder.py b/tests/models/test_encoder_processor_decoder.py
@@ -265,3 +265,86 @@ def test_encoder_processor_decoder_rollout_handles_short_trajectory(
     # Ground truth only for windows where data was available
     assert gts is not None
     assert gts.shape == (batch_size, expected_gt_windows * n_steps_output, 32, 32, 1)
+
+
+class CountingPermuteConcat(PermuteConcat):
+    """PermuteConcat encoder that tracks how many times ``encode`` is called."""
+
+    def __init__(
+        self, in_channels: int, n_steps_input: int, with_constants: bool = False
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            n_steps_input=n_steps_input,
+            with_constants=with_constants,
+        )
+        self.encode_calls = 0
+
+    def encode(self, batch: Batch) -> Tensor:  # type: ignore[override]
+        self.encode_calls += 1
+        return super().encode(batch)
+
+
+def test_encoder_processor_decoder_rollout_re_encodes_each_step(make_toy_batch):
+    """Ambient rollout must re-invoke the encoder at every rollout step.
+
+    This is the invariant the whole ``eval.mode=ambient`` path rests on: in
+    ambient rollout each step decodes the prediction and re-encodes it as
+    the next input, so decode/encode drift accumulates. If a future refactor
+    ever collapsed this into a latent-only loop, latent and ambient eval
+    would silently report the same numbers and ambient-vs-latent ablations
+    would be meaningless. This test pins the contract.
+    """
+    max_rollout_steps = 3
+    n_steps_input = 2
+    n_steps_output = 2
+    stride = 2
+    batch_size = 2
+    trajectory_length = 20
+
+    batch = make_toy_batch(
+        batch_size=batch_size,
+        t_in=n_steps_input,
+        t_out=trajectory_length - n_steps_input,
+    )
+    output_channels = batch.output_fields.shape[-1]
+    merged_input_channels = output_channels * n_steps_input
+    merged_output_channels = output_channels * n_steps_output
+
+    encoder = CountingPermuteConcat(
+        in_channels=output_channels,
+        n_steps_input=n_steps_input,
+        with_constants=False,
+    )
+    decoder = ChannelsLast(output_channels=output_channels, time_steps=n_steps_output)
+    loss = nn.MSELoss()
+    encoder_decoder = EncoderDecoder(encoder=encoder, decoder=decoder, loss_func=loss)
+    processor = TinyProcessor(
+        in_channels=merged_input_channels, out_channels=merged_output_channels
+    )
+    model = EncoderProcessorDecoder(
+        encoder_decoder=encoder_decoder,
+        processor=processor,
+        loss_func=loss,
+        optimizer_config=get_optimizer_config(),
+        stride=stride,
+        max_rollout_steps=max_rollout_steps,
+    )
+    model.eval()
+
+    calls_before = encoder.encode_calls
+    preds, _ = model.rollout(
+        batch,
+        stride=stride,
+        max_rollout_steps=max_rollout_steps,
+        free_running_only=True,
+    )
+    calls_during = encoder.encode_calls - calls_before
+
+    assert calls_during >= max_rollout_steps, (
+        "Ambient rollout must invoke the encoder at least once per rollout "
+        f"step; got {calls_during} encode calls for "
+        f"{max_rollout_steps} rollout steps."
+    )
+    assert preds.shape[0] == batch_size
+    assert preds.shape[1] == max_rollout_steps * n_steps_output
diff --git a/tests/scripts/test_eval_encoder_processor_decoder.py b/tests/scripts/test_eval_encoder_processor_decoder.py