Merge pull request #349 from alan-turing-institute/add-render-rollout-snapshot

sgreenbury · web-flow · commit dd64554297de · 2026-04-25T15:40:16.000+01:00
Add rollout snapshot rendering to eval
diff --git a/src/autocast/configs/eval/README.md b/src/autocast/configs/eval/README.md
@@ -62,6 +62,11 @@ All eval configs support these parameters:
 - `video_format`: Video format (mp4 or gif)
 - `video_sample_index`: Sample index within batch to visualize
 - `fps`: Frames per second for videos
+- `save_rollout_snapshots`: Save still rollout panels from raw tensors
+- `rollout_snapshot_timesteps`: Timestep indices shown in each still panel
+- `rollout_snapshot_channels`: Channel indices to render (`null` means all)
+- `rollout_snapshot_dir`: Custom snapshot directory (default:
+  work_dir/videos/snapshots)
 - `accelerator`: Accelerator for evaluation (auto, cpu, cuda, mps)
 - `devices`: Number of GPUs for DDP evaluation (default: 1; set explicitly,
   e.g. 4, for multi-GPU runs)
diff --git a/src/autocast/configs/eval/default.yaml b/src/autocast/configs/eval/default.yaml
@@ -54,6 +54,14 @@ video_format: mp4  # gif or mp4
 video_sample_index: 0
 fps: 5
 
+# Rollout snapshot settings. When enabled, eval saves still panels for the
+# requested rollout samples using raw tensors rather than extracting MP4 frames.
+save_rollout_snapshots: false
+rollout_snapshot_dir: null  # default: <video_dir>/snapshots
+rollout_snapshot_timesteps: [0, 4, 12, 30, 99]
+rollout_snapshot_channels: null  # null means all channels
+rollout_snapshot_format: png
+
 # Accelerator for evaluation (mirrors Lightning Fabric API)
 accelerator: auto  # auto, cpu, cuda, or mps
 devices: 1 # default single-GPU eval; override with int (e.g. 4) for multi-GPU DDP
diff --git a/src/autocast/configs/eval/encoder_processor_decoder.yaml b/src/autocast/configs/eval/encoder_processor_decoder.yaml
@@ -59,4 +59,5 @@ compute_rollout_metrics: true
 metric_windows: [null]
 metric_windows_rollout: [[0, 1], [0, 4], [6, 12], [13, 30], [31, 99]]
 batch_indices: [0, 1, 2, 3, 4, 5, 6, 7]
-preserve_aspect: true
+preserve_aspect: true
+save_rollout_snapshots: true
diff --git a/src/autocast/scripts/eval/encoder_processor_decoder.py b/src/autocast/scripts/eval/encoder_processor_decoder.py
@@ -75,7 +75,7 @@
 from autocast.scripts.training import apply_float32_matmul_precision
 from autocast.scripts.utils import get_default_config_path
 from autocast.types.batch import Batch, EncodedBatch
-from autocast.utils import plot_spatiotemporal_video
+from autocast.utils import plot_spatiotemporal_snapshots, plot_spatiotemporal_video
 from autocast.utils.plots import (
     compute_metrics_from_dataloader,
     compute_metrics_per_timestep_from_dataloader,
@@ -298,6 +298,13 @@ def _resolve_video_dir(eval_cfg: DictConfig, work_dir: Path) -> Path:
     return (work_dir / "videos").resolve()
 
 
+def _resolve_rollout_snapshot_dir(eval_cfg: DictConfig, video_dir: Path) -> Path:
+    snapshot_dir = eval_cfg.get("rollout_snapshot_dir")
+    if snapshot_dir is not None:
+        return Path(snapshot_dir).expanduser().resolve()
+    return (video_dir / "snapshots").resolve()
+
+
 def _unwrap_module(module: Any) -> Any:
     """Return the underlying model when wrapped by Fabric/DDP-style wrappers."""
     unwrapped = module
@@ -500,6 +507,125 @@ def _collect_rollout_sample_targets_for_batch(
     return sample_targets
 
 
+def _select_snapshot_timesteps(
+    timesteps: Sequence[int] | None,
+    n_timesteps: int,
+) -> list[int]:
+    if not timesteps:
+        return []
+
+    selected: list[int] = []
+    invalid: list[int] = []
+    seen: set[int] = set()
+    for raw_timestep in timesteps:
+        timestep = int(raw_timestep)
+        if 0 <= timestep < n_timesteps:
+            if timestep not in seen:
+                selected.append(timestep)
+                seen.add(timestep)
+        else:
+            invalid.append(timestep)
+
+    if invalid:
+        log.warning(
+            "Ignoring rollout snapshot timestep(s) outside [0, %s]: %s",
+            n_timesteps - 1,
+            invalid,
+        )
+    return selected
+
+
+def _select_snapshot_channels(
+    channels: Sequence[int] | None,
+    n_channels: int,
+) -> list[int]:
+    if channels is None:
+        return list(range(n_channels))
+
+    selected: list[int] = []
+    invalid: list[int] = []
+    seen: set[int] = set()
+    for raw_channel in channels:
+        channel = int(raw_channel)
+        if 0 <= channel < n_channels:
+            if channel not in seen:
+                selected.append(channel)
+                seen.add(channel)
+        else:
+            invalid.append(channel)
+
+    if invalid:
+        log.warning(
+            "Ignoring rollout snapshot channel(s) outside [0, %s]: %s",
+            n_channels - 1,
+            invalid,
+        )
+    return selected
+
+
+def _prepare_rollout_snapshot_plan(
+    *,
+    snapshot_dir: Path | None,
+    snapshot_timesteps: Sequence[int] | None,
+    snapshot_channels: Sequence[int] | None,
+    n_timesteps: int,
+    n_channels: int,
+) -> tuple[Path | None, list[int], list[int]]:
+    if snapshot_dir is None:
+        return None, [], []
+
+    selected_timesteps = _select_snapshot_timesteps(snapshot_timesteps, n_timesteps)
+    if not selected_timesteps:
+        return None, [], []
+
+    selected_channels = _select_snapshot_channels(snapshot_channels, n_channels)
+    if not selected_channels:
+        return None, [], []
+
+    snapshot_dir.mkdir(parents=True, exist_ok=True)
+    return snapshot_dir, selected_timesteps, selected_channels
+
+
+def _save_rollout_snapshot_panels(
+    *,
+    trues_mean: torch.Tensor,
+    preds_mean: torch.Tensor,
+    preds_uq: torch.Tensor | None,
+    local_idx: int,
+    target_idx: int,
+    snapshot_dir: Path,
+    snapshot_timesteps: Sequence[int],
+    snapshot_channels: Sequence[int],
+    snapshot_ext: str,
+    saved_paths: list[Path],
+    names_for_plot: list[str] | None,
+    preserve_aspect: bool,
+) -> None:
+    for channel_idx in snapshot_channels:
+        snapshot_filename = snapshot_dir / (
+            f"batch_{target_idx}_sample_{local_idx}_"
+            f"channel_{channel_idx}_snapshots.{snapshot_ext}"
+        )
+        plot_spatiotemporal_snapshots(
+            true=trues_mean[local_idx : local_idx + 1].cpu(),
+            pred=preds_mean[local_idx : local_idx + 1].cpu(),
+            pred_uq=(
+                preds_uq[local_idx : local_idx + 1].cpu()
+                if preds_uq is not None
+                else None
+            ),
+            timesteps=snapshot_timesteps,
+            channel=channel_idx,
+            batch_idx=0,
+            save_path=str(snapshot_filename),
+            title="Rollout snapshots",
+            channel_names=names_for_plot,
+            preserve_aspect=preserve_aspect,
+        )
+        saved_paths.append(snapshot_filename)
+        log.info("Saved rollout snapshot panel to %s", snapshot_filename)
+
+
 def _render_rollouts(  # noqa: PLR0912
     model: (
         EncoderProcessorDecoder
@@ -520,6 +646,10 @@ def _render_rollouts(  # noqa: PLR0912
     channel_names: list[str] | None = None,
     preserve_aspect: bool = False,
     decode_fn: Callable | None = None,
+    snapshot_timesteps: Sequence[int] | None = None,
+    snapshot_dir: Path | None = None,
+    snapshot_format: str = "png",
+    snapshot_channels: Sequence[int] | None = None,
 ) -> list[Path]:
     # Return early if no rollout indices are requested
     if not batch_indices:
@@ -533,6 +663,7 @@ def _render_rollouts(  # noqa: PLR0912
     rendered_targets: set[int] = set()
     global_sample_offset = 0
     video_dir.mkdir(parents=True, exist_ok=True)
+    snapshot_ext = snapshot_format.removeprefix(".") or "png"
 
     # Perform rollouts and save videos for requested target indices.
     with torch.no_grad():
@@ -599,6 +730,17 @@ def _render_rollouts(  # noqa: PLR0912
                 sample_index=sample_index,
                 global_sample_offset=global_sample_offset,
             )
+            (
+                snapshot_dir_for_batch,
+                snapshot_timesteps_for_batch,
+                snapshot_channels_for_batch,
+            ) = _prepare_rollout_snapshot_plan(
+                snapshot_dir=snapshot_dir,
+                snapshot_timesteps=snapshot_timesteps,
+                snapshot_channels=snapshot_channels,
+                n_timesteps=int(trues_mean.shape[1]),
+                n_channels=n_channels,
+            )
 
             for target_idx, local_idx in sample_targets.items():
                 if target_idx in rendered_targets:
@@ -627,6 +769,22 @@ def _render_rollouts(  # noqa: PLR0912
                 rendered_targets.add(target_idx)
                 log.info("Saved rollout visualization to %s", filename)
 
+                if snapshot_dir_for_batch is not None:
+                    _save_rollout_snapshot_panels(
+                        trues_mean=trues_mean,
+                        preds_mean=preds_mean,
+                        preds_uq=preds_uq,
+                        local_idx=local_idx,
+                        target_idx=target_idx,
+                        snapshot_dir=snapshot_dir_for_batch,
+                        snapshot_timesteps=snapshot_timesteps_for_batch,
+                        snapshot_channels=snapshot_channels_for_batch,
+                        snapshot_ext=snapshot_ext,
+                        saved_paths=saved_paths,
+                        names_for_plot=names_for_plot,
+                        preserve_aspect=preserve_aspect,
+                    )
+
             global_sample_offset += batch_size
 
     # Check for any missing rollout sample indices that were requested
@@ -1920,6 +2078,22 @@ def coverage_factory() -> Metric:
                 rollout_test_loader,
                 max_rollout_batches,
             )
+            save_rollout_snapshots = bool(eval_cfg.get("save_rollout_snapshots", False))
+            rollout_snapshot_dir = (
+                _resolve_rollout_snapshot_dir(eval_cfg, video_dir)
+                if save_rollout_snapshots
+                else None
+            )
+            rollout_snapshot_timesteps = (
+                eval_cfg.get("rollout_snapshot_timesteps", [])
+                if save_rollout_snapshots
+                else None
+            )
+            rollout_snapshot_channels = (
+                eval_cfg.get("rollout_snapshot_channels", None)
+                if save_rollout_snapshots
+                else None
+            )
             _render_rollouts(
                 model,  # pyright: ignore[reportArgumentType]
                 rollout_loader,
@@ -1935,6 +2109,10 @@ def coverage_factory() -> Metric:
                 channel_names=rollout_channel_names,
                 preserve_aspect=eval_cfg.get("preserve_aspect", False),
                 decode_fn=decode_fn,
+                snapshot_timesteps=rollout_snapshot_timesteps,
+                snapshot_dir=rollout_snapshot_dir,
+                snapshot_format=eval_cfg.get("rollout_snapshot_format", "png"),
+                snapshot_channels=rollout_snapshot_channels,
             )
 
         # Prepare metric functions for rollouts
diff --git a/src/autocast/utils/__init__.py b/src/autocast/utils/__init__.py
@@ -1,4 +1,8 @@
 from .optimizer import get_optimizer_config
-from .plots import plot_spatiotemporal_video
+from .plots import plot_spatiotemporal_snapshots, plot_spatiotemporal_video
 
-__all__ = ["get_optimizer_config", "plot_spatiotemporal_video"]
+__all__ = [
+    "get_optimizer_config",
+    "plot_spatiotemporal_snapshots",
+    "plot_spatiotemporal_video",
+]
diff --git a/src/autocast/utils/plots.py b/src/autocast/utils/plots.py