Updating hydra config further to support flow matching and diffusion

sgreenbury · sgreenbury · commit a47286608eb5 · 2025-12-11T16:31:31.000Z
diff --git a/configs/decoder/identity.yaml b/configs/decoder/identity.yaml
@@ -0,0 +1 @@
+_target_: autocast.decoders.identity.IdentityDecoder
diff --git a/configs/encoder/identity.yaml b/configs/encoder/identity.yaml
@@ -0,0 +1 @@
+_target_: autocast.encoders.identity.IdentityEncoder
diff --git a/configs/model/encoder_processor_decoder.yaml b/configs/model/encoder_processor_decoder.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - /encoder: identity
+  - /decoder: identity
+  - /processor: flow_matching
+  - _self_
+
+learning_rate: 0.001
+train_processor_only: false
+teacher_forcing_ratio: 0.5
+max_rollout_steps: 10
+loss_func:
+  _target_: torch.nn.MSELoss
diff --git a/configs/processor.yaml b/configs/processor.yaml
@@ -1,8 +1,6 @@
 defaults:
   - data: reaction_diffusion
-  - encoder: permute_concat
-  - decoder: channels_last
-  - processor: flow_matching
+  - model: encoder_processor_decoder
   - trainer: default
   - logging: wandb
   - _self_
@@ -14,18 +12,12 @@ output:
   save_config: true
 
 training:
-  n_steps_input: 4
+  n_steps_input: 1
   n_steps_output: 4
   stride: 4
   autoencoder_checkpoint: null
   freeze_autoencoder: false
 
-encoder_processor_decoder:
-  learning_rate: 0.001
-  train_processor_only: false
-  loss_func:
-    _target_: torch.nn.MSELoss
-
 hydra:
   run:
     dir: outputs/${experiment_name}/${now:%Y-%m-%d_%H-%M-%S}
diff --git a/src/autocast/eval/processor.py b/src/autocast/eval/processor.py
@@ -30,6 +30,7 @@
 )
 from autocast.models.encoder_decoder import EncoderDecoder
 from autocast.models.encoder_processor_decoder import EncoderProcessorDecoder
+from autocast.processors.utils import initialize_flow_matching_backbone
 from autocast.train.configuration import (
     compose_training_config,
     configure_module_dimensions,
@@ -330,17 +331,28 @@ def _load_state_dict(checkpoint_path: Path) -> OrderedDict[str, torch.Tensor]:
     return state_dict
 
 
-def _load_model(cfg: DictConfig, checkpoint_path: Path) -> EncoderProcessorDecoder:
-    encoder = instantiate(cfg.encoder)
-    decoder = instantiate(cfg.decoder)
+def _load_model(
+    cfg: DictConfig,
+    checkpoint_path: Path,
+    n_steps_input: int,
+    channel_count: int,
+    spatial_shape: Sequence[int],
+) -> EncoderProcessorDecoder:
+    model_cfg = cfg.get("model") or cfg
+    encoder = instantiate(model_cfg.encoder)
+    decoder = instantiate(model_cfg.decoder)
     encoder_decoder = EncoderDecoder(encoder=encoder, decoder=decoder)
-    processor = instantiate(cfg.processor)
-    epd_cfg = cfg.get("encoder_processor_decoder") or {}
+    processor = instantiate(model_cfg.processor)
+    initialize_flow_matching_backbone(
+        processor,
+        n_steps_input,
+        channel_count,
+        spatial_shape,
+    )
+    epd_cfg = model_cfg
     learning_rate = epd_cfg.get("learning_rate", 1e-3)
-    training_cfg = cfg.get("training")
-    stride = 1
-    if isinstance(training_cfg, DictConfig):
-        stride = training_cfg.get("stride", 1)
+    training_cfg = cfg.get("training") or {}
+    stride = training_cfg.get("stride", 1)
     teacher_forcing_ratio = epd_cfg.get("teacher_forcing_ratio", 0.5)
     max_rollout_steps = epd_cfg.get("max_rollout_steps", 10)
     loss_cfg = epd_cfg.get("loss_func")
@@ -495,8 +507,8 @@ def main() -> None:
         channel_count,
         inferred_n_steps_input,
         inferred_n_steps_output,
-        _,
-        _,
+        input_shape,
+        output_shape,
     ) = prepare_datamodule(cfg)
 
     configure_module_dimensions(
@@ -509,7 +521,14 @@ def main() -> None:
 
     metrics = _build_metrics(args.metrics or ("mse", "rmse"))
 
-    model = _load_model(cfg, args.checkpoint)
+    spatial_shape = tuple(input_shape[2:-1])
+    model = _load_model(
+        cfg,
+        args.checkpoint,
+        inferred_n_steps_input,
+        channel_count,
+        spatial_shape,
+    )
     device = _resolve_device(args.device)
     model.to(device)
 
diff --git a/src/autocast/processors/flow_matching.py b/src/autocast/processors/flow_matching.py
@@ -3,6 +3,7 @@
 from typing import Any
 
 import torch
+from omegaconf import DictConfig, OmegaConf
 from torch import nn
 
 from autocast.nn.unet import TemporalUNetBackbone
@@ -28,7 +29,7 @@ def __init__(
         flow_ode_steps: int = 1,
         n_steps_output: int = 4,
         n_channels_out: int = 1,
-        backbone_kwargs: dict[str, Any] | None = None,
+        backbone_kwargs: dict[str, Any] | DictConfig | None = None,
         **kwargs: Any,
     ) -> None:
         # Store core hyperparameters and optional prebuilt backbone.
@@ -46,7 +47,19 @@ def __init__(
         self.flow_ode_steps = max(flow_ode_steps, 1)
         self.n_steps_output = n_steps_output
         self.n_channels_out = n_channels_out
-        self.backbone_kwargs = backbone_kwargs or {}
+        processed_kwargs: dict[str, Any] = {}
+        raw_kwargs: Any | None
+        if isinstance(backbone_kwargs, DictConfig):
+            raw_kwargs = OmegaConf.to_container(backbone_kwargs, resolve=True)
+        else:
+            raw_kwargs = backbone_kwargs
+        if isinstance(raw_kwargs, dict):
+            processed_kwargs = {str(k): v for k, v in raw_kwargs.items()}
+            for field in ("hid_channels", "hid_blocks"):
+                value = processed_kwargs.get(field)
+                if isinstance(value, list):
+                    processed_kwargs[field] = tuple(value)
+        self.backbone_kwargs = processed_kwargs
 
     def _maybe_build_backbone(self, x: Tensor) -> None:
         """Lazily build TemporalUNetBackbone when no model is provided."""
diff --git a/src/autocast/processors/utils.py b/src/autocast/processors/utils.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import torch
+
+
+def initialize_flow_matching_backbone(
+    processor,
+    n_steps_input: int | None,
+    channel_count: int | None,
+    spatial_shape: Sequence[int] | None,
+) -> None:
+    """Instantiate the flow-matching backbone before optimizers are created."""
+    builder = getattr(processor, "_maybe_build_backbone", None)
+    has_model = getattr(processor, "flow_matching_model", None) is not None
+    if builder is None or has_model:
+        return
+    if n_steps_input is None or channel_count is None:
+        return
+    spatial = tuple(spatial_shape) if spatial_shape is not None else ()
+    dummy = torch.zeros(
+        (1, n_steps_input, *spatial, channel_count), dtype=torch.float32
+    )
+    builder(dummy)
diff --git a/src/autocast/train/configuration.py b/src/autocast/train/configuration.py
@@ -65,16 +65,26 @@ def _maybe_set(cfg_node: DictConfig | None, key: str, value: int) -> None:
         cfg_node[key] = value
 
 
+def _model_cfg(cfg: DictConfig) -> DictConfig:
+    """Return the nested model config when present, else the root config."""
+    model_cfg = cfg.get("model")
+    if isinstance(model_cfg, DictConfig):
+        return model_cfg
+    return cfg
+
+
 def configure_module_dimensions(
     cfg: DictConfig,
     channel_count: int,
     n_steps_input: int,
     n_steps_output: int,
 ) -> None:
     """Populate missing dimension hints for encoder/decoder/processor modules."""
-    _maybe_set(cfg.decoder, "output_channels", channel_count)
-    _maybe_set(cfg.decoder, "time_steps", n_steps_output)
-    processor_cfg = cfg.get("processor")
+    model_cfg = _model_cfg(cfg)
+    decoder_cfg = model_cfg.get("decoder")
+    _maybe_set(decoder_cfg, "output_channels", channel_count)
+    _maybe_set(decoder_cfg, "time_steps", n_steps_output)
+    processor_cfg = model_cfg.get("processor")
     _maybe_set(processor_cfg, "in_channels", channel_count * n_steps_input)
     _maybe_set(processor_cfg, "out_channels", channel_count * n_steps_output)
     _maybe_set(processor_cfg, "n_steps_output", n_steps_output)
@@ -88,7 +98,7 @@ def configure_module_dimensions(
 
 def normalize_processor_cfg(cfg: DictConfig) -> None:
     """Force config values into the shapes expected by processor classes."""
-    processor_cfg = cfg.get("processor")
+    processor_cfg = _model_cfg(cfg).get("processor")
     if processor_cfg is None:
         return
     tuple_fields = ("n_modes",)
diff --git a/src/autocast/train/processor.py b/src/autocast/train/processor.py
@@ -17,6 +17,7 @@
 from autocast.models.ae import AE, AELoss
 from autocast.models.encoder_decoder import EncoderDecoder
 from autocast.models.encoder_processor_decoder import EncoderProcessorDecoder
+from autocast.processors.utils import initialize_flow_matching_backbone
 from autocast.train.configuration import (
     compose_training_config,
     configure_module_dimensions,
@@ -165,7 +166,7 @@ def instantiate_trainer(
     )
 
 
-def main() -> None:
+def main() -> None:  # noqa: PLR0915
     """CLI entrypoint for training the processor."""
     args = parse_args()
     logging.basicConfig(level=logging.INFO)
@@ -175,6 +176,7 @@ def main() -> None:
 
     cfg = compose_training_config(args)
     resolved_cfg = OmegaConf.to_container(cfg, resolve=True)
+    model_cfg = cfg.get("model") or cfg
     wandb_logger, watch_cfg = create_wandb_logger(
         cfg.get("logging"),
         experiment_name=cfg.get("experiment_name", "processor"),
@@ -225,8 +227,8 @@ def main() -> None:
     normalize_processor_cfg(cfg)
 
     encoder, decoder = build_autoencoder_modules(
-        cfg.encoder,
-        cfg.decoder,
+        model_cfg.encoder,
+        model_cfg.decoder,
         training_params.autoencoder_checkpoint,
     )
     encoder_decoder = EncoderDecoder(encoder=encoder, decoder=decoder)
@@ -236,17 +238,33 @@ def main() -> None:
         _freeze_module(encoder_decoder.encoder)
         _freeze_module(encoder_decoder.decoder)
 
-    processor = instantiate(cfg.processor)
+    processor = instantiate(model_cfg.processor)
+    spatial_shape = tuple(input_shape[2:-1])
+    initialize_flow_matching_backbone(
+        processor,
+        inferred_n_steps_input,
+        channel_count,
+        spatial_shape,
+    )
 
-    epd_cfg = cfg.get("encoder_processor_decoder")
-    learning_rate = epd_cfg.get("learning_rate", 1e-3) if epd_cfg is not None else 1e-3
-    loss_cfg = epd_cfg.get("loss_func") if epd_cfg is not None else None
+    epd_cfg = model_cfg
+    learning_rate = epd_cfg.get("learning_rate", 1e-3)
+    train_processor_only = epd_cfg.get("train_processor_only", False)
+    teacher_forcing_ratio = epd_cfg.get("teacher_forcing_ratio", 0.5)
+    max_rollout_steps = epd_cfg.get("max_rollout_steps", 10)
+    loss_cfg = epd_cfg.get("loss_func")
     loss_func = instantiate(loss_cfg) if loss_cfg is not None else nn.MSELoss()
+    training_cfg = cfg.get("training") or {}
+    stride = training_cfg.get("stride", 1)
 
     model = EncoderProcessorDecoder(
         encoder_decoder=encoder_decoder,
         processor=processor,
         learning_rate=learning_rate,
+        train_processor_only=train_processor_only,
+        stride=stride,
+        teacher_forcing_ratio=teacher_forcing_ratio,
+        max_rollout_steps=max_rollout_steps,
         loss_func=loss_func,
     )
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+_target_: autocast.decoders.identity.IdentityDecoder`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+_target_: autocast.encoders.identity.IdentityEncoder`