version, guidance cache fix

homerjed · homerjed · commit 81264c27e774 · 2025-07-08T16:48:13.000+02:00
diff --git a/examples/main.py b/examples/main.py
@@ -186,7 +186,7 @@ def get_config(dataset_name: str) -> ConfigDict:
     # Model
     config.model = model = ConfigDict()
     model.img_size             = data.img_size
-    model.in_channels          = data.n_channels
+    model.n_channels          = data.n_channels
     model.patch_size           = 4 
     model.channels             = {"CIFAR10" : 512, "MNIST" : 128, "FLOWERS" : 512}[dataset_name]
     model.y_dim                = {"CIFAR10" : 1, "MNIST" : 1, "FLOWERS" : 1}[dataset_name] 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transformer-flows"
-version = "0.0.11"
+version = "0.0.12"
 description = "Implementation of Transformer Flows (Apple ML) in JAX and Equinox."
 readme = "README.md"
 authors = [
diff --git a/src/transformer_flows/attention.py b/src/transformer_flows/attention.py
@@ -14,6 +14,8 @@
 
 typecheck = jaxtyped(typechecker=typechecker)
 
+KQVCacheType = Literal["conditional", "unconditional"] # Guidance caches
+
 
 @typecheck
 def standard_attention(
@@ -245,9 +247,9 @@ def _make_autoregressive_cache(**_):
             else:
                 _int = jnp.int32
 
-            # return jnp.empty(key_shape), jnp.empty(value_shape), jnp.zeros((), _int)
             initial_cache = (jnp.empty(key_shape), jnp.empty(value_shape), jnp.zeros((), _int))
-            return dict(uncond=initial_cache, cond=initial_cache)
+
+            return dict(unconditional=initial_cache, conditional=initial_cache)
 
         query_proj_out_size = qk_size
         key_proj_out_size = qk_size
@@ -317,7 +319,7 @@ def __call__(
         *,
         key: Optional[PRNGKeyArray] = None,
         temperature: Optional[float] = 1.,
-        which_cache: Literal["cond", "uncond"],
+        which_cache: KQVCacheType,
         inference: Optional[bool] = None,
         deterministic: Optional[bool] = None,
         process_heads: Optional[
@@ -391,20 +393,23 @@ def __call__(
             causal_mask_offset = index # Offset shifts attention lower-tril
             index = index + kv_seq_length # i -> i + 1, nudging autoregression
 
-            other_cache = "cond" if which_cache == "uncond" else "uncond"
-            empty_cache = jax.tree.map(
-                lambda x: jnp.zeros_like(x), (key_state, value_state, index)
-            )
+            if which_cache == "unconditional":
+                other_cache = "conditional" 
+            else: 
+                other_cache = "unconditional"
+
+            # empty_cache = jax.tree.map(
+            #     lambda x: jnp.zeros_like(x), (key_state, value_state, index)
+            # )
+
             state = state.set(
                 self.autoregressive_index, 
-                {which_cache : (key_state, value_state, index), other_cache : empty_cache}
+                {
+                    which_cache : (key_state, value_state, index), 
+                    other_cache : state.get(self.autoregressive_index)[other_cache] # empty_cache
+                }
             )
 
-            # if sample:
-            #     state = state.set(
-            #         self.autoregressive_index, (key_state, value_state, index)
-            #     )
-
             # The keys and values stack the preceeding keys and values, 
             # key-value sequence length updated; masking adopts this
             key_heads = key_state
diff --git a/src/transformer_flows/transformer_flow.py b/src/transformer_flows/transformer_flow.py
@@ -22,7 +22,7 @@
 import matplotlib.pyplot as plt
 from tqdm.auto import trange
 
-from .attention import MultiheadAttention, self_attention
+from .attention import MultiheadAttention, self_attention, KQVCacheType
 
 
 if os.getenv("TYPECHECK", "").lower() in ["1", "true"]:
@@ -42,8 +42,6 @@
 
 NoiseType = Union[Literal["gaussian", "uniform"], None]
 
-CacheType = Literal["conditional", "unconditional"] # Guidance caches
-
 MaskArray = Union[
     Float[Array, "s s"], Int[Array, "s s"], Bool[Array, "s s"]
 ]
@@ -123,23 +121,27 @@ def clear_and_get_results_dir(
     run_dir: Optional[Path] = None, 
     clear_old: bool = False 
 ) -> Path:
+
     if not exists(run_dir):
         run_dir = Path.cwd()
 
     # Image save directories
     imgs_dir = run_dir / "imgs" / dataset_name.lower()
 
+    # Clear old ones
     if clear_old:
-        rmtree(str(imgs_dir), ignore_errors=True) # Clear old ones
+        rmtree(str(imgs_dir), ignore_errors=True) 
 
     if not imgs_dir.exists():
-
         imgs_dir.mkdir(exist_ok=True, parents=True)
 
-        for _dir in ["samples", "warps", "latents"]:
-            (imgs_dir / _dir).mkdir(exist_ok=True)
+    # Image type directories
+    for _dir in ["samples", "warps", "latents"]:
+        (imgs_dir / _dir).mkdir(exist_ok=True, parents=True)
 
-    return imgs_dir
+    print("Saving samples in:\n\t", imgs_dir)
+
+    return imgs_dir 
 
 
 def count_parameters(model: eqx.Module) -> int:
@@ -289,6 +291,7 @@ def __init__(
         key: PRNGKeyArray
     ):
         key_weight, key_bias = jr.split(key)
+
         l = math.sqrt(1. / in_size)
         dtype = default(dtype, jnp.float32)
 
@@ -432,7 +435,7 @@ def __call__(
         mask: Optional[Union[MaskArray, Literal["causal"]]], 
         state: Optional[eqx.nn.State],
         *,
-        which_cache: CacheType,
+        which_cache: KQVCacheType,
         attention_temperature: Optional[float] = 1.
     ) -> Tuple[
         Float[Array, "#s q"], Optional[eqx.nn.State] # Autoregression
@@ -480,6 +483,7 @@ def __init__(
         key: PRNGKeyArray
     ):
         keys = jr.split(key, 3)
+
         self.y_dim = y_dim
         self.conditioning_type = conditioning_type
 
@@ -575,7 +579,7 @@ def __call__(
         ] = None, 
         state: Optional[eqx.nn.State] = None, # No state during forward pass
         *,
-        which_cache: CacheType = "conditional",
+        which_cache: KQVCacheType = "conditional",
         attention_temperature: Optional[float] = 1.
     ) -> Union[
         Float[Array, "#{self.n_patches} {self.sequence_dim}"],
@@ -613,9 +617,10 @@ def __init__(
         sequence_length: int
     ):
         self.permute = permute # Flip if true else pass
-        assert jnp.isscalar(self.permute)
         self.sequence_length = sequence_length
 
+        assert jnp.isscalar(self.permute)
+
     @property
     def permute_idx(self):
         permute = maybe_stop_grad(self.permute, stop=True)
@@ -786,7 +791,7 @@ def reverse_step(
         s: Int[Array, ""],
         state: eqx.nn.State,
         *,
-        which_cache: CacheType = "conditional",
+        which_cache: KQVCacheType = "conditional",
         attention_temperature: Optional[float] = 1.
     ) -> Tuple[
         Float[Array, "1 {self.sequence_dim}"], 
@@ -847,7 +852,7 @@ def reverse(
         ],
         state: eqx.nn.State, 
         *,
-        which_cache: CacheType = "conditional",
+        which_cache: KQVCacheType = "conditional",
         guidance: float = 0.,
         attention_temperature: Optional[float] = 1.0,
         guide_what: Optional[Literal["ab", "a", "b"]] = "ab",
@@ -942,7 +947,7 @@ class TransformerFlow(eqx.Module):
     @typecheck
     def __init__(
         self,
-        in_channels: int,
+        n_channels: int,
         img_size: int,
         patch_size: int,
         channels: int,
@@ -958,11 +963,11 @@ def __init__(
         key: PRNGKeyArray
     ):
         self.img_size = img_size
-        self.n_channels = in_channels
+        self.n_channels = n_channels
 
         self.patch_size = patch_size
         self.n_patches = int(img_size / patch_size) ** 2
-        self.sequence_dim = in_channels * patch_size ** 2
+        self.sequence_dim = n_channels * patch_size ** 2
         self.n_blocks = n_blocks
 
         self.y_dim = y_dim
@@ -1785,6 +1790,7 @@ def filter_spikes(l: list, loss_max: float = 10.0) -> list[float]:
                     plt.savefig(imgs_dir / "losses.png", bbox_inches="tight")
                     plt.close()
 
-                save_fn(model=ema_model if use_ema else model)
+                if exists(save_fn):
+                    save_fn(model=ema_model if use_ema else model)
 
     return model
diff --git a/uv.lock b/uv.lock