allow for config driven creation of imagen off the bat, also researcher must always give list of image resolutions this time around

lucidrains · lucidrains · commit 2e2f87a66e9b · 2022-05-25T10:43:06.000-07:00
diff --git a/imagen_pytorch/configs.py b/imagen_pytorch/configs.py
@@ -0,0 +1,55 @@
+import json
+from pydantic import BaseModel, validator, root_validator
+from typing import List, Iterable, Optional, Union, Tuple, Dict, Any
+from imagen_pytorch.imagen_pytorch import Imagen, Unet
+
+# helper functions
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+def ListOrTuple(inner_type):
+    return Union[List[inner_type], Tuple[inner_type]]
+
+# imagen pydantic classes
+
+class UnetConfig(BaseModel):
+    dim: int
+    dim_mults: ListOrTuple(int)
+    text_embed_dim: int = 1024
+    cond_dim: int = None
+    channels: int = 3
+    attn_dim_head: int = 32
+    attn_heads: int = 16
+
+    class Config:
+        extra = "allow"
+
+class ImagenConfig(BaseModel):
+    unets: ListOrTuple(UnetConfig)
+    image_sizes: ListOrTuple(int)
+    channels: int = 3
+    timesteps: int = 1000
+    loss_type: str = 'l2'
+    beta_schedule: str = 'cosine'
+    learned_variance: bool = True
+    cond_drop_prob: float = 0.5
+
+    @validator('image_sizes')
+    def check_image_sizes(cls, image_sizes, values):
+        unets = values.get('unets')
+        if len(image_sizes) != len(unets):
+            raise ValueError(f'image sizes length {len(image_sizes)} must be equivalent to the number of unets {len(unets)}')
+        return image_sizes
+
+    def create(self):
+        decoder_kwargs = self.dict()
+        unet_configs = decoder_kwargs.pop('unets')
+        unets = [Unet(**config) for config in unet_configs]
+        return Imagen(unets, **decoder_kwargs)
+
+    class Config:
+        extra = "allow"
diff --git a/imagen_pytorch/imagen_pytorch.py b/imagen_pytorch/imagen_pytorch.py
@@ -620,7 +620,7 @@ def __init__(
         self,
         dim,
         *,
-        image_embed_dim = None,
+        image_embed_dim = 1024,
         text_embed_dim = 512,
         cond_dim = None,
         num_image_tokens = 4,
@@ -940,14 +940,13 @@ def __init__(
         self,
         unets,
         *,
+        image_sizes,                                # for cascading ddpm, image size at each stage
         text_encoder_name = 't5-small',
-        image_size = None,
         channels = 3,
         timesteps = 1000,
         cond_drop_prob = 0.1,
         loss_type = 'l2',
         beta_schedule = 'cosine',
-        image_sizes = None,                         # for cascading ddpm, image size at each stage
         random_crop_sizes = None,                   # whether to random crop the image at that stage in the cascade (super resoluting convolutions at the end may be able to generalize on smaller crops)
         lowres_sample_noise_level = 0.2,            # in the paper, they present a new trick where they noise the lowres conditioning image, and at sample time, fix it to a certain level (0.1 or 0.3) - the unets are also made to be conditioned on this noise level
         condition_on_text = True,
@@ -966,14 +965,6 @@ def __init__(
         self.condition_on_text = condition_on_text
         self.unconditional = not condition_on_text
 
-        # determine image size, with image_size and image_sizes taking precedence
-
-        if exists(image_size) or exists(image_sizes):
-            assert exists(image_size) ^ exists(image_sizes), 'only one of image_size or image_sizes must be given'
-            image_size = default(image_size, lambda: image_sizes[-1])
-        else:
-            raise Error('either image_size or image sizes must be given to imagen')
-
         # channels
 
         self.channels = channels
@@ -1016,11 +1007,8 @@ def __init__(
 
         # unet image sizes
 
-        image_sizes = default(image_sizes, (image_size,))
-        image_sizes = tuple(sorted(set(image_sizes)))
-
         assert len(self.unets) == len(image_sizes), f'you did not supply the correct number of u-nets ({len(self.unets)}) for resolutions {image_sizes}'
-        self.image_sizes = image_sizes
+        self.image_sizes = cast_tuple(image_sizes)
         self.sample_channels = cast_tuple(self.channels, len(image_sizes))
 
         # random crop sizes (for super-resoluting unets at the end of cascade?)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'imagen-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.12',
+  version = '0.0.15',
   license='MIT',
   description = 'Imagen - unprecedented photorealism × deep level of language understanding',
   author = 'Phil Wang',
@@ -22,6 +22,7 @@
     'einops-exts',
     'kornia',
     'numpy',
+    'pydantic',
     'resize-right',
     'torch>=1.6',
     'torchvision',