make sure mnumber of resnet blocks are customizable, address difference between t5 encoder and google/t5-v1.1

lucidrains · lucidrains · commit 032bbe5b1372 · 2022-05-25T13:44:42.000-07:00
diff --git a/imagen_pytorch/imagen_pytorch.py b/imagen_pytorch/imagen_pytorch.py
@@ -601,6 +601,7 @@ def __init__(
         *,
         image_embed_dim = 1024,
         text_embed_dim = get_encoded_dim(DEFAULT_T5_NAME),
+        num_resnet_blocks = 1,
         cond_dim = None,
         num_image_tokens = 4,
         num_time_tokens = 2,
@@ -706,6 +707,7 @@ def __init__(
 
         # resnet block klass
 
+        num_resnet_blocks = cast_tuple(num_resnet_blocks, len(in_out))
         resnet_groups = cast_tuple(resnet_groups, len(in_out))
 
         assert len(resnet_groups) == len(in_out)
@@ -722,15 +724,15 @@ def __init__(
         self.ups = nn.ModuleList([])
         num_resolutions = len(in_out)
 
-        for ind, ((dim_in, dim_out), groups) in enumerate(zip(in_out, resnet_groups)):
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups) in enumerate(zip(in_out, num_resnet_blocks, resnet_groups)):
             is_first = ind == 0
             is_last = ind >= (num_resolutions - 1)
             layer_cond_dim = cond_dim if not is_first else None
 
             self.downs.append(nn.ModuleList([
                 ResnetBlock(dim_in, dim_out, time_cond_dim = time_cond_dim, groups = groups),
                 Residual(LinearAttention(dim_out, **attn_kwargs)) if sparse_attn else nn.Identity(),
-                ResnetBlock(dim_out, dim_out, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups),
+                nn.ModuleList([ResnetBlock(dim_out, dim_out, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups) for _ in range(layer_num_resnet_blocks)]),
                 downsample_klass(dim_out) if not is_last else nn.Identity()
             ]))
 
@@ -740,14 +742,14 @@ def __init__(
         self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', Residual(Attention(mid_dim, **attn_kwargs))) if attend_at_middle else None
         self.mid_block2 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])
 
-        for ind, ((dim_in, dim_out), groups) in enumerate(zip(reversed(in_out[1:]), reversed(resnet_groups))):
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups) in enumerate(zip(reversed(in_out[1:]), reversed(num_resnet_blocks), reversed(resnet_groups))):
             is_last = ind >= (num_resolutions - 2)
             layer_cond_dim = cond_dim if not is_last else None
 
             self.ups.append(nn.ModuleList([
                 ResnetBlock(dim_out * 2, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups),
                 Residual(LinearAttention(dim_in, **attn_kwargs)) if sparse_attn else nn.Identity(),
-                ResnetBlock(dim_in, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups),
+                nn.ModuleList([ResnetBlock(dim_in, dim_in, cond_dim = layer_cond_dim, time_cond_dim = time_cond_dim, groups = groups) for _ in range(layer_num_resnet_blocks)]),
                 Upsample(dim_in)
             ]))
 
@@ -891,10 +893,13 @@ def forward(
 
         hiddens = []
 
-        for block1, sparse_attn, block2, downsample in self.downs:
-            x = block1(x, c, t)
+        for init_block, sparse_attn, resnet_blocks, downsample in self.downs:
+            x = init_block(x, c, t)
             x = sparse_attn(x)
-            x = block2(x, c, t)
+
+            for resnet_block in resnet_blocks:
+                x = resnet_block(x, c, t)
+
             hiddens.append(x)
             x = downsample(x)
 
@@ -905,11 +910,14 @@ def forward(
 
         x = self.mid_block2(x, mid_c, t)
 
-        for block1, sparse_attn, block2, upsample in self.ups:
+        for init_block, sparse_attn, resnet_blocks, upsample in self.ups:
             x = torch.cat((x, hiddens.pop()), dim=1)
-            x = block1(x, c, t)
+            x = init_block(x, c, t)
             x = sparse_attn(x)
-            x = block2(x, c, t)
+
+            for resnet_block in resnet_blocks:
+                x = resnet_block(x, c, t)
+
             x = upsample(x)
 
         return self.final_conv(x)
@@ -962,7 +970,7 @@ def __init__(
         # get text encoder
 
         self.text_encoder_name = text_encoder_name
-        text_embed_dim = get_encoded_dim(text_encoder_name)
+        self.text_embed_dim = get_encoded_dim(text_encoder_name)
 
         # construct unets
 
@@ -977,7 +985,7 @@ def __init__(
             one_unet = one_unet.cast_model_parameters(
                 lowres_cond = not is_first,
                 cond_on_text = self.condition_on_text,
-                text_embed_dim = text_embed_dim if self.condition_on_text else None,
+                text_embed_dim = self.text_embed_dim if self.condition_on_text else None,
                 channels = self.channels,
                 channels_out = unet_channels_out
             )
@@ -1211,6 +1219,7 @@ def sample(
 
         assert not (self.condition_on_text and not exists(text_embeds)), 'text or text encodings must be passed into imagen if specified'
         assert not (not self.condition_on_text and exists(text_embeds)), 'imagen specified not to be conditioned on text, yet it is presented'
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
 
         img = None
         is_cuda = next(self.parameters()).is_cuda
@@ -1282,6 +1291,8 @@ def forward(
         assert not (self.condition_on_text and not exists(text_embeds)), 'text or text encodings must be passed into decoder if specified'
         assert not (not self.condition_on_text and exists(text_embeds)), 'decoder specified not to be conditioned on text, yet it is presented'
 
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+
         lowres_cond_img = lowres_aug_times = None
         if exists(prev_image_size):
             lowres_cond_img = resize_image_to(image, prev_image_size)
diff --git a/imagen_pytorch/t5.py b/imagen_pytorch/t5.py
@@ -104,8 +104,16 @@ def t5_encode_text(texts, name = 't5-small'):
     attn_mask = encoded.attention_mask.to(device)
 
     t5.eval()
+
+    config = T5_CONFIGS[name]
+    src = config['src']
+
     with torch.no_grad():
-        output = t5(input_ids = input_ids, attention_mask = attn_mask) # too lazy to figure out how to make it work without decoder inputs
-        encoded_text = output.last_hidden_state.detach()
+        if src == 't5':
+            output = t5(input_ids = input_ids, attention_mask = attn_mask)
+            encoded_text = output.last_hidden_state.detach()
+        elif src == 'auto':
+            output = t5(input_ids = input_ids, attention_mask = attn_mask, decoder_input_ids = input_ids[:, :1])
+            encoded_text = output.encoder_last_hidden_state.detach()
 
     return encoded_text, attn_mask.bool()
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'imagen-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.21',
+  version = '0.0.22',
   license='MIT',
   description = 'Imagen - unprecedented photorealism × deep level of language understanding',
   author = 'Phil Wang',