allow for use of larger t5, and extensible to any text encoder model

lucidrains · lucidrains · commit 322496f7e306 · 2022-05-25T07:15:05.000-07:00
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ unet2 = Unet(
 # imagen, which contains the unets above (base unet and super resoluting ones)
 
 imagen = Imagen(
-    unet = (unet1, unet2),
+    unets = (unet1, unet2),
     image_sizes = (64, 256),
     timesteps = 100,
     cond_drop_prob = 0.5
@@ -73,7 +73,7 @@ images.shape # (3, 3, 256, 256)
 - [x] use huggingface transformers for T5-small text embeddings
 - [x] add dynamic thresholding
 - [x] add dynamic thresholding DALLE2 and video-diffusion repository as well
-- [ ] allow for one to set T5-large (and perhaps small factory method to take in any huggingface transformer)
+- [x] allow for one to set T5-large (and perhaps small factory method to take in any huggingface transformer)
 - [ ] separate unet into base unet and SR3 unet
 - [ ] build whatever efficient unet they came up with
 - [ ] add the noise level conditioning with the pseudocode in appendix, and figure out what is this sweep they do at inference time
diff --git a/imagen_pytorch/imagen_pytorch.py b/imagen_pytorch/imagen_pytorch.py
@@ -22,7 +22,7 @@
 
 from resize_right import resize
 
-from imagen_pytorch.t5 import t5_encode_text, T5_SMALL_EMBED_DIM
+from imagen_pytorch.t5 import t5_encode_text, get_encoded_dim
 
 # constants
 
@@ -233,7 +233,7 @@ def __init__(self, *, beta_schedule, timesteps, loss_type):
 
         # register buffer helper function to cast double back to float
 
-        register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32))
+        register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32), persistent = False)
 
         register_buffer('betas', betas)
         register_buffer('alphas_cumprod', alphas_cumprod)
@@ -691,7 +691,7 @@ def __init__(
         dim,
         *,
         image_embed_dim = None,
-        text_embed_dim = T5_SMALL_EMBED_DIM,
+        text_embed_dim = 512,
         cond_dim = None,
         num_image_tokens = 4,
         num_time_tokens = 2,
@@ -842,18 +842,21 @@ def cast_model_parameters(
         self,
         *,
         lowres_cond,
+        text_embed_dim,
         channels,
         channels_out,
         cond_on_text
     ):
         if lowres_cond == self.lowres_cond and \
             channels == self.channels and \
             cond_on_text == self.cond_on_text and \
+            text_embed_dim == self._locals['text_embed_dim'] and \
             channels_out == self.channels_out:
             return self
 
         updated_kwargs = dict(
             lowres_cond = lowres_cond,
+            text_embed_dim = text_embed_dim,
             channels = channels,
             channels_out = channels_out,
             cond_on_text = cond_on_text
@@ -1020,8 +1023,9 @@ def forward(
 class Imagen(BaseGaussianDiffusion):
     def __init__(
         self,
-        unet,
+        unets,
         *,
+        text_encoder_name = 't5-small',
         image_size = None,
         channels = 3,
         timesteps = 1000,
@@ -1068,14 +1072,18 @@ def __init__(
         # automatically take care of ensuring that first unet is unconditional
         # while the rest of the unets are conditioned on the low resolution image produced by previous unet
 
-        unets = cast_tuple(unet)
+        unets = cast_tuple(unets)
 
         # whether to use learned variance, defaults to True for the first unet in the cascade, as in paper
 
         learned_variance = pad_tuple_to_length(cast_tuple(learned_variance), len(unets), fillvalue = False)
         self.learned_variance = learned_variance
         self.vb_loss_weight = vb_loss_weight
 
+        # get text encoder
+
+        text_embed_dim = get_encoded_dim(text_encoder_name)
+
         # construct unets
 
         self.unets = nn.ModuleList([])
@@ -1089,6 +1097,7 @@ def __init__(
             one_unet = one_unet.cast_model_parameters(
                 lowres_cond = not is_first,
                 cond_on_text = one_unet.cond_on_text and not unconditional,
+                text_embed_dim = text_embed_dim,
                 channels = self.channels,
                 channels_out = unet_channels_out
             )
diff --git a/imagen_pytorch/t5.py b/imagen_pytorch/t5.py
@@ -4,38 +4,73 @@
 def exists(val):
     return val is not None
 
+# config
+
+MAX_LENGTH = 256
+
+T5_CONFIGS = {
+    't5-small': {
+        'dim': 512
+    },
+    't5-large': {
+        'dim': 1024
+    }
+}
+
 # singleton globals
 
-MODEL = None
-TOKENIZER = None
-T5_SMALL_EMBED_DIM = 512
+def get_tokenizer(name):
+    assert name in T5_CONFIGS
+    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    return tokenizer
+
+def get_model(name):
+    assert name in T5_CONFIGS
+    model = T5ForConditionalGeneration.from_pretrained("t5-small")
+    return model
+
+def get_model_and_tokenizer(name):
+    global T5_CONFIGS
+    assert name in T5_CONFIGS, f'{name} model is not found in the configuration'
+    config = T5_CONFIGS[name]
 
-def get_tokenizer():
-    global TOKENIZER
-    if not exists(TOKENIZER):
-        TOKENIZER = T5Tokenizer.from_pretrained("t5-small")
-    return TOKENIZER
+    if not 'model' in config:
+        model = get_model(name)
+        config['model'] = model
 
-def get_t5():
-    global MODEL
-    if not exists(MODEL):
-        MODEL = T5ForConditionalGeneration.from_pretrained("t5-small")
-        if torch.cuda.is_available():
-            MODEL = MODEL.cuda()
+    if not 'tokenizer' in config:
+        tokenizer = get_tokenizer(name)
+        config['tokenizer'] = tokenizer
 
-    return MODEL
+    return config['model'], config['tokenizer']
+
+def get_encoded_dim(name):
+    assert name in T5_CONFIGS, f'{name} model is not found in configuration'
+    return T5_CONFIGS[name]['dim']
 
 # encoding text
 
-def t5_encode_text(texts):
-    t5 = get_t5()
-    tokenizer = get_tokenizer()
+def t5_encode_text(texts, name = 't5-small'):
+    t5, tokenizer = get_model_and_tokenizer(name)
+
+    if torch.cuda.is_available():
+        t5 = t5.cuda()
+
+    device = next(t5.parameters()).device
+
+    encoded = tokenizer.batch_encode_plus(
+        texts,
+        return_tensors = "pt",
+        padding = 'longest',
+        max_length = MAX_LENGTH,
+        truncation = True
+    )
 
-    input_ids = tokenizer.batch_encode_plus(texts, return_tensors = "pt", padding = True, truncation = True).input_ids
-    input_ids = input_ids.to(next(t5.parameters()).device)
+    input_ids = encoded.input_ids.to(device)
+    attn_mask = encoded.attention_mask.to(device)
 
     t5.eval()
     with torch.no_grad():
-        output = t5(input_ids = input_ids, decoder_input_ids = input_ids[:, :1]) # too lazy to figure out how to make it work without decoder inputs
+        output = t5(input_ids = input_ids, attention_mask = attn_mask, decoder_input_ids = input_ids[:, :1]) # too lazy to figure out how to make it work without decoder inputs
 
     return output.encoder_last_hidden_state
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'imagen-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.5',
+  version = '0.0.6',
   license='MIT',
   description = 'Imagen - unprecedented photorealism × deep level of language understanding',
   author = 'Phil Wang',