remove some dead code, text mask is not needed when training DALL-E

lucidrains · lucidrains · commit 4511d2df4429 · 2022-01-04T12:23:58.000-08:00
diff --git a/README.md b/README.md
@@ -123,14 +123,13 @@ dalle = DALLE(
 
 text = torch.randint(0, 10000, (4, 256))
 images = torch.randn(4, 3, 256, 256)
-mask = torch.ones_like(text).bool()
 
-loss = dalle(text, images, mask = mask, return_loss = True)
+loss = dalle(text, images, return_loss = True)
 loss.backward()
 
 # do the above for a long time with a lot of data ... then
 
-images = dalle.generate_images(text, mask = mask)
+images = dalle.generate_images(text)
 images.shape # (4, 3, 256, 256)
 ```
 
@@ -141,7 +140,6 @@ img_prime = torch.randn(4, 3, 256, 256)
 
 images = dalle.generate_images(
     text,
-    mask = mask,
     img = img_prime,
     num_init_img_tokens = (14 * 32)  # you can set the size of the initial crop, defaults to a little less than ~1/2 of the tokens, as done in the paper
 )
@@ -179,9 +177,8 @@ dalle = DALLE(
 
 text = torch.randint(0, 10000, (4, 256))
 images = torch.randn(4, 3, 256, 256)
-mask = torch.ones_like(text).bool()
 
-loss = dalle(text, images, mask = mask, return_loss = True)
+loss = dalle(text, images, return_loss = True)
 loss.backward()
 ```
 
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -466,7 +466,6 @@ def generate_images(
         text,
         *,
         clip = None,
-        mask = None,
         filter_thres = 0.5,
         temperature = 1.,
         img = None,
@@ -494,7 +493,7 @@ def generate_images(
 
             text, image = out[:, :text_seq_len], out[:, text_seq_len:]
 
-            logits = self(text, image, mask = mask)
+            logits = self(text, image)
             logits = logits[:, -1, :]
 
             filtered_logits = top_k(logits, thres = filter_thres)
@@ -503,9 +502,6 @@ def generate_images(
             sample -= (num_text_tokens if is_image else 0) # offset sampled token if it is an image token, since logit space is composed of text and then image tokens
             out = torch.cat((out, sample[:, None]), dim=-1)
 
-            if out.shape[1] <= text_seq_len:
-                mask = F.pad(mask, (0, 1), value = True)
-
         text_seq = out[:, :text_seq_len]
 
         img_seq = out[:, -image_seq_len:]
@@ -521,7 +517,6 @@ def forward(
         self,
         text,
         image = None,
-        mask = None,
         return_loss = False
     ):
         assert text.shape[-1] == self.text_seq_len, f'the length {text.shape[-1]} of the text tokens you passed in does not have the correct length ({self.text_seq_len})'
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'dalle-pytorch',
   packages = find_packages(),
   include_package_data = True,
-  version = '1.1.8',
+  version = '1.2.0',
   license='MIT',
   description = 'DALL-E - Pytorch',
   author = 'Phil Wang',