add super-conditioning feature, idea originated from @crowsonkb

lucidrains · lucidrains · commit 459c46a43540 · 2022-01-04T14:23:52.000-08:00
diff --git a/README.md b/README.md
@@ -200,6 +200,63 @@ vae = VQGanVAE()
 
 The default VQGan is the codebook size 1024 one trained on imagenet. If you wish to use a different one, you can use the `vqgan_model_path` and `vqgan_config_path` to pass the .ckpt file and the .yaml file. These options can be used both in train-dalle script or as argument of VQGanVAE class. Other pretrained VQGAN can be found in [taming transformers readme](https://github.com/CompVis/taming-transformers#overview-of-pretrained-models). If you want to train a custom one you can [follow this guide](https://github.com/CompVis/taming-transformers/pull/54)
 
+
+## Adjust text conditioning strength
+
+Recently there has surfaced a <a href="https://openreview.net/forum?id=qw8AKxfYbI">new technique</a> for guiding diffusion models without a classifier. The gist of the technique involves randomly dropping out the text condition during training, and at inference time, deriving the rough direction from unconditional to conditional distributions.
+
+<a href="https://github.com/crowsonkb">Katherine Crowson</a> outlined in a <a href="https://twitter.com/RiversHaveWings/status/1478093658716966912">tweet</a> how this could work for autoregressive attention models. I have decided to include her idea in this repository for further exploration. One only has to account for two extra keyword arguments on training (`null_cond_prob`) and generation (`cond_scale`).
+
+```python
+import torch
+from dalle_pytorch import DiscreteVAE, DALLE
+
+vae = DiscreteVAE(
+    image_size = 256,
+    num_layers = 3,
+    num_tokens = 8192,
+    codebook_dim = 1024,
+    hidden_dim = 64,
+    num_resnet_blocks = 1,
+    temperature = 0.9
+)
+
+dalle = DALLE(
+    dim = 1024,
+    vae = vae,
+    num_text_tokens = 10000,
+    text_seq_len = 256,
+    depth = 12,
+    heads = 16,
+    dim_head = 64,
+    attn_dropout = 0.1,
+    ff_dropout = 0.1
+)
+
+text = torch.randint(0, 10000, (4, 256))
+images = torch.randn(4, 3, 256, 256)
+
+loss = dalle(
+    text,
+    images,
+    return_loss = True,
+    null_cond_prob = 0.2  # firstly, set this to the probability of dropping out the condition, 20% is recommended as a default
+)
+
+loss.backward()
+
+# do the above for a long time with a lot of data ... then
+
+images = dalle.generate_images(
+    text,
+    cond_scale = 3. # secondly, set this to a value greater than 1 to increase the conditioning beyond average
+)
+
+images.shape # (4, 3, 256, 256)
+```
+
+That's it!
+
 ## Ranking the generations
 
 Train CLIP
@@ -673,4 +730,21 @@ $ python generate.py --chinese --text '追老鼠的猫'
 }
 ```
 
+```bibtex
+@inproceedings{ho2021classifierfree,
+    title   = {Classifier-Free Diffusion Guidance},
+    author  = {Jonathan Ho and Tim Salimans},
+    booktitle = {NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications},
+    year    = {2021},
+    url     = {https://openreview.net/forum?id=qw8AKxfYbI}
+}
+```
+
+```bibtex
+@misc{crowson2022,
+    author  = {Katherine Crowson},
+    url     = {https://twitter.com/RiversHaveWings/status/1478093658716966912}
+}
+```
+
 *Those who do not want to imitate anything, produce nothing.* - Dali
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -32,6 +32,9 @@ def masked_mean(t, mask, dim = 1):
     t = t.masked_fill(~mask[:, :, None], 0.)
     return t.sum(dim = 1) / mask.sum(dim = 1)[..., None]
 
+def prob_mask_like(shape, prob, device):
+    return torch.zeros(shape, device = device).float().uniform_(0, 1) < prob
+
 def set_requires_grad(model, value):
     for param in model.parameters():
         param.requires_grad = value
@@ -469,7 +472,8 @@ def generate_images(
         filter_thres = 0.5,
         temperature = 1.,
         img = None,
-        num_init_img_tokens = None
+        num_init_img_tokens = None,
+        cond_scale = 1.
     ):
         vae, text_seq_len, image_seq_len, num_text_tokens = self.vae, self.text_seq_len, self.image_seq_len, self.num_text_tokens
         total_len = text_seq_len + image_seq_len
@@ -494,6 +498,13 @@ def generate_images(
             text, image = out[:, :text_seq_len], out[:, text_seq_len:]
 
             logits = self(text, image)
+
+            if cond_scale != 1:
+                # discovery by Katherine Crowson
+                # https://twitter.com/RiversHaveWings/status/1478093658716966912
+                null_cond_logits = self(text, image, null_cond_prob = 1.)
+                logits = null_cond_logits + (logits - null_cond_logits) * cond_scale
+
             logits = logits[:, -1, :]
 
             filtered_logits = top_k(logits, thres = filter_thres)
@@ -517,10 +528,17 @@ def forward(
         self,
         text,
         image = None,
-        return_loss = False
+        return_loss = False,
+        null_cond_prob = 0.
     ):
         assert text.shape[-1] == self.text_seq_len, f'the length {text.shape[-1]} of the text tokens you passed in does not have the correct length ({self.text_seq_len})'
-        device, total_seq_len = text.device, self.total_seq_len
+        batch, device, total_seq_len = text.shape[0], text.device, self.total_seq_len
+
+        # randomly remove text condition with <null_cond_prob> probability
+
+        if null_cond_prob > 0:
+            null_mask = prob_mask_like((batch,), null_cond_prob, device = device)
+            text *= rearrange(~null_mask, 'b -> b 1')
 
         # make sure padding in text tokens get unique padding token id
 
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'dalle-pytorch',
   packages = find_packages(),
   include_package_data = True,
-  version = '1.2.0',
+  version = '1.2.1',
   license='MIT',
   description = 'DALL-E - Pytorch',
   author = 'Phil Wang',