invoke-ai
diff --git a/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 5 additions & 0 deletions b/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎invokeai/app/invocations/spandrel_image_to_image.py
Lines changed: 137 additions & 59 deletions b/‎invokeai/app/invocations/spandrel_image_to_image.py
Lines changed: 137 additions & 59 deletions
diff --git a/‎invokeai/backend/stable_diffusion/denoise_context.py
Lines changed: 10 additions & 10 deletions b/‎invokeai/backend/stable_diffusion/denoise_context.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎invokeai/backend/stable_diffusion/diffusion_backend.py
Lines changed: 4 additions & 4 deletions b/‎invokeai/backend/stable_diffusion/diffusion_backend.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎invokeai/backend/stable_diffusion/extension_callback_type.py
Lines changed: 1 addition & 1 deletion b/‎invokeai/backend/stable_diffusion/extension_callback_type.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎invokeai/backend/stable_diffusion/extensions/rescale_cfg.py
Lines changed: 36 additions & 0 deletions b/‎invokeai/backend/stable_diffusion/extensions/rescale_cfg.py
Lines changed: 36 additions & 0 deletions
@@ -60,6 +60,7 @@
 from invokeai.backend.stable_diffusion.extension_callback_type import ExtensionCallbackType
 from invokeai.backend.stable_diffusion.extensions.freeu import FreeUExt
 from invokeai.backend.stable_diffusion.extensions.preview import PreviewExt
+from invokeai.backend.stable_diffusion.extensions.rescale_cfg import RescaleCFGExt
 from invokeai.backend.stable_diffusion.extensions_manager import ExtensionsManager
 from invokeai.backend.stable_diffusion.schedulers import SCHEDULER_MAP
 from invokeai.backend.stable_diffusion.schedulers.schedulers import SCHEDULER_NAME_VALUES
@@ -791,6 +792,10 @@ def step_callback(state: PipelineIntermediateState) -> None:
 
         ext_manager.add_extension(PreviewExt(step_callback))
 
+        ### cfg rescale
+        if self.cfg_rescale_multiplier > 0:
+            ext_manager.add_extension(RescaleCFGExt(self.cfg_rescale_multiplier))
+
         ### freeu
         if self.unet.freeu_config:
             ext_manager.add_extension(FreeUExt(self.unet.freeu_config))
 
@@ -1,3 +1,5 @@
+from typing import Callable
+
 import numpy as np
 import torch
 from PIL import Image
@@ -21,7 +23,7 @@
 from invokeai.backend.tiles.utils import TBLR, Tile
 
 
-@invocation("spandrel_image_to_image", title="Image-to-Image", tags=["upscale"], category="upscale", version="1.1.0")
+@invocation("spandrel_image_to_image", title="Image-to-Image", tags=["upscale"], category="upscale", version="1.2.0")
 class SpandrelImageToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
     """Run any spandrel image-to-image model (https://github.com/chaiNNer-org/spandrel)."""
 
@@ -34,8 +36,19 @@ class SpandrelImageToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
     tile_size: int = InputField(
         default=512, description="The tile size for tiled image-to-image. Set to 0 to disable tiling."
     )
+    scale: float = InputField(
+        default=4.0,
+        gt=0.0,
+        le=16.0,
+        description="The final scale of the output image. If the model does not upscale the image, this will be ignored.",
+    )
+    fit_to_multiple_of_8: bool = InputField(
+        default=False,
+        description="If true, the output image will be resized to the nearest multiple of 8 in both dimensions.",
+    )
 
-    def _scale_tile(self, tile: Tile, scale: int) -> Tile:
+    @classmethod
+    def scale_tile(cls, tile: Tile, scale: int) -> Tile:
         return Tile(
             coords=TBLR(
                 top=tile.coords.top * scale,
@@ -51,20 +64,22 @@ def _scale_tile(self, tile: Tile, scale: int) -> Tile:
             ),
         )
 
-    @torch.inference_mode()
-    def invoke(self, context: InvocationContext) -> ImageOutput:
-        # Images are converted to RGB, because most models don't support an alpha channel. In the future, we may want to
-        # revisit this.
-        image = context.images.get_pil(self.image.image_name, mode="RGB")
-
+    @classmethod
+    def upscale_image(
+        cls,
+        image: Image.Image,
+        tile_size: int,
+        spandrel_model: SpandrelImageToImageModel,
+        is_canceled: Callable[[], bool],
+    ) -> Image.Image:
         # Compute the image tiles.
-        if self.tile_size > 0:
+        if tile_size > 0:
             min_overlap = 20
             tiles = calc_tiles_min_overlap(
                 image_height=image.height,
                 image_width=image.width,
-                tile_height=self.tile_size,
-                tile_width=self.tile_size,
+                tile_height=tile_size,
+                tile_width=tile_size,
                 min_overlap=min_overlap,
             )
         else:
@@ -85,60 +100,123 @@ def invoke(self, context: InvocationContext) -> ImageOutput:
         # Prepare input image for inference.
         image_tensor = SpandrelImageToImageModel.pil_to_tensor(image)
 
-        # Load the model.
-        spandrel_model_info = context.models.load(self.image_to_image_model)
-
-        # Run the model on each tile.
-        with spandrel_model_info as spandrel_model:
-            assert isinstance(spandrel_model, SpandrelImageToImageModel)
+        # Scale the tiles for re-assembling the final image.
+        scale = spandrel_model.scale
+        scaled_tiles = [cls.scale_tile(tile, scale=scale) for tile in tiles]
 
-            # Scale the tiles for re-assembling the final image.
-            scale = spandrel_model.scale
-            scaled_tiles = [self._scale_tile(tile, scale=scale) for tile in tiles]
+        # Prepare the output tensor.
+        _, channels, height, width = image_tensor.shape
+        output_tensor = torch.zeros(
+            (height * scale, width * scale, channels), dtype=torch.uint8, device=torch.device("cpu")
+        )
 
-            # Prepare the output tensor.
-            _, channels, height, width = image_tensor.shape
-            output_tensor = torch.zeros(
-                (height * scale, width * scale, channels), dtype=torch.uint8, device=torch.device("cpu")
-            )
+        image_tensor = image_tensor.to(device=spandrel_model.device, dtype=spandrel_model.dtype)
 
-            image_tensor = image_tensor.to(device=spandrel_model.device, dtype=spandrel_model.dtype)
-
-            for tile, scaled_tile in tqdm(list(zip(tiles, scaled_tiles, strict=True)), desc="Upscaling Tiles"):
-                # Exit early if the invocation has been canceled.
-                if context.util.is_canceled():
-                    raise CanceledException
-
-                # Extract the current tile from the input tensor.
-                input_tile = image_tensor[
-                    :, :, tile.coords.top : tile.coords.bottom, tile.coords.left : tile.coords.right
-                ].to(device=spandrel_model.device, dtype=spandrel_model.dtype)
-
-                # Run the model on the tile.
-                output_tile = spandrel_model.run(input_tile)
-
-                # Convert the output tile into the output tensor's format.
-                # (N, C, H, W) -> (C, H, W)
-                output_tile = output_tile.squeeze(0)
-                # (C, H, W) -> (H, W, C)
-                output_tile = output_tile.permute(1, 2, 0)
-                output_tile = output_tile.clamp(0, 1)
-                output_tile = (output_tile * 255).to(dtype=torch.uint8, device=torch.device("cpu"))
-
-                # Merge the output tile into the output tensor.
-                # We only keep half of the overlap on the top and left side of the tile. We do this in case there are
-                # edge artifacts. We don't bother with any 'blending' in the current implementation - for most upscalers
-                # it seems unnecessary, but we may find a need in the future.
-                top_overlap = scaled_tile.overlap.top // 2
-                left_overlap = scaled_tile.overlap.left // 2
-                output_tensor[
-                    scaled_tile.coords.top + top_overlap : scaled_tile.coords.bottom,
-                    scaled_tile.coords.left + left_overlap : scaled_tile.coords.right,
-                    :,
-                ] = output_tile[top_overlap:, left_overlap:, :]
+        # Run the model on each tile.
+        for tile, scaled_tile in tqdm(list(zip(tiles, scaled_tiles, strict=True)), desc="Upscaling Tiles"):
+            # Exit early if the invocation has been canceled.
+            if is_canceled():
+                raise CanceledException
+
+            # Extract the current tile from the input tensor.
+            input_tile = image_tensor[
+                :, :, tile.coords.top : tile.coords.bottom, tile.coords.left : tile.coords.right
+            ].to(device=spandrel_model.device, dtype=spandrel_model.dtype)
+
+            # Run the model on the tile.
+            output_tile = spandrel_model.run(input_tile)
+
+            # Convert the output tile into the output tensor's format.
+            # (N, C, H, W) -> (C, H, W)
+            output_tile = output_tile.squeeze(0)
+            # (C, H, W) -> (H, W, C)
+            output_tile = output_tile.permute(1, 2, 0)
+            output_tile = output_tile.clamp(0, 1)
+            output_tile = (output_tile * 255).to(dtype=torch.uint8, device=torch.device("cpu"))
+
+            # Merge the output tile into the output tensor.
+            # We only keep half of the overlap on the top and left side of the tile. We do this in case there are
+            # edge artifacts. We don't bother with any 'blending' in the current implementation - for most upscalers
+            # it seems unnecessary, but we may find a need in the future.
+            top_overlap = scaled_tile.overlap.top // 2
+            left_overlap = scaled_tile.overlap.left // 2
+            output_tensor[
+                scaled_tile.coords.top + top_overlap : scaled_tile.coords.bottom,
+                scaled_tile.coords.left + left_overlap : scaled_tile.coords.right,
+                :,
+            ] = output_tile[top_overlap:, left_overlap:, :]
 
         # Convert the output tensor to a PIL image.
         np_image = output_tensor.detach().numpy().astype(np.uint8)
         pil_image = Image.fromarray(np_image)
+
+        return pil_image
+
+    @torch.inference_mode()
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        # Images are converted to RGB, because most models don't support an alpha channel. In the future, we may want to
+        # revisit this.
+        image = context.images.get_pil(self.image.image_name, mode="RGB")
+
+        # Load the model.
+        spandrel_model_info = context.models.load(self.image_to_image_model)
+
+        # The target size of the image, determined by the provided scale. We'll run the upscaler until we hit this size.
+        # Later, we may mutate this value if the model doesn't upscale the image or if the user requested a multiple of 8.
+        target_width = int(image.width * self.scale)
+        target_height = int(image.height * self.scale)
+
+        # Do the upscaling.
+        with spandrel_model_info as spandrel_model:
+            assert isinstance(spandrel_model, SpandrelImageToImageModel)
+
+            # First pass of upscaling. Note: `pil_image` will be mutated.
+            pil_image = self.upscale_image(image, self.tile_size, spandrel_model, context.util.is_canceled)
+
+            # Some models don't upscale the image, but we have no way to know this in advance. We'll check if the model
+            # upscaled the image and run the loop below if it did. We'll require the model to upscale both dimensions
+            # to be considered an upscale model.
+            is_upscale_model = pil_image.width > image.width and pil_image.height > image.height
+
+            if is_upscale_model:
+                # This is an upscale model, so we should keep upscaling until we reach the target size.
+                iterations = 1
+                while pil_image.width < target_width or pil_image.height < target_height:
+                    pil_image = self.upscale_image(pil_image, self.tile_size, spandrel_model, context.util.is_canceled)
+                    iterations += 1
+
+                    # Sanity check to prevent excessive or infinite loops. All known upscaling models are at least 2x.
+                    # Our max scale is 16x, so with a 2x model, we should never exceed 16x == 2^4 -> 4 iterations.
+                    # We'll allow one extra iteration "just in case" and bail at 5 upscaling iterations. In practice,
+                    # we should never reach this limit.
+                    if iterations >= 5:
+                        context.logger.warning(
+                            "Upscale loop reached maximum iteration count of 5, stopping upscaling early."
+                        )
+                        break
+            else:
+                # This model doesn't upscale the image. We should ignore the scale parameter, modifying the output size
+                # to be the same as the processed image size.
+
+                # The output size is now the size of the processed image.
+                target_width = pil_image.width
+                target_height = pil_image.height
+
+                # Warn the user if they requested a scale greater than 1.
+                if self.scale > 1:
+                    context.logger.warning(
+                        "Model does not increase the size of the image, but a greater scale than 1 was requested. Image will not be scaled."
+                    )
+
+        # We may need to resize the image to a multiple of 8. Use floor division to ensure we don't scale the image up
+        # in the final resize
+        if self.fit_to_multiple_of_8:
+            target_width = int(target_width // 8 * 8)
+            target_height = int(target_height // 8 * 8)
+
+        # Final resize. Per PIL documentation, Lanczos provides the best quality for both upscale and downscale.
+        # See: https://pillow.readthedocs.io/en/stable/handbook/concepts.html#filters-comparison-table
+        pil_image = pil_image.resize((target_width, target_height), resample=Image.Resampling.LANCZOS)
+
         image_dto = context.images.save(image=pil_image)
         return ImageOutput.build(image_dto)
@@ -83,47 +83,47 @@ class DenoiseContext:
     unet: Optional[UNet2DConditionModel] = None
 
     # Current state of latent-space image in denoising process.
-    # None until `pre_denoise_loop` callback.
+    # None until `PRE_DENOISE_LOOP` callback.
     # Shape: [batch, channels, latent_height, latent_width]
     latents: Optional[torch.Tensor] = None
 
     # Current denoising step index.
-    # None until `pre_step` callback.
+    # None until `PRE_STEP` callback.
     step_index: Optional[int] = None
 
     # Current denoising step timestep.
-    # None until `pre_step` callback.
+    # None until `PRE_STEP` callback.
     timestep: Optional[torch.Tensor] = None
 
     # Arguments which will be passed to UNet model.
-    # Available in `pre_unet`/`post_unet` callbacks, otherwise will be None.
+    # Available in `PRE_UNET`/`POST_UNET` callbacks, otherwise will be None.
     unet_kwargs: Optional[UNetKwargs] = None
 
     # SchedulerOutput class returned from step function(normally, generated by scheduler).
-    # Supposed to be used only in `post_step` callback, otherwise can be None.
+    # Supposed to be used only in `POST_STEP` callback, otherwise can be None.
     step_output: Optional[SchedulerOutput] = None
 
     # Scaled version of `latents`, which will be passed to unet_kwargs initialization.
-    # Available in events inside step(between `pre_step` and `post_stop`).
+    # Available in events inside step(between `PRE_STEP` and `POST_STEP`).
     # Shape: [batch, channels, latent_height, latent_width]
     latent_model_input: Optional[torch.Tensor] = None
 
     # [TMP] Defines on which conditionings current unet call will be runned.
-    # Available in `pre_unet`/`post_unet` callbacks, otherwise will be None.
+    # Available in `PRE_UNET`/`POST_UNET` callbacks, otherwise will be None.
     conditioning_mode: Optional[ConditioningMode] = None
 
     # [TMP] Noise predictions from negative conditioning.
-    # Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
+    # Available in `POST_COMBINE_NOISE_PREDS` callback, otherwise will be None.
     # Shape: [batch, channels, latent_height, latent_width]
     negative_noise_pred: Optional[torch.Tensor] = None
 
     # [TMP] Noise predictions from positive conditioning.
-    # Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
+    # Available in `POST_COMBINE_NOISE_PREDS` callback, otherwise will be None.
     # Shape: [batch, channels, latent_height, latent_width]
     positive_noise_pred: Optional[torch.Tensor] = None
 
     # Combined noise prediction from passed conditionings.
-    # Available in `apply_cfg` and `post_apply_cfg` callbacks, otherwise will be None.
+    # Available in `POST_COMBINE_NOISE_PREDS` callback, otherwise will be None.
     # Shape: [batch, channels, latent_height, latent_width]
     noise_pred: Optional[torch.Tensor] = None
 
 
@@ -76,12 +76,12 @@ def step(self, ctx: DenoiseContext, ext_manager: ExtensionsManager) -> Scheduler
             both_noise_pred = self.run_unet(ctx, ext_manager, ConditioningMode.Both)
             ctx.negative_noise_pred, ctx.positive_noise_pred = both_noise_pred.chunk(2)
 
-        # ext: override apply_cfg
-        ctx.noise_pred = self.apply_cfg(ctx)
+        # ext: override combine_noise_preds
+        ctx.noise_pred = self.combine_noise_preds(ctx)
 
         # ext: cfg_rescale [modify_noise_prediction]
         # TODO: rename
-        ext_manager.run_callback(ExtensionCallbackType.POST_APPLY_CFG, ctx)
+        ext_manager.run_callback(ExtensionCallbackType.POST_COMBINE_NOISE_PREDS, ctx)
 
         # compute the previous noisy sample x_t -> x_t-1
         step_output = ctx.scheduler.step(ctx.noise_pred, ctx.timestep, ctx.latents, **ctx.inputs.scheduler_step_kwargs)
@@ -95,7 +95,7 @@ def step(self, ctx: DenoiseContext, ext_manager: ExtensionsManager) -> Scheduler
         return step_output
 
     @staticmethod
-    def apply_cfg(ctx: DenoiseContext) -> torch.Tensor:
+    def combine_noise_preds(ctx: DenoiseContext) -> torch.Tensor:
         guidance_scale = ctx.inputs.conditioning_data.guidance_scale
         if isinstance(guidance_scale, list):
             guidance_scale = guidance_scale[ctx.step_index]
 
@@ -9,4 +9,4 @@ class ExtensionCallbackType(Enum):
     POST_STEP = "post_step"
     PRE_UNET = "pre_unet"
     POST_UNET = "post_unet"
-    POST_APPLY_CFG = "post_apply_cfg"
+    POST_COMBINE_NOISE_PREDS = "post_combine_noise_preds"
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from invokeai.backend.stable_diffusion.extension_callback_type import ExtensionCallbackType
+from invokeai.backend.stable_diffusion.extensions.base import ExtensionBase, callback
+
+if TYPE_CHECKING:
+    from invokeai.backend.stable_diffusion.denoise_context import DenoiseContext
+
+
+class RescaleCFGExt(ExtensionBase):
+    def __init__(self, rescale_multiplier: float):
+        super().__init__()
+        self._rescale_multiplier = rescale_multiplier
+
+    @staticmethod
+    def _rescale_cfg(total_noise_pred: torch.Tensor, pos_noise_pred: torch.Tensor, multiplier: float = 0.7):
+        """Implementation of Algorithm 2 from https://arxiv.org/pdf/2305.08891.pdf."""
+        ro_pos = torch.std(pos_noise_pred, dim=(1, 2, 3), keepdim=True)
+        ro_cfg = torch.std(total_noise_pred, dim=(1, 2, 3), keepdim=True)
+
+        x_rescaled = total_noise_pred * (ro_pos / ro_cfg)
+        x_final = multiplier * x_rescaled + (1.0 - multiplier) * total_noise_pred
+        return x_final
+
+    @callback(ExtensionCallbackType.POST_COMBINE_NOISE_PREDS)
+    def rescale_noise_pred(self, ctx: DenoiseContext):
+        if self._rescale_multiplier > 0:
+            ctx.noise_pred = self._rescale_cfg(
+                ctx.noise_pred,
+                ctx.positive_noise_pred,
+                self._rescale_multiplier,
+            )