keras-team · ianstenbit · Aug 24, 2023 · Jul 13, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,4 @@ __pycache__/
 .vscode/
 .devcontainer/
 .coverage
+.history
diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py
@@ -19,6 +19,9 @@
 from keras_cv.layers.augmenter import Augmenter
 from keras_cv.layers.feature_pyramid import FeaturePyramid
 from keras_cv.layers.fusedmbconv import FusedMBConvBlock
+from keras_cv.layers.hierarchical_transformer_encoder import (
+    HierarchicalTransformerEncoder,
+)
 from keras_cv.layers.mbconv import MBConvBlock
 from keras_cv.layers.object_detection.anchor_generator import AnchorGenerator
 from keras_cv.layers.object_detection.box_matcher import BoxMatcher
@@ -32,6 +35,9 @@
     CenterNetLabelEncoder,
 )
 from keras_cv.layers.object_detection_3d.voxelization import DynamicVoxelization
+from keras_cv.layers.overlapping_patching_embedding import (
+    OverlappingPatchingAndEmbedding,
+)
 from keras_cv.layers.preprocessing.aug_mix import AugMix
 from keras_cv.layers.preprocessing.auto_contrast import AutoContrast
 from keras_cv.layers.preprocessing.base_image_augmentation_layer import (
@@ -124,6 +130,9 @@
 from keras_cv.layers.regularization.dropblock_2d import DropBlock2D
 from keras_cv.layers.regularization.squeeze_excite import SqueezeAndExcite2D
 from keras_cv.layers.regularization.stochastic_depth import StochasticDepth
+from keras_cv.layers.segformer_multihead_attention import (
+    SegFormerMultiheadAttention,
+)
 from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling
 from keras_cv.layers.transformer_encoder import TransformerEncoder
 from keras_cv.layers.vit_layers import PatchingAndEmbedding
diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py
@@ -0,0 +1,140 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.layers.regularization.drop_path import DropPath
+from keras_cv.layers.segformer_multihead_attention import (
+    SegFormerMultiheadAttention,
+)
+
+
+@keras_cv_export("keras_cv.layers.HierarchicalTransformerEncoder")
+class HierarchicalTransformerEncoder(keras.layers.Layer):
+    """
+    Hierarchical transformer encoder block implementation as a Keras Layer.
+    The layer uses `SegFormerMultiheadAttention` as a `MultiHeadAttention`
+    alternative for computational efficiency, and is meant to be used
+    within the SegFormer architecture.
+
+    References:
+        - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501
+        - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501
+        - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501
+
+    Args:
+        project_dim: the dimensionality of the projection of the encoder, and
+            output of the `SegFormerMultiheadAttention` layer. Due to the
+            residual addition the input dimensionality has to be equal to
+            the output dimensionality.
+        num_heads: the number of heads for the `SegFormerMultiheadAttention`
+            layer
+        drop_prob: default 0.0, the probability of dropping a random sample
+            using the `DropPath` layer.
+        layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization`
+            layers
+        sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. # noqa: E501
+             If set to > 1, a `Conv2D` layer is used to reduce the length of
+             the sequence.
+
+    Basic usage:
+
+    ```
+    project_dim = 1024
+    num_heads = 4
+    patch_size = 16
+
+    encoded_patches = keras_cv.layers.OverlappingPatchingAndEmbedding(
+    project_dim=project_dim, patch_size=patch_size)(img_batch)
+
+    trans_encoded = keras_cv.layers.HierarchicalTransformerEncoder(project_dim=project_dim,
+                                                                   num_heads=num_heads,
+                                                                   sr_ratio=1)(encoded_patches)
+
+    print(trans_encoded.shape) # (1, 3136, 1024)
+    ```
+    """
+
+    def __init__(
+        self,
+        project_dim,
+        num_heads,
+        sr_ratio=1,
+        drop_prob=0.0,
+        layer_norm_epsilon=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.project_dim = project_dim
+        self.num_heads = num_heads
+        self.drop_prop = drop_prob
+
+        self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon)
+        self.attn = SegFormerMultiheadAttention(
+            project_dim, num_heads, sr_ratio
+        )
+        self.drop_path = DropPath(drop_prob)
+        self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon)
+        self.mlp = self.MixFFN(
+            channels=project_dim,
+            mid_channels=int(project_dim * 4),
+        )
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.H = ops.sqrt(ops.cast(input_shape[1], "float32"))
+        self.W = ops.sqrt(ops.cast(input_shape[2], "float32"))
+
+    def call(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "mlp": keras.saving.serialize_keras_object(self.mlp),
+                "project_dim": self.project_dim,
+                "num_heads": self.num_heads,
+                "drop_prop": self.drop_prop,
+            }
+        )
+        return config
+
+    class MixFFN(keras.layers.Layer):
+        def __init__(self, channels, mid_channels):
+            super().__init__()
+            self.fc1 = keras.layers.Dense(mid_channels)
+            self.dwconv = keras.layers.DepthwiseConv2D(
+                kernel_size=3,
+                strides=1,
+                padding="same",
+            )
+            self.fc2 = keras.layers.Dense(channels)
+
+        def call(self, x):
+            x = self.fc1(x)
+            shape = ops.shape(x)
+            B, C = ops.cast(shape[0], "float32"), ops.cast(shape[-1], "float32")
+            H, W = ops.sqrt(ops.cast(shape[1], "float32")), ops.sqrt(
+                ops.cast(shape[1], "float32")
+            )
+            x = ops.reshape(x, (B, H, W, C))
+            x = self.dwconv(x)
+            x = ops.reshape(x, (B, -1, C))
+            x = ops.nn.gelu(x)
+            x = self.fc2(x)
+            return x
diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py
@@ -0,0 +1,87 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+
+
+@keras_cv_export("keras_cv.layers.OverlappingPatchingAndEmbedding")
+class OverlappingPatchingAndEmbedding(keras.layers.Layer):
+    def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs):
+        """
+        Overlapping Patching and Embedding layer. Differs from `PatchingAndEmbedding`
+        in that the patch size does not affect the sequence length. It's fully derived
+        from the `stride` parameter. Additionally, no positional embedding is done
+        as part of the layer - only a projection using a `Conv2D` layer.
+
+        References:
+            - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501
+            - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501
+            - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501
+
+        Args:
+            project_dim: the dimensionality of the projection of the encoder, and
+                output of the `MultiHeadAttention`
+            num_heads: the number of heads for the `MultiHeadAttention` layer
+            drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer.
+            layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization`
+                layers
+            sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. If set to > 1,
+                a `Conv2D` layer is used to reduce the length of the sequence.
+
+        Basic usage:
+
+        ```
+        project_dim = 1024
+        patch_size = 16
+
+        encoded_patches = keras_cv.layers.OverlappingPatchingAndEmbedding(
+        project_dim=project_dim, patch_size=patch_size)(img_batch)
+
+        print(encoded_patches.shape) # (1, 3136, 1024)
+        ```
+        """
+        super().__init__(**kwargs)
+
+        self.project_dim = project_dim
+        self.patch_size = patch_size
+        self.stride = stride
+
+        self.proj = keras.layers.Conv2D(
+            filters=project_dim,
+            kernel_size=patch_size,
+            strides=stride,
+            padding="same",
+        )
+        self.norm = keras.layers.LayerNormalization()
+
+    def call(self, x):
+        x = self.proj(x)
+        # B, H, W, C
+        shape = x.shape
+        x = ops.reshape(x, (-1, shape[1] * shape[2], shape[3]))
+        x = self.norm(x)
+        return x
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "project_dim": self.project_dim,
+                "patch_size": self.patch_size,
+                "stride": self.stride,
+            }
+        )
+        return config
diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py
@@ -0,0 +1,130 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+
+
+@keras_cv_export("keras_cv.layers.SegFormerMultiheadAttention")
+class SegFormerMultiheadAttention(keras.layers.Layer):
+    def __init__(self, project_dim, num_heads, sr_ratio):
+        """
+        Efficient MultiHeadAttention implementation as a Keras layer.
+        A huge bottleneck in scaling transformers is the self-attention layer
+        with an O(n^2) complexity.
+
+        SegFormerMultiheadAttention performs a sequence reduction (SR) operation
+        with a given ratio, to reduce the sequence length before performing key and value projections,
+        reducing the O(n^2) complexity to O(n^2/R) where R is the sequence reduction ratio.
+
+        References:
+        - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501
+        - [NVlabs' official implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501
+        - [@sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py) # noqa: E501
+        - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) # noqa: E501
+
+        Args:
+            project_dim: the dimensionality of the projection of the `SegFormerMultiheadAttention` layer.
+            num_heads: the number of heads to use in the attention computation.
+            sr_ratio: the sequence reduction ratio to perform on the sequence before key and value projections.
+
+        Basic usage:
+
+        ```
+        tensor = tf.random.uniform([1, 196, 32])
+        output = keras_cv.layers.SegFormerMultiheadAttention(project_dim=768,
+                                                            num_heads=2,
+                                                            sr_ratio=4)(tensor)
+        print(output.shape) # (1, 196, 32)
+        ```
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        self.sr_ratio = sr_ratio
+        self.scale = (project_dim // num_heads) ** -0.5
+        self.q = keras.layers.Dense(project_dim)
+        self.k = keras.layers.Dense(project_dim)
+        self.v = keras.layers.Dense(project_dim)
+        self.proj = keras.layers.Dense(project_dim)
+
+        if sr_ratio > 1:
+            self.sr = keras.layers.Conv2D(
+                filters=project_dim,
+                kernel_size=sr_ratio,
+                strides=sr_ratio,
+                padding="same",
+            )
+            self.norm = keras.layers.LayerNormalization()
+
+    def call(self, x):
+        input_shape = ops.shape(x)
+        H, W = ops.sqrt(ops.cast(input_shape[1], "float32")), ops.sqrt(
+            ops.cast(input_shape[1], "float32")
+        )
+        B, C = ops.cast(input_shape[0], "float32"), ops.cast(
+            input_shape[2], "float32"
+        )
+
+        q = self.q(x)
+        q = ops.reshape(
+            q,
+            (
+                input_shape[0],
+                input_shape[1],
+                self.num_heads,
+                input_shape[2] // self.num_heads,
+            ),
+        )
+        q = ops.transpose(q, [0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x = ops.reshape(
+                ops.transpose(x, [0, 2, 1]),
+                (B, H, W, C),
+            )
+            x = self.sr(x)
+            x = ops.reshape(x, [input_shape[0], input_shape[2], -1])
+            x = ops.transpose(x, [0, 2, 1])
+            x = self.norm(x)
+
+        k = self.k(x)
+        v = self.v(x)
+
+        k = ops.transpose(
+            ops.reshape(
+                k,
+                [B, -1, self.num_heads, C // self.num_heads],
+            ),
+            [0, 2, 1, 3],
+        )
+
+        v = ops.transpose(
+            ops.reshape(
+                v,
+                [B, -1, self.num_heads, C // self.num_heads],
+            ),
+            [0, 2, 1, 3],
+        )
+
+        attn = (q @ ops.transpose(k, [0, 1, 3, 2])) * self.scale
+        attn = ops.nn.softmax(attn, axis=-1)
+
+        attn = attn @ v
+        attn = ops.reshape(
+            ops.transpose(attn, [0, 2, 1, 3]),
+            [input_shape[0], input_shape[1], input_shape[2]],
+        )
+        x = self.proj(attn)
+        return x