paddleseg/models/bisenetv1.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddleseg.cvlibs import manager
from paddleseg.models import layers
from paddleseg.utils import utils


@manager.MODELS.add_component
class BiseNetV1(nn.Layer):
    """
    The BiSeNetV1 implementation based on PaddlePaddle.

    The original article refers to
    Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
    (https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for)

    Args:
        num_classes (int): The unique number of target classes.
        backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
    """

    def __init__(self,
                 num_classes,
                 backbone,
                 conv_channel=128,
                 pretrained=None):
        super().__init__()
        self.backbone = backbone
        self.spatial_path = SpatialPath(3, 128)
        self.global_context = nn.Sequential(
            nn.AdaptiveAvgPool2D(1),
            layers.ConvBNReLU(512, conv_channel, 1, bias_attr=False),
        )

        self.arms = nn.LayerList([
            AttentionRefinement(512, conv_channel),
            AttentionRefinement(256, conv_channel),
        ])
        self.refines = nn.LayerList([
            layers.ConvBNReLU(conv_channel,
                              conv_channel,
                              3,
                              stride=1,
                              padding=1,
                              bias_attr=False),
            layers.ConvBNReLU(conv_channel,
                              conv_channel,
                              3,
                              stride=1,
                              padding=1,
                              bias_attr=False),
        ])

        self.heads = nn.LayerList([
            BiSeNetHead(conv_channel, num_classes, 8, True),
            BiSeNetHead(conv_channel, num_classes, 8, True),
            BiSeNetHead(conv_channel * 2, num_classes, 8, False),
        ])

        self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1)

        self.pretrained = pretrained

    def init_weight(self):
        if self.pretrained is not None:
            utils.load_entire_model(self, self.pretrained)

    def forward(self, x):
        spatial_out = self.spatial_path(x)
        context_blocks = self.backbone(x)
        context_blocks.reverse()

        global_context = self.global_context(context_blocks[0])
        global_context = F.interpolate(global_context,
                                       size=context_blocks[0].shape[2:],
                                       mode='bilinear',
                                       align_corners=True)
        last_fm = global_context
        pred_out = []

        for i, (fm, arm, refine) in enumerate(
                zip(context_blocks[:2], self.arms, self.refines)):
            fm = arm(fm)
            fm += last_fm
            last_fm = F.interpolate(fm,
                                    size=context_blocks[i + 1].shape[2:],
                                    mode='bilinear',
                                    align_corners=True)
            last_fm = refine(last_fm)
            pred_out.append(last_fm)
        context_out = last_fm

        concate_fm = self.ffm(spatial_out, context_out)
        pred_out.append(concate_fm)

        output = []
        if self.training:
            for i, head in enumerate(self.heads):
                out = head(pred_out[i])
                output.append(out)
        else:
            out = self.heads[-1](pred_out[-1])
            output.append(out)
        return output


class SpatialPath(nn.Layer):
    """
    SpatialPath module of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
    """

    def __init__(self, in_channels, out_channels, inner_channel=64):
        super().__init__()
        self.conv_7x7 = layers.ConvBNReLU(in_channels,
                                          inner_channel,
                                          7,
                                          stride=2,
                                          padding=3,
                                          bias_attr=False)
        self.conv_3x3_1 = layers.ConvBNReLU(inner_channel,
                                            inner_channel,
                                            3,
                                            stride=2,
                                            padding=1,
                                            bias_attr=False)
        self.conv_3x3_2 = layers.ConvBNReLU(inner_channel,
                                            inner_channel,
                                            3,
                                            stride=2,
                                            padding=1,
                                            bias_attr=False)
        self.conv_1x1 = layers.ConvBNReLU(inner_channel,
                                          out_channels,
                                          1,
                                          bias_attr=False)

    def forward(self, x):
        x = self.conv_7x7(x)
        x = self.conv_3x3_1(x)
        x = self.conv_3x3_2(x)
        x = self.conv_1x1(x)
        return x


class BiSeNetHead(nn.Layer):
    """
    BiSeNet head of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
        scale (int, float): The scale factor of interpolation.
    """

    def __init__(self, in_channels, out_channels, scale, is_aux=False):
        super().__init__()
        inner_channel = 128 if is_aux else 64
        self.conv_3x3 = layers.ConvBNReLU(in_channels,
                                          inner_channel,
                                          3,
                                          stride=1,
                                          padding=1,
                                          bias_attr=False)
        self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1)
        self.scale = scale

    def forward(self, x):
        x = self.conv_3x3(x)
        x = self.conv_1x1(x)
        if self.scale > 1:
            x = F.interpolate(x,
                              scale_factor=self.scale,
                              mode='bilinear',
                              align_corners=True)
        return x


class AttentionRefinement(nn.Layer):
    """
    AttentionRefinement module of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
    """

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_3x3 = layers.ConvBNReLU(in_channels,
                                          out_channels,
                                          3,
                                          stride=1,
                                          padding=1,
                                          bias_attr=False)
        self.channel_attention = nn.Sequential(
            nn.AdaptiveAvgPool2D(1),
            layers.ConvBNReLU(out_channels, out_channels, 1, bias_attr=False),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.conv_3x3(x)
        se = self.channel_attention(x)
        x = x * se
        return x


class FeatureFusion(nn.Layer):
    """
    AttentionRefinement module of BiseNetV1 model

    Args:
        in_channels (int): The number of input channels in spatial path module.
        out_channels (int): The number of output channels in spatial path module.
        reduction (int): A factor shrinks convolutional channels. Default: 1.
    """

    def __init__(self, in_channels, out_channels, reduction=1):
        super().__init__()
        self.conv_1x1 = layers.ConvBNReLU(in_channels,
                                          out_channels,
                                          1,
                                          bias_attr=False)
        self.channel_attention = nn.Sequential(
            nn.AdaptiveAvgPool2D(1),
            layers.ConvBNReLU(out_channels,
                              out_channels // reduction,
                              1,
                              bias_attr=False),
            layers.ConvBNReLU(out_channels // reduction,
                              out_channels,
                              1,
                              bias_attr=False),
            nn.Sigmoid(),
        )

    def forward(self, x1, x2):
        fm = paddle.concat([x1, x2], axis=1)
        fm = self.conv_1x1(fm)
        fm_se = self.channel_attention(fm)
        output = fm + fm * fm_se
        return output