paddleseg/models/ginet.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
from paddle.nn import functional as F

from paddleseg.utils import utils
from paddleseg.models import layers
from paddleseg.cvlibs import manager


@manager.MODELS.add_component
class GINet(nn.Layer):
    """
    The GINet implementation based on PaddlePaddle.
    The original article refers to
    Wu, Tianyi, Yu Lu, Yu Zhu, Chuang Zhang, Ming Wu, Zhanyu Ma, and Guodong Guo. "GINet: Graph interaction network for scene parsing." In European Conference on Computer Vision, pp. 34-51. Springer, Cham, 2020.
    (https://arxiv.org/pdf/2009.06160).
    Args:
        num_classes (int): The unique number of target classes.
        backbone (Paddle.nn.Layer): Backbone network.
        backbone_indices (tuple, optional): Values in the tuple indicate the indices of output of backbone.
        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
            If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
        jpu (bool, optional)): whether to use jpu unit in the base forward. Default:True.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
    """

    def __init__(self,
                 num_classes,
                 backbone,
                 backbone_indices=[0, 1, 2, 3],
                 enable_auxiliary_loss=True,
                 align_corners=True,
                 jpu=True,
                 pretrained=None):
        super().__init__()
        self.nclass = num_classes
        self.aux = enable_auxiliary_loss
        self.jpu = jpu

        self.backbone = backbone
        self.backbone_indices = backbone_indices
        self.align_corners = align_corners

        self.jpu = layers.JPU([512, 1024, 2048], width=512) if jpu else None
        self.head = GIHead(in_channels=2048, nclass=num_classes)

        if self.aux:
            self.auxlayer = layers.AuxLayer(1024,
                                            1024 // 4,
                                            num_classes,
                                            bias_attr=False)

        self.pretrained = pretrained
        self.init_weight()

    def base_forward(self, x):
        feat_list = self.backbone(x)

        c1, c2, c3, c4 = [feat_list[i] for i in self.backbone_indices]

        if self.jpu:
            return self.jpu(c1, c2, c3, c4)
        else:
            return c1, c2, c3, c4

    def forward(self, x):
        _, _, h, w = x.shape
        _, _, c3, c4 = self.base_forward(x)

        logit_list = []
        x, _ = self.head(c4)
        logit_list.append(x)

        if self.aux:
            auxout = self.auxlayer(c3)

            logit_list.append(auxout)

        return [
            F.interpolate(logit, [h, w],
                          mode='bilinear',
                          align_corners=self.align_corners)
            for logit in logit_list
        ]

    def init_weight(self):
        if self.pretrained is not None:
            utils.load_entire_model(self, self.pretrained)


class GIHead(nn.Layer):
    """The Graph Interaction Network head."""

    def __init__(self, in_channels, nclass):
        super().__init__()
        self.nclass = nclass
        inter_channels = in_channels // 4
        self.inp = paddle.zeros(shape=(nclass, 300), dtype='float32')
        self.inp = paddle.create_parameter(
            shape=self.inp.shape,
            dtype=str(self.inp.numpy().dtype),
            default_initializer=paddle.nn.initializer.Assign(self.inp))
        self.inp.stop_gradient = True

        self.fc1 = nn.Sequential(nn.Linear(300, 128), nn.BatchNorm1D(128),
                                 nn.ReLU())
        self.fc2 = nn.Sequential(nn.Linear(128, 256), nn.BatchNorm1D(256),
                                 nn.ReLU())
        self.conv5 = layers.ConvBNReLU(in_channels,
                                       inter_channels,
                                       3,
                                       padding=1,
                                       bias_attr=False,
                                       stride=1)

        self.gloru = GlobalReasonUnit(in_channels=inter_channels,
                                      num_state=256,
                                      num_node=84,
                                      nclass=nclass)
        self.conv6 = nn.Sequential(nn.Dropout(0.1),
                                   nn.Conv2D(inter_channels, nclass, 1))

    def forward(self, x):

        B, C, H, W = x.shape
        inp = self.inp

        inp = self.fc1(inp)
        inp = self.fc2(inp).unsqueeze(axis=0).transpose((0, 2, 1))\
                           .expand((B, 256, self.nclass))

        out = self.conv5(x)

        out, se_out = self.gloru(out, inp)
        out = self.conv6(out)
        return out, se_out


class GlobalReasonUnit(nn.Layer):
    """
        The original paper refers to:
            Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" (https://arxiv.org/abs/1811.12814)
    """

    def __init__(self, in_channels, num_state=256, num_node=84, nclass=59):
        super().__init__()
        self.num_state = num_state
        self.conv_theta = nn.Conv2D(in_channels,
                                    num_node,
                                    kernel_size=1,
                                    stride=1,
                                    padding=0)
        self.conv_phi = nn.Conv2D(in_channels,
                                  num_state,
                                  kernel_size=1,
                                  stride=1,
                                  padding=0)
        self.graph = GraphLayer(num_state, num_node, nclass)
        self.extend_dim = nn.Conv2D(num_state,
                                    in_channels,
                                    kernel_size=1,
                                    bias_attr=False)

        self.bn = layers.SyncBatchNorm(in_channels)

    def forward(self, x, inp):
        B = self.conv_theta(x)
        sizeB = B.shape
        B = paddle.flatten(B, 2, 3)

        sizex = paddle.shape(x)
        x_reduce = self.conv_phi(x)

        x_reduce = paddle.flatten(x_reduce, 2, 3).transpose((0, 2, 1))

        V = paddle.bmm(B, x_reduce).transpose((0, 2, 1))
        V = paddle.divide(V, (sizex[2] * sizex[3]).astype('float32'))

        class_node, new_V = self.graph(inp, V)
        D = B.transpose((0, 2, 1))
        Y = paddle.bmm(D, new_V.transpose((0, 2, 1)))
        Y = Y.transpose((0, 2, 1)).reshape((sizex[0], self.num_state, \
                                            sizex[2], -1))
        Y = self.extend_dim(Y)
        Y = self.bn(Y)
        out = Y + x

        return out, class_node


class GraphLayer(nn.Layer):

    def __init__(self, num_state, num_node, num_class):
        super().__init__()
        self.vis_gcn = GCN(num_state, num_node)
        self.word_gcn = GCN(num_state, num_class)
        self.transfer = GraphTransfer(num_state)
        self.gamma_vis = paddle.zeros([num_node])
        self.gamma_word = paddle.zeros([num_class])
        self.gamma_vis = paddle.create_parameter(
            shape=self.gamma_vis.shape,
            dtype=str(self.gamma_vis.numpy().dtype),
            default_initializer=paddle.nn.initializer.Assign(self.gamma_vis))
        self.gamma_word = paddle.create_parameter(
            shape=self.gamma_word.shape,
            dtype=str(self.gamma_word.numpy().dtype),
            default_initializer=paddle.nn.initializer.Assign(self.gamma_word))

    def forward(self, inp, vis_node):
        inp = self.word_gcn(inp)
        new_V = self.vis_gcn(vis_node)
        class_node, vis_node = self.transfer(inp, new_V)

        class_node = self.gamma_word * inp + class_node
        new_V = self.gamma_vis * vis_node + new_V
        return class_node, new_V


class GCN(nn.Layer):

    def __init__(self, num_state=128, num_node=64, bias=False):
        super().__init__()
        self.conv1 = nn.Conv1D(
            num_node,
            num_node,
            kernel_size=1,
            padding=0,
            stride=1,
            groups=1,
        )
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1D(num_state,
                               num_state,
                               kernel_size=1,
                               padding=0,
                               stride=1,
                               groups=1,
                               bias_attr=bias)

    def forward(self, x):
        h = self.conv1(x.transpose((0, 2, 1))).transpose((0, 2, 1))
        h = h + x
        h = self.relu(h)
        h = self.conv2(h)
        return h


class GraphTransfer(nn.Layer):
    """Transfer vis graph to class node, transfer class node to vis feature"""

    def __init__(self, in_dim):
        super().__init__()
        self.channle_in = in_dim
        self.query_conv = nn.Conv1D(in_channels=in_dim,
                                    out_channels=in_dim // 2,
                                    kernel_size=1)
        self.key_conv = nn.Conv1D(in_channels=in_dim,
                                  out_channels=in_dim // 2,
                                  kernel_size=1)
        self.value_conv_vis = nn.Conv1D(in_channels=in_dim,
                                        out_channels=in_dim,
                                        kernel_size=1)
        self.value_conv_word = nn.Conv1D(in_channels=in_dim,
                                         out_channels=in_dim,
                                         kernel_size=1)
        self.softmax_vis = nn.Softmax(axis=-1)
        self.softmax_word = nn.Softmax(axis=-2)

    def forward(self, word, vis_node):
        m_batchsize, C, Nc = word.shape
        m_batchsize, C, Nn = vis_node.shape

        proj_query = self.query_conv(word).reshape((m_batchsize, -1, Nc))\
                                          .transpose((0, 2, 1))
        proj_key = self.key_conv(vis_node).reshape((m_batchsize, -1, Nn))

        energy = paddle.bmm(proj_query, proj_key)
        attention_vis = self.softmax_vis(energy).transpose((0, 2, 1))
        attention_word = self.softmax_word(energy)

        proj_value_vis = self.value_conv_vis(vis_node).reshape(
            (m_batchsize, -1, Nn))
        proj_value_word = self.value_conv_word(word).reshape(
            (m_batchsize, -1, Nc))

        class_out = paddle.bmm(proj_value_vis, attention_vis)
        node_out = paddle.bmm(proj_value_word, attention_word)
        return class_out, node_out