diff --git a/robosat/efficientnet.py b/robosat/efficientnet.py new file mode 100644 index 00000000..c9cfac39 --- /dev/null +++ b/robosat/efficientnet.py @@ -0,0 +1,211 @@ +"""EfficientNet architecture. + +See: +- https://arxiv.org/abs/1905.11946 - EfficientNet +- https://arxiv.org/abs/1801.04381 - MobileNet V2 +- https://arxiv.org/abs/1905.02244 - MobileNet V3 +- https://arxiv.org/abs/1709.01507 - Squeeze-and-Excitation +- https://arxiv.org/abs/1803.02579 - Concurrent spatial and channel squeeze-and-excitation +- https://arxiv.org/abs/1812.01187 - Bag of Tricks for Image Classification with Convolutional Neural Networks + + +Known issues: + +- Not using swish activation function: unclear where, if, and how + much it helps. Needs more experimentation. See also MobileNet V3. + +- Not using squeeze and excitation blocks: I had significantly worse + results with scse blocks, and cse blocks alone did not help, too. + Needs more experimentation as it was done on small datasets only. + +- Not using DropConnect: no efficient native implementation in PyTorch. + Unclear if and how much it helps over Dropout. +""" + +import math +import collections + +import torch +import torch.nn as nn + + +EfficientNetParam = collections.namedtuple("EfficientNetParam", [ + "width", "depth", "resolution", "dropout"]) + +EfficientNetParams = { + "B0": EfficientNetParam(1.0, 1.0, 224, 0.2), + "B1": EfficientNetParam(1.0, 1.1, 240, 0.2), + "B2": EfficientNetParam(1.1, 1.2, 260, 0.3), + "B3": EfficientNetParam(1.2, 1.4, 300, 0.3), + "B4": EfficientNetParam(1.4, 1.8, 380, 0.4), + "B5": EfficientNetParam(1.6, 2.2, 456, 0.4), + "B6": EfficientNetParam(1.8, 2.6, 528, 0.5), + "B7": EfficientNetParam(2.0, 3.1, 600, 0.5)} + + +def efficientnet0(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B0"], num_classes=num_classes) + +def efficientnet1(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B1"], num_classes=num_classes) + +def efficientnet2(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B2"], num_classes=num_classes) + +def efficientnet3(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B3"], num_classes=num_classes) + +def efficientnet4(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B4"], num_classes=num_classes) + +def efficientnet5(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B5"], num_classes=num_classes) + +def efficientnet6(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B6"], num_classes=num_classes) + +def efficientnet7(pretrained=False, progress=False, num_classes=1000): + return EfficientNet(param=EfficientNetParams["B7"], num_classes=num_classes) + + +class EfficientNet(nn.Module): + def __init__(self, param, num_classes=1000): + super().__init__() + + # For the exact scaling technique we follow the official implementation as the paper does not tell us + # https://github.com/tensorflow/tpu/blob/01574500090fa9c011cb8418c61d442286720211/models/official/efficientnet/efficientnet_model.py#L101-L125 + + def scaled_depth(n): + return int(math.ceil(n * param.depth)) + + # Snap number of channels to multiple of 8 for optimized implementations + def scaled_width(n): + n = n * param.width + m = max(8, int(n + 8 / 2) // 8 * 8) + + if m < 0.9 * n: + m = m + 8 + + return int(m) + + self.conv1 = nn.Conv2d(3, scaled_width(32), kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(scaled_width(32)) + self.relu = nn.ReLU6(inplace=True) + + self.layer1 = self._make_layer(n=scaled_depth(1), expansion=1, cin=scaled_width(32), cout=scaled_width(16), kernel_size=3, stride=1) + self.layer2 = self._make_layer(n=scaled_depth(2), expansion=6, cin=scaled_width(16), cout=scaled_width(24), kernel_size=3, stride=2) + self.layer3 = self._make_layer(n=scaled_depth(2), expansion=6, cin=scaled_width(24), cout=scaled_width(40), kernel_size=5, stride=2) + self.layer4 = self._make_layer(n=scaled_depth(3), expansion=6, cin=scaled_width(40), cout=scaled_width(80), kernel_size=3, stride=2) + self.layer5 = self._make_layer(n=scaled_depth(3), expansion=6, cin=scaled_width(80), cout=scaled_width(112), kernel_size=5, stride=1) + self.layer6 = self._make_layer(n=scaled_depth(4), expansion=6, cin=scaled_width(112), cout=scaled_width(192), kernel_size=5, stride=2) + self.layer7 = self._make_layer(n=scaled_depth(1), expansion=6, cin=scaled_width(192), cout=scaled_width(320), kernel_size=3, stride=1) + + self.features = nn.Conv2d(scaled_width(320), scaled_width(1280), kernel_size=1, bias=False) + + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.dropout = nn.Dropout(param.dropout, inplace=True) + self.fc = nn.Linear(scaled_width(1280), num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.zeros_(m.bias) + + # Zero BatchNorm weight at end of res-blocks: identity by default + # See https://arxiv.org/abs/1812.01187 Section 3.1 + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.zeros_(m.linear[1].weight) + + + def _make_layer(self, n, expansion, cin, cout, kernel_size=3, stride=1): + layers = [] + + for i in range(n): + if i == 0: + planes = cin + expand = cin * expansion + squeeze = cout + stride = stride + else: + planes = cout + expand = cout * expansion + squeeze = cout + stride = 1 + + layers += [Bottleneck(planes, expand, squeeze, kernel_size=kernel_size, stride=stride)] + + return nn.Sequential(*layers) + + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.layer5(x) + x = self.layer6(x) + x = self.layer7(x) + + x = self.features(x) + + x = self.avgpool(x) + x = x.reshape(x.size(0), -1) + x = self.dropout(x) + x = self.fc(x) + + return x + + +class Bottleneck(nn.Module): + def __init__(self, planes, expand, squeeze, kernel_size, stride): + super().__init__() + + self.expand = nn.Identity() if planes == expand else nn.Sequential( + nn.Conv2d(planes, expand, kernel_size=1, bias=False), + nn.BatchNorm2d(expand), + nn.ReLU6(inplace=True)) + + self.depthwise = nn.Sequential( + nn.Conv2d(expand, expand, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=expand, bias=False), + nn.BatchNorm2d(expand), + nn.ReLU6(inplace=True)) + + self.linear = nn.Sequential( + nn.Conv2d(expand, squeeze, kernel_size=1, bias=False), + nn.BatchNorm2d(squeeze)) + + # Make all blocks skip-able via AvgPool + 1x1 Conv + # See https://arxiv.org/abs/1812.01187 Figure 2 c + + downsample = [] + + if stride != 1: + downsample += [nn.AvgPool2d(kernel_size=stride, stride=stride)] + + if planes != squeeze: + downsample += [ + nn.Conv2d(planes, squeeze, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(squeeze)] + + self.downsample = nn.Identity() if not downsample else nn.Sequential(*downsample) + + + def forward(self, x): + xx = self.expand(x) + xx = self.depthwise(xx) + xx = self.linear(xx) + + x = self.downsample(x) + xx.add_(x) + + return xx diff --git a/robosat/unet.py b/robosat/unet.py index 1acc4ac7..9cda4ba6 100644 --- a/robosat/unet.py +++ b/robosat/unet.py @@ -12,7 +12,7 @@ import torch import torch.nn as nn -from torchvision.models import resnet50 +from robosat.efficientnet import efficientnet0 class ConvRelu(nn.Module): @@ -91,17 +91,17 @@ def __init__(self, num_classes, num_filters=32, pretrained=True): # Todo: make input channels configurable, not hard-coded to three channels for RGB - self.resnet = resnet50(pretrained=pretrained) + self.net = efficientnet0(pretrained=pretrained) - # Access resnet directly in forward pass; do not store refs here due to + # Access backbone directly in forward pass; do not store refs here due to # https://github.com/pytorch/pytorch/issues/8392 - self.center = DecoderBlock(2048, num_filters * 8) + self.center = DecoderBlock(1280, num_filters * 8) - self.dec0 = DecoderBlock(2048 + num_filters * 8, num_filters * 8) - self.dec1 = DecoderBlock(1024 + num_filters * 8, num_filters * 8) - self.dec2 = DecoderBlock(512 + num_filters * 8, num_filters * 2) - self.dec3 = DecoderBlock(256 + num_filters * 2, num_filters * 2 * 2) + self.dec0 = DecoderBlock(1280 + num_filters * 8, num_filters * 8) + self.dec1 = DecoderBlock(112 + num_filters * 8, num_filters * 8) + self.dec2 = DecoderBlock(40 + num_filters * 8, num_filters * 2) + self.dec3 = DecoderBlock(24 + num_filters * 2, num_filters * 2 * 2) self.dec4 = DecoderBlock(num_filters * 2 * 2, num_filters) self.dec5 = ConvRelu(num_filters, num_filters) @@ -117,17 +117,33 @@ def forward(self, x): The networks output tensor. """ size = x.size() - assert size[-1] % 32 == 0 and size[-2] % 32 == 0, "image resolution has to be divisible by 32 for resnet" - - enc0 = self.resnet.conv1(x) - enc0 = self.resnet.bn1(enc0) - enc0 = self.resnet.relu(enc0) - enc0 = self.resnet.maxpool(enc0) - - enc1 = self.resnet.layer1(enc0) - enc2 = self.resnet.layer2(enc1) - enc3 = self.resnet.layer3(enc2) - enc4 = self.resnet.layer4(enc3) + assert size[-1] % 32 == 0 and size[-2] % 32 == 0, "image resolution has to be divisible by 32 for backbone" + + # 1, 3, 512, 512 + enc0 = self.net.conv1(x) + enc0 = self.net.bn1(enc0) + enc0 = self.net.relu(enc0) + # 1, 32, 256, 256 + enc0 = self.net.layer1(enc0) + # 1, 16, 256, 256 + + enc1 = self.net.layer2(enc0) + # 1, 24, 128, 128 + + enc2 = self.net.layer3(enc1) + # 1, 40, 64, 64 + + enc3 = self.net.layer4(enc2) + # 1, 80, 32, 32 + enc3 = self.net.layer5(enc3) + # 1, 112, 32, 32 + + enc4 = self.net.layer6(enc3) + # 1, 192, 16, 16 + enc4 = self.net.layer7(enc4) + # 1, 320, 16, 16 + enc4 = self.net.features(enc4) + # 1, 1280, 16, 16 center = self.center(nn.functional.max_pool2d(enc4, kernel_size=2, stride=2))