From ba9b183fbbaf69725607709446fb828380cf18b8 Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 17:41:28 -0300
Subject: [PATCH 1/7] update on requirements to get noisy students weights

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f5678e05..2d29b61c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 keras_applications>=1.0.7,<=1.0.8
 image-classifiers==1.0.0
-efficientnet==1.0.0
+efficientnet==1.1.0

From 766be51d9ca71f20b77608c61451df727e998438 Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 17:41:50 -0300
Subject: [PATCH 2/7] activation_dtype on inception_resnet and inception_v3

---
 .../backbones/inception_resnet_v2.py          | 119 +++++++------
 segmentation_models/backbones/inception_v3.py | 164 ++++++++++--------
 2 files changed, 158 insertions(+), 125 deletions(-)

diff --git a/segmentation_models/backbones/inception_resnet_v2.py b/segmentation_models/backbones/inception_resnet_v2.py
index 5f6cebb6..499c7279 100644
--- a/segmentation_models/backbones/inception_resnet_v2.py
+++ b/segmentation_models/backbones/inception_resnet_v2.py
@@ -44,6 +44,7 @@ def conv2d_bn(x,
               strides=1,
               padding='same',
               activation='relu',
+              activation_dtype=None,
               use_bias=False,
               name=None):
     """Utility function to apply conv + BN.
@@ -54,6 +55,8 @@ def conv2d_bn(x,
         strides: strides in `Conv2D`.
         padding: padding mode in `Conv2D`.
         activation: activation in `Conv2D`.
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         use_bias: whether to use a bias in `Conv2D`.
         name: name of the ops; will become `name + '_ac'` for the activation
             and `name + '_bn'` for the batch norm layer.
@@ -74,11 +77,14 @@ def conv2d_bn(x,
                                       name=bn_name)(x)
     if activation is not None:
         ac_name = None if name is None else name + '_ac'
-        x = layers.Activation(activation, name=ac_name)(x)
+        if activation_dtype is None:
+            x = layers.Activation(activation, name=ac_name)(x)
+        else:
+            x = layers.Activation(activation, name=ac_name, dtype=activation_dtype)(x)
     return x
 
 
-def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
+def inception_resnet_block(x, scale, block_type, block_idx, activation='relu', activation_dtype=None):
     """Adds a Inception-ResNet block.
     This function builds 3 types of Inception-ResNet blocks mentioned
     in the paper, controlled by the `block_type` argument (which is the
@@ -108,6 +114,8 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
             (see [activations](../activations.md)).
             When `activation=None`, no activation is applied
             (i.e., "linear" activation: `a(x) = x`).
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
     # Returns
         Output tensor for the block.
     # Raises
@@ -115,24 +123,24 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
             `'block17'` or `'block8'`.
     """
     if block_type == 'block35':
-        branch_0 = conv2d_bn(x, 32, 1)
-        branch_1 = conv2d_bn(x, 32, 1)
-        branch_1 = conv2d_bn(branch_1, 32, 3)
-        branch_2 = conv2d_bn(x, 32, 1)
-        branch_2 = conv2d_bn(branch_2, 48, 3)
-        branch_2 = conv2d_bn(branch_2, 64, 3)
+        branch_0 = conv2d_bn(x, 32, 1, activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(x, 32, 1, activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(branch_1, 32, 3, activation_dtype=activation_dtype)
+        branch_2 = conv2d_bn(x, 32, 1, activation_dtype=activation_dtype)
+        branch_2 = conv2d_bn(branch_2, 48, 3, activation_dtype=activation_dtype)
+        branch_2 = conv2d_bn(branch_2, 64, 3, activation_dtype=activation_dtype)
         branches = [branch_0, branch_1, branch_2]
     elif block_type == 'block17':
-        branch_0 = conv2d_bn(x, 192, 1)
-        branch_1 = conv2d_bn(x, 128, 1)
-        branch_1 = conv2d_bn(branch_1, 160, [1, 7])
-        branch_1 = conv2d_bn(branch_1, 192, [7, 1])
+        branch_0 = conv2d_bn(x, 192, 1, activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(x, 128, 1, activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(branch_1, 160, [1, 7], activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(branch_1, 192, [7, 1], activation_dtype=activation_dtype)
         branches = [branch_0, branch_1]
     elif block_type == 'block8':
-        branch_0 = conv2d_bn(x, 192, 1)
-        branch_1 = conv2d_bn(x, 192, 1)
-        branch_1 = conv2d_bn(branch_1, 224, [1, 3])
-        branch_1 = conv2d_bn(branch_1, 256, [3, 1])
+        branch_0 = conv2d_bn(x, 192, 1, activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(x, 192, 1, activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(branch_1, 224, [1, 3], activation_dtype=activation_dtype)
+        branch_1 = conv2d_bn(branch_1, 256, [3, 1], activation_dtype=activation_dtype)
         branches = [branch_0, branch_1]
     else:
         raise ValueError('Unknown Inception-ResNet block type. '
@@ -148,14 +156,18 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
                    1,
                    activation=None,
                    use_bias=True,
-                   name=block_name + '_conv')
+                   name=block_name + '_conv',
+                   activation_dtype=activation_dtype)
 
     x = layers.Lambda(lambda inputs, scale: inputs[0] + inputs[1] * scale,
                       output_shape=backend.int_shape(x)[1:],
                       arguments={'scale': scale},
                       name=block_name)([x, up])
     if activation is not None:
-        x = layers.Activation(activation, name=block_name + '_ac')(x)
+        if activation_dtype is None:
+            x = layers.Activation(activation, name=block_name + '_ac')(x)
+        else:
+            x = layers.Activation(activation, name=block_name + '_ac', dtype=activation_dtype)(x)
     return x
 
 
@@ -165,6 +177,7 @@ def InceptionResNetV2(include_top=True,
                       input_shape=None,
                       pooling=None,
                       classes=1000,
+                      activation_dtype=None,
                       **kwargs):
     """Instantiates the Inception-ResNet v2 architecture.
     Optionally loads weights pre-trained on ImageNet.
@@ -234,23 +247,23 @@ def InceptionResNetV2(include_top=True,
             img_input = input_tensor
 
     # Stem block: 35 x 35 x 192
-    x = conv2d_bn(img_input, 32, 3, strides=2, padding='same')
-    x = conv2d_bn(x, 32, 3, padding='same')
-    x = conv2d_bn(x, 64, 3, padding='same')
-    x = layers.MaxPooling2D(3, strides=2, padding='same')(x)
-    x = conv2d_bn(x, 80, 1, padding='same')
-    x = conv2d_bn(x, 192, 3, padding='same')
-    x = layers.MaxPooling2D(3, strides=2, padding='same')(x)
+    x = conv2d_bn(img_input, 32, 3, strides=2, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 32, 3, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 64, 3, padding='same', activation_dtype=activation_dtype)
+    x = layers.MaxPooling2D(3, strides=2, padding='same', activation_dtype=activation_dtype)(x)
+    x = conv2d_bn(x, 80, 1, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 192, 3, padding='same', activation_dtype=activation_dtype)
+    x = layers.MaxPooling2D(3, strides=2, padding='same', activation_dtype=activation_dtype)(x)
 
     # Mixed 5b (Inception-A block): 35 x 35 x 320
-    branch_0 = conv2d_bn(x, 96, 1, padding='same')
-    branch_1 = conv2d_bn(x, 48, 1, padding='same')
-    branch_1 = conv2d_bn(branch_1, 64, 5, padding='same')
-    branch_2 = conv2d_bn(x, 64, 1, padding='same')
-    branch_2 = conv2d_bn(branch_2, 96, 3, padding='same')
-    branch_2 = conv2d_bn(branch_2, 96, 3, padding='same')
-    branch_pool = layers.AveragePooling2D(3, strides=1, padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 64, 1, padding='same')
+    branch_0 = conv2d_bn(x, 96, 1, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(x, 48, 1, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(branch_1, 64, 5, padding='same', activation_dtype=activation_dtype)
+    branch_2 = conv2d_bn(x, 64, 1, padding='same', activation_dtype=activation_dtype)
+    branch_2 = conv2d_bn(branch_2, 96, 3, padding='same', activation_dtype=activation_dtype)
+    branch_2 = conv2d_bn(branch_2, 96, 3, padding='same', activation_dtype=activation_dtype)
+    branch_pool = layers.AveragePooling2D(3, strides=1, padding='same', activation_dtype=activation_dtype)(x)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, padding='same', activation_dtype=activation_dtype)
     branches = [branch_0, branch_1, branch_2, branch_pool]
     channel_axis = 1 if backend.image_data_format() == 'channels_first' else 3
     x = layers.Concatenate(axis=channel_axis, name='mixed_5b')(branches)
@@ -263,11 +276,11 @@ def InceptionResNetV2(include_top=True,
                                    block_idx=block_idx)
 
     # Mixed 6a (Reduction-A block): 17 x 17 x 1088
-    branch_0 = conv2d_bn(x, 384, 3, strides=2, padding='same')
-    branch_1 = conv2d_bn(x, 256, 1, padding='same')
-    branch_1 = conv2d_bn(branch_1, 256, 3, padding='same')
-    branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding='same')
-    branch_pool = layers.MaxPooling2D(3, strides=2, padding='same')(x)
+    branch_0 = conv2d_bn(x, 384, 3, strides=2, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(x, 256, 1, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(branch_1, 256, 3, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding='same', activation_dtype=activation_dtype)
+    branch_pool = layers.MaxPooling2D(3, strides=2, padding='same', activation_dtype=activation_dtype)(x)
     branches = [branch_0, branch_1, branch_pool]
     x = layers.Concatenate(axis=channel_axis, name='mixed_6a')(branches)
 
@@ -279,14 +292,14 @@ def InceptionResNetV2(include_top=True,
                                    block_idx=block_idx)
 
     # Mixed 7a (Reduction-B block): 8 x 8 x 2080
-    branch_0 = conv2d_bn(x, 256, 1, padding='same')
-    branch_0 = conv2d_bn(branch_0, 384, 3, strides=2, padding='same')
-    branch_1 = conv2d_bn(x, 256, 1, padding='same')
-    branch_1 = conv2d_bn(branch_1, 288, 3, strides=2, padding='same')
-    branch_2 = conv2d_bn(x, 256, 1, padding='same')
-    branch_2 = conv2d_bn(branch_2, 288, 3, padding='same')
-    branch_2 = conv2d_bn(branch_2, 320, 3, strides=2, padding='same')
-    branch_pool = layers.MaxPooling2D(3, strides=2, padding='same')(x)
+    branch_0 = conv2d_bn(x, 256, 1, padding='same', activation_dtype=activation_dtype)
+    branch_0 = conv2d_bn(branch_0, 384, 3, strides=2, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(x, 256, 1, padding='same', activation_dtype=activation_dtype)
+    branch_1 = conv2d_bn(branch_1, 288, 3, strides=2, padding='same', activation_dtype=activation_dtype)
+    branch_2 = conv2d_bn(x, 256, 1, padding='same', activation_dtype=activation_dtype)
+    branch_2 = conv2d_bn(branch_2, 288, 3, padding='same', activation_dtype=activation_dtype)
+    branch_2 = conv2d_bn(branch_2, 320, 3, strides=2, padding='same', activation_dtype=activation_dtype)
+    branch_pool = layers.MaxPooling2D(3, strides=2, padding='same', activation_dtype=activation_dtype)(x)
     branches = [branch_0, branch_1, branch_2, branch_pool]
     x = layers.Concatenate(axis=channel_axis, name='mixed_7a')(branches)
 
@@ -295,20 +308,26 @@ def InceptionResNetV2(include_top=True,
         x = inception_resnet_block(x,
                                    scale=0.2,
                                    block_type='block8',
-                                   block_idx=block_idx)
+                                   block_idx=block_idx,
+                                   activation_dtype=activation_dtype)
     x = inception_resnet_block(x,
                                scale=1.,
                                activation=None,
                                block_type='block8',
-                               block_idx=10)
+                               block_idx=10,
+                               activation_dtype=activation_dtype)
 
     # Final convolution block: 8 x 8 x 1536
-    x = conv2d_bn(x, 1536, 1, name='conv_7b')
+    x = conv2d_bn(x, 1536, 1, name='conv_7b', activation_dtype=activation_dtype)
 
     if include_top:
         # Classification block
         x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-        x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+        if activation_dtype is None:
+            x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+        else:
+            x = layers.Dense(classes, name='dense_logists')(x)
+            x = layers.Activation('softmax', dtype=activation_dtype, name='predictions')(x)
     else:
         if pooling == 'avg':
             x = layers.GlobalAveragePooling2D()(x)
diff --git a/segmentation_models/backbones/inception_v3.py b/segmentation_models/backbones/inception_v3.py
index db8b567f..1b8e714c 100644
--- a/segmentation_models/backbones/inception_v3.py
+++ b/segmentation_models/backbones/inception_v3.py
@@ -36,6 +36,7 @@ def conv2d_bn(x,
               num_col,
               padding='same',
               strides=(1, 1),
+              activation_dtype=None,
               name=None):
     """Utility function to apply conv + BN.
     # Arguments
@@ -45,6 +46,8 @@ def conv2d_bn(x,
         num_col: width of the convolution kernel.
         padding: padding mode in `Conv2D`.
         strides: strides in `Conv2D`.
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         name: name of the ops; will become `name + '_conv'`
             for the convolution and `name + '_bn'` for the
             batch norm layer.
@@ -68,7 +71,10 @@ def conv2d_bn(x,
         use_bias=False,
         name=conv_name)(x)
     x = layers.BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
-    x = layers.Activation('relu', name=name)(x)
+    if activation_dtype is None:
+        x = layers.Activation('relu', name=name)(x)
+    else:
+        x = layers.Activation('relu', name=name, dtype=activation_dtype)(x)
     return x
 
 
@@ -77,6 +83,7 @@ def InceptionV3(include_top=True,
                 input_tensor=None,
                 input_shape=None,
                 pooling=None,
+                activation_dtype=None,
                 classes=1000,
                 **kwargs):
     """Instantiates the Inception v3 architecture.
@@ -109,6 +116,8 @@ def InceptionV3(include_top=True,
                 the output of the model will be a 2D tensor.
             - `max` means that global max pooling will
                 be applied.
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         classes: optional number of classes to classify images
             into, only to be specified if `include_top` is True, and
             if no `weights` argument is specified.
@@ -153,79 +162,79 @@ def InceptionV3(include_top=True,
     else:
         channel_axis = 3
 
-    x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='same')
-    x = conv2d_bn(x, 32, 3, 3, padding='same')
-    x = conv2d_bn(x, 64, 3, 3, padding='same')
+    x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 32, 3, 3, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 64, 3, 3, padding='same', activation_dtype=activation_dtype)
     x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
 
-    x = conv2d_bn(x, 80, 1, 1, padding='same')
-    x = conv2d_bn(x, 192, 3, 3, padding='same')
+    x = conv2d_bn(x, 80, 1, 1, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 192, 3, 3, padding='same', activation_dtype=activation_dtype)
     x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
 
     # mixed 0: 35 x 35 x 256
-    branch1x1 = conv2d_bn(x, 64, 1, 1)
+    branch1x1 = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
 
-    branch5x5 = conv2d_bn(x, 48, 1, 1)
-    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+    branch5x5 = conv2d_bn(x, 48, 1, 1, activation_dtype=activation_dtype)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5, activation_dtype=activation_dtype)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
+    branch_pool = conv2d_bn(branch_pool, 32, 1, 1, activation_dtype=activation_dtype)
     x = layers.concatenate(
         [branch1x1, branch5x5, branch3x3dbl, branch_pool],
         axis=channel_axis,
         name='mixed0')
 
     # mixed 1: 35 x 35 x 288
-    branch1x1 = conv2d_bn(x, 64, 1, 1)
+    branch1x1 = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
 
-    branch5x5 = conv2d_bn(x, 48, 1, 1)
-    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+    branch5x5 = conv2d_bn(x, 48, 1, 1, activation_dtype=activation_dtype)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5, activation_dtype=activation_dtype)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, 1, activation_dtype=activation_dtype)
     x = layers.concatenate(
         [branch1x1, branch5x5, branch3x3dbl, branch_pool],
         axis=channel_axis,
         name='mixed1')
 
     # mixed 2: 35 x 35 x 288
-    branch1x1 = conv2d_bn(x, 64, 1, 1)
+    branch1x1 = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
 
-    branch5x5 = conv2d_bn(x, 48, 1, 1)
-    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
+    branch5x5 = conv2d_bn(x, 48, 1, 1, activation_dtype=activation_dtype)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5, activation_dtype=activation_dtype)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, 1, activation_dtype=activation_dtype)
     x = layers.concatenate(
         [branch1x1, branch5x5, branch3x3dbl, branch_pool],
         axis=channel_axis,
         name='mixed2')
 
     # mixed 3: 17 x 17 x 768
-    branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='same')
+    branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
     branch3x3dbl = conv2d_bn(
-        branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='same')
+        branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
 
     branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
     x = layers.concatenate(
@@ -234,22 +243,22 @@ def InceptionV3(include_top=True,
         name='mixed3')
 
     # mixed 4: 17 x 17 x 768
-    branch1x1 = conv2d_bn(x, 192, 1, 1)
+    branch1x1 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
 
-    branch7x7 = conv2d_bn(x, 128, 1, 1)
-    branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
-    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+    branch7x7 = conv2d_bn(x, 128, 1, 1, activation_dtype=activation_dtype)
+    branch7x7 = conv2d_bn(branch7x7, 128, 1, 7, activation_dtype=activation_dtype)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1, activation_dtype=activation_dtype)
 
-    branch7x7dbl = conv2d_bn(x, 128, 1, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+    branch7x7dbl = conv2d_bn(x, 128, 1, 1, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
     x = layers.concatenate(
         [branch1x1, branch7x7, branch7x7dbl, branch_pool],
         axis=channel_axis,
@@ -257,58 +266,58 @@ def InceptionV3(include_top=True,
 
     # mixed 5, 6: 17 x 17 x 768
     for i in range(2):
-        branch1x1 = conv2d_bn(x, 192, 1, 1)
+        branch1x1 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
 
-        branch7x7 = conv2d_bn(x, 160, 1, 1)
-        branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
-        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+        branch7x7 = conv2d_bn(x, 160, 1, 1, activation_dtype=activation_dtype)
+        branch7x7 = conv2d_bn(branch7x7, 160, 1, 7, activation_dtype=activation_dtype)
+        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1, activation_dtype=activation_dtype)
 
-        branch7x7dbl = conv2d_bn(x, 160, 1, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+        branch7x7dbl = conv2d_bn(x, 160, 1, 1, activation_dtype=activation_dtype)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1, activation_dtype=activation_dtype)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7, activation_dtype=activation_dtype)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1, activation_dtype=activation_dtype)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
 
         branch_pool = layers.AveragePooling2D(
             (3, 3), strides=(1, 1), padding='same')(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
         x = layers.concatenate(
             [branch1x1, branch7x7, branch7x7dbl, branch_pool],
             axis=channel_axis,
             name='mixed' + str(5 + i))
 
     # mixed 7: 17 x 17 x 768
-    branch1x1 = conv2d_bn(x, 192, 1, 1)
+    branch1x1 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
 
-    branch7x7 = conv2d_bn(x, 192, 1, 1)
-    branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
-    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
+    branch7x7 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
+    branch7x7 = conv2d_bn(branch7x7, 192, 1, 7, activation_dtype=activation_dtype)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1, activation_dtype=activation_dtype)
 
     branch7x7dbl = conv2d_bn(x, 192, 1, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
     x = layers.concatenate(
         [branch1x1, branch7x7, branch7x7dbl, branch_pool],
         axis=channel_axis,
         name='mixed7')
 
     # mixed 8: 8 x 8 x 1280
-    branch3x3 = conv2d_bn(x, 192, 1, 1)
+    branch3x3 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
     branch3x3 = conv2d_bn(branch3x3, 320, 3, 3,
-                          strides=(2, 2), padding='same')
+                          strides=(2, 2), padding='same', activation_dtype=activation_dtype)
 
     branch7x7x3 = conv2d_bn(x, 192, 1, 1)
-    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
-    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7, activation_dtype=activation_dtype)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1, activation_dtype=activation_dtype)
     branch7x7x3 = conv2d_bn(
-        branch7x7x3, 192, 3, 3, strides=(2, 2), padding='same')
+        branch7x7x3, 192, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
 
     branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
     x = layers.concatenate(
@@ -318,26 +327,26 @@ def InceptionV3(include_top=True,
 
     # mixed 9: 8 x 8 x 2048
     for i in range(2):
-        branch1x1 = conv2d_bn(x, 320, 1, 1)
+        branch1x1 = conv2d_bn(x, 320, 1, 1, activation_dtype=activation_dtype)
 
-        branch3x3 = conv2d_bn(x, 384, 1, 1)
-        branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
-        branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
+        branch3x3 = conv2d_bn(x, 384, 1, 1, activation_dtype=activation_dtype)
+        branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3, activation_dtype=activation_dtype)
+        branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1, activation_dtype=activation_dtype)
         branch3x3 = layers.concatenate(
             [branch3x3_1, branch3x3_2],
             axis=channel_axis,
             name='mixed9_' + str(i))
 
-        branch3x3dbl = conv2d_bn(x, 448, 1, 1)
-        branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
-        branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
-        branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
+        branch3x3dbl = conv2d_bn(x, 448, 1, 1, activation_dtype=activation_dtype)
+        branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3, activation_dtype=activation_dtype)
+        branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3, activation_dtype=activation_dtype)
+        branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1, activation_dtype=activation_dtype)
         branch3x3dbl = layers.concatenate(
             [branch3x3dbl_1, branch3x3dbl_2], axis=channel_axis)
 
         branch_pool = layers.AveragePooling2D(
             (3, 3), strides=(1, 1), padding='same')(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
         x = layers.concatenate(
             [branch1x1, branch3x3, branch3x3dbl, branch_pool],
             axis=channel_axis,
@@ -345,7 +354,12 @@ def InceptionV3(include_top=True,
     if include_top:
         # Classification block
         x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-        x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+        if activation_dtype is None:
+            x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+        else:
+            x = layers.Dense(classes, name='dense_logits')(x)
+            x = layers.Activation('softmax', dtype=activation_dtype, name='predictions')(x)
+            
     else:
         if pooling == 'avg':
             x = layers.GlobalAveragePooling2D()(x)

From a43d0814f977cc0fd10df4b4881e693e7fd16e7f Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 18:11:12 -0300
Subject: [PATCH 3/7] fpn and linknet with activation_dtype

---
 segmentation_models/models/_common_blocks.py |  6 +-
 segmentation_models/models/fpn.py            | 63 +++++++++++++++-----
 segmentation_models/models/linknet.py        | 60 +++++++++++++------
 3 files changed, 94 insertions(+), 35 deletions(-)

diff --git a/segmentation_models/models/_common_blocks.py b/segmentation_models/models/_common_blocks.py
index 221d83bd..1a2eb3e9 100644
--- a/segmentation_models/models/_common_blocks.py
+++ b/segmentation_models/models/_common_blocks.py
@@ -9,6 +9,7 @@ def Conv2dBn(
         data_format=None,
         dilation_rate=(1, 1),
         activation=None,
+        activation_dtype=None,
         kernel_initializer='glorot_uniform',
         bias_initializer='zeros',
         kernel_regularizer=None,
@@ -62,7 +63,10 @@ def wrapper(input_tensor):
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
 
         if activation:
-            x = layers.Activation(activation, name=act_name)(x)
+            if activation_dtype is None:
+                x = layers.Activation(activation, name=act_name)(x)
+            else:
+                x = layers.Activation(activation, name=act_name, dtype=activation_dtype)(x)
 
         return x
 
diff --git a/segmentation_models/models/fpn.py b/segmentation_models/models/fpn.py
index deab7f54..bcce1fee 100644
--- a/segmentation_models/models/fpn.py
+++ b/segmentation_models/models/fpn.py
@@ -27,7 +27,7 @@ def get_submodules():
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv3x3BnReLU(filters, use_batchnorm, name=None):
+def Conv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -35,6 +35,7 @@ def wrapper(input_tensor):
             filters,
             kernel_size=3,
             activation='relu',
+            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -45,15 +46,25 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def DoubleConv3x3BnReLU(filters, use_batchnorm, name=None):
+def DoubleConv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
     name1, name2 = None, None
     if name is not None:
         name1 = name + 'a'
         name2 = name + 'b'
 
     def wrapper(input_tensor):
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=name1)(input_tensor)
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=name2)(x)
+        x = Conv3x3BnReLU(
+            filters,
+            use_batchnorm,
+            name=name1,
+            activation_dtype=activation_dtype
+        )(input_tensor)
+        x = Conv3x3BnReLU(
+            filters,
+            use_batchnorm,
+            name=name2,
+            activation_dtype=activation_dtype
+        )(x)
         return x
 
     return wrapper
@@ -106,6 +117,7 @@ def build_fpn(
         segmentation_filters=128,
         classes=1,
         activation='sigmoid',
+        activation_dtype=None,
         use_batchnorm=True,
         aggregation='sum',
         dropout=None,
@@ -124,22 +136,30 @@ def build_fpn(
     p2 = FPNBlock(pyramid_filters, stage=2)(p3, skips[3])
 
     # add segmentation head to each
-    s5 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm, name='segm_stage5')(p5)
-    s4 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm, name='segm_stage4')(p4)
-    s3 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm, name='segm_stage3')(p3)
-    s2 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm, name='segm_stage2')(p2)
+    s5 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
+                             name='segm_stage5', activation_dtype=activation_dtype)(p5)
+    s4 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
+                             name='segm_stage4', activation_dtype=activation_dtype)(p4)
+    s3 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
+                             name='segm_stage3', activation_dtype=activation_dtype)(p3)
+    s2 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
+                             name='segm_stage2', activation_dtype=activation_dtype)(p2)
 
     # upsampling to same resolution
-    s5 = layers.UpSampling2D((8, 8), interpolation='nearest', name='upsampling_stage5')(s5)
-    s4 = layers.UpSampling2D((4, 4), interpolation='nearest', name='upsampling_stage4')(s4)
-    s3 = layers.UpSampling2D((2, 2), interpolation='nearest', name='upsampling_stage3')(s3)
+    s5 = layers.UpSampling2D(
+        (8, 8), interpolation='nearest', name='upsampling_stage5')(s5)
+    s4 = layers.UpSampling2D(
+        (4, 4), interpolation='nearest', name='upsampling_stage4')(s4)
+    s3 = layers.UpSampling2D(
+        (2, 2), interpolation='nearest', name='upsampling_stage3')(s3)
 
     # aggregating results
     if aggregation == 'sum':
         x = layers.Add(name='aggregation_sum')([s2, s3, s4, s5])
     elif aggregation == 'concat':
         concat_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-        x = layers.Concatenate(axis=concat_axis, name='aggregation_concat')([s2, s3, s4, s5])
+        x = layers.Concatenate(
+            axis=concat_axis, name='aggregation_concat')([s2, s3, s4, s5])
     else:
         raise ValueError('Aggregation parameter should be in ("sum", "concat"), '
                          'got {}'.format(aggregation))
@@ -148,8 +168,10 @@ def build_fpn(
         x = layers.SpatialDropout2D(dropout, name='pyramid_dropout')(x)
 
     # final stage
-    x = Conv3x3BnReLU(segmentation_filters, use_batchnorm, name='final_stage')(x)
-    x = layers.UpSampling2D(size=(2, 2), interpolation='bilinear', name='final_upsampling')(x)
+    x = Conv3x3BnReLU(segmentation_filters, use_batchnorm,
+                      name='final_stage', activation_dtype=activation_dtype)(x)
+    x = layers.UpSampling2D(
+        size=(2, 2), interpolation='bilinear', name='final_upsampling')(x)
 
     # model head (define number of output classes)
     x = layers.Conv2D(
@@ -160,7 +182,11 @@ def build_fpn(
         kernel_initializer='glorot_uniform',
         name='head_conv',
     )(x)
-    x = layers.Activation(activation, name=activation)(x)
+    if activation_dtype is None:
+        x = layers.Activation(activation, name=activation)(x)
+    else:
+        x = layers.Activation(activation, name=activation,
+                              dtype=activation_dtype)(x)
 
     # create keras model instance
     model = models.Model(input_, x)
@@ -177,6 +203,7 @@ def FPN(
         input_shape=(None, None, 3),
         classes=21,
         activation='softmax',
+        activation_dtype=None,
         weights=None,
         encoder_weights='imagenet',
         encoder_freeze=False,
@@ -198,6 +225,8 @@ def FPN(
         classes: a number of classes for output (output shape - ``(h, w, classes)``).
         weights: optional, path to model weights.
         activation: name of one of ``keras.activations`` for last model layer (e.g. ``sigmoid``, ``softmax``, ``linear``).
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
         encoder_freeze: if ``True`` set all layers of encoder (backbone model) as non-trainable.
         encoder_features: a list of layer numbers or names starting from top of the model.
@@ -218,7 +247,8 @@ def FPN(
     """
     global backend, layers, models, keras_utils
     submodule_args = filter_keras_submodules(kwargs)
-    backend, layers, models, keras_utils = get_submodules_from_kwargs(submodule_args)
+    backend, layers, models, keras_utils = get_submodules_from_kwargs(
+        submodule_args)
 
     backbone = Backbones.get_backbone(
         backbone_name,
@@ -239,6 +269,7 @@ def FPN(
         use_batchnorm=pyramid_use_batchnorm,
         dropout=pyramid_dropout,
         activation=activation,
+        activation_dtype=activation_dtype,
         classes=classes,
         aggregation=pyramid_aggregation,
     )
diff --git a/segmentation_models/models/linknet.py b/segmentation_models/models/linknet.py
index 74c533c9..95255042 100644
--- a/segmentation_models/models/linknet.py
+++ b/segmentation_models/models/linknet.py
@@ -27,7 +27,7 @@ def get_submodules():
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv3x3BnReLU(filters, use_batchnorm, name=None):
+def Conv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -35,6 +35,7 @@ def wrapper(input_tensor):
             filters,
             kernel_size=3,
             activation='relu',
+            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -45,7 +46,7 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def Conv1x1BnReLU(filters, use_batchnorm, name=None):
+def Conv1x1BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -53,6 +54,7 @@ def wrapper(input_tensor):
             filters,
             kernel_size=1,
             activation='relu',
+            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -63,7 +65,7 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def DecoderUpsamplingX2Block(filters, stage, use_batchnorm):
+def DecoderUpsamplingX2Block(filters, stage, use_batchnorm, activation_dtype=None):
     conv_block1_name = 'decoder_stage{}a'.format(stage)
     conv_block2_name = 'decoder_stage{}b'.format(stage)
     conv_block3_name = 'decoder_stage{}c'.format(stage)
@@ -74,12 +76,16 @@ def DecoderUpsamplingX2Block(filters, stage, use_batchnorm):
 
     def wrapper(input_tensor, skip=None):
         input_filters = backend.int_shape(input_tensor)[channels_axis]
-        output_filters = backend.int_shape(skip)[channels_axis] if skip is not None else filters
+        output_filters = backend.int_shape(
+            skip)[channels_axis] if skip is not None else filters
 
-        x = Conv1x1BnReLU(input_filters // 4, use_batchnorm, name=conv_block1_name)(input_tensor)
+        x = Conv1x1BnReLU(input_filters // 4, use_batchnorm, name=conv_block1_name,
+                          activation_dtype=activation_dtype)(input_tensor)
         x = layers.UpSampling2D((2, 2), name=up_name)(x)
-        x = Conv3x3BnReLU(input_filters // 4, use_batchnorm, name=conv_block2_name)(x)
-        x = Conv1x1BnReLU(output_filters, use_batchnorm, name=conv_block3_name)(x)
+        x = Conv3x3BnReLU(input_filters // 4, use_batchnorm,
+                          name=conv_block2_name, activation_dtype=activation_dtype)(x)
+        x = Conv1x1BnReLU(output_filters, use_batchnorm,
+                          name=conv_block3_name, activation_dtype=activation_dtype)(x)
 
         if skip is not None:
             x = layers.Add(name=add_name)([x, skip])
@@ -88,7 +94,7 @@ def wrapper(input_tensor, skip=None):
     return wrapper
 
 
-def DecoderTransposeX2Block(filters, stage, use_batchnorm):
+def DecoderTransposeX2Block(filters, stage, use_batchnorm, activation_dtype=None):
     conv_block1_name = 'decoder_stage{}a'.format(stage)
     transpose_name = 'decoder_stage{}b_transpose'.format(stage)
     bn_name = 'decoder_stage{}b_bn'.format(stage)
@@ -100,9 +106,11 @@ def DecoderTransposeX2Block(filters, stage, use_batchnorm):
 
     def wrapper(input_tensor, skip=None):
         input_filters = backend.int_shape(input_tensor)[channels_axis]
-        output_filters = backend.int_shape(skip)[channels_axis] if skip is not None else filters
+        output_filters = backend.int_shape(
+            skip)[channels_axis] if skip is not None else filters
 
-        x = Conv1x1BnReLU(input_filters // 4, use_batchnorm, name=conv_block1_name)(input_tensor)
+        x = Conv1x1BnReLU(input_filters // 4, use_batchnorm,
+                          name=conv_block1_name, activation_dtype=activation_dtype)(input_tensor)
         x = layers.Conv2DTranspose(
             filters=input_filters // 4,
             kernel_size=(4, 4),
@@ -114,9 +122,13 @@ def wrapper(input_tensor, skip=None):
 
         if use_batchnorm:
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
-
-        x = layers.Activation('relu', name=relu_name)(x)
-        x = Conv1x1BnReLU(output_filters, use_batchnorm, name=conv_block3_name)(x)
+        if activation_dtype is None:
+            x = layers.Activation('relu', name=relu_name)(x)
+        else:
+            x = layers.Activation('relu', name=relu_name,
+                                  dtype=activation_dtype)(x)
+        x = Conv1x1BnReLU(output_filters, use_batchnorm,
+                          name=conv_block3_name)(x)
 
         if skip is not None:
             x = layers.Add(name=add_name)([x, skip])
@@ -138,6 +150,7 @@ def build_linknet(
         n_upsample_blocks=5,
         classes=1,
         activation='sigmoid',
+        activation_dtype=None,
         use_batchnorm=True,
 ):
     input_ = backbone.input
@@ -149,8 +162,10 @@ def build_linknet(
 
     # add center block if previous operation was maxpooling (for vgg models)
     if isinstance(backbone.layers[-1], layers.MaxPooling2D):
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1')(x)
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2')(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1',
+                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2',
+                          activation_dtype=activation_dtype)(x)
 
     # building decoder blocks
     for i in range(n_upsample_blocks):
@@ -160,7 +175,8 @@ def build_linknet(
         else:
             skip = None
 
-        x = decoder_block(decoder_filters[i], stage=i, use_batchnorm=use_batchnorm)(x, skip)
+        x = decoder_block(
+            decoder_filters[i], stage=i, use_batchnorm=use_batchnorm)(x, skip)
 
     # model head (define number of output classes)
     x = layers.Conv2D(
@@ -170,7 +186,10 @@ def build_linknet(
         use_bias=True,
         kernel_initializer='glorot_uniform'
     )(x)
-    x = layers.Activation(activation, name=activation)(x)
+    if activation_dtype is None:
+        x = layers.Activation(activation, name=activation)(x)
+    else:
+        x = layers.Activation(activation, name=activation, dtype=activation_dtype)(x)
 
     # create keras model instance
     model = models.Model(input_, x)
@@ -187,6 +206,7 @@ def Linknet(
         input_shape=(None, None, 3),
         classes=1,
         activation='sigmoid',
+        activation_dtype=None,
         weights=None,
         encoder_weights='imagenet',
         encoder_freeze=False,
@@ -210,6 +230,8 @@ def Linknet(
         classes: a number of classes for output (output shape - ``(h, w, classes)``).
         activation: name of one of ``keras.activations`` for last model layer
             (e.g. ``sigmoid``, ``softmax``, ``linear``).
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         weights: optional, path to model weights.
         encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
         encoder_freeze: if ``True`` set all layers of encoder (backbone model) as non-trainable.
@@ -234,7 +256,8 @@ def Linknet(
 
     global backend, layers, models, keras_utils
     submodule_args = filter_keras_submodules(kwargs)
-    backend, layers, models, keras_utils = get_submodules_from_kwargs(submodule_args)
+    backend, layers, models, keras_utils = get_submodules_from_kwargs(
+        submodule_args)
 
     if decoder_block_type == 'upsampling':
         decoder_block = DecoderUpsamplingX2Block
@@ -262,6 +285,7 @@ def Linknet(
         decoder_filters=decoder_filters,
         classes=classes,
         activation=activation,
+        activation_dtype=activation_dtype,
         n_upsample_blocks=len(decoder_filters),
         use_batchnorm=decoder_use_batchnorm,
     )

From e52c86df0acf9508c4790376d4a21386530df512 Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 18:21:45 -0300
Subject: [PATCH 4/7] unet and pspnet

---
 segmentation_models/models/pspnet.py | 69 +++++++++++++++++++---------
 segmentation_models/models/unet.py   | 51 ++++++++++++++------
 2 files changed, 85 insertions(+), 35 deletions(-)

diff --git a/segmentation_models/models/pspnet.py b/segmentation_models/models/pspnet.py
index 001b28c9..0997d124 100644
--- a/segmentation_models/models/pspnet.py
+++ b/segmentation_models/models/pspnet.py
@@ -25,14 +25,16 @@ def get_submodules():
 
 def check_input_shape(input_shape, factor):
     if input_shape is None:
-        raise ValueError("Input shape should be a tuple of 3 integers, not None!")
+        raise ValueError(
+            "Input shape should be a tuple of 3 integers, not None!")
 
-    h, w = input_shape[:2] if backend.image_data_format() == 'channels_last' else input_shape[1:]
+    h, w = input_shape[:2] if backend.image_data_format(
+    ) == 'channels_last' else input_shape[1:]
     min_size = factor * 6
 
     is_wrong_shape = (
-            h % min_size != 0 or w % min_size != 0 or
-            h < min_size or w < min_size
+        h % min_size != 0 or w % min_size != 0 or
+        h < min_size or w < min_size
     )
 
     if is_wrong_shape:
@@ -44,7 +46,7 @@ def check_input_shape(input_shape, factor):
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv1x1BnReLU(filters, use_batchnorm, name=None):
+def Conv1x1BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -52,6 +54,7 @@ def wrapper(input_tensor):
             filters,
             kernel_size=1,
             activation='relu',
+            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -67,6 +70,7 @@ def SpatialContextBlock(
         conv_filters=512,
         pooling_type='avg',
         use_batchnorm=True,
+        activation_dtype=None
 ):
     if pooling_type not in ('max', 'avg'):
         raise ValueError('Unsupported pooling type - `{}`.'.format(pooling_type) +
@@ -81,17 +85,22 @@ def SpatialContextBlock(
     def wrapper(input_tensor):
         # extract input feature maps size (h, and w dimensions)
         input_shape = backend.int_shape(input_tensor)
-        spatial_size = input_shape[1:3] if backend.image_data_format() == 'channels_last' else input_shape[2:]
+        spatial_size = input_shape[1:3] if backend.image_data_format(
+        ) == 'channels_last' else input_shape[2:]
 
         # Compute the kernel and stride sizes according to how large the final feature map will be
         # When the kernel factor and strides are equal, then we can compute the final feature map factor
         # by simply dividing the current factor by the kernel or stride factor
         # The final feature map sizes are 1x1, 2x2, 3x3, and 6x6.
-        pool_size = up_size = [spatial_size[0] // level, spatial_size[1] // level]
-
-        x = Pooling2D(pool_size, strides=pool_size, padding='same', name=pooling_name)(input_tensor)
-        x = Conv1x1BnReLU(conv_filters, use_batchnorm, name=conv_block_name)(x)
-        x = layers.UpSampling2D(up_size, interpolation='bilinear', name=upsampling_name)(x)
+        pool_size = up_size = [spatial_size[0] //
+                               level, spatial_size[1] // level]
+
+        x = Pooling2D(pool_size, strides=pool_size,
+                      padding='same', name=pooling_name)(input_tensor)
+        x = Conv1x1BnReLU(conv_filters, use_batchnorm,
+                          name=conv_block_name, activation_dtype=activation_dtype)(x)
+        x = layers.UpSampling2D(
+            up_size, interpolation='bilinear', name=upsampling_name)(x)
         return x
 
     return wrapper
@@ -110,6 +119,7 @@ def build_psp(
         final_upsampling_factor=8,
         classes=21,
         activation='softmax',
+        activation_dtype=None,
         dropout=None,
 ):
     input_ = backbone.input
@@ -117,15 +127,21 @@ def build_psp(
          else backbone.get_layer(index=psp_layer_idx).output)
 
     # build spatial pyramid
-    x1 = SpatialContextBlock(1, conv_filters, pooling_type, use_batchnorm)(x)
-    x2 = SpatialContextBlock(2, conv_filters, pooling_type, use_batchnorm)(x)
-    x3 = SpatialContextBlock(3, conv_filters, pooling_type, use_batchnorm)(x)
-    x6 = SpatialContextBlock(6, conv_filters, pooling_type, use_batchnorm)(x)
+    x1 = SpatialContextBlock(1, conv_filters, pooling_type,
+                             use_batchnorm, activation_dtype=activation_dtype)(x)
+    x2 = SpatialContextBlock(2, conv_filters, pooling_type,
+                             use_batchnorm, activation_dtype=activation_dtype)(x)
+    x3 = SpatialContextBlock(3, conv_filters, pooling_type,
+                             use_batchnorm, activation_dtype=activation_dtype)(x)
+    x6 = SpatialContextBlock(6, conv_filters, pooling_type,
+                             use_batchnorm, activation_dtype=activation_dtype)(x)
 
     # aggregate spatial pyramid
     concat_axis = 3 if backend.image_data_format() == 'channels_last' else 1
-    x = layers.Concatenate(axis=concat_axis, name='psp_concat')([x, x1, x2, x3, x6])
-    x = Conv1x1BnReLU(conv_filters, use_batchnorm, name='aggregation')(x)
+    x = layers.Concatenate(axis=concat_axis, name='psp_concat')(
+        [x, x1, x2, x3, x6])
+    x = Conv1x1BnReLU(conv_filters, use_batchnorm,
+                      name='aggregation', activation_dtype=activation_dtype)(x)
 
     # model regularization
     if dropout is not None:
@@ -140,8 +156,13 @@ def build_psp(
         name='final_conv',
     )(x)
 
-    x = layers.UpSampling2D(final_upsampling_factor, name='final_upsampling', interpolation='bilinear')(x)
-    x = layers.Activation(activation, name=activation)(x)
+    x = layers.UpSampling2D(final_upsampling_factor,
+                            name='final_upsampling', interpolation='bilinear')(x)
+    if activation_dtype is None:
+        x = layers.Activation(activation, name=activation)(x)
+    else:
+        x = layers.Activation(activation, name=activation,
+                              dtype=activation_dtype)(x)
 
     model = models.Model(input_, x)
 
@@ -157,6 +178,7 @@ def PSPNet(
         input_shape=(384, 384, 3),
         classes=21,
         activation='softmax',
+        activation_dtype=None,
         weights=None,
         encoder_weights='imagenet',
         encoder_freeze=False,
@@ -177,6 +199,8 @@ def PSPNet(
         classes: a number of classes for output (output shape - ``(h, w, classes)``).
         activation: name of one of ``keras.activations`` for last model layer
                 (e.g. ``sigmoid``, ``softmax``, ``linear``).
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         weights: optional, path to model weights.
         encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
         encoder_freeze: if ``True`` set all layers of encoder (backbone model) as non-trainable.
@@ -198,7 +222,8 @@ def PSPNet(
 
     global backend, layers, models, keras_utils
     submodule_args = filter_keras_submodules(kwargs)
-    backend, layers, models, keras_utils = get_submodules_from_kwargs(submodule_args)
+    backend, layers, models, keras_utils = get_submodules_from_kwargs(
+        submodule_args)
 
     # control image input shape
     check_input_shape(input_shape, downsample_factor)
@@ -220,7 +245,8 @@ def PSPNet(
     elif downsample_factor == 4:
         psp_layer_idx = feature_layers[2]
     else:
-        raise ValueError('Unsupported factor - `{}`, Use 4, 8 or 16.'.format(downsample_factor))
+        raise ValueError(
+            'Unsupported factor - `{}`, Use 4, 8 or 16.'.format(downsample_factor))
 
     model = build_psp(
         backbone,
@@ -231,6 +257,7 @@ def PSPNet(
         final_upsampling_factor=downsample_factor,
         classes=classes,
         activation=activation,
+        activation_dtype=activation_dtype,
         dropout=psp_dropout,
     )
 
diff --git a/segmentation_models/models/unet.py b/segmentation_models/models/unet.py
index 7da2b391..d4facac0 100644
--- a/segmentation_models/models/unet.py
+++ b/segmentation_models/models/unet.py
@@ -27,7 +27,7 @@ def get_submodules():
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv3x3BnReLU(filters, use_batchnorm, name=None):
+def Conv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -35,6 +35,7 @@ def wrapper(input_tensor):
             filters,
             kernel_size=3,
             activation='relu',
+            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -45,7 +46,7 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def DecoderUpsamplingX2Block(filters, stage, use_batchnorm=False):
+def DecoderUpsamplingX2Block(filters, stage, use_batchnorm=False, activation_dtype=None):
     up_name = 'decoder_stage{}_upsampling'.format(stage)
     conv1_name = 'decoder_stage{}a'.format(stage)
     conv2_name = 'decoder_stage{}b'.format(stage)
@@ -57,17 +58,20 @@ def wrapper(input_tensor, skip=None):
         x = layers.UpSampling2D(size=2, name=up_name)(input_tensor)
 
         if skip is not None:
-            x = layers.Concatenate(axis=concat_axis, name=concat_name)([x, skip])
+            x = layers.Concatenate(
+                axis=concat_axis, name=concat_name)([x, skip])
 
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv1_name)(x)
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv2_name)(x)
+        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv1_name,
+                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv2_name,
+                          activation_dtype=activation_dtype)(x)
 
         return x
 
     return wrapper
 
 
-def DecoderTransposeX2Block(filters, stage, use_batchnorm=False):
+def DecoderTransposeX2Block(filters, stage, use_batchnorm=False, activation_dtype=None):
     transp_name = 'decoder_stage{}a_transpose'.format(stage)
     bn_name = 'decoder_stage{}a_bn'.format(stage)
     relu_name = 'decoder_stage{}a_relu'.format(stage)
@@ -90,12 +94,18 @@ def layer(input_tensor, skip=None):
         if use_batchnorm:
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
 
-        x = layers.Activation('relu', name=relu_name)(x)
+        if activation_dtype is None:
+            x = layers.Activation('relu', name=relu_name)(x)
+        else:
+            x = layers.Activation('relu', name=relu_name,
+                                  dtype=activation_dtype)(x)
 
         if skip is not None:
-            x = layers.Concatenate(axis=concat_axis, name=concat_name)([x, skip])
+            x = layers.Concatenate(
+                axis=concat_axis, name=concat_name)([x, skip])
 
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv_block_name)(x)
+        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv_block_name,
+                          activation_dtype=activation_dtype)(x)
 
         return x
 
@@ -114,6 +124,7 @@ def build_unet(
         n_upsample_blocks=5,
         classes=1,
         activation='sigmoid',
+        activation_dtype=None,
         use_batchnorm=True,
 ):
     input_ = backbone.input
@@ -125,8 +136,10 @@ def build_unet(
 
     # add center block if previous operation was maxpooling (for vgg models)
     if isinstance(backbone.layers[-1], layers.MaxPooling2D):
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1')(x)
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2')(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1',
+                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2',
+                          activation_dtype=activation_dtype)(x)
 
     # building decoder blocks
     for i in range(n_upsample_blocks):
@@ -136,7 +149,8 @@ def build_unet(
         else:
             skip = None
 
-        x = decoder_block(decoder_filters[i], stage=i, use_batchnorm=use_batchnorm)(x, skip)
+        x = decoder_block(
+            decoder_filters[i], stage=i, use_batchnorm=use_batchnorm)(x, skip)
 
     # model head (define number of output classes)
     x = layers.Conv2D(
@@ -147,7 +161,11 @@ def build_unet(
         kernel_initializer='glorot_uniform',
         name='final_conv',
     )(x)
-    x = layers.Activation(activation, name=activation)(x)
+    if activation_dtype is None:
+        x = layers.Activation(activation, name=activation)(x)
+    else:
+        x = layers.Activation(activation, name=activation,
+                              dtype=activation_dtype)(x)
 
     # create keras model instance
     model = models.Model(input_, x)
@@ -164,6 +182,7 @@ def Unet(
         input_shape=(None, None, 3),
         classes=1,
         activation='sigmoid',
+        activation_dtype=None,
         weights=None,
         encoder_weights='imagenet',
         encoder_freeze=False,
@@ -184,6 +203,8 @@ def Unet(
         classes: a number of classes for output (output shape - ``(h, w, classes)``).
         activation: name of one of ``keras.activations`` for last model layer
             (e.g. ``sigmoid``, ``softmax``, ``linear``).
+        activation_dtype: Optional type parameter to force activations
+            to be treated in certain type. Used when mixed_precision is enabled.
         weights: optional, path to model weights.
         encoder_weights: one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
         encoder_freeze: if ``True`` set all layers of encoder (backbone model) as non-trainable.
@@ -209,7 +230,8 @@ def Unet(
 
     global backend, layers, models, keras_utils
     submodule_args = filter_keras_submodules(kwargs)
-    backend, layers, models, keras_utils = get_submodules_from_kwargs(submodule_args)
+    backend, layers, models, keras_utils = get_submodules_from_kwargs(
+        submodule_args)
 
     if decoder_block_type == 'upsampling':
         decoder_block = DecoderUpsamplingX2Block
@@ -237,6 +259,7 @@ def Unet(
         decoder_filters=decoder_filters,
         classes=classes,
         activation=activation,
+        activation_dtype=activation_dtype,
         n_upsample_blocks=len(decoder_filters),
         use_batchnorm=decoder_use_batchnorm,
     )

From cda0ec1b361df8e6f50af45f787a32945a981887 Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 20:41:32 -0300
Subject: [PATCH 5/7] setting restrictions only to softmax activations

---
 .../backbones/inception_resnet_v2.py          |   4 +-
 segmentation_models/backbones/inception_v3.py | 155 +++++++++---------
 segmentation_models/models/_common_blocks.py  |   2 +-
 segmentation_models/models/fpn.py             |   2 +-
 segmentation_models/models/linknet.py         |   4 +-
 segmentation_models/models/pspnet.py          |   2 +-
 segmentation_models/models/unet.py            |   4 +-
 7 files changed, 84 insertions(+), 89 deletions(-)

diff --git a/segmentation_models/backbones/inception_resnet_v2.py b/segmentation_models/backbones/inception_resnet_v2.py
index 499c7279..035571ba 100644
--- a/segmentation_models/backbones/inception_resnet_v2.py
+++ b/segmentation_models/backbones/inception_resnet_v2.py
@@ -77,7 +77,7 @@ def conv2d_bn(x,
                                       name=bn_name)(x)
     if activation is not None:
         ac_name = None if name is None else name + '_ac'
-        if activation_dtype is None:
+        if activation_dtype is None or activation != 'softmax':
             x = layers.Activation(activation, name=ac_name)(x)
         else:
             x = layers.Activation(activation, name=ac_name, dtype=activation_dtype)(x)
@@ -164,7 +164,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu', a
                       arguments={'scale': scale},
                       name=block_name)([x, up])
     if activation is not None:
-        if activation_dtype is None:
+        if activation_dtype is None or activation != 'softmax':
             x = layers.Activation(activation, name=block_name + '_ac')(x)
         else:
             x = layers.Activation(activation, name=block_name + '_ac', dtype=activation_dtype)(x)
diff --git a/segmentation_models/backbones/inception_v3.py b/segmentation_models/backbones/inception_v3.py
index 1b8e714c..543d91ac 100644
--- a/segmentation_models/backbones/inception_v3.py
+++ b/segmentation_models/backbones/inception_v3.py
@@ -36,7 +36,6 @@ def conv2d_bn(x,
               num_col,
               padding='same',
               strides=(1, 1),
-              activation_dtype=None,
               name=None):
     """Utility function to apply conv + BN.
     # Arguments
@@ -46,8 +45,6 @@ def conv2d_bn(x,
         num_col: width of the convolution kernel.
         padding: padding mode in `Conv2D`.
         strides: strides in `Conv2D`.
-        activation_dtype: Optional type parameter to force activations
-            to be treated in certain type. Used when mixed_precision is enabled.
         name: name of the ops; will become `name + '_conv'`
             for the convolution and `name + '_bn'` for the
             batch norm layer.
@@ -71,10 +68,7 @@ def conv2d_bn(x,
         use_bias=False,
         name=conv_name)(x)
     x = layers.BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
-    if activation_dtype is None:
-        x = layers.Activation('relu', name=name)(x)
-    else:
-        x = layers.Activation('relu', name=name, dtype=activation_dtype)(x)
+    x = layers.Activation('relu', name=name)(x)
     return x
 
 
@@ -162,79 +156,79 @@ def InceptionV3(include_top=True,
     else:
         channel_axis = 3
 
-    x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
-    x = conv2d_bn(x, 32, 3, 3, padding='same', activation_dtype=activation_dtype)
-    x = conv2d_bn(x, 64, 3, 3, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(img_input, 32, 3, 3, strides=(2, 2), padding='same')
+    x = conv2d_bn(x, 32, 3, 3, padding='same')
+    x = conv2d_bn(x, 64, 3, 3, padding='same')
     x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
 
-    x = conv2d_bn(x, 80, 1, 1, padding='same', activation_dtype=activation_dtype)
-    x = conv2d_bn(x, 192, 3, 3, padding='same', activation_dtype=activation_dtype)
+    x = conv2d_bn(x, 80, 1, 1, padding='same')
+    x = conv2d_bn(x, 192, 3, 3, padding='same')
     x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
 
     # mixed 0: 35 x 35 x 256
-    branch1x1 = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch1x1 = conv2d_bn(x, 64, 1, 1)
 
-    branch5x5 = conv2d_bn(x, 48, 1, 1, activation_dtype=activation_dtype)
-    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5, activation_dtype=activation_dtype)
+    branch5x5 = conv2d_bn(x, 48, 1, 1)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 32, 1, 1, activation_dtype=activation_dtype)
+    branch_pool = conv2d_bn(branch_pool, 32, 1, 1)
     x = layers.concatenate(
         [branch1x1, branch5x5, branch3x3dbl, branch_pool],
         axis=channel_axis,
         name='mixed0')
 
     # mixed 1: 35 x 35 x 288
-    branch1x1 = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch1x1 = conv2d_bn(x, 64, 1, 1)
 
-    branch5x5 = conv2d_bn(x, 48, 1, 1, activation_dtype=activation_dtype)
-    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5, activation_dtype=activation_dtype)
+    branch5x5 = conv2d_bn(x, 48, 1, 1)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 64, 1, 1, activation_dtype=activation_dtype)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
     x = layers.concatenate(
         [branch1x1, branch5x5, branch3x3dbl, branch_pool],
         axis=channel_axis,
         name='mixed1')
 
     # mixed 2: 35 x 35 x 288
-    branch1x1 = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
+    branch1x1 = conv2d_bn(x, 64, 1, 1)
 
-    branch5x5 = conv2d_bn(x, 48, 1, 1, activation_dtype=activation_dtype)
-    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5, activation_dtype=activation_dtype)
+    branch5x5 = conv2d_bn(x, 48, 1, 1)
+    branch5x5 = conv2d_bn(branch5x5, 64, 5, 5)
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 64, 1, 1, activation_dtype=activation_dtype)
+    branch_pool = conv2d_bn(branch_pool, 64, 1, 1)
     x = layers.concatenate(
         [branch1x1, branch5x5, branch3x3dbl, branch_pool],
         axis=channel_axis,
         name='mixed2')
 
     # mixed 3: 17 x 17 x 768
-    branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
+    branch3x3 = conv2d_bn(x, 384, 3, 3, strides=(2, 2), padding='same')
 
-    branch3x3dbl = conv2d_bn(x, 64, 1, 1, activation_dtype=activation_dtype)
-    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3, activation_dtype=activation_dtype)
+    branch3x3dbl = conv2d_bn(x, 64, 1, 1)
+    branch3x3dbl = conv2d_bn(branch3x3dbl, 96, 3, 3)
     branch3x3dbl = conv2d_bn(
-        branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
+        branch3x3dbl, 96, 3, 3, strides=(2, 2), padding='same')
 
     branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
     x = layers.concatenate(
@@ -243,22 +237,22 @@ def InceptionV3(include_top=True,
         name='mixed3')
 
     # mixed 4: 17 x 17 x 768
-    branch1x1 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
 
-    branch7x7 = conv2d_bn(x, 128, 1, 1, activation_dtype=activation_dtype)
-    branch7x7 = conv2d_bn(branch7x7, 128, 1, 7, activation_dtype=activation_dtype)
-    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1, activation_dtype=activation_dtype)
+    branch7x7 = conv2d_bn(x, 128, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 128, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
 
-    branch7x7dbl = conv2d_bn(x, 128, 1, 1, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(x, 128, 1, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 128, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
     x = layers.concatenate(
         [branch1x1, branch7x7, branch7x7dbl, branch_pool],
         axis=channel_axis,
@@ -266,58 +260,58 @@ def InceptionV3(include_top=True,
 
     # mixed 5, 6: 17 x 17 x 768
     for i in range(2):
-        branch1x1 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
+        branch1x1 = conv2d_bn(x, 192, 1, 1)
 
-        branch7x7 = conv2d_bn(x, 160, 1, 1, activation_dtype=activation_dtype)
-        branch7x7 = conv2d_bn(branch7x7, 160, 1, 7, activation_dtype=activation_dtype)
-        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1, activation_dtype=activation_dtype)
+        branch7x7 = conv2d_bn(x, 160, 1, 1)
+        branch7x7 = conv2d_bn(branch7x7, 160, 1, 7)
+        branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
 
-        branch7x7dbl = conv2d_bn(x, 160, 1, 1, activation_dtype=activation_dtype)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1, activation_dtype=activation_dtype)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7, activation_dtype=activation_dtype)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1, activation_dtype=activation_dtype)
-        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
+        branch7x7dbl = conv2d_bn(x, 160, 1, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 1, 7)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 160, 7, 1)
+        branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
 
         branch_pool = layers.AveragePooling2D(
             (3, 3), strides=(1, 1), padding='same')(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
         x = layers.concatenate(
             [branch1x1, branch7x7, branch7x7dbl, branch_pool],
             axis=channel_axis,
             name='mixed' + str(5 + i))
 
     # mixed 7: 17 x 17 x 768
-    branch1x1 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
+    branch1x1 = conv2d_bn(x, 192, 1, 1)
 
-    branch7x7 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
-    branch7x7 = conv2d_bn(branch7x7, 192, 1, 7, activation_dtype=activation_dtype)
-    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1, activation_dtype=activation_dtype)
+    branch7x7 = conv2d_bn(x, 192, 1, 1)
+    branch7x7 = conv2d_bn(branch7x7, 192, 1, 7)
+    branch7x7 = conv2d_bn(branch7x7, 192, 7, 1)
 
     branch7x7dbl = conv2d_bn(x, 192, 1, 1)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1, activation_dtype=activation_dtype)
-    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7, activation_dtype=activation_dtype)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 7, 1)
+    branch7x7dbl = conv2d_bn(branch7x7dbl, 192, 1, 7)
 
     branch_pool = layers.AveragePooling2D((3, 3),
                                           strides=(1, 1),
                                           padding='same')(x)
-    branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
+    branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
     x = layers.concatenate(
         [branch1x1, branch7x7, branch7x7dbl, branch_pool],
         axis=channel_axis,
         name='mixed7')
 
     # mixed 8: 8 x 8 x 1280
-    branch3x3 = conv2d_bn(x, 192, 1, 1, activation_dtype=activation_dtype)
+    branch3x3 = conv2d_bn(x, 192, 1, 1)
     branch3x3 = conv2d_bn(branch3x3, 320, 3, 3,
-                          strides=(2, 2), padding='same', activation_dtype=activation_dtype)
+                          strides=(2, 2), padding='same')
 
     branch7x7x3 = conv2d_bn(x, 192, 1, 1)
-    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7, activation_dtype=activation_dtype)
-    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1, activation_dtype=activation_dtype)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 1, 7)
+    branch7x7x3 = conv2d_bn(branch7x7x3, 192, 7, 1)
     branch7x7x3 = conv2d_bn(
-        branch7x7x3, 192, 3, 3, strides=(2, 2), padding='same', activation_dtype=activation_dtype)
+        branch7x7x3, 192, 3, 3, strides=(2, 2), padding='same')
 
     branch_pool = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
     x = layers.concatenate(
@@ -327,26 +321,26 @@ def InceptionV3(include_top=True,
 
     # mixed 9: 8 x 8 x 2048
     for i in range(2):
-        branch1x1 = conv2d_bn(x, 320, 1, 1, activation_dtype=activation_dtype)
+        branch1x1 = conv2d_bn(x, 320, 1, 1)
 
-        branch3x3 = conv2d_bn(x, 384, 1, 1, activation_dtype=activation_dtype)
-        branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3, activation_dtype=activation_dtype)
-        branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1, activation_dtype=activation_dtype)
+        branch3x3 = conv2d_bn(x, 384, 1, 1)
+        branch3x3_1 = conv2d_bn(branch3x3, 384, 1, 3)
+        branch3x3_2 = conv2d_bn(branch3x3, 384, 3, 1)
         branch3x3 = layers.concatenate(
             [branch3x3_1, branch3x3_2],
             axis=channel_axis,
             name='mixed9_' + str(i))
 
-        branch3x3dbl = conv2d_bn(x, 448, 1, 1, activation_dtype=activation_dtype)
-        branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3, activation_dtype=activation_dtype)
-        branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3, activation_dtype=activation_dtype)
-        branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1, activation_dtype=activation_dtype)
+        branch3x3dbl = conv2d_bn(x, 448, 1, 1)
+        branch3x3dbl = conv2d_bn(branch3x3dbl, 384, 3, 3)
+        branch3x3dbl_1 = conv2d_bn(branch3x3dbl, 384, 1, 3)
+        branch3x3dbl_2 = conv2d_bn(branch3x3dbl, 384, 3, 1)
         branch3x3dbl = layers.concatenate(
             [branch3x3dbl_1, branch3x3dbl_2], axis=channel_axis)
 
         branch_pool = layers.AveragePooling2D(
             (3, 3), strides=(1, 1), padding='same')(x)
-        branch_pool = conv2d_bn(branch_pool, 192, 1, 1, activation_dtype=activation_dtype)
+        branch_pool = conv2d_bn(branch_pool, 192, 1, 1)
         x = layers.concatenate(
             [branch1x1, branch3x3, branch3x3dbl, branch_pool],
             axis=channel_axis,
@@ -357,6 +351,7 @@ def InceptionV3(include_top=True,
         if activation_dtype is None:
             x = layers.Dense(classes, activation='softmax', name='predictions')(x)
         else:
+            #only softmax activation must be cast when using mixed precision
             x = layers.Dense(classes, name='dense_logits')(x)
             x = layers.Activation('softmax', dtype=activation_dtype, name='predictions')(x)
             
diff --git a/segmentation_models/models/_common_blocks.py b/segmentation_models/models/_common_blocks.py
index 1a2eb3e9..9c379d4e 100644
--- a/segmentation_models/models/_common_blocks.py
+++ b/segmentation_models/models/_common_blocks.py
@@ -63,7 +63,7 @@ def wrapper(input_tensor):
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
 
         if activation:
-            if activation_dtype is None:
+            if activation_dtype is None or activation != 'softmax':
                 x = layers.Activation(activation, name=act_name)(x)
             else:
                 x = layers.Activation(activation, name=act_name, dtype=activation_dtype)(x)
diff --git a/segmentation_models/models/fpn.py b/segmentation_models/models/fpn.py
index bcce1fee..b904751f 100644
--- a/segmentation_models/models/fpn.py
+++ b/segmentation_models/models/fpn.py
@@ -182,7 +182,7 @@ def build_fpn(
         kernel_initializer='glorot_uniform',
         name='head_conv',
     )(x)
-    if activation_dtype is None:
+    if activation_dtype is None or activation != 'softmax':
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation,
diff --git a/segmentation_models/models/linknet.py b/segmentation_models/models/linknet.py
index 95255042..53821e18 100644
--- a/segmentation_models/models/linknet.py
+++ b/segmentation_models/models/linknet.py
@@ -122,7 +122,7 @@ def wrapper(input_tensor, skip=None):
 
         if use_batchnorm:
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
-        if activation_dtype is None:
+        if activation_dtype is None or activation != 'softmax':
             x = layers.Activation('relu', name=relu_name)(x)
         else:
             x = layers.Activation('relu', name=relu_name,
@@ -186,7 +186,7 @@ def build_linknet(
         use_bias=True,
         kernel_initializer='glorot_uniform'
     )(x)
-    if activation_dtype is None:
+    if activation_dtype is None or activation != 'softmax':
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation, dtype=activation_dtype)(x)
diff --git a/segmentation_models/models/pspnet.py b/segmentation_models/models/pspnet.py
index 0997d124..c05d0b03 100644
--- a/segmentation_models/models/pspnet.py
+++ b/segmentation_models/models/pspnet.py
@@ -158,7 +158,7 @@ def build_psp(
 
     x = layers.UpSampling2D(final_upsampling_factor,
                             name='final_upsampling', interpolation='bilinear')(x)
-    if activation_dtype is None:
+    if activation_dtype is None or activation != 'softmax':
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation,
diff --git a/segmentation_models/models/unet.py b/segmentation_models/models/unet.py
index d4facac0..faa759b2 100644
--- a/segmentation_models/models/unet.py
+++ b/segmentation_models/models/unet.py
@@ -94,7 +94,7 @@ def layer(input_tensor, skip=None):
         if use_batchnorm:
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
 
-        if activation_dtype is None:
+        if activation_dtype is None or activation != 'softmax':
             x = layers.Activation('relu', name=relu_name)(x)
         else:
             x = layers.Activation('relu', name=relu_name,
@@ -161,7 +161,7 @@ def build_unet(
         kernel_initializer='glorot_uniform',
         name='final_conv',
     )(x)
-    if activation_dtype is None:
+    if activation_dtype is None or activation != 'softmax':
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation,

From f342915008d7b030659a101a3f1903d9eee96883 Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 21:08:12 -0300
Subject: [PATCH 6/7] removing parameter where it does not make sense

---
 segmentation_models/models/fpn.py     | 29 ++++++++++++--------------
 segmentation_models/models/linknet.py | 29 ++++++++------------------
 segmentation_models/models/pspnet.py  | 24 +++++++--------------
 segmentation_models/models/unet.py    | 30 +++++++++------------------
 4 files changed, 40 insertions(+), 72 deletions(-)

diff --git a/segmentation_models/models/fpn.py b/segmentation_models/models/fpn.py
index b904751f..2f687237 100644
--- a/segmentation_models/models/fpn.py
+++ b/segmentation_models/models/fpn.py
@@ -27,7 +27,7 @@ def get_submodules():
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
+def Conv3x3BnReLU(filters, use_batchnorm, name=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -35,7 +35,6 @@ def wrapper(input_tensor):
             filters,
             kernel_size=3,
             activation='relu',
-            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -46,7 +45,7 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def DoubleConv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
+def DoubleConv3x3BnReLU(filters, use_batchnorm, name=None):
     name1, name2 = None, None
     if name is not None:
         name1 = name + 'a'
@@ -56,14 +55,12 @@ def wrapper(input_tensor):
         x = Conv3x3BnReLU(
             filters,
             use_batchnorm,
-            name=name1,
-            activation_dtype=activation_dtype
+            name=name1
         )(input_tensor)
         x = Conv3x3BnReLU(
             filters,
             use_batchnorm,
-            name=name2,
-            activation_dtype=activation_dtype
+            name=name2
         )(x)
         return x
 
@@ -136,14 +133,14 @@ def build_fpn(
     p2 = FPNBlock(pyramid_filters, stage=2)(p3, skips[3])
 
     # add segmentation head to each
-    s5 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
-                             name='segm_stage5', activation_dtype=activation_dtype)(p5)
-    s4 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
-                             name='segm_stage4', activation_dtype=activation_dtype)(p4)
-    s3 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
-                             name='segm_stage3', activation_dtype=activation_dtype)(p3)
-    s2 = DoubleConv3x3BnReLU(segmentation_filters, use_batchnorm,
-                             name='segm_stage2', activation_dtype=activation_dtype)(p2)
+    s5 = DoubleConv3x3BnReLU(segmentation_filters,
+                             use_batchnorm, name='segm_stage5')(p5)
+    s4 = DoubleConv3x3BnReLU(segmentation_filters,
+                             use_batchnorm, name='segm_stage4')(p4)
+    s3 = DoubleConv3x3BnReLU(segmentation_filters,
+                             use_batchnorm, name='segm_stage3')(p3)
+    s2 = DoubleConv3x3BnReLU(segmentation_filters,
+                             use_batchnorm, name='segm_stage2')(p2)
 
     # upsampling to same resolution
     s5 = layers.UpSampling2D(
@@ -169,7 +166,7 @@ def build_fpn(
 
     # final stage
     x = Conv3x3BnReLU(segmentation_filters, use_batchnorm,
-                      name='final_stage', activation_dtype=activation_dtype)(x)
+                      name='final_stage')(x)
     x = layers.UpSampling2D(
         size=(2, 2), interpolation='bilinear', name='final_upsampling')(x)
 
diff --git a/segmentation_models/models/linknet.py b/segmentation_models/models/linknet.py
index 53821e18..b6db1a8d 100644
--- a/segmentation_models/models/linknet.py
+++ b/segmentation_models/models/linknet.py
@@ -27,7 +27,7 @@ def get_submodules():
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
+def Conv3x3BnReLU(filters, use_batchnorm, name=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -35,7 +35,6 @@ def wrapper(input_tensor):
             filters,
             kernel_size=3,
             activation='relu',
-            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -46,7 +45,7 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def Conv1x1BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
+def Conv1x1BnReLU(filters, use_batchnorm, name=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -54,7 +53,6 @@ def wrapper(input_tensor):
             filters,
             kernel_size=1,
             activation='relu',
-            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -79,13 +77,10 @@ def wrapper(input_tensor, skip=None):
         output_filters = backend.int_shape(
             skip)[channels_axis] if skip is not None else filters
 
-        x = Conv1x1BnReLU(input_filters // 4, use_batchnorm, name=conv_block1_name,
-                          activation_dtype=activation_dtype)(input_tensor)
+        x = Conv1x1BnReLU(input_filters // 4, use_batchnorm, name=conv_block1_name)(input_tensor)
         x = layers.UpSampling2D((2, 2), name=up_name)(x)
-        x = Conv3x3BnReLU(input_filters // 4, use_batchnorm,
-                          name=conv_block2_name, activation_dtype=activation_dtype)(x)
-        x = Conv1x1BnReLU(output_filters, use_batchnorm,
-                          name=conv_block3_name, activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(input_filters // 4, use_batchnorm, name=conv_block2_name)(x)
+        x = Conv1x1BnReLU(output_filters, use_batchnorm, name=conv_block3_name)(x)
 
         if skip is not None:
             x = layers.Add(name=add_name)([x, skip])
@@ -110,7 +105,7 @@ def wrapper(input_tensor, skip=None):
             skip)[channels_axis] if skip is not None else filters
 
         x = Conv1x1BnReLU(input_filters // 4, use_batchnorm,
-                          name=conv_block1_name, activation_dtype=activation_dtype)(input_tensor)
+                          name=conv_block1_name)(input_tensor)
         x = layers.Conv2DTranspose(
             filters=input_filters // 4,
             kernel_size=(4, 4),
@@ -122,11 +117,7 @@ def wrapper(input_tensor, skip=None):
 
         if use_batchnorm:
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
-        if activation_dtype is None or activation != 'softmax':
-            x = layers.Activation('relu', name=relu_name)(x)
-        else:
-            x = layers.Activation('relu', name=relu_name,
-                                  dtype=activation_dtype)(x)
+        x = layers.Activation('relu', name=relu_name)(x)
         x = Conv1x1BnReLU(output_filters, use_batchnorm,
                           name=conv_block3_name)(x)
 
@@ -162,10 +153,8 @@ def build_linknet(
 
     # add center block if previous operation was maxpooling (for vgg models)
     if isinstance(backbone.layers[-1], layers.MaxPooling2D):
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1',
-                          activation_dtype=activation_dtype)(x)
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2',
-                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1')(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2')(x)
 
     # building decoder blocks
     for i in range(n_upsample_blocks):
diff --git a/segmentation_models/models/pspnet.py b/segmentation_models/models/pspnet.py
index c05d0b03..60d6b782 100644
--- a/segmentation_models/models/pspnet.py
+++ b/segmentation_models/models/pspnet.py
@@ -46,7 +46,7 @@ def check_input_shape(input_shape, factor):
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv1x1BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
+def Conv1x1BnReLU(filters, use_batchnorm, name=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -54,7 +54,6 @@ def wrapper(input_tensor):
             filters,
             kernel_size=1,
             activation='relu',
-            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -69,8 +68,7 @@ def SpatialContextBlock(
         level,
         conv_filters=512,
         pooling_type='avg',
-        use_batchnorm=True,
-        activation_dtype=None
+        use_batchnorm=True
 ):
     if pooling_type not in ('max', 'avg'):
         raise ValueError('Unsupported pooling type - `{}`.'.format(pooling_type) +
@@ -97,8 +95,7 @@ def wrapper(input_tensor):
 
         x = Pooling2D(pool_size, strides=pool_size,
                       padding='same', name=pooling_name)(input_tensor)
-        x = Conv1x1BnReLU(conv_filters, use_batchnorm,
-                          name=conv_block_name, activation_dtype=activation_dtype)(x)
+        x = Conv1x1BnReLU(conv_filters, use_batchnorm, name=conv_block_name)(x)
         x = layers.UpSampling2D(
             up_size, interpolation='bilinear', name=upsampling_name)(x)
         return x
@@ -127,21 +124,16 @@ def build_psp(
          else backbone.get_layer(index=psp_layer_idx).output)
 
     # build spatial pyramid
-    x1 = SpatialContextBlock(1, conv_filters, pooling_type,
-                             use_batchnorm, activation_dtype=activation_dtype)(x)
-    x2 = SpatialContextBlock(2, conv_filters, pooling_type,
-                             use_batchnorm, activation_dtype=activation_dtype)(x)
-    x3 = SpatialContextBlock(3, conv_filters, pooling_type,
-                             use_batchnorm, activation_dtype=activation_dtype)(x)
-    x6 = SpatialContextBlock(6, conv_filters, pooling_type,
-                             use_batchnorm, activation_dtype=activation_dtype)(x)
+    x1 = SpatialContextBlock(1, conv_filters, pooling_type, use_batchnorm)(x)
+    x2 = SpatialContextBlock(2, conv_filters, pooling_type, use_batchnorm)(x)
+    x3 = SpatialContextBlock(3, conv_filters, pooling_type, use_batchnorm)(x)
+    x6 = SpatialContextBlock(6, conv_filters, pooling_type, use_batchnorm)(x)
 
     # aggregate spatial pyramid
     concat_axis = 3 if backend.image_data_format() == 'channels_last' else 1
     x = layers.Concatenate(axis=concat_axis, name='psp_concat')(
         [x, x1, x2, x3, x6])
-    x = Conv1x1BnReLU(conv_filters, use_batchnorm,
-                      name='aggregation', activation_dtype=activation_dtype)(x)
+    x = Conv1x1BnReLU(conv_filters, use_batchnorm, name='aggregation')(x)
 
     # model regularization
     if dropout is not None:
diff --git a/segmentation_models/models/unet.py b/segmentation_models/models/unet.py
index faa759b2..69f1c8b9 100644
--- a/segmentation_models/models/unet.py
+++ b/segmentation_models/models/unet.py
@@ -27,7 +27,7 @@ def get_submodules():
 #  Blocks
 # ---------------------------------------------------------------------
 
-def Conv3x3BnReLU(filters, use_batchnorm, name=None, activation_dtype=None):
+def Conv3x3BnReLU(filters, use_batchnorm, name=None):
     kwargs = get_submodules()
 
     def wrapper(input_tensor):
@@ -35,7 +35,6 @@ def wrapper(input_tensor):
             filters,
             kernel_size=3,
             activation='relu',
-            activation_dtype=activation_dtype,
             kernel_initializer='he_uniform',
             padding='same',
             use_batchnorm=use_batchnorm,
@@ -46,7 +45,7 @@ def wrapper(input_tensor):
     return wrapper
 
 
-def DecoderUpsamplingX2Block(filters, stage, use_batchnorm=False, activation_dtype=None):
+def DecoderUpsamplingX2Block(filters, stage, use_batchnorm=False):
     up_name = 'decoder_stage{}_upsampling'.format(stage)
     conv1_name = 'decoder_stage{}a'.format(stage)
     conv2_name = 'decoder_stage{}b'.format(stage)
@@ -61,17 +60,15 @@ def wrapper(input_tensor, skip=None):
             x = layers.Concatenate(
                 axis=concat_axis, name=concat_name)([x, skip])
 
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv1_name,
-                          activation_dtype=activation_dtype)(x)
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv2_name,
-                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv1_name)(x)
+        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv2_name)(x)
 
         return x
 
     return wrapper
 
 
-def DecoderTransposeX2Block(filters, stage, use_batchnorm=False, activation_dtype=None):
+def DecoderTransposeX2Block(filters, stage, use_batchnorm=False):
     transp_name = 'decoder_stage{}a_transpose'.format(stage)
     bn_name = 'decoder_stage{}a_bn'.format(stage)
     relu_name = 'decoder_stage{}a_relu'.format(stage)
@@ -94,18 +91,13 @@ def layer(input_tensor, skip=None):
         if use_batchnorm:
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
 
-        if activation_dtype is None or activation != 'softmax':
-            x = layers.Activation('relu', name=relu_name)(x)
-        else:
-            x = layers.Activation('relu', name=relu_name,
-                                  dtype=activation_dtype)(x)
-
+        x = layers.Activation('relu', name=relu_name)(x)
+        
         if skip is not None:
             x = layers.Concatenate(
                 axis=concat_axis, name=concat_name)([x, skip])
 
-        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv_block_name,
-                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(filters, use_batchnorm, name=conv_block_name)(x)
 
         return x
 
@@ -136,10 +128,8 @@ def build_unet(
 
     # add center block if previous operation was maxpooling (for vgg models)
     if isinstance(backbone.layers[-1], layers.MaxPooling2D):
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1',
-                          activation_dtype=activation_dtype)(x)
-        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2',
-                          activation_dtype=activation_dtype)(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block1')(x)
+        x = Conv3x3BnReLU(512, use_batchnorm, name='center_block2')(x)
 
     # building decoder blocks
     for i in range(n_upsample_blocks):

From f0b0727a112cb4b16a3552591335b1a1f4e7a6eb Mon Sep 17 00:00:00 2001
From: phborba <philipeborba@gmail.com>
Date: Sat, 18 Apr 2020 21:18:25 -0300
Subject: [PATCH 7/7] bug fix

---
 segmentation_models/backbones/inception_resnet_v2.py | 4 ++--
 segmentation_models/models/_common_blocks.py         | 2 +-
 segmentation_models/models/fpn.py                    | 2 +-
 segmentation_models/models/linknet.py                | 2 +-
 segmentation_models/models/pspnet.py                 | 2 +-
 segmentation_models/models/unet.py                   | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/segmentation_models/backbones/inception_resnet_v2.py b/segmentation_models/backbones/inception_resnet_v2.py
index 035571ba..499c7279 100644
--- a/segmentation_models/backbones/inception_resnet_v2.py
+++ b/segmentation_models/backbones/inception_resnet_v2.py
@@ -77,7 +77,7 @@ def conv2d_bn(x,
                                       name=bn_name)(x)
     if activation is not None:
         ac_name = None if name is None else name + '_ac'
-        if activation_dtype is None or activation != 'softmax':
+        if activation_dtype is None:
             x = layers.Activation(activation, name=ac_name)(x)
         else:
             x = layers.Activation(activation, name=ac_name, dtype=activation_dtype)(x)
@@ -164,7 +164,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu', a
                       arguments={'scale': scale},
                       name=block_name)([x, up])
     if activation is not None:
-        if activation_dtype is None or activation != 'softmax':
+        if activation_dtype is None:
             x = layers.Activation(activation, name=block_name + '_ac')(x)
         else:
             x = layers.Activation(activation, name=block_name + '_ac', dtype=activation_dtype)(x)
diff --git a/segmentation_models/models/_common_blocks.py b/segmentation_models/models/_common_blocks.py
index 9c379d4e..1a2eb3e9 100644
--- a/segmentation_models/models/_common_blocks.py
+++ b/segmentation_models/models/_common_blocks.py
@@ -63,7 +63,7 @@ def wrapper(input_tensor):
             x = layers.BatchNormalization(axis=bn_axis, name=bn_name)(x)
 
         if activation:
-            if activation_dtype is None or activation != 'softmax':
+            if activation_dtype is None:
                 x = layers.Activation(activation, name=act_name)(x)
             else:
                 x = layers.Activation(activation, name=act_name, dtype=activation_dtype)(x)
diff --git a/segmentation_models/models/fpn.py b/segmentation_models/models/fpn.py
index 2f687237..3f74efdd 100644
--- a/segmentation_models/models/fpn.py
+++ b/segmentation_models/models/fpn.py
@@ -179,7 +179,7 @@ def build_fpn(
         kernel_initializer='glorot_uniform',
         name='head_conv',
     )(x)
-    if activation_dtype is None or activation != 'softmax':
+    if activation_dtype is None:
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation,
diff --git a/segmentation_models/models/linknet.py b/segmentation_models/models/linknet.py
index b6db1a8d..faba695d 100644
--- a/segmentation_models/models/linknet.py
+++ b/segmentation_models/models/linknet.py
@@ -175,7 +175,7 @@ def build_linknet(
         use_bias=True,
         kernel_initializer='glorot_uniform'
     )(x)
-    if activation_dtype is None or activation != 'softmax':
+    if activation_dtype is None:
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation, dtype=activation_dtype)(x)
diff --git a/segmentation_models/models/pspnet.py b/segmentation_models/models/pspnet.py
index 60d6b782..d4d89895 100644
--- a/segmentation_models/models/pspnet.py
+++ b/segmentation_models/models/pspnet.py
@@ -150,7 +150,7 @@ def build_psp(
 
     x = layers.UpSampling2D(final_upsampling_factor,
                             name='final_upsampling', interpolation='bilinear')(x)
-    if activation_dtype is None or activation != 'softmax':
+    if activation_dtype is None:
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation,
diff --git a/segmentation_models/models/unet.py b/segmentation_models/models/unet.py
index 69f1c8b9..f524c4a8 100644
--- a/segmentation_models/models/unet.py
+++ b/segmentation_models/models/unet.py
@@ -151,7 +151,7 @@ def build_unet(
         kernel_initializer='glorot_uniform',
         name='final_conv',
     )(x)
-    if activation_dtype is None or activation != 'softmax':
+    if activation_dtype is None:
         x = layers.Activation(activation, name=activation)(x)
     else:
         x = layers.Activation(activation, name=activation,