Allow higher rank inputs to SparseCategoricalFocalLoss (#6)

artemmavrin · web-flow · commit 7d6faa25b273 · 2020-06-09T20:02:15.000-07:00
* Make SparseCategoricalFocalLoss accept higher-rank tensors (#5) * Updated numpy sparse categorical focal loss reference implementation * new test cases for higher rank and unknown rank inputs
diff --git a/src/focal_loss/_categorical_focal_loss.py b/src/focal_loss/_categorical_focal_loss.py
@@ -6,11 +6,16 @@
 #  | |   | (_) | | (__  | (_| | | |   | | | (_) | \__ \ \__ \
 #  |_|    \___/   \___|  \__,_| |_|   |_|  \___/  |___/ |___/
 
+import itertools
+
 import tensorflow as tf
 
+_EPSILON = tf.keras.backend.epsilon()
+
 
 def sparse_categorical_focal_loss(y_true, y_pred, gamma, *,
-                                  from_logits: bool = False) -> tf.Tensor:
+                                  from_logits: bool = False, axis: int = -1
+                                  ) -> tf.Tensor:
     r"""Focal loss function for multiclass classification with integer labels.
 
     This loss function generalizes multiclass softmax cross-entropy by
@@ -46,10 +51,10 @@ def sparse_categorical_focal_loss(y_true, y_pred, gamma, *,
 
     Parameters
     ----------
-    y_true : tensor-like, shape (N,)
+    y_true : tensor-like
         Integer class labels.
 
-    y_pred : tensor-like, shape (N, K)
+    y_pred : tensor-like
         Either probabilities or logits, depending on the `from_logits`
         parameter.
 
@@ -63,6 +68,9 @@ def sparse_categorical_focal_loss(y_true, y_pred, gamma, *,
     from_logits : bool, optional
         Whether `y_pred` contains logits or probabilities.
 
+    axis : int, optional
+        Channel axis in the `y_pred` tensor.
+
     Returns
     -------
     :class:`tf.Tensor`
@@ -103,33 +111,64 @@ def sparse_categorical_focal_loss(y_true, y_pred, gamma, *,
         A wrapper around this function that makes it a
         :class:`tf.keras.losses.Loss`.
     """
+    # Process focusing parameter
     gamma = tf.convert_to_tensor(gamma, dtype=tf.dtypes.float32)
-    scalar_gamma = gamma.shape == []
+    gamma_rank = gamma.shape.rank
+    scalar_gamma = gamma_rank == 0
 
+    # Process prediction tensor
     y_pred = tf.convert_to_tensor(y_pred)
-    y_true = tf.dtypes.cast(y_true, dtype=tf.dtypes.int32)
-    base_loss = tf.keras.backend.sparse_categorical_crossentropy(
-        target=y_true, output=y_pred, from_logits=from_logits)
+    y_pred_rank = y_pred.shape.rank
+    if y_pred_rank is not None:
+        axis %= y_pred_rank
+        if axis != y_pred_rank - 1:
+            # Put channel axis last for sparse_softmax_cross_entropy_with_logits
+            perm = list(itertools.chain(range(axis),
+                                        range(axis + 1, y_pred_rank), [axis]))
+            y_pred = tf.transpose(y_pred, perm=perm)
+    elif axis != -1:
+        raise ValueError(
+            f'Cannot compute sparse categorical focal loss with axis={axis} on '
+            'a prediction tensor with statically unknown rank.')
+    y_pred_shape = tf.shape(y_pred)
+
+    # Process ground truth tensor
+    y_true = tf.dtypes.cast(y_true, dtype=tf.dtypes.int64)
+    y_true_rank = y_true.shape.rank
+
+    if y_true_rank is None:
+        raise NotImplementedError('Sparse categorical focal loss not supported '
+                                  'for target/label tensors of unknown rank')
+
+    reshape_needed = (y_true_rank is not None and y_pred_rank is not None and
+                      y_pred_rank != y_true_rank + 1)
+    if reshape_needed:
+        y_true = tf.reshape(y_true, [-1])
+        y_pred = tf.reshape(y_pred, [-1, y_pred_shape[-1]])
 
     if from_logits:
+        logits = y_pred
         probs = tf.nn.softmax(y_pred, axis=-1)
     else:
         probs = y_pred
-    batch_size = tf.shape(y_true)[0]
+        logits = tf.math.log(tf.clip_by_value(y_pred, _EPSILON, 1 - _EPSILON))
 
-    # For some reason y_true becomes shaped like (batch, 1) during training, so
-    # the next line is a hack to ensure it's always rank 1 (needed for stacking)
-    y_true = tf.cond(tf.rank(y_true) == 1, lambda: y_true, lambda: y_true[:, 0])
+    xent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=y_true,
+        logits=logits,
+    )
 
-    indices = tf.stack([tf.range(batch_size), y_true], axis=1)
-    probs = tf.gather_nd(probs, indices)
+    y_true_rank = y_true.shape.rank
+    probs = tf.gather(probs, y_true, axis=-1, batch_dims=y_true_rank)
+    if not scalar_gamma:
+        gamma = tf.gather(gamma, y_true, axis=0, batch_dims=y_true_rank)
+    focal_modulation = (1 - probs) ** gamma
+    loss = focal_modulation * xent_loss
 
-    if scalar_gamma:
-        focal_modulation = (1 - probs) ** gamma
-    else:
-        focal_modulation = (1 - probs) ** tf.gather(gamma, y_true)
+    if reshape_needed:
+        loss = tf.reshape(loss, y_pred_shape[:-1])
 
-    return focal_modulation * base_loss
+    return loss
 
 
 @tf.keras.utils.register_keras_serializable()
@@ -198,6 +237,7 @@ class SparseCategoricalFocalLoss(tf.keras.losses.Loss):
         The function that performs the focal loss computation, taking a label
         tensor and a prediction tensor and outputting a loss.
     """
+
     def __init__(self, gamma, from_logits: bool = False, **kwargs):
         super().__init__(**kwargs)
         self.gamma = gamma
diff --git a/src/focal_loss/tests/test_sparse_categorical_focal_loss.py b/src/focal_loss/tests/test_sparse_categorical_focal_loss.py
@@ -53,20 +53,34 @@
 
 
 def numpy_sparse_categorical_focal_loss(y_true, y_pred, gamma,
-                                        from_logits=False):
+                                        from_logits=False, axis=-1):
     """Simple sparse categorical focal loss implementation using NumPy."""
-    # Convert to arrays
     y_true = np.asarray(y_true)
     y_pred = np.asarray(y_pred)
 
+    if axis != -1:
+        pred_dim = np.ndim(y_pred)
+        axes = list(range(axis)) + list(range(axis + 1, pred_dim)) + [axis]
+        y_pred = np.transpose(y_pred, axes)
+
+    y_pred_shape_original = y_pred.shape
+    n_classes = y_pred_shape_original[-1]
+    y_true = np.reshape(y_true, newshape=[-1])
+    y_pred = np.reshape(y_pred, newshape=[-1, n_classes])
+
     # One-hot encoding of integer labels
-    y_true_one_hot = np.eye(y_pred.shape[-1])[y_true]
+    y_true_one_hot = np.eye(n_classes)[y_true]
 
     if from_logits:
         y_pred = softmax(y_pred, axis=-1)
+    else:
+        y_pred = np.clip(y_pred, 1e-7, 1-1e-7)
 
     loss = -y_true_one_hot * (1 - y_pred) ** gamma * np.log(y_pred)
-    return loss.sum(axis=-1)
+    loss = np.sum(loss, axis=-1)
+    loss = np.reshape(loss, y_pred_shape_original[:-1])
+
+    return loss
 
 
 def get_dummy_sparse_multiclass_classifier(n_features, n_classes, gamma,
@@ -250,3 +264,95 @@ def test_save_and_restore(self, gamma, from_logits):
 
         # Delete the created SavedModel directory
         shutil.rmtree(sm_filepath, ignore_errors=True)
+
+    def test_with_higher_rank_inputs(self):
+        """Addresses https://github.com/artemmavrin/focal-loss/issues/5"""
+
+        def build_model():
+            return tf.keras.Sequential([
+                tf.keras.layers.Input((100, 10)),
+                tf.keras.layers.GRU(13, return_sequences=True),
+                tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(13)),
+            ])
+
+        x = np.zeros((20, 100, 10))
+        y = np.ones((20, 100, 1))
+
+        model = build_model()
+        loss = SparseCategoricalFocalLoss(gamma=2)
+        model.compile(loss=loss, optimizer='adam')
+        model.fit(x, y)
+
+    @named_parameters_with_testcase_names(axis=[0, 1, 2],
+                                          from_logits=[False, True])
+    def test_reduce_to_keras_with_higher_rank_and_axis(self, axis, from_logits):
+        labels = tf.convert_to_tensor([[0, 1, 2], [0, 0, 0], [1, 1, 1]],
+                                      dtype=tf.dtypes.int64)
+        logits = tf.reshape(tf.range(27, dtype=tf.dtypes.float32),
+                            shape=[3, 3, 3])
+        probs = tf.nn.softmax(logits, axis=axis)
+
+        y_pred = logits if from_logits else probs
+        keras_loss = tf.keras.losses.sparse_categorical_crossentropy(
+            labels, y_pred, from_logits=from_logits, axis=axis)
+        focal_loss = sparse_categorical_focal_loss(
+            labels, y_pred, gamma=0, from_logits=from_logits, axis=axis)
+        self.assertAllClose(focal_loss, keras_loss)
+
+    @named_parameters_with_testcase_names(gamma=[0, 1, 2], axis=[0, 1, 2],
+                                          from_logits=[False, True])
+    def test_higher_rank_sanity_checks(self, gamma, axis, from_logits):
+        labels = tf.convert_to_tensor([[0, 1, 2], [0, 0, 0], [1, 1, 1]],
+                                      dtype=tf.dtypes.int64)
+        logits = tf.reshape(tf.range(27, dtype=tf.dtypes.float32),
+                            shape=[3, 3, 3])
+        probs = tf.nn.softmax(logits, axis=axis)
+
+        y_pred = logits if from_logits else probs
+        numpy_loss = numpy_sparse_categorical_focal_loss(
+            labels, y_pred, gamma=gamma, from_logits=from_logits, axis=axis)
+        focal_loss = sparse_categorical_focal_loss(
+            labels, y_pred, gamma=gamma, from_logits=from_logits, axis=axis)
+        self.assertAllClose(focal_loss, numpy_loss)
+
+    @named_parameters_with_testcase_names(gamma=[0, 1, 2],
+                                          from_logits=[False, True])
+    def test_with_dynamic_ranks(self, gamma, from_logits):
+        # y_true must have defined rank
+        y_true = tf.keras.backend.placeholder(None, dtype=tf.int64)
+        y_pred = tf.keras.backend.placeholder((None, 2), dtype=tf.float32)
+        with self.assertRaises(NotImplementedError):
+            sparse_categorical_focal_loss(y_true, y_pred, gamma=gamma,
+                                          from_logits=from_logits)
+
+        # If axis is specified, y_pred must have a defined rank
+        y_true = tf.keras.backend.placeholder((None,), dtype=tf.int64)
+        y_pred = tf.keras.backend.placeholder(None, dtype=tf.float32)
+        with self.assertRaises(ValueError):
+            sparse_categorical_focal_loss(y_true, y_pred, gamma=gamma,
+                                          from_logits=from_logits, axis=0)
+
+        # It's fine if y_pred has undefined rank is axis=-1
+        graph = tf.Graph()
+        with graph.as_default():
+            y_true = tf.keras.backend.placeholder((None,), dtype=tf.int64)
+            y_pred = tf.keras.backend.placeholder(None, dtype=tf.float32)
+            focal_loss = sparse_categorical_focal_loss(y_true, y_pred,
+                                                       gamma=gamma,
+                                                       from_logits=from_logits)
+
+        labels = [0, 0, 1]
+        logits = [[10., 0.], [5., -5.], [0., 10.]]
+        probs = softmax(logits, axis=-1)
+
+        pred = logits if from_logits else probs
+        loss_numpy = numpy_sparse_categorical_focal_loss(
+            labels, pred, gamma=gamma, from_logits=from_logits)
+
+        with tf.compat.v1.Session(graph=graph) as sess:
+            loss = sess.run(focal_loss,
+                            feed_dict={y_true: labels, y_pred: pred})
+
+        self.assertAllClose(loss, loss_numpy)
+
+