Perplexity: add clipping and from_logits (#47)

jeffcarp · web-flow · commit 79fefa2619ff · 2025-03-31T11:27:14.000-07:00
It was pointed out that Perplexity returns NaNs for negative values. This is because our implementation did not clip logit values to [0, 1], whereas the Keras implementation does. [1] Even with that fix, the tests were failing because Keras defaults to the TensorFlow version of the metric, which applies softmax to the outputs unconditionally [2], unlike the JAX implementation which does not. [3] I also added a `from_logits` arg, similar to Keras, for users who want to pass raw logits and have us apply softmax internally. [1] https://github.com/keras-team/keras/blob/3f8b065e82b17884bd43fcfbd4bd79f18a7019fe/keras/src/backend/jax/nn.py#L582 [2] https://www.tensorflow.org/api_docs/python/tf/nn/sparse_softmax_cross_entropy_with_logits [3] https://github.com/keras-team/keras/blob/3f8b065e82b17884bd43fcfbd4bd79f18a7019fe/keras/src/backend/jax/nn.py#L578-L579
diff --git a/src/metrax/base_test.py b/src/metrax/base_test.py
@@ -14,6 +14,9 @@
 
 """Tests for metrax base utilities."""
 
+import os
+os.environ['KERAS_BACKEND'] = 'jax'
+
 from absl.testing import absltest
 from absl.testing import parameterized
 import jax.numpy as jnp
diff --git a/src/metrax/nlp_metrics.py b/src/metrax/nlp_metrics.py
@@ -68,6 +68,7 @@ def from_model_output(
       predictions: jax.Array,
       labels: jax.Array,
       sample_weights: jax.Array | None = None,
+      from_logits: bool = False,
   ) -> 'Perplexity':
     """Updates the metric.
 
@@ -78,6 +79,9 @@ def from_model_output(
       labels: True value. The shape should be (batch_size, seq_len).
       sample_weights: An optional tensor representing the
         weight of each token. The shape should be (batch_size, seq_len).
+      from_logits: Whether the predictions are logits. If True, the predictions
+        are converted to probabilities using a softmax. If False, all values
+        outside of [0, 1] are clipped to 0 or 1.
 
     Returns:
       Updated Perplexity metric.
@@ -86,11 +90,17 @@ def from_model_output(
       ValueError: If type of `labels` is wrong or the shapes of `predictions`
       and `labels` are incompatible.
     """
-    predictions = base.divide_no_nan(
-        predictions, jnp.sum(predictions, axis=-1, keepdims=True)
-    )
+    if from_logits:
+      log_prob = jax.nn.log_softmax(predictions, axis=-1)
+    else:
+      predictions = base.divide_no_nan(
+          predictions, jnp.sum(predictions, axis=-1, keepdims=True)
+      )
+      epsilon = 1e-7
+      predictions = jnp.clip(predictions, epsilon, 1.0 - epsilon)
+      log_prob = jnp.log(predictions)
+
     labels_one_hot = jax.nn.one_hot(labels, predictions.shape[-1], axis=-1)
-    log_prob = jnp.log(predictions)
     crossentropy = -jnp.sum(labels_one_hot * log_prob, axis=-1)
 
     # Sum across sequence length dimension first.
@@ -227,4 +237,4 @@ def _levenshtein_distance(prediction: list, reference: list) -> int:
             distance_matrix[i - 1][j - 1] + cost,  # substitution
         )
 
-    return distance_matrix[m][n]
+    return distance_matrix[m][n]
diff --git a/src/metrax/nlp_metrics_test.py b/src/metrax/nlp_metrics_test.py
@@ -14,13 +14,18 @@
 
 """Tests for metrax nlp metrics."""
 
+import os
+os.environ['KERAS_BACKEND'] = 'jax'
+
 from absl.testing import absltest
 from absl.testing import parameterized
 import jax.numpy as jnp
 import keras_hub
 import metrax
 import numpy as np
 
+np.random.seed(42)
+
 
 class NlpMetricsTest(parameterized.TestCase):
 
@@ -42,17 +47,33 @@ def test_wer_empty(self):
           np.random.randint(10, size=[2, 5, 10]),
           np.random.uniform(size=(2, 5, 10, 20)),
           None,
+          False,
       ),
       (
           'weighted',
           np.random.randint(10, size=[2, 5, 10]),
           np.random.uniform(size=(2, 5, 10, 20)),
           np.random.randint(2, size=(2, 5, 10)).astype(np.float32),
+          False,
+      ),
+      (
+          'negative_values',
+          np.random.randint(10, size=[2, 5, 10]),
+          np.random.uniform(size=(2, 5, 10, 20), low=-2, high=2),
+          None,
+          False,
+      ),
+      (
+          'from_logits',
+          np.random.randint(10, size=[2, 5, 10]),
+          np.random.uniform(size=(2, 5, 10, 20), low=-2, high=2),
+          None,
+          True,
       ),
   )
-  def test_perplexity(self, y_true, y_pred, sample_weights):
+  def test_perplexity(self, y_true, y_pred, sample_weights, from_logits):
     """Test that `Perplexity` Metric computes correct values."""
-    keras_metric = keras_hub.metrics.Perplexity()
+    keras_metric = keras_hub.metrics.Perplexity(from_logits=from_logits)
     metrax_metric = None
     for index, (labels, logits) in enumerate(zip(y_true, y_pred)):
       weights = sample_weights[index] if sample_weights is not None else None
@@ -61,6 +82,7 @@ def test_perplexity(self, y_true, y_pred, sample_weights):
           predictions=logits,
           labels=labels,
           sample_weights=weights,
+          from_logits=from_logits,
       )
       metrax_metric = update if metrax_metric is None else metrax_metric.merge(
           update
@@ -119,4 +141,4 @@ def test_wer(self):
 
 
 if __name__ == '__main__':
-  absltest.main()
+  absltest.main()
diff --git a/src/metrax/regression_metrics.py b/src/metrax/regression_metrics.py
@@ -203,7 +203,7 @@ def merge(self, other: 'RSQUARED') -> 'RSQUARED':
     )
 
   def compute(self) -> jax.Array:
-    """Computes the r-squared score.
+    r"""Computes the r-squared score.
 
     Since we don't know the mean of the labels before we aggregate all of the
     data, we will manipulate the formula to be:
diff --git a/src/metrax/regression_metrics_test.py b/src/metrax/regression_metrics_test.py
@@ -15,6 +15,8 @@
 """Tests for metrax regression metrics."""
 
 import os
+os.environ['KERAS_BACKEND'] = 'jax'
+
 from absl.testing import absltest
 from absl.testing import parameterized
 import jax

Original file line number	Diff line number	Diff line change
`@@ -203,7 +203,7 @@ def merge(self, other: 'RSQUARED') -> 'RSQUARED':`
`203`	`203`	`)`
`204`	`204`
`205`	`205`	`def compute(self) -> jax.Array:`
`206`		`- """Computes the r-squared score.`
	`206`	`+ r"""Computes the r-squared score.`
`207`	`207`
`208`	`208`	`Since we don't know the mean of the labels before we aggregate all of the`
`209`	`209`	`data, we will manipulate the formula to be:`