Add Perplexity metric (#10)

jeffcarp · web-flow · commit 9686e7c19133 · 2025-02-12T13:15:42.000-08:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,16 +11,15 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: '3.x'
+          python-version: 3.12
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
-      - name: Test with pytest
+      - name: Run Unit Tests
         run: |
-          pip install pytest
           pytest ./src/
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,8 @@
-clu==0.0.12
-scikit-learn==1.6.1
+absl-py
+clu
+jax[cpu]
+keras-hub
+pytest
+scikit-learn
+tensorflow-cpu~=2.18
+tensorflow-text~=2.18
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -16,6 +16,7 @@
     AUCPR,
     AUCROC,
     MSE,
+    Perplexity,
     Precision,
     RMSE,
     RSQUARED,
@@ -26,6 +27,7 @@
     "MSE",
     "RMSE",
     "RSQUARED",
+    "Perplexity",
     "Precision",
     "Recall",
     "AUCPR",
diff --git a/src/metrax/metrics.py b/src/metrax/metrics.py
@@ -536,4 +536,76 @@ def compute(self) -> jax.Array:
         self.false_positives, self.false_positives + self.true_negatives
     )
     # Threshold goes from 0 to 1, so trapezoid is negative.
-    return jnp.trapezoid(tp_rate, fp_rate) * -1
+    return jnp.trapezoid(tp_rate, fp_rate) * -1
+
+
+@flax.struct.dataclass
+class Perplexity(clu_metrics.Metric):
+  """Computes perplexity for sequence generation.
+
+  Perplexity is a measurement of how well a probability distribution predicts a
+  sample. It is defined as the exponentiation of the cross-entropy. A low
+  perplexity indicates the probability distribution is good at predicting the
+  sample.
+
+  For language models, it can be interpreted as the weighted average branching
+  factor of the model - how many equally likely words can be selected at each
+  step.
+  """
+
+  aggregate_crossentropy: jax.Array
+  num_samples: jax.Array
+
+  @classmethod
+  def from_model_output(
+      cls,
+      predictions: jax.Array,
+      labels: jax.Array,
+      sample_weights: jax.Array | None = None,
+  ) -> 'Perplexity':
+    """Updates the metric.
+
+    Args:
+      predictions: A floating point tensor representing the prediction
+      generated from the model. The shape should be (batch_size, seq_len,
+      vocab_size).
+      labels: True value. The shape should be (batch_size, seq_len).
+      sample_weights: An optional tensor representing the
+        weight of each token. The shape should be (batch_size, seq_len).
+
+    Returns:
+      Updated Perplexity metric.
+
+    Raises:
+      ValueError: If type of `labels` is wrong or the shapes of `predictions`
+      and `labels` are incompatible.
+    """
+    predictions = predictions / jnp.sum(predictions, axis=-1, keepdims=True)
+    labels_one_hot = jax.nn.one_hot(labels, predictions.shape[-1], axis=-1)
+    log_prob = jnp.log(predictions)
+    crossentropy = -jnp.sum(labels_one_hot * log_prob, axis=-1)
+
+    # Sum across sequence length dimension first.
+    if sample_weights is not None:
+      crossentropy = crossentropy * sample_weights
+      # Normalize by the sum of weights for each sequence.
+      crossentropy = jnp.sum(crossentropy) / jnp.sum(sample_weights)
+    else:
+      crossentropy = jnp.mean(crossentropy)
+
+    batch_size = jnp.array(labels.shape[0])
+    return cls(
+        aggregate_crossentropy=(batch_size * crossentropy),
+        num_samples=batch_size,
+    )
+
+  def merge(self, other: 'Perplexity') -> 'Perplexity':
+    return type(self)(
+        aggregate_crossentropy=(
+            self.aggregate_crossentropy + other.aggregate_crossentropy
+        ),
+        num_samples=self.num_samples + other.num_samples,
+    )
+
+  def compute(self) -> jax.Array:
+    return jnp.exp(self.aggregate_crossentropy / self.num_samples)
diff --git a/src/metrax/metrics_test.py b/src/metrax/metrics_test.py
@@ -17,6 +17,7 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 import jax.numpy as jnp
+import keras_hub
 import metrax
 import numpy as np
 from sklearn import metrics as sklearn_metrics
@@ -320,6 +321,43 @@ def test_rsquared(self, y_true, y_pred, sample_weights):
         atol=1e-05,
     )
 
+  @parameterized.named_parameters(
+      (
+          'basic',
+          np.random.randint(10, size=[2, 5, 10]),
+          np.random.uniform(size=(2, 5, 10, 20)),
+          None,
+      ),
+      (
+          'weighted',
+          np.random.randint(10, size=[2, 5, 10]),
+          np.random.uniform(size=(2, 5, 10, 20)),
+          np.random.randint(2, size=(2, 5, 10)).astype(np.float32),
+      ),
+  )
+  def test_perplexity(self, y_true, y_pred, sample_weights):
+    keras_metric = keras_hub.metrics.Perplexity()
+    metrax_metric = None
+    for index, (labels, logits) in enumerate(zip(y_true, y_pred)):
+      weights = sample_weights[index] if sample_weights is not None else None
+      keras_metric.update_state(labels, logits, sample_weight=weights)
+      update = metrax.Perplexity.from_model_output(
+          predictions=logits,
+          labels=labels,
+          sample_weights=weights,
+      )
+      metrax_metric = update if metrax_metric is None else metrax_metric.merge(
+          update
+      )
+
+    expected = keras_metric.result()
+    np.testing.assert_allclose(
+        metrax_metric.compute(),
+        expected,
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
 
 if __name__ == '__main__':
   absltest.main()