Add WER metric

nikolasavic3 · nikolasavic3 · commit a7675873f0f5 · 2025-03-14T18:30:39.000Z
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -20,6 +20,7 @@
 )
 from metrax.nlp_metrics import (
     Perplexity,
+    WER
 )
 from metrax.ranking_metrics import (
     AveragePrecisionAtK,
@@ -40,4 +41,5 @@
     "Recall",
     "RMSE",
     "RSQUARED",
+    "WER",
 ]
diff --git a/src/metrax/nlp_metrics.py b/src/metrax/nlp_metrics.py
@@ -22,95 +22,197 @@
 
 @flax.struct.dataclass
 class Perplexity(clu_metrics.Metric):
-  r"""Computes perplexity for sequence generation.
-
-  Perplexity is a measurement of how well a probability distribution predicts a
-  sample. It is defined as the exponentiation of the cross-entropy. A low
-  perplexity indicates the probability distribution is good at predicting the
-  sample.
-
-  For language models, it can be interpreted as the weighted average branching
-  factor of the model - how many equally likely words can be selected at each
-  step.
-
-  Given a sequence of :math:`N` tokens, perplexity is calculated as:
-
-  .. math::
-      Perplexity = \exp\left(-\frac{1}{N}\sum_{i=1}^{N} \log P(x_i|x_{<i})\right)
-
-  When sample weights :math:`w_i` are provided:
-
-  .. math::
-      Perplexity = \exp\left(-\frac{\sum_{i=1}^{N} w_i\log P(x_i|x_{<i})}{\sum_{i=1}^{N} w_i}\right)
-
-  where:
-      - :math:`P(x_i|x_{<i})` is the predicted probability of token :math:`x_i`
-        given previous tokens
-      - :math:`w_i` are sample weights
-      - :math:`N` is the sequence length
-
-  Lower perplexity indicates better prediction - the model is less "perplexed" by the data.
-  """
-
-  aggregate_crossentropy: jax.Array
-  num_samples: jax.Array
-
-  @classmethod
-  def empty(cls) -> 'Perplexity':
-    return cls(
-      aggregate_crossentropy=jnp.array(0, jnp.float32),
-      num_samples=jnp.array(0, jnp.float32))
-
-  @classmethod
-  def from_model_output(
-      cls,
-      predictions: jax.Array,
-      labels: jax.Array,
-      sample_weights: jax.Array | None = None,
-  ) -> 'Perplexity':
-    """Updates the metric.
-
-    Args:
-      predictions: A floating point tensor representing the prediction
-      generated from the model. The shape should be (batch_size, seq_len,
-      vocab_size).
-      labels: True value. The shape should be (batch_size, seq_len).
-      sample_weights: An optional tensor representing the
-        weight of each token. The shape should be (batch_size, seq_len).
-
-    Returns:
-      Updated Perplexity metric.
-
-    Raises:
-      ValueError: If type of `labels` is wrong or the shapes of `predictions`
-      and `labels` are incompatible.
+    r"""Computes perplexity for sequence generation.
+
+    Perplexity is a measurement of how well a probability distribution predicts a
+    sample. It is defined as the exponentiation of the cross-entropy. A low
+    perplexity indicates the probability distribution is good at predicting the
+    sample.
+
+    For language models, it can be interpreted as the weighted average branching
+    factor of the model - how many equally likely words can be selected at each
+    step.
+
+    Given a sequence of :math:`N` tokens, perplexity is calculated as:
+
+    .. math::
+        Perplexity = \exp\left(-\frac{1}{N}\sum_{i=1}^{N} \log P(x_i|x_{<i})\right)
+
+    When sample weights :math:`w_i` are provided:
+
+    .. math::
+        Perplexity = \exp\left(-\frac{\sum_{i=1}^{N} w_i\log P(x_i|x_{<i})}{\sum_{i=1}^{N} w_i}\right)
+
+    where:
+        - :math:`P(x_i|x_{<i})` is the predicted probability of token :math:`x_i`
+          given previous tokens
+        - :math:`w_i` are sample weights
+        - :math:`N` is the sequence length
+
+    Lower perplexity indicates better prediction - the model is less "perplexed" by the data.
     """
-    predictions = predictions / jnp.sum(predictions, axis=-1, keepdims=True)
-    labels_one_hot = jax.nn.one_hot(labels, predictions.shape[-1], axis=-1)
-    log_prob = jnp.log(predictions)
-    crossentropy = -jnp.sum(labels_one_hot * log_prob, axis=-1)
-
-    # Sum across sequence length dimension first.
-    if sample_weights is not None:
-      crossentropy = crossentropy * sample_weights
-      # Normalize by the sum of weights for each sequence.
-      crossentropy = jnp.sum(crossentropy) / jnp.sum(sample_weights)
-    else:
-      crossentropy = jnp.mean(crossentropy)
-
-    batch_size = jnp.array(labels.shape[0])
-    return cls(
-        aggregate_crossentropy=(batch_size * crossentropy),
-        num_samples=batch_size,
-    )
-
-  def merge(self, other: 'Perplexity') -> 'Perplexity':
-    return type(self)(
-        aggregate_crossentropy=(
-            self.aggregate_crossentropy + other.aggregate_crossentropy
-        ),
-        num_samples=self.num_samples + other.num_samples,
-    )
-
-  def compute(self) -> jax.Array:
-    return jnp.exp(self.aggregate_crossentropy / self.num_samples)
+
+    aggregate_crossentropy: jax.Array
+    num_samples: jax.Array
+
+    @classmethod
+    def empty(cls) -> "Perplexity":
+        return cls(
+            aggregate_crossentropy=jnp.array(0, jnp.float32),
+            num_samples=jnp.array(0, jnp.float32),
+        )
+
+    @classmethod
+    def from_model_output(
+        cls,
+        predictions: jax.Array,
+        labels: jax.Array,
+        sample_weights: jax.Array | None = None,
+    ) -> "Perplexity":
+        """Updates the metric.
+
+        Args:
+          predictions: A floating point tensor representing the prediction
+          generated from the model. The shape should be (batch_size, seq_len,
+          vocab_size).
+          labels: True value. The shape should be (batch_size, seq_len).
+          sample_weights: An optional tensor representing the
+            weight of each token. The shape should be (batch_size, seq_len).
+
+        Returns:
+          Updated Perplexity metric.
+
+        Raises:
+          ValueError: If type of `labels` is wrong or the shapes of `predictions`
+          and `labels` are incompatible.
+        """
+        predictions = predictions / jnp.sum(predictions, axis=-1, keepdims=True)
+        labels_one_hot = jax.nn.one_hot(labels, predictions.shape[-1], axis=-1)
+        log_prob = jnp.log(predictions)
+        crossentropy = -jnp.sum(labels_one_hot * log_prob, axis=-1)
+
+        # Sum across sequence length dimension first.
+        if sample_weights is not None:
+            crossentropy = crossentropy * sample_weights
+            # Normalize by the sum of weights for each sequence.
+            crossentropy = jnp.sum(crossentropy) / jnp.sum(sample_weights)
+        else:
+            crossentropy = jnp.mean(crossentropy)
+
+        batch_size = jnp.array(labels.shape[0])
+        return cls(
+            aggregate_crossentropy=(batch_size * crossentropy),
+            num_samples=batch_size,
+        )
+
+    def merge(self, other: "Perplexity") -> "Perplexity":
+        return type(self)(
+            aggregate_crossentropy=(
+                self.aggregate_crossentropy + other.aggregate_crossentropy
+            ),
+            num_samples=self.num_samples + other.num_samples,
+        )
+
+    def compute(self) -> jax.Array:
+        return jnp.exp(self.aggregate_crossentropy / self.num_samples)
+
+
+@flax.struct.dataclass
+class WER(clu_metrics.Average):
+    r"""Computes Word Error Rate (WER) for speech recognition or text generation tasks.
+
+    Word Error Rate measures the edit distance between reference texts and predictions,
+    normalized by the length of the reference texts. It is calculated as:
+
+    .. math::
+        WER = \frac{S + D + I}{N}
+
+    where:
+        - S is the number of substitutions
+        - D is the number of deletions
+        - I is the number of insertions
+        - N is the number of words in the reference
+
+    A lower WER indicates better performance, with 0 being perfect.
+
+    This implementation accepts both pre-tokenized inputs (lists of tokens) and untokenized
+    strings. When strings are provided, they are tokenized by splitting on whitespace.
+    """
+
+    @classmethod
+    def from_model_output(
+        cls,
+        predictions: list[str],
+        references: list[str],
+    ) -> "WER":
+        """Updates the metric.
+
+        Args:
+            prediction: Either a string or a list of tokens in the predicted sequence.
+            reference: Either a string or a list of tokens in the reference sequence.
+
+        Returns:
+            New WER metric instance.
+
+        Raises:
+            ValueError: If inputs are not properly formatted or are empty.
+        """
+        if not predictions or not references:
+            raise ValueError("predictions and references must not be empty")
+
+        if isinstance(predictions, str):
+            predictions = predictions.split()
+        if isinstance(references, str):
+            references = references.split()
+
+        edit_distance = cls._levenshtein_distance(predictions, references)
+        reference_length = len(references)
+
+        return cls(
+            total=jnp.array(edit_distance, dtype=jnp.float32),
+            count=jnp.array(reference_length, dtype=jnp.float32),
+        )
+
+    @staticmethod
+    def _levenshtein_distance(prediction: list, reference: list) -> int:
+        """Computes the Levenshtein (edit) distance between two token sequences.
+
+        Args:
+            prediction: List of tokens in the predicted sequence.
+            reference: List of tokens in the reference sequence.
+
+        Returns:
+            The minimum number of edits needed to transform prediction into reference.
+        """
+        m, n = len(prediction), len(reference)
+
+        # Handle edge cases
+        if m == 0:
+            return n
+        if n == 0:
+            return m
+
+        # Create distance matrix
+        distance_matrix = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
+
+        # Initialize first row and column
+        for i in range(m + 1):
+            distance_matrix[i][0] = i
+        for j in range(n + 1):
+            distance_matrix[0][j] = j
+
+        # Fill the matrix
+        for i in range(1, m + 1):
+            for j in range(1, n + 1):
+                if prediction[i - 1] == reference[j - 1]:
+                    cost = 0
+                else:
+                    cost = 1
+
+                distance_matrix[i][j] = min(
+                    distance_matrix[i - 1][j] + 1,  # deletion
+                    distance_matrix[i][j - 1] + 1,  # insertion
+                    distance_matrix[i - 1][j - 1] + cost,  # substitution
+                )
+
+        return distance_matrix[m][n]
diff --git a/src/metrax/nlp_metrics_test.py b/src/metrax/nlp_metrics_test.py
@@ -30,6 +30,12 @@ def test_perplexity_empty(self):
     self.assertEqual(m.aggregate_crossentropy, jnp.array(0, jnp.float32))
     self.assertEqual(m.num_samples, jnp.array(0, jnp.float32))
 
+  def test_wer_empty(self):
+    """Tests the `empty` method of the `WER` class."""
+    m = metrax.WER.empty()
+    self.assertEqual(m.total, jnp.array(0, jnp.float32))
+    self.assertEqual(m.count, jnp.array(0, jnp.float32))
+
   @parameterized.named_parameters(
       (
           'basic',
@@ -68,6 +74,49 @@ def test_perplexity(self, y_true, y_pred, sample_weights):
         atol=1e-05,
     )
 
+  def test_wer(self):
+    """Tests that WER metric computes correct values with tokenized inputs."""
+    string_preds = [
+      "the cat sat on the mat",
+      "a quick brown fox jumps over the lazy dog",
+      "hello world"
+    ]
+    string_refs = [
+      "the cat sat on the hat",
+      "the quick brown fox jumps over the lazy dog",
+      "hello beautiful world"
+    ]
+    tokenized_preds = [sentence.split() for sentence in string_preds]
+    tokenized_refs = [sentence.split() for sentence in string_refs]
+
+    metrax_token_metric = None
+    keras_metric = keras_hub.metrics.EditDistance(normalize=True)
+    for pred, ref in zip(tokenized_preds, tokenized_refs):
+      metrax_update = metrax.WER.from_model_output(pred,ref)
+      keras_metric.update_state(ref, pred)
+      metrax_token_metric = metrax_update if metrax_token_metric is None else metrax_token_metric.merge(metrax_update)
+
+    np.testing.assert_allclose(
+        metrax_token_metric.compute(),
+        keras_metric.result(),
+        rtol=1e-05,
+        atol=1e-05,
+        err_msg="String-based WER should match keras_hub EditDistance"
+    )
+      
+    metrax_string_metric = None
+    for pred, ref in zip(string_preds, string_refs):
+      update = metrax.WER.from_model_output(predictions=pred, references=ref)
+      metrax_string_metric = update if metrax_string_metric is None else metrax_string_metric.merge(update)
+
+    np.testing.assert_allclose(
+    metrax_string_metric.compute(),
+    metrax_token_metric.compute(),
+    rtol=1e-05,
+    atol=1e-05,
+    err_msg="String input and tokenized input should produce the same WER"
+    )
+
 
 if __name__ == '__main__':
   absltest.main()

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`)`
`21`	`21`	`from metrax.nlp_metrics import (`
`22`	`22`	`Perplexity,`
	`23`	`+ WER`
`23`	`24`	`)`
`24`	`25`	`from metrax.ranking_metrics import (`
`25`	`26`	`AveragePrecisionAtK,`
`@@ -40,4 +41,5 @@`
`40`	`41`	`"Recall",`
`41`	`42`	`"RMSE",`
`42`	`43`	`"RSQUARED",`
	`44`	`+ "WER",`
`43`	`45`	`]`