Add WER metric

nikolasavic3 · nikolasavic3 · commit 8876108ba4cc · 2025-03-14T19:08:56.000Z
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -20,6 +20,7 @@
 )
 from metrax.nlp_metrics import (
     Perplexity,
+    WER
 )
 from metrax.ranking_metrics import (
     AveragePrecisionAtK,
@@ -40,4 +41,5 @@
     "Recall",
     "RMSE",
     "RSQUARED",
+    "WER",
 ]
diff --git a/src/metrax/nlp_metrics.py b/src/metrax/nlp_metrics.py
@@ -114,3 +114,104 @@ def merge(self, other: 'Perplexity') -> 'Perplexity':
 
   def compute(self) -> jax.Array:
     return jnp.exp(self.aggregate_crossentropy / self.num_samples)
+
+
+@flax.struct.dataclass
+class WER(clu_metrics.Average):
+    r"""Computes Word Error Rate (WER) for speech recognition or text generation tasks.
+
+    Word Error Rate measures the edit distance between reference texts and predictions,
+    normalized by the length of the reference texts. It is calculated as:
+
+    .. math::
+        WER = \frac{S + D + I}{N}
+
+    where:
+        - S is the number of substitutions
+        - D is the number of deletions
+        - I is the number of insertions
+        - N is the number of words in the reference
+
+    A lower WER indicates better performance, with 0 being perfect.
+
+    This implementation accepts both pre-tokenized inputs (lists of tokens) and untokenized
+    strings. When strings are provided, they are tokenized by splitting on whitespace.
+    """
+
+    @classmethod
+    def from_model_output(
+        cls,
+        predictions: list[str],
+        references: list[str],
+    ) -> "WER":
+        """Updates the metric.
+
+        Args:
+            prediction: Either a string or a list of tokens in the predicted sequence.
+            reference: Either a string or a list of tokens in the reference sequence.
+
+        Returns:
+            New WER metric instance.
+
+        Raises:
+            ValueError: If inputs are not properly formatted or are empty.
+        """
+        if not predictions or not references:
+            raise ValueError("predictions and references must not be empty")
+
+        if isinstance(predictions, str):
+            predictions = predictions.split()
+        if isinstance(references, str):
+            references = references.split()
+
+        edit_distance = cls._levenshtein_distance(predictions, references)
+        reference_length = len(references)
+
+        return cls(
+            total=jnp.array(edit_distance, dtype=jnp.float32),
+            count=jnp.array(reference_length, dtype=jnp.float32),
+        )
+
+    @staticmethod
+    def _levenshtein_distance(prediction: list, reference: list) -> int:
+        """Computes the Levenshtein (edit) distance between two token sequences.
+
+        Args:
+            prediction: List of tokens in the predicted sequence.
+            reference: List of tokens in the reference sequence.
+
+        Returns:
+            The minimum number of edits needed to transform prediction into reference.
+        """
+        m, n = len(prediction), len(reference)
+
+        # Handle edge cases
+        if m == 0:
+            return n
+        if n == 0:
+            return m
+
+        # Create distance matrix
+        distance_matrix = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
+
+        # Initialize first row and column
+        for i in range(m + 1):
+            distance_matrix[i][0] = i
+        for j in range(n + 1):
+            distance_matrix[0][j] = j
+
+        # Fill the matrix
+        for i in range(1, m + 1):
+            for j in range(1, n + 1):
+                if prediction[i - 1] == reference[j - 1]:
+                    cost = 0
+                else:
+                    cost = 1
+
+                distance_matrix[i][j] = min(
+                    distance_matrix[i - 1][j] + 1,  # deletion
+                    distance_matrix[i][j - 1] + 1,  # insertion
+                    distance_matrix[i - 1][j - 1] + cost,  # substitution
+                )
+
+        return distance_matrix[m][n]
diff --git a/src/metrax/nlp_metrics_test.py b/src/metrax/nlp_metrics_test.py
@@ -30,6 +30,12 @@ def test_perplexity_empty(self):
     self.assertEqual(m.aggregate_crossentropy, jnp.array(0, jnp.float32))
     self.assertEqual(m.num_samples, jnp.array(0, jnp.float32))
 
+  def test_wer_empty(self):
+    """Tests the `empty` method of the `WER` class."""
+    m = metrax.WER.empty()
+    self.assertEqual(m.total, jnp.array(0, jnp.float32))
+    self.assertEqual(m.count, jnp.array(0, jnp.float32))
+
   @parameterized.named_parameters(
       (
           'basic',
@@ -68,6 +74,49 @@ def test_perplexity(self, y_true, y_pred, sample_weights):
         atol=1e-05,
     )
 
+  def test_wer(self):
+    """Tests that WER metric computes correct values with tokenized and untokenized inputs."""
+    string_preds = [
+      "the cat sat on the mat",
+      "a quick brown fox jumps over the lazy dog",
+      "hello world"
+    ]
+    string_refs = [
+      "the cat sat on the hat",
+      "the quick brown fox jumps over the lazy dog",
+      "hello beautiful world"
+    ]
+    tokenized_preds = [sentence.split() for sentence in string_preds]
+    tokenized_refs = [sentence.split() for sentence in string_refs]
+
+    metrax_token_metric = None
+    keras_metric = keras_hub.metrics.EditDistance(normalize=True)
+    for pred, ref in zip(tokenized_preds, tokenized_refs):
+      metrax_update = metrax.WER.from_model_output(pred,ref)
+      keras_metric.update_state(ref, pred)
+      metrax_token_metric = metrax_update if metrax_token_metric is None else metrax_token_metric.merge(metrax_update)
+
+    np.testing.assert_allclose(
+        metrax_token_metric.compute(),
+        keras_metric.result(),
+        rtol=1e-05,
+        atol=1e-05,
+        err_msg="String-based WER should match keras_hub EditDistance"
+    )
+
+    metrax_string_metric = None
+    for pred, ref in zip(string_preds, string_refs):
+      update = metrax.WER.from_model_output(predictions=pred, references=ref)
+      metrax_string_metric = update if metrax_string_metric is None else metrax_string_metric.merge(update)
+
+    np.testing.assert_allclose(
+    metrax_string_metric.compute(),
+    metrax_token_metric.compute(),
+    rtol=1e-05,
+    atol=1e-05,
+    err_msg="String input and tokenized input should produce the same WER"
+    )
+
 
 if __name__ == '__main__':
   absltest.main()

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`)`
`21`	`21`	`from metrax.nlp_metrics import (`
`22`	`22`	`Perplexity,`
	`23`	`+ WER`
`23`	`24`	`)`
`24`	`25`	`from metrax.ranking_metrics import (`
`25`	`26`	`AveragePrecisionAtK,`
`@@ -40,4 +41,5 @@`
`40`	`41`	`"Recall",`
`41`	`42`	`"RMSE",`
`42`	`43`	`"RSQUARED",`
	`44`	`+ "WER",`
`43`	`45`	`]`