Add WER metric (#28)

nikolasavic3 · nikolasavic3 · commit 5cbea548f2ec · 2025-03-07T17:27:22.000+01:00
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -21,6 +21,7 @@
     RMSE,
     RSQUARED,
     Recall,
+    WER,
 )
 
 __all__ = [
@@ -32,4 +33,5 @@
     "Recall",
     "AUCPR",
     "AUCROC",
+    "WER",
 ]
diff --git a/src/metrax/metrics.py b/src/metrax/metrics.py
@@ -795,3 +795,140 @@ def merge(self, other: 'Perplexity') -> 'Perplexity':
 
   def compute(self) -> jax.Array:
     return jnp.exp(self.aggregate_crossentropy / self.num_samples)
+
+
+@flax.struct.dataclass
+class WER(clu_metrics.Metric):
+  r"""Computes Word Error Rate (WER) for speech recognition or text generation tasks.
+
+  Word Error Rate measures the edit distance between reference texts and predictions,
+  normalized by the length of the reference texts. It is calculated as:
+
+  .. math::
+      WER = \frac{S + D + I}{N}
+
+  where:
+      - S is the number of substitutions
+      - D is the number of deletions
+      - I is the number of insertions
+      - N is the number of words in the reference
+
+  A lower WER indicates better performance, with 0 being perfect.
+
+  Attributes:
+      total_edit_distance: Sum of edit distances across all samples.
+      total_reference_length: Sum of reference lengths across all samples.
+  """
+
+  total_edit_distance: jax.Array
+  total_reference_length: jax.Array
+
+  @classmethod
+  def empty(cls) -> 'WER':
+    return cls(
+        total_edit_distance=jnp.array(0, jnp.float32),
+        total_reference_length=jnp.array(0, jnp.float32))
+
+  @classmethod
+  def from_model_output(
+      cls,
+      predictions: list[str] | list[list],
+      references: list[str] | list[list],
+  ) -> 'WER':
+    """Updates the metric.
+
+    Args:
+        predictions: A list of predicted texts/transcriptions or tokenized sequences.
+        references: A list of reference texts/transcriptions or tokenized sequences.
+
+    Returns:
+        Updated WER metric.
+
+    Raises:
+        ValueError: If inputs are not properly formatted or are empty.
+    """
+    if not predictions or not references:
+      raise ValueError('predictions and references must not be empty')
+    
+    if len(predictions) != len(references):
+      raise ValueError(
+          f'Length mismatch: predictions has {len(predictions)} items, '
+          f'but references has {len(references)} items'
+      )
+    
+    # Determine if inputs are strings that need tokenization or already tokenized
+    total_edit_distance = 0
+    total_reference_length = 0
+    
+    for pred, ref in zip(predictions, references):
+      # Convert to tokens if needed
+      pred_tokens = pred.split() if isinstance(pred, str) else pred
+      ref_tokens = ref.split() if isinstance(ref, str) else ref
+      
+      # Calculate edit distance
+      edit_distance = cls._levenshtein_distance(pred_tokens, ref_tokens)
+      
+      # Update totals
+      total_edit_distance += edit_distance
+      total_reference_length += len(ref_tokens)
+    
+    return cls(
+        total_edit_distance=jnp.array(total_edit_distance, dtype=jnp.float32),
+        total_reference_length=jnp.array(total_reference_length, dtype=jnp.float32),
+    )
+
+  @staticmethod
+  def _levenshtein_distance(prediction: list, reference: list) -> int:
+    """Computes the Levenshtein (edit) distance between two token sequences.
+    
+    Args:
+        prediction: List of tokens in the predicted sequence.
+        reference: List of tokens in the reference sequence.
+        
+    Returns:
+        The minimum number of edits needed to transform prediction into reference.
+    """
+    # Create a matrix to store the edit distances
+    m, n = len(prediction), len(reference)
+    
+    # Handle edge cases
+    if m == 0:
+      return n
+    if n == 0:
+      return m
+      
+    # Create distance matrix
+    distance_matrix = [[0 for _ in range(n+1)] for _ in range(m+1)]
+    
+    # Initialize first row and column
+    for i in range(m+1):
+      distance_matrix[i][0] = i
+    for j in range(n+1):
+      distance_matrix[0][j] = j
+    
+    # Fill the matrix
+    for i in range(1, m+1):
+      for j in range(1, n+1):
+        if prediction[i-1] == reference[j-1]:
+          cost = 0
+        else:
+          cost = 1
+        
+        distance_matrix[i][j] = min(
+            distance_matrix[i-1][j] + 1,      # deletion
+            distance_matrix[i][j-1] + 1,      # insertion
+            distance_matrix[i-1][j-1] + cost  # substitution
+        )
+    
+    return distance_matrix[m][n]
+
+  def merge(self, other: 'WER') -> 'WER':
+    return type(self)(
+        total_edit_distance=self.total_edit_distance + other.total_edit_distance,
+        total_reference_length=self.total_reference_length + other.total_reference_length,
+    )
+
+  def compute(self) -> jax.Array:
+    return _divide_no_nan(
+        self.total_edit_distance, self.total_reference_length
+    )
diff --git a/src/metrax/metrics_test.py b/src/metrax/metrics_test.py
@@ -414,5 +414,111 @@ def test_perplexity(self, y_true, y_pred, sample_weights):
     )
 
 
+  def test_wer_empty(self):
+    """Tests the `empty` method of the `WER` class."""
+    m = metrax.WER.empty()
+    self.assertEqual(m.total_edit_distance, jnp.array(0, jnp.float32))
+    self.assertEqual(m.total_reference_length, jnp.array(0, jnp.float32))
+
+  def test_wer(self):
+    """Tests that WER metric computes correct values."""
+    # Test with string inputs
+    predictions = [
+        "the cat sat on the mat",
+        "a quick brown fox jumps over the lazy dog",
+        "hello world"
+    ]
+    references = [
+        "the cat sat on the hat",  # 1 substitution (mat->hat), 6 total words
+        "the quick brown fox jumps over the lazy dog",  # 1 substitution (a->the), 9 total words
+        "hello beautiful world"  # 1 insertion (beautiful), 3 total words
+    ]
+    
+    # Expected individual WERs: 1/6, 1/9, 1/3
+    # Total edit distance: 1 + 1 + 1 = 3
+    # Total reference length: 6 + 9 + 3 = 18
+    # Expected WER: 3/18 = 0.1667
+    
+    metric = None
+    for pred, ref in zip(predictions, references):
+      update = metrax.WER.from_model_output(
+          predictions=[pred],
+          references=[ref],
+      )
+      metric = update if metric is None else metric.merge(update)
+    
+    np.testing.assert_allclose(
+        metric.compute(),
+        jnp.array(3/18, dtype=jnp.float32),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+  def test_wer_with_tokens(self):
+    """Tests that WER metric computes correct values with tokenized inputs."""
+    # Test with token inputs (lists instead of strings)
+    tokenized_preds = [
+        ["the", "cat", "sat", "on", "the", "mat"],
+        ["a", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"],
+        ["hello", "world"]
+    ]
+    tokenized_refs = [
+        ["the", "cat", "sat", "on", "the", "hat"],
+        ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"],
+        ["hello", "beautiful", "world"]
+    ]
+    
+    metric = None
+    for pred, ref in zip(tokenized_preds, tokenized_refs):
+      update = metrax.WER.from_model_output(
+          predictions=[pred],
+          references=[ref],
+      )
+      metric = update if metric is None else metric.merge(update)
+    
+    np.testing.assert_allclose(
+        metric.compute(),
+        jnp.array(3/18, dtype=jnp.float32),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+  def test_wer_merge(self):
+    """Tests the merge functionality of the WER metric."""
+    predictions1 = ["the cat sat on the mat"]
+    references1 = ["the cat sat on the hat"]  # 1/6 WER
+    
+    predictions2 = [
+        "a quick brown fox jumps over the lazy dog",
+        "hello world"
+    ]
+    references2 = [
+        "the quick brown fox jumps over the lazy dog",
+        "hello beautiful world"
+    ]  # (1+1)/(9+3) = 2/12 WER
+    
+    # Create and compute first metric
+    metric1 = metrax.WER.from_model_output(
+        predictions=predictions1,
+        references=references1,
+    )
+    
+    # Create and compute second metric
+    metric2 = metrax.WER.from_model_output(
+        predictions=predictions2,
+        references=references2,
+    )
+    
+    # Merge and compute
+    merged_metric = metric1.merge(metric2)
+    
+    np.testing.assert_allclose(
+        merged_metric.compute(),
+        jnp.array(3/18, dtype=jnp.float32),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+    
 if __name__ == '__main__':
   absltest.main()

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@`
`21`	`21`	`RMSE,`
`22`	`22`	`RSQUARED,`
`23`	`23`	`Recall,`
	`24`	`+ WER,`
`24`	`25`	`)`
`25`	`26`
`26`	`27`	`__all__ = [`
`@@ -32,4 +33,5 @@`
`32`	`33`	`"Recall",`
`33`	`34`	`"AUCPR",`
`34`	`35`	`"AUCROC",`
	`36`	`+ "WER",`
`35`	`37`	`]`