add BLEU

jshin1394 · jshin1394 · commit d51b93b5c43b · 2025-04-11T22:08:09.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,6 @@ absl-py
 clu
 jax[cpu]
 keras-hub
+keras-nlp
 pytest
 scikit-learn
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -22,6 +22,7 @@
 AUCROC = classification_metrics.AUCROC
 Average = base.Average
 AveragePrecisionAtK = ranking_metrics.AveragePrecisionAtK
+BLEU = nlp_metrics.BLEU
 MSE = regression_metrics.MSE
 Perplexity = nlp_metrics.Perplexity
 Precision = classification_metrics.Precision
@@ -36,7 +37,7 @@
     "AUCROC",
     "Average",
     "AveragePrecisionAtK",
-    "MSE",
+    "BLEUMSE",
     "Perplexity",
     "Precision",
     "RMSE",
diff --git a/src/metrax/metrax_test.py b/src/metrax/metrax_test.py
@@ -40,8 +40,6 @@
     'the quick brown fox jumps over the lazy dog',
     'hello beautiful world',
 ]
-TOKENIZED_PREDS = [sentence.split() for sentence in STRING_PREDS]
-TOKENIZED_REFS = [sentence.split() for sentence in STRING_REFS]
 
 
 class MetraxTest(parameterized.TestCase):
@@ -114,7 +112,12 @@ def test_metrics_jittable(self, metric, kwargs):
       (
           'wer',
           metrax.WER,
-          {'predictions': TOKENIZED_PREDS, 'references': TOKENIZED_REFS},
+          {'predictions': STRING_PREDS, 'references': STRING_REFS},
+      ),
+      (
+          'bleu',
+          metrax.BLEU,
+          {'predictions': STRING_PREDS, 'references': STRING_REFS},
       ),
   )
   def test_metrics_not_jittable(self, metric, kwargs):
diff --git a/src/metrax/nlp_metrics.py b/src/metrax/nlp_metrics.py
@@ -15,12 +15,164 @@
 """A collection of different metrics for NLP models."""
 
 from clu import metrics as clu_metrics
+import collections
+import math
 import flax
 import jax
 import jax.numpy as jnp
 from metrax import base
 
 
+def get_ngrams(segment: list[str], max_order: int):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+      segment: list. Text segment from which n-grams will be extracted.
+      max_order: int. Maximum length in tokens of the n-grams returned by this
+        method.
+  """
+  ngram_counts = collections.Counter()
+  for order in range(1, max_order + 1):
+    for i in range(0, len(segment) - order + 1):
+      ngram = tuple(segment[i : i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+
+@flax.struct.dataclass
+class BLEU(clu_metrics.Metric):
+  r"""Computes the BLEU score for sequence generation.
+
+  BLEU measures the similarity between a machine-generated candidate translation
+  and one or more human reference translations, focusing on matching n-grams.
+
+  It's calculated as:
+  .. math::
+    \text{BLEU} = \text{BP} \times \exp\left( \sum_{n=1}^{N} w_n \log p_n
+    \right)
+
+  Where:
+    - :math:`p_n` is the modified n-gram precision for n-grams of order n.
+    - :math:`N` is the maximum n-gram order considered (typically 4).
+    - :math:`w_n` are weights for each order (typically uniform, 1/N).
+    - :math:`\text{BP}` is the Brevity Penalty.
+
+  This implementation uses uniform weights and calculates statistics
+  incrementally.
+
+  Attributes:
+    max_order: Maximum n-gram order to consider.
+    matches_by_order: Accumulated sum of clipped n-gram matches for each order.
+    possible_matches_by_order: Accumulated sum of total n-grams in predictions
+      for each order.
+    translation_length: Accumulated total length of predictions.
+    reference_length: Accumulated total 'effective' reference length (closest
+      length match for each prediction).
+  """
+
+  max_order: int
+  matches_by_order: jax.Array
+  possible_matches_by_order: jax.Array
+  translation_length: jax.Array
+  reference_length: jax.Array
+
+  @classmethod
+  def empty(cls) -> 'BLEU':
+    return cls(
+        max_order=4,
+        matches_by_order=jnp.array(0, jnp.float32),
+        possible_matches_by_order=jnp.array(0, jnp.float32),
+        translation_length=jnp.array(0, jnp.float32),
+        reference_length=jnp.array(0, jnp.float32),
+    )
+
+  @classmethod
+  def from_model_output(
+      cls,
+      predictions: list[str],
+      references: list[list[str]],
+      max_order: int = 4,
+  ) -> 'BLEU':
+    """Computes BLEU statistics for a batch of predictions and references.
+
+    Args:
+      predictions: A list of predicted strings. The shape should be (batch_size,
+        ).
+      references: A list of lists of reference strings. The shape should be
+        (batch_size, num_references).
+      max_order: The maximum order of n-grams to consider.
+
+    Returns:
+      A BLEU metric instance containing the statistics for this batch.
+
+    Raises:
+      ValueError: If the shapes of `predictions` and `references` are
+      incompatible.
+    """
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    pred_length = 0
+    ref_length = 0
+
+    for pred, ref_list in zip(predictions, references):
+      pred = pred.split()
+      ref_list = [r.split() for r in ref_list]
+      pred_length += len(pred)
+      ref_length += min(len(r) for r in ref_list)
+      prediction_ngram_counts = get_ngrams(pred, max_order)
+      reference_ngram_counts = collections.Counter()
+      for ref in ref_list:
+        reference_ngram_counts |= get_ngrams(ref, max_order)
+      overlap = prediction_ngram_counts & reference_ngram_counts
+      for ngram in overlap:
+        matches_by_order[len(ngram) - 1] += overlap[ngram]
+      for order in range(1, max_order + 1):
+        possible_matches = len(pred) - order + 1
+        if possible_matches > 0:
+          possible_matches_by_order[order - 1] += possible_matches
+
+    return cls(
+        max_order=max_order,
+        matches_by_order=jnp.array(matches_by_order, dtype=jnp.float32),
+        possible_matches_by_order=jnp.array(
+            possible_matches_by_order, dtype=jnp.float32
+        ),
+        translation_length=jnp.array(pred_length, dtype=jnp.float32),
+        reference_length=jnp.array(ref_length, dtype=jnp.float32),
+    )
+
+  def merge(self, other: 'BLEU') -> 'BLEU':
+    if self.max_order != other.max_order:
+      raise ValueError(
+          'BLEU metrics with different max_order cannot be merged.'
+      )
+    return type(self)(
+        max_order=self.max_order,
+        matches_by_order=(self.matches_by_order + other.matches_by_order),
+        possible_matches_by_order=(
+            self.possible_matches_by_order + other.possible_matches_by_order
+        ),
+        translation_length=(self.translation_length + other.translation_length),
+        reference_length=(self.reference_length + other.reference_length),
+    )
+
+  def compute(self) -> jax.Array:
+    precisions = [0] * self.max_order
+    for i in range(0, self.max_order):
+      precisions[i] = base.divide_no_nan(
+          self.matches_by_order[i], self.possible_matches_by_order[i]
+      )
+    geo_mean = (
+        math.exp(sum((1.0 / self.max_order) * math.log(p) for p in precisions))
+        if precisions and min(precisions) > 0
+        else 0
+    )
+    ratio = base.divide_no_nan(self.translation_length, self.reference_length)
+    bp = 1.0 if ratio > 1.0 else math.exp(1 - 1.0 / ratio)
+    bleu = geo_mean * bp
+    return jnp.array(bleu)
+
+
 @flax.struct.dataclass
 class Perplexity(clu_metrics.Metric):
   r"""Computes perplexity for sequence generation.
diff --git a/src/metrax/nlp_metrics_test.py b/src/metrax/nlp_metrics_test.py
@@ -21,6 +21,7 @@
 from absl.testing import parameterized
 import jax.numpy as jnp
 import keras_hub
+import keras_nlp
 import metrax
 import numpy as np
 
@@ -29,6 +30,15 @@
 
 class NlpMetricsTest(parameterized.TestCase):
 
+  def test_bleu_empty(self):
+    """Tests the `empty` method of the `BLEU` class."""
+    m = metrax.BLEU.empty()
+    self.assertEqual(m.max_order, 4)
+    self.assertEqual(m.matches_by_order, jnp.array(0, jnp.float32))
+    self.assertEqual(m.possible_matches_by_order, jnp.array(0, jnp.float32))
+    self.assertEqual(m.translation_length, jnp.array(0, jnp.float32))
+    self.assertEqual(m.reference_length, jnp.array(0, jnp.float32))
+
   def test_perplexity_empty(self):
     """Tests the `empty` method of the `Perplexity` class."""
     m = metrax.Perplexity.empty()
@@ -41,6 +51,78 @@ def test_wer_empty(self):
     self.assertEqual(m.total, jnp.array(0, jnp.float32))
     self.assertEqual(m.count, jnp.array(0, jnp.float32))
 
+  def test_bleu(self):
+    """Tests that BLEU metric computes correct values."""
+    references = [
+        ["He eats a sweet apple", "He is eating a tasty apple, isn't he"],
+        [
+            "Silicon Valley is one of my favourite shows",
+            "Silicon Valley is the best show ever",
+        ],
+    ]
+    predictions = [
+        "He He He eats sweet apple which is a fruit",
+        "I love Silicon Valley it's one of my favourite shows",
+    ]
+    keras_metric = keras_nlp.metrics.Bleu()
+    keras_metric.update_state(references, predictions)
+    metrax_metric = metrax.BLEU.from_model_output(predictions, references)
+
+    np.testing.assert_allclose(
+        metrax_metric.compute(),
+        keras_metric.result(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+  def test_bleu_merge(self):
+    """Tests that BLEU metric computes correct values using merge."""
+    references = [
+        ["He eats a sweet apple", "He is eating a tasty apple, isn't he"],
+        [
+            "Silicon Valley is one of my favourite shows",
+            "Silicon Valley is the best show ever",
+        ],
+    ]
+    predictions = [
+        "He He He eats sweet apple which is a fruit",
+        "I love Silicon Valley it's one of my favourite shows",
+    ]
+    keras_metric = keras_nlp.metrics.Bleu()
+    keras_metric.update_state(references, predictions)
+    metrax_metric = None
+    for ref_list, pred in zip(references, predictions):
+      update = metrax.BLEU.from_model_output([pred], [ref_list])
+      metrax_metric = (
+          update if metrax_metric is None else metrax_metric.merge(update)
+      )
+
+    np.testing.assert_allclose(
+        metrax_metric.compute(),
+        keras_metric.result(),
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+  def test_bleu_merge_fails_on_different_max_order(self):
+    """Tests that error is raised when BLEU metrics with different max_order are merged."""
+    references = [
+        ["He eats a sweet apple", "He is eating a tasty apple, isn't he"],
+    ]
+    predictions = [
+        "He He He eats sweet apple which is a fruit",
+    ]
+    order_3_metric = metrax.BLEU.from_model_output(
+        predictions, references, max_order=3
+    )
+    order_4_metric = metrax.BLEU.from_model_output(
+        predictions, references, max_order=4
+    )
+
+    np.testing.assert_raises(
+        ValueError, lambda: order_3_metric.merge(order_4_metric)
+    )
+
   @parameterized.named_parameters(
       (
           'basic',
@@ -141,4 +223,4 @@ def test_wer(self):
 
 
 if __name__ == '__main__':
-  absltest.main()
+  absltest.main()
diff --git a/src/metrax/nnx/__init__.py b/src/metrax/nnx/__init__.py
@@ -18,6 +18,7 @@
 AUCROC = nnx_metrics.AUCROC
 Average = nnx_metrics.Average
 AveragePrecisionAtK = nnx_metrics.AveragePrecisionAtK
+BLEU = nnx_metrics.BLEU
 MSE = nnx_metrics.MSE
 Perplexity = nnx_metrics.Perplexity
 Precision = nnx_metrics.Precision
@@ -32,6 +33,7 @@
     "AUCROC",
     "Average",
     "AveragePrecisionAtK",
+    "BLEU",
     "MSE",
     "Perplexity",
     "Precision",
diff --git a/src/metrax/nnx/nnx_metrics.py b/src/metrax/nnx/nnx_metrics.py
@@ -46,6 +46,13 @@ def __init__(self):
     super().__init__(metrax.AveragePrecisionAtK)
 
 
+class BLEU(NnxWrapper):
+  """An NNX class for the Metrax metric BLEU."""
+
+  def __init__(self):
+    super().__init__(metrax.BLEU)
+
+
 class MSE(NnxWrapper):
   """An NNX class for the Metrax metric MSE."""