add macro average rouge-n to metrax

jshin1394 · jshin1394 · commit 1811369ffe1f · 2025-04-29T17:02:41.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ keras-hub
 keras-nlp
 pytest
 scikit-learn
+rouge-score
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -29,6 +29,7 @@
 RMSE = regression_metrics.RMSE
 RSQUARED = regression_metrics.RSQUARED
 Recall = classification_metrics.Recall
+RougeN = nlp_metrics.RougeN
 WER = nlp_metrics.WER
 
 
@@ -44,5 +45,6 @@
     "RMSE",
     "RSQUARED",
     "Recall",
+    "RougeN",
     "WER",
 ]
diff --git a/src/metrax/metrax_test.py b/src/metrax/metrax_test.py
@@ -105,7 +105,7 @@ def test_metrics_jittable(self, metric, kwargs):
     computed_metric = metric.from_model_output(**kwargs)
     jitted_metric = jax.jit(metric.from_model_output)(**kwargs)
     np.testing.assert_allclose(
-        computed_metric.compute(), jitted_metric.compute()
+        computed_metric.compute(), jitted_metric.compute(), rtol=1e-2, atol=1e-2
     )
 
   @parameterized.named_parameters(
@@ -119,6 +119,11 @@ def test_metrics_jittable(self, metric, kwargs):
           metrax.BLEU,
           {'predictions': STRING_PREDS, 'references': STRING_REFS},
       ),
+      (
+          'rougeN',
+          metrax.RougeN,
+          {'predictions': STRING_PREDS, 'references': STRING_REFS},
+      ),
   )
   def test_metrics_not_jittable(self, metric, kwargs):
     """Tests that attempting to jit and call a known non-jittable metric raises an error."""
diff --git a/src/metrax/nlp_metrics.py b/src/metrax/nlp_metrics.py
@@ -14,28 +14,42 @@
 
 """A collection of different metrics for NLP models."""
 
-from clu import metrics as clu_metrics
 import collections
 import math
+from clu import metrics as clu_metrics
 import flax
 import jax
 import jax.numpy as jnp
 from metrax import base
 
 
+def _get_single_n_grams(segment: list[str], order: int):
+  """Generates a counter of n-grams from a list of tokens for a specific n.
+
+  Args:
+    segment: list. Text segment from which n-grams will be extracted.
+    order: The order of n-grams.
+
+  Returns:
+    A collections.Counter mapping n-gram tuples to their counts.
+  """
+  return collections.Counter(zip(*[segment[i:] for i in range(order)]))
+
+
 def _get_ngrams(segment: list[str], max_order: int):
   """Extracts all n-grams up to a given maximum order from an input segment.
 
   Args:
       segment: list. Text segment from which n-grams will be extracted.
       max_order: int. Maximum length in tokens of the n-grams returned by this
         method.
+
+  Returns:
+    A collections.Counter mapping n-gram tuples to their counts for all orders.
   """
   ngram_counts = collections.Counter()
   for order in range(1, max_order + 1):
-    for i in range(0, len(segment) - order + 1):
-      ngram = tuple(segment[i : i + order])
-      ngram_counts[ngram] += 1
+    ngram_counts.update(_get_single_n_grams(segment, order))
   return ngram_counts
 
 
@@ -285,6 +299,159 @@ def compute(self) -> jax.Array:
     )
 
 
+@flax.struct.dataclass
+class RougeN(clu_metrics.Metric):
+  r"""Computes macro-averaged ROUGE-N recall, precision, and F1-score.
+
+  This metric first calculates ROUGE-N precision, recall, and F1-score for each
+  individual prediction compared against its single corresponding reference.
+  These per-instance precision, recall and F1-scores are then averaged across
+  all instances in the dataset/batch.
+
+  Accumulation for Macro-Average:
+  - total_precision = sum of all precision values.
+  - total_recall = sum of all instance_recall values.
+  - total_f1 = sum of all f1 values.
+  - num_examples = count of prediction-reference pairs.
+
+  Final Macro-Averaged Metrics:
+  .. math::
+      \text{MacroAvgPrecision} =
+      \frac{\text{total_precision}}{\text{num_examples}}
+  .. math::
+      \text{MacroAvgRecall} = \frac{\text{total_recall}}{\text{num_examples}}
+  .. math::
+      \text{MacroAvgF1} = 2 \cdot \frac{\text{MacroAvgPrecision} \cdot
+      \text{MacroAvgRecall}}{\text{MacroAvgPrecision} + \text{MacroAvgRecall}}
+
+  Attributes:
+    order: The specific 'N' in ROUGE-N (e.g., 1 for ROUGE-1, 2 for ROUGE-2).
+    total_precision: Accumulated sum of precision scores from each instance.
+    total_recall: Accumulated sum of recall scores from each instance.
+    total_f1: Accumulated sum of f1 scores from each instance.
+    num_examples: The number of instances (prediction-reference pairs)
+      processed.
+  """
+
+  order: int
+  total_precision: jax.Array
+  total_recall: jax.Array
+  total_f1: jax.Array
+  num_examples: jax.Array
+
+  @classmethod
+  def empty(cls, order: int = 2) -> 'RougeN':
+    """Creates an empty ROUGE-N metric for macro-averaging.
+
+    Args:
+      order: The order 'N' of n-grams (e.g., 2 for ROUGE-2). Must be a positive
+        integer.
+
+    Returns:
+      An empty RougeN metric.
+    """
+    return cls(
+        order=order,
+        total_precision=jnp.array(0, jnp.float32),
+        total_recall=jnp.array(0, jnp.float32),
+        total_f1=jnp.array(0, jnp.float32),
+        num_examples=jnp.array(0, jnp.float32),
+    )
+
+  @classmethod
+  def from_model_output(
+      cls,
+      predictions: list[str],
+      references: list[str],
+      order: int = 2,
+  ) -> 'RougeN':
+    """Computes sums of per-instance ROUGE-N scores for a batch.
+
+    Args:
+      predictions: A list of predicted strings. The shape should be (batch_size,
+        ).
+      references: A list of reference strings. Each prediction must have one
+        corresponding reference string. The shape should be (batch_size, ).
+      order: The order 'N' of n-grams to consider. Must be positive.
+
+    Returns:
+      A RougeN metric instance with accumulated per-instance scores.
+    """
+    total_precision = 0.0
+    total_recall = 0.0
+    total_f1 = 0.0
+    num_examples = 0.0
+
+    for pred_str, ref_str in zip(predictions, references):
+      pred_tokens = pred_str.split()
+      ref_tokens = ref_str.split()
+
+      pred_ngrams_counts = _get_single_n_grams(pred_tokens, order)
+      ref_ngrams_counts = _get_single_n_grams(ref_tokens, order)
+      overlap_counts = pred_ngrams_counts & ref_ngrams_counts
+
+      prediction_ngrams = jnp.array(sum(pred_ngrams_counts.values()))
+      reference_ngrams = jnp.array(sum(ref_ngrams_counts.values()))
+      overlapping_ngrams = jnp.array(sum(overlap_counts.values()))
+
+      precision = base.divide_no_nan(overlapping_ngrams, prediction_ngrams)
+      recall = base.divide_no_nan(overlapping_ngrams, reference_ngrams)
+      f1 = base.divide_no_nan(2 * precision * recall, precision + recall)
+
+      total_precision += precision
+      total_recall += recall
+      total_f1 += f1
+      num_examples += 1
+
+    return cls(
+        order=order,
+        total_precision=jnp.array(total_precision, dtype=jnp.float32),
+        total_recall=jnp.array(total_recall, dtype=jnp.float32),
+        total_f1=jnp.array(total_f1, dtype=jnp.float32),
+        num_examples=jnp.array(num_examples, dtype=jnp.float32),
+    )
+
+  def merge(self, other: 'RougeN') -> 'RougeN':
+    """Merges this RougeN metric with another.
+
+    Args:
+      other: Another RougeN metric instance.
+
+    Returns:
+      A new RougeN metric instance with combined statistics.
+    """
+    if self.order != other.order:
+      raise ValueError(
+          'RougeN metrics with different orders cannot be merged. '
+          f'Got {self.order} and {other.order}.'
+      )
+    return RougeN(
+        order=self.order,
+        total_precision=(self.total_precision + other.total_precision),
+        total_recall=(self.total_recall + other.total_recall),
+        total_f1=(self.total_f1 + other.total_f1),
+        num_examples=(self.num_examples + other.num_examples),
+    )
+
+  def compute(self) -> jax.Array:
+    """Computes macro-averaged ROUGE-N recall, precision, and F1-score.
+
+    Returns:
+      A JAX array where:
+            - index 0: macro-averaged precision
+            - index 1: macro-averaged recall
+            - index 2: macro-averaged f1score (derived from avg_precision and
+            avg_recall)
+          Scores are 0.0 if num_examples is zero.
+    """
+    macro_avg_precision = base.divide_no_nan(
+        self.total_precision, self.num_examples
+    )
+    macro_avg_recall = base.divide_no_nan(self.total_recall, self.num_examples)
+    macro_avg_f1score = base.divide_no_nan(self.total_f1, self.num_examples)
+    return jnp.stack([macro_avg_precision, macro_avg_recall, macro_avg_f1score])
+
+
 @flax.struct.dataclass
 class WER(base.Average):
   r"""Computes Word Error Rate (WER) for speech recognition or text generation tasks.
@@ -389,4 +556,4 @@ def _levenshtein_distance(prediction: list, reference: list) -> int:
             distance_matrix[i - 1][j - 1] + cost,  # substitution
         )
 
-    return distance_matrix[m][n]
+    return distance_matrix[m][n]
diff --git a/src/metrax/nlp_metrics_test.py b/src/metrax/nlp_metrics_test.py
@@ -62,7 +62,7 @@ def test_bleu(self):
     ]
     predictions = [
         "He He He eats sweet apple which is a fruit",
-        "I love Silicon Valley it's one of my favourite shows",
+        "I love Silicon Valley it is one of my favourite shows",
     ]
     keras_metric = keras_nlp.metrics.Bleu()
     keras_metric.update_state(references, predictions)
@@ -86,7 +86,7 @@ def test_bleu_merge(self):
     ]
     predictions = [
         "He He He eats sweet apple which is a fruit",
-        "I love Silicon Valley it's one of my favourite shows",
+        "I love Silicon Valley it is one of my favourite shows",
     ]
     keras_metric = keras_nlp.metrics.Bleu()
     keras_metric.update_state(references, predictions)
@@ -123,6 +123,75 @@ def test_bleu_merge_fails_on_different_max_order(self):
         ValueError, lambda: order_3_metric.merge(order_4_metric)
     )
 
+  def test_rougen(self):
+    """Tests that ROUGE-N metric computes correct values."""
+    references = [
+        "He eats a sweet apple",
+        "Silicon Valley is one of my favourite shows",
+    ]
+    predictions = [
+        "He He He eats sweet apple which is a fruit",
+        "I love Silicon Valley it is one of my favourite shows",
+    ]
+    keras_metric = keras_nlp.metrics.RougeN()
+    keras_metric.update_state(references, predictions)
+    keras_metric_array = jnp.stack(list(keras_metric.result().values()))
+    metrax_metric = metrax.RougeN.from_model_output(predictions, references)
+
+    np.testing.assert_allclose(
+        metrax_metric.compute(),
+        keras_metric_array,
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+  def test_rougen_merge(self):
+    """Tests that ROUGE-N metric computes correct values using merge."""
+    references = [
+        "He eats a sweet apple",
+        "Silicon Valley is one of my favourite shows",
+    ]
+    predictions = [
+        "He He He eats sweet apple which is a fruit",
+        "I love Silicon Valley it is one of my favourite shows",
+    ]
+    keras_metric = keras_nlp.metrics.RougeN()
+    keras_metric.update_state(references, predictions)
+    keras_metric_array = jnp.stack(list(keras_metric.result().values()))
+
+    metrax_metric = None
+    for ref, pred in zip(references, predictions):
+      update = metrax.RougeN.from_model_output([pred], [ref])
+      metrax_metric = (
+          update if metrax_metric is None else metrax_metric.merge(update)
+      )
+
+    np.testing.assert_allclose(
+        metrax_metric.compute(),
+        keras_metric_array,
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
+  def test_rougen_merge_fails_on_different_max_order(self):
+    """Tests that error is raised when ROUGE-N metrics with different max_order are merged."""
+    references = [
+        "He eats a sweet apple",
+    ]
+    predictions = [
+        "He He He eats sweet apple which is a fruit",
+    ]
+    order_3_metric = metrax.RougeN.from_model_output(
+        predictions, references, order=3
+    )
+    order_4_metric = metrax.RougeN.from_model_output(
+        predictions, references, order=4
+    )
+
+    np.testing.assert_raises(
+        ValueError, lambda: order_3_metric.merge(order_4_metric)
+    )
+
   @parameterized.named_parameters(
       (
           'basic',
diff --git a/src/metrax/nnx/__init__.py b/src/metrax/nnx/__init__.py
@@ -25,6 +25,7 @@
 RMSE = nnx_metrics.RMSE
 RSQUARED = nnx_metrics.RSQUARED
 Recall = nnx_metrics.Recall
+RougeN = nnx_metrics.RougeN
 WER = nnx_metrics.WER
 
 
@@ -40,5 +41,6 @@
     "RMSE",
     "RSQUARED",
     "Recall",
+    "RougeN",
     "WER",
 ]
diff --git a/src/metrax/nnx/nnx_metrics.py b/src/metrax/nnx/nnx_metrics.py
@@ -88,6 +88,13 @@ def __init__(self):
     super().__init__(metrax.RMSE)
 
 
+class RougeN(NnxWrapper):
+  """An NNX class for the Metrax metric RougeN."""
+
+  def __init__(self):
+    super().__init__(metrax.RougeN)
+
+
 class RSQUARED(NnxWrapper):
   """An NNX class for the Metrax metric RSQUARED."""