add recallAtK to metrax (#73)

jshin1394 · web-flow · commit cae0c2ab2ec8 · 2025-05-01T10:33:04.000-07:00
* add recallAtK to metrax

* add back test files

* remove in JAX from docstring
diff --git a/src/metrax/__init__.py b/src/metrax/__init__.py
@@ -30,6 +30,7 @@
 RMSE = regression_metrics.RMSE
 RSQUARED = regression_metrics.RSQUARED
 Recall = classification_metrics.Recall
+RecallAtK = ranking_metrics.RecallAtK
 RougeL = nlp_metrics.RougeL
 RougeN = nlp_metrics.RougeN
 WER = nlp_metrics.WER
@@ -48,6 +49,7 @@
     "RMSE",
     "RSQUARED",
     "Recall",
+    "RecallAtK",
     "RougeL",
     "RougeN",
     "WER",
diff --git a/src/metrax/metrax_test.py b/src/metrax/metrax_test.py
@@ -29,6 +29,7 @@
     size=(BATCHES, BATCH_SIZE),
 ).astype(np.float32)
 OUTPUT_PREDS = np.random.uniform(size=(BATCHES, BATCH_SIZE))
+KS = np.array([3])
 
 STRING_PREDS = [
     'the cat sat on the mat',
@@ -66,7 +67,7 @@ class MetraxTest(parameterized.TestCase):
           {
               'predictions': OUTPUT_LABELS,
               'labels': OUTPUT_PREDS,
-              'ks': np.array([3]),
+              'ks': KS,
           },
       ),
       (
@@ -90,7 +91,7 @@ class MetraxTest(parameterized.TestCase):
           {
               'predictions': OUTPUT_LABELS,
               'labels': OUTPUT_PREDS,
-              'ks': np.array([3]),
+              'ks': KS,
           },
       ),
       (
@@ -108,6 +109,15 @@ class MetraxTest(parameterized.TestCase):
           metrax.Recall,
           {'predictions': OUTPUT_LABELS, 'labels': OUTPUT_PREDS},
       ),
+      (
+          'recallAtK',
+          metrax.RecallAtK,
+          {
+              'predictions': OUTPUT_LABELS,
+              'labels': OUTPUT_PREDS,
+              'ks': KS,
+          },
+      ),
   )
   def test_metrics_jittable(self, metric, kwargs):
     """Tests that jitted metrax metric yields the same result as non-jitted metric."""
diff --git a/src/metrax/nnx/__init__.py b/src/metrax/nnx/__init__.py
@@ -26,6 +26,7 @@
 RMSE = nnx_metrics.RMSE
 RSQUARED = nnx_metrics.RSQUARED
 Recall = nnx_metrics.Recall
+RecallAtK = nnx_metrics.RecallAtK
 RougeL = nnx_metrics.RougeL
 RougeN = nnx_metrics.RougeN
 WER = nnx_metrics.WER
@@ -44,6 +45,7 @@
     "RMSE",
     "RSQUARED",
     "Recall",
+    "RecallAtK",
     "RougeL",
     "RougeN",
     "WER",
diff --git a/src/metrax/nnx/nnx_metrics.py b/src/metrax/nnx/nnx_metrics.py
@@ -88,6 +88,13 @@ def __init__(self):
     super().__init__(metrax.Recall)
 
 
+class RecallAtK(NnxWrapper):
+  """An NNX class for the Metrax metric RecallAtK."""
+
+  def __init__(self):
+    super().__init__(metrax.RecallAtK)
+
+
 class RMSE(NnxWrapper):
   """An NNX class for the Metrax metric RMSE."""
 
diff --git a/src/metrax/ranking_metrics.py b/src/metrax/ranking_metrics.py
@@ -14,6 +14,7 @@
 
 """A collection of different metrics for ranking models."""
 
+import abc
 import flax
 import jax
 import jax.numpy as jnp
@@ -22,7 +23,7 @@
 
 @flax.struct.dataclass
 class AveragePrecisionAtK(base.Average):
-  r"""Computes AP@k (average precision at k) metrics in JAX.
+  r"""Computes AP@k (average precision at k) metrics.
 
   Average precision at k (AP@k) is a metric used to evaluate the performance of
   ranking models. It measures the sum of precision at k where the item at
@@ -125,71 +126,200 @@ def from_model_output(
 
 
 @flax.struct.dataclass
-class PrecisionAtK(base.Average):
-  r"""Computes P@k (precision at k) metrics in JAX.
+class TopKRankingMetric(base.Average, abc.ABC):
+  """Abstract base class for Top-K ranking metrics like Precision@k and Recall@k.
 
-  Precision at k (P@k) is a metric that measures the proportion of
-  relevant items found in the top k recommendations.
-
-  Given the top :math:`K` recommendations, P@K is calculated as:
+  This class provides common functionality for calculating metrics that evaluate
+  the quality of the top k items in a ranked list. Subclasses must implement
+  the `_calculate_metric_at_ks` method to define the specific metric
+  computation (e.g., precision, recall).
 
-  .. math::
-      Precision@K = \frac{\text{Number of relevant items in top K}}{K}
+  The `from_model_output` method is a factory method that computes the metric
+  values for a batch of predictions and labels, and aggregates them.
   """
 
-  @classmethod
-  def precision_at_ks(
-      cls, predictions: jax.Array, labels: jax.Array, ks: jax.Array
+  @staticmethod
+  def _get_relevant_at_k(
+      predictions: jax.Array, labels: jax.Array, ks: jax.Array
   ) -> jax.Array:
-    """Computes P@k (precision at k) metrics for each of k in ks.
+    """Computes the number of relevant items at each k.
+
+    This static method processes predictions and labels to determine the
+    number of relevant items at specified k-values.
 
     Args:
-      predictions: A floating point 2D array representing the prediction
-        scores from the model. Higher scores indicate higher relevance. The
+      predictions: A floating point 2D array representing the prediction scores
+        from the model. Higher scores indicate higher relevance. The shape
+        should be (batch_size, vocab_size).
+      labels: A multi-hot encoding (0 or 1, or counts) of the true labels. The
         shape should be (batch_size, vocab_size).
-      labels: A multi-hot encoding (0 or 1) of the true labels. The shape should
-        be (batch_size, vocab_size).
-      ks: A 1D array of integers representing the k's to compute the P@k
-        metrics. The shape should be (|ks|).
+      ks: A 1D array of integers representing the k's (cut-off points) for which
+        to compute metrics. The shape should be (|ks|).
 
     Returns:
-      A rank-2 array of shape (batch_size, |ks|) containing P@k metrics.
+      relevant_at_k: A 2D array of shape (batch_size, |ks|). Each element [i, j]
+      is the number of relevant items among the top ks[j] recommendations for
+      the i-th example in the batch.
     """
     labels = jnp.array(labels >= 1, dtype=jnp.float32)
     indices_by_rank = jnp.argsort(-predictions, axis=1)
     labels_by_rank = jnp.take_along_axis(labels, indices_by_rank, axis=1)
     relevant_by_rank = jnp.cumsum(labels_by_rank, axis=1)
 
     vocab_size = predictions.shape[1]
-    relevant_at_k = relevant_by_rank[:, jnp.minimum(ks - 1, vocab_size - 1)]
-    total_at_k = jnp.minimum(ks, vocab_size)
-    return base.divide_no_nan(relevant_at_k, total_at_k)
+    k_indices = jnp.minimum(ks - 1, vocab_size - 1)
+    relevant_at_k = relevant_by_rank[:, k_indices]
+
+    return relevant_at_k
+
+  @classmethod
+  @abc.abstractmethod
+  def _calculate_metric_at_ks(
+      cls, predictions: jax.Array, labels: jax.Array, ks: jax.Array
+  ) -> jax.Array:
+    """Computes the specific metric (e.g., P@k, R@k) values per example for each k.
+
+    This method must be implemented by concrete subclasses (e.g., PrecisionAtK,
+    RecallAtK) to define the actual calculation of the metric based on
+    predictions, labels, and k-values.
+
+    Args:
+      predictions: A floating point 2D array representing the prediction scores
+        from the model.
+      labels: A multi-hot encoding of the true labels.
+      ks: A 1D array of integers representing the k's.
+
+    Returns:
+      A rank-2 array of shape (batch_size, |ks|) containing the metric
+      values for each example in the batch and each specified k.
+    """
+    raise NotImplementedError('Subclasses must implement this method.')
 
   @classmethod
   def from_model_output(
       cls,
       predictions: jax.Array,
       labels: jax.Array,
       ks: jax.Array,
-  ) -> 'PrecisionAtK':
-    """Creates a PrecisionAtK metric instance from model output.
+  ) -> 'TopKRankingMetric':
+    """Creates a metric instance from model output.
 
-    This computes the P@k for each example in the batch and then aggregates
-    them (sum of P@k values and count of examples) to be averaged later by
-    calling .compute() on the returned metric object.
+    This class method computes the specific ranking metric (defined by the
+    subclass's implementation of `_calculate_metric_at_ks`) for each example
+    in the batch. It then aggregates these values (sum of metric values and
+    count of examples) into a metric object. This object can later be used
+    to compute the mean metric value (e.g., Mean Precision@k) by calling
+    its `.compute()` method (inherited from `base.Average`).
 
     Args:
-      predictions: A floating point 2D array representing the prediction
-        scores from the model. The shape should be (batch_size, vocab_size).
-      labels: A multi-hot encoding (0 or 1) of the true labels. The shape should
-        be (batch_size, vocab_size).
+      predictions: A floating point 2D array representing the prediction scores
+        from the model. The shape should be (batch_size, vocab_size).
+      labels: A multi-hot encoding (0 or 1, or counts) of the true labels. The
+        shape should be (batch_size, vocab_size).
+      ks: A 1D array of integers representing the k's to compute the metrics.
+        The shape should be (|ks|).
+
+    Returns:
+      An instance of the calling class (e.g., PrecisionAtK, RecallAtK)
+      with `total` and `count` fields populated. The `total` field will
+      have shape (|ks|), representing the sum of metric values for each k
+      across the batch, and `count` will be a scalar representing the
+      number of examples in the batch.
+    """
+    metric_at_ks = cls._calculate_metric_at_ks(predictions, labels, ks)
+    num_examples = jnp.array(labels.shape[0], dtype=jnp.float32)
+    return cls(
+        total=metric_at_ks.sum(axis=0),
+        count=num_examples,
+    )
+
+
+@flax.struct.dataclass
+class PrecisionAtK(TopKRankingMetric):
+  r"""Computes P@k (precision at k) metrics.
+
+  Precision at k (P@k) is a metric that measures the proportion of
+  relevant items found in the top k recommendations. It answers the question:
+  "Out of the K items recommended, how many are actually relevant?"
+
+  Given the top :math:`K` recommendations, P@K is calculated as:
+
+  .. math::
+      Precision@K = \frac{\text{Number of relevant items in top K}}{K}
+  """
+
+  @classmethod
+  def _calculate_metric_at_ks(
+      cls, predictions: jax.Array, labels: jax.Array, ks: jax.Array
+  ) -> jax.Array:
+    """Computes P@k (precision at k) metrics for each of k in ks for each example.
+
+    This method implements the core logic for calculating Precision@k.
+    It utilizes the `_get_relevant_at_k` helper from the base
+    class to get the number of relevant items at each k, and then divides
+    by k (clamped by vocabulary size) to get the precision.
+
+    Args:
+      predictions: A floating point 2D array representing the prediction scores
+        from the model. The shape should be (batch_size, vocab_size).
+      labels: A multi-hot encoding (0 or 1, or counts) of the true labels. The
+        shape should be (batch_size, vocab_size).
       ks: A 1D array of integers representing the k's to compute the P@k
         metrics. The shape should be (|ks|).
 
     Returns:
-      The PrecisionAtK metric object. The `total` field will have shape (|ks|),
-      and `count` will be a scalar.
+      A rank-2 array of shape (batch_size, |ks|) containing P@k metrics
+      for each example and each k.
     """
-    p_at_ks = cls.precision_at_ks(predictions, labels, ks)
-    num_examples = jnp.array(labels.shape[0], dtype=jnp.float32)
-    return cls(total=p_at_ks.sum(axis=0), count=num_examples)
+    relevant_at_k = cls._get_relevant_at_k(predictions, labels, ks)
+    vocab_size = labels.shape[1]
+    denominator_p_at_k = jnp.minimum(ks.astype(jnp.float32), vocab_size)
+    return base.divide_no_nan(relevant_at_k, denominator_p_at_k[jnp.newaxis, :])
+
+
+@flax.struct.dataclass
+class RecallAtK(TopKRankingMetric):
+  r"""Computes R@k (recall at k) metrics in JAX.
+
+  Recall at k (R@k) is a metric that measures the proportion of
+  relevant items that are found in the top k recommendations, out of the
+  total number of relevant items for a given user/query. It answers the
+  question:
+  "Out of all the items that are truly relevant, how many did we find in the top
+  K?"
+
+  Given the top :math:`K` recommendations, R@K is calculated as:
+
+  .. math::
+      Recall@K = \frac{\text{Number of relevant items in top K}}{\text{Total
+      number of relevant items}}
+  """
+
+  @classmethod
+  def _calculate_metric_at_ks(
+      cls, predictions: jax.Array, labels: jax.Array, ks: jax.Array
+  ) -> jax.Array:
+    """Computes R@k (recall at k) metrics for each of k in ks for each example.
+
+    This method implements the core logic for calculating Recall@k.
+    It utilizes the `_get_relevant_at_k` helper from the base
+    class to get the number of relevant items at each k and the binarized
+    labels.
+    The number of relevant items at k is then divided by the total number of
+    relevant items for that example to get the recall.
+
+    Args:
+      predictions: A floating point 2D array representing the prediction scores
+        from the model. The shape should be (batch_size, vocab_size).
+      labels: A multi-hot encoding (0 or 1, or counts) of the true labels. The
+        shape should be (batch_size, vocab_size).
+      ks: A 1D array of integers representing the k's to compute the R@k
+        metrics. The shape should be (|ks|).
+
+    Returns:
+      A rank-2 array of shape (batch_size, |ks|) containing R@k metrics
+      for each example and each k.
+    """
+    relevant_at_k = cls._get_relevant_at_k(predictions, labels, ks)
+    total_relevant = jnp.sum(jnp.array(labels >= 1, dtype=jnp.float32), axis=1)
+    return base.divide_no_nan(relevant_at_k, total_relevant[:, jnp.newaxis])
diff --git a/src/metrax/ranking_metrics_test.py b/src/metrax/ranking_metrics_test.py
@@ -50,6 +50,15 @@
 MAP_FROM_KERAS_VS1 = np.array([0.75, 0.75, 0.75, 0.75, 0.75, 0.75])
 P_FROM_KERAS = np.array([0.75, 0.875, 0.58333337306976320, 0.5625, 0.5, 0.5])
 P_FROM_KERAS_VS1 = np.array([0.75, 0.75, 0.75, 0.75, 0.75, 0.75])
+R_FROM_KERAS = np.array([
+    0.2083333432674408,
+    0.5416666865348816,
+    0.5416666865348816,
+    0.625,
+    0.6666666865348816,
+    0.75,
+])
+R_FROM_KERAS_VS1 = np.array([0.75, 0.75, 0.75, 0.75, 0.75, 0.75])
 
 
 class RankingMetricsTest(parameterized.TestCase):
@@ -116,6 +125,31 @@ def test_precisionatk(self, y_true, y_pred, map_from_keras):
         atol=1e-05,
     )
 
+  @parameterized.named_parameters(
+      ('basic', OUTPUT_LABELS, OUTPUT_PREDS, R_FROM_KERAS),
+      (
+          'vocab_size_one',
+          OUTPUT_LABELS_VS1,
+          OUTPUT_PREDS_VS1,
+          R_FROM_KERAS_VS1,
+      ),
+  )
+  def test_recallatk(self, y_true, y_pred, map_from_keras):
+    """Test that `RecallAtK` Metric computes correct values."""
+    ks = jnp.array([1, 2, 3, 4, 5, 6])
+    metric = metrax.RecallAtK.from_model_output(
+        predictions=y_pred,
+        labels=y_true,
+        ks=ks,
+    )
+
+    np.testing.assert_allclose(
+        metric.compute(),
+        map_from_keras,
+        rtol=1e-05,
+        atol=1e-05,
+    )
+
 
 if __name__ == '__main__':
   absltest.main()