Add base retrieval class (#49)

abheesht17 · web-flow · commit 0fa4e13a6d02 · 2025-04-14T14:35:12.000-07:00
* Add base retrieval class

* Restore doc-string

* doc-string edit

* Add unit tests

* Typo

* Address comments
diff --git a/keras_rs/api/layers/__init__.py b/keras_rs/api/layers/__init__.py
@@ -17,6 +17,7 @@
 from keras_rs.src.layers.retrieval.remove_accidental_hits import (
     RemoveAccidentalHits,
 )
+from keras_rs.src.layers.retrieval.retrieval import Retrieval
 from keras_rs.src.layers.retrieval.sampling_probability_correction import (
     SamplingProbabilityCorrection,
 )
diff --git a/keras_rs/src/layers/retrieval/brute_force_retrieval.py b/keras_rs/src/layers/retrieval/brute_force_retrieval.py
@@ -4,10 +4,11 @@
 
 from keras_rs.src import types
 from keras_rs.src.api_export import keras_rs_export
+from keras_rs.src.layers.retrieval.retrieval import Retrieval
 
 
 @keras_rs_export("keras_rs.layers.BruteForceRetrieval")
-class BruteForceRetrieval(keras.layers.Layer):
+class BruteForceRetrieval(Retrieval):
     """Brute force top-k retrieval.
 
     This layer maintains a set of candidates and is able to exactly retrieve the
@@ -60,11 +61,13 @@ def __init__(
         return_scores: bool = True,
         **kwargs: Any,
     ) -> None:
-        super().__init__(**kwargs)
+        # Keep `k`, `return_scores` as separately passed args instead of keeping
+        # them in `kwargs`. This is to ensure the user does not have to hop
+        # to the base class to check which other args can be passed.
+        super().__init__(k=k, return_scores=return_scores, **kwargs)
+
         self.candidate_embeddings = None
         self.candidate_ids = None
-        self.k = k
-        self.return_scores = return_scores
 
         if candidate_embeddings is None:
             if candidate_ids is not None:
@@ -84,36 +87,12 @@ def update_candidates(
 
         Args:
             candidate_embeddings: The candidate embeddings.
-            candidate_ids: The identifiers for the candidates. If `None` the
+            candidate_ids: The identifiers for the candidates. If `None`, the
                 indices of the candidates are returned instead.
         """
-        if candidate_embeddings is None:
-            raise ValueError("`candidate_embeddings` is required")
-
-        if len(candidate_embeddings.shape) != 2:
-            raise ValueError(
-                "`candidate_embeddings` must be a tensor of rank 2 "
-                "(num_candidates, embedding_size), received "
-                "`candidate_embeddings` with shape "
-                f"{candidate_embeddings.shape}"
-            )
-
-        if candidate_embeddings.shape[0] < self.k:
-            raise ValueError(
-                "The number of candidates provided "
-                f"({candidate_embeddings.shape[0]}) is less than the number of "
-                f"candidates to retrieve (k={self.k})."
-            )
-
-        if (
-            candidate_ids is not None
-            and candidate_ids.shape[0] != candidate_embeddings.shape[0]
-        ):
-            raise ValueError(
-                "The `candidate_embeddings` and `candidate_is` tensors must "
-                "have the same number of rows, got tensors of shape "
-                f"{candidate_embeddings.shape} and {candidate_ids.shape}."
-            )
+        self._validate_candidate_embeddings_and_ids(
+            candidate_embeddings, candidate_ids
+        )
 
         if self.candidate_embeddings is not None:
             # Update of existing variables.
@@ -167,31 +146,3 @@ def call(
             return top_scores, top_ids
         else:
             return top_ids
-
-    def compute_score(
-        self, query_embedding: types.Tensor, candidate_embedding: types.Tensor
-    ) -> types.Tensor:
-        """Computes the standard dot product score from queries and candidates.
-
-        Args:
-            query_embedding: Tensor of query embedding corresponding to the
-                queries for which to retrieve top candidates.
-            candidate_embedding: Tensor of candidate embeddings.
-
-        Returns:
-            The dot product of queries and candidates.
-        """
-
-        return keras.ops.matmul(
-            query_embedding, keras.ops.transpose(candidate_embedding)
-        )
-
-    def get_config(self) -> dict[str, Any]:
-        config: dict[str, Any] = super().get_config()
-        config.update(
-            {
-                "k": self.k,
-                "return_scores": self.compute_score,
-            }
-        )
-        return config
diff --git a/keras_rs/src/layers/retrieval/retrieval.py b/keras_rs/src/layers/retrieval/retrieval.py
@@ -0,0 +1,127 @@
+import abc
+from typing import Any, Optional, Union
+
+import keras
+
+from keras_rs.src import types
+from keras_rs.src.api_export import keras_rs_export
+
+
+@keras_rs_export("keras_rs.layers.Retrieval")
+class Retrieval(keras.layers.Layer, abc.ABC):
+    """Retrieval base abstract class.
+
+    This layer provides a common interface for all retrieval layers. In order
+    to implement a custom retrieval layer, this abstract class should be
+    subclassed.
+
+    Args:
+        k: int. Number of candidates to retrieve.
+        return_scores: bool. When `True`, this layer returns a tuple with the
+            top scores and the top identifiers. When `False`, this layer returns
+            a single tensor with the top identifiers.
+    """
+
+    def __init__(
+        self,
+        k: int = 10,
+        return_scores: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.k = k
+        self.return_scores = return_scores
+
+    def _validate_candidate_embeddings_and_ids(
+        self,
+        candidate_embeddings: types.Tensor,
+        candidate_ids: Optional[types.Tensor] = None,
+    ) -> None:
+        """Validates inputs to `update_candidates()`."""
+
+        if candidate_embeddings is None:
+            raise ValueError("`candidate_embeddings` is required.")
+
+        if len(candidate_embeddings.shape) != 2:
+            raise ValueError(
+                "`candidate_embeddings` must be a tensor of rank 2 "
+                "(num_candidates, embedding_size), received "
+                "`candidate_embeddings` with shape "
+                f"{candidate_embeddings.shape}"
+            )
+
+        if candidate_embeddings.shape[0] < self.k:
+            raise ValueError(
+                "The number of candidates provided "
+                f"({candidate_embeddings.shape[0]}) is less than the number of "
+                f"candidates to retrieve (k={self.k})."
+            )
+
+        if (
+            candidate_ids is not None
+            and candidate_ids.shape[0] != candidate_embeddings.shape[0]
+        ):
+            raise ValueError(
+                "The `candidate_embeddings` and `candidate_is` tensors must "
+                "have the same number of rows, got tensors of shape "
+                f"{candidate_embeddings.shape} and {candidate_ids.shape}."
+            )
+
+    @abc.abstractmethod
+    def update_candidates(
+        self,
+        candidate_embeddings: types.Tensor,
+        candidate_ids: Optional[types.Tensor] = None,
+    ) -> None:
+        """Update the set of candidates and optionally their candidate IDs.
+
+        Args:
+            candidate_embeddings: The candidate embeddings.
+            candidate_ids: The identifiers for the candidates. If `None`, the
+                indices of the candidates are returned instead.
+        """
+        pass
+
+    @abc.abstractmethod
+    def call(
+        self, inputs: types.Tensor
+    ) -> Union[types.Tensor, tuple[types.Tensor, types.Tensor]]:
+        """Returns the top candidates for the query passed as input.
+
+        Args:
+            inputs: the query for which to return top candidates.
+
+        Returns:
+            A tuple with the top scores and the top identifiers if
+            `returns_scores` is True, otherwise a tensor with the top
+            identifiers.
+        """
+        pass
+
+    def compute_score(
+        self, query_embedding: types.Tensor, candidate_embedding: types.Tensor
+    ) -> types.Tensor:
+        """Computes the standard dot product score from queries and candidates.
+
+        Args:
+            query_embedding: Tensor of query embedding corresponding to the
+                queries for which to retrieve top candidates.
+            candidate_embedding: Tensor of candidate embeddings.
+
+        Returns:
+            The dot product of queries and candidates.
+        """
+
+        return keras.ops.matmul(
+            query_embedding, keras.ops.transpose(candidate_embedding)
+        )
+
+    def get_config(self) -> dict[str, Any]:
+        config: dict[str, Any] = super().get_config()
+        config.update(
+            {
+                "k": self.k,
+                "return_scores": self.compute_score,
+            }
+        )
+        return config
diff --git a/keras_rs/src/layers/retrieval/retrieval_test.py b/keras_rs/src/layers/retrieval/retrieval_test.py
@@ -0,0 +1,66 @@
+import keras
+from absl.testing import parameterized
+
+from keras_rs.src import testing
+from keras_rs.src.layers.retrieval.retrieval import Retrieval
+
+
+class DummyRetrieval(Retrieval):
+    def update_candidates(self, candidate_embeddings, candidate_ids=None):
+        pass
+
+    def call(self, inputs):
+        pass
+
+
+class RetrievalTest(testing.TestCase, parameterized.TestCase):
+    def setUp(self):
+        self.layer = DummyRetrieval(k=5)
+
+    @parameterized.named_parameters(
+        ("embeddings_none", None, None, "`candidate_embeddings` is required."),
+        (
+            "embeddings_rank_1",
+            keras.random.normal(shape=(10,)),
+            None,
+            "`candidate_embeddings` must be a tensor of rank 2",
+        ),
+        (
+            "embeddings_smaller_than_k",
+            keras.random.normal(shape=(3, 10)),
+            None,
+            "The number of candidates provided \(3\) is less than",
+        ),
+        (
+            "embeddings_ids_shape",
+            keras.random.normal(shape=(6, 10)),
+            keras.random.randint(shape=(4,), minval=0, maxval=3),
+            "The `candidate_embeddings` and `candidate_is` tensors must have "
+            "the same number of rows",
+        ),
+    )
+    def test_validate_candidate_embeddings_and_ids(
+        self, candidate_embeddings, candidate_ids, error_msg
+    ):
+        with self.assertRaisesRegex(ValueError, error_msg):
+            self.layer._validate_candidate_embeddings_and_ids(
+                candidate_embeddings, candidate_ids
+            )
+
+    def test_call_not_overridden(self):
+        class DummyRetrieval(Retrieval):
+            def update_candidates(
+                self, candidate_embeddings, candidate_ids=None
+            ):
+                pass
+
+        with self.assertRaises(TypeError):
+            DummyRetrieval(k=5)
+
+    def test_update_candidates_not_overridden(self):
+        class DummyRetrieval(Retrieval):
+            def call(self, inputs):
+                pass
+
+        with self.assertRaises(TypeError):
+            DummyRetrieval(k=5)

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`from keras_rs.src.layers.retrieval.remove_accidental_hits import (`
`18`	`18`	`RemoveAccidentalHits,`
`19`	`19`	`)`
	`20`	`+from keras_rs.src.layers.retrieval.retrieval import Retrieval`
`20`	`21`	`from keras_rs.src.layers.retrieval.sampling_probability_correction import (`
`21`	`22`	`SamplingProbabilityCorrection,`
`22`	`23`	`)`