Skip to content

Commit fb9ee1f

Browse files
authored
feat: add vector store and integer indexing (#46)
* feat: add vector store and integer indexing * add saving and loading of vector store * fix: bug in loading * Add tests * tests: add additional tests
1 parent 71a9c29 commit fb9ee1f

5 files changed

Lines changed: 179 additions & 66 deletions

File tree

tests/conftest.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,7 @@ def query_vector() -> np.ndarray:
5050

5151

5252
# Create human-readable ids for each backend type
53-
BACKEND_IDS = [
54-
f"{backend.name}-{index_type}" if index_type else backend.name
55-
for backend, index_type in BACKEND_PARAMS
56-
]
53+
BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS]
5754

5855

5956
@pytest.fixture(params=BACKEND_PARAMS)
@@ -63,9 +60,7 @@ def backend_type(request: pytest.FixtureRequest) -> Backend:
6360

6461

6562
@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
66-
def vicinity_instance(
67-
request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray
68-
) -> Vicinity:
63+
def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity:
6964
"""Fixture providing a Vicinity instance for each backend type."""
7065
backend_type, index_type = request.param
7166
# Handle FAISS backend with specific FAISS index types
@@ -91,3 +86,30 @@ def vicinity_instance(
9186
)
9287

9388
return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
89+
90+
91+
@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
92+
def vicinity_instance_with_stored_vectors(
93+
request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray
94+
) -> Vicinity:
95+
"""Fixture providing a Vicinity instance for each backend type."""
96+
backend_type, index_type = request.param
97+
# Handle FAISS backend with specific FAISS index types
98+
if backend_type == Backend.FAISS:
99+
if index_type in ("pq", "ivfpq", "ivfpqr"):
100+
# Use smaller values for pq indexes since the dataset is small
101+
return Vicinity.from_vectors_and_items(
102+
vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4, store_vectors=True
103+
)
104+
else:
105+
return Vicinity.from_vectors_and_items(
106+
vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32, store_vectors=True
107+
)
108+
109+
return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type, store_vectors=True)
110+
111+
112+
@pytest.fixture()
113+
def vicinity_with_basic_backend(vectors: np.ndarray, items: list[str]) -> Vicinity:
114+
"""Fixture providing a BasicBackend instance."""
115+
return Vicinity.from_vectors_and_items(vectors, items, backend_type=Backend.BASIC, store_vectors=True)

tests/test_vicinity.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,48 @@ def test_vicinity_save_and_load(tmp_path: Path, vicinity_instance: Vicinity) ->
131131
"""
132132
save_path = tmp_path / "vicinity_data"
133133
vicinity_instance.save(save_path)
134+
assert vicinity_instance.vector_store is None
134135

135-
Vicinity.load(save_path)
136+
v = Vicinity.load(save_path)
137+
assert v.vector_store is None
138+
139+
140+
def test_vicinity_save_and_load_vector_store(tmp_path: Path, vicinity_instance_with_stored_vectors: Vicinity) -> None:
141+
"""
142+
Test Vicinity.save and Vicinity.load.
143+
144+
:param tmp_path: Temporary directory provided by pytest.
145+
:param vicinity_instance: A Vicinity instance.
146+
"""
147+
save_path = tmp_path / "vicinity_data"
148+
vicinity_instance_with_stored_vectors.save(save_path)
149+
150+
assert (save_path / "store").exists()
151+
assert (save_path / "store" / "vectors.npy").exists()
152+
153+
v = Vicinity.load(save_path)
154+
assert v.vector_store is not None
155+
156+
157+
def test_index_vector_store(vicinity_with_basic_backend: Vicinity, vectors: np.ndarray) -> None:
158+
"""
159+
Index vectors in the Vicinity instance.
160+
161+
:param vicinity_instance: A Vicinity instance.
162+
:param vectors: Array of vectors to index.
163+
"""
164+
v = vicinity_with_basic_backend.get_vector_by_index(0)
165+
assert np.allclose(v, vectors[0])
166+
167+
idx = [0, 1, 2, 3, 4, 10]
168+
v = vicinity_with_basic_backend.get_vector_by_index(idx)
169+
assert np.allclose(v, vectors[idx])
170+
171+
with pytest.raises(ValueError):
172+
vicinity_with_basic_backend.get_vector_by_index([10_000])
173+
174+
with pytest.raises(ValueError):
175+
vicinity_with_basic_backend.get_vector_by_index([-1])
136176

137177

138178
def test_vicinity_insert_duplicate(vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:

vicinity/backends/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Union
22

33
from vicinity.backends.base import AbstractBackend
4-
from vicinity.backends.basic import BasicBackend
4+
from vicinity.backends.basic import BasicBackend, BasicVectorStore
55
from vicinity.datatypes import Backend
66

77

@@ -39,4 +39,4 @@ def get_backend_class(backend: Union[Backend, str]) -> type[AbstractBackend]:
3939
return VoyagerBackend
4040

4141

42-
__all__ = ["get_backend_class", "AbstractBackend"]
42+
__all__ = ["get_backend_class", "AbstractBackend", "BasicVectorStore"]

vicinity/backends/basic.py

Lines changed: 74 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,58 @@ class BasicArgs(BaseArgs):
1818
metric: Metric = Metric.COSINE
1919

2020

21-
class BasicBackend(AbstractBackend[BasicArgs], ABC):
22-
argument_class = BasicArgs
23-
_vectors: npt.NDArray
24-
supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN}
21+
class BasicVectorStore:
22+
def __init__(self, *, vectors: npt.NDArray, **kwargs: Any) -> None:
23+
"""
24+
A basic vector store that just stores vectors.
2525
26-
def __init__(self, arguments: BasicArgs) -> None:
27-
"""Initialize the backend."""
28-
super().__init__(arguments)
26+
Note that we use kwargs in order to use this class as a mixin.
2927
30-
def __len__(self) -> int:
31-
"""Get the number of vectors."""
32-
return self.vectors.shape[0]
28+
:param vectors: The vectors to store.
29+
:param **kwargs: Additional arguments. These are passed on to the super class.
30+
"""
31+
super().__init__(**kwargs)
32+
self._vectors = vectors
3333

34-
@property
35-
def backend_type(self) -> Backend:
36-
"""The type of the backend."""
37-
return Backend.BASIC
34+
def _update_precomputed_data(self) -> None:
35+
"""Update precomputed data based on the metric."""
36+
# NOTE: this is a no-op in the base implementation.
37+
return
38+
39+
def get_by_index(self, indices: list[int]) -> npt.NDArray:
40+
"""Get vectors by index."""
41+
return self._vectors[indices]
42+
43+
def insert(self, vectors: npt.NDArray) -> None:
44+
"""Insert vectors into the vector space."""
45+
self._vectors = np.vstack([self._vectors, vectors])
46+
self._update_precomputed_data()
47+
48+
def delete(self, indices: list[int]) -> None:
49+
"""Deletes specific indices from the vector space."""
50+
self._vectors = np.delete(self._vectors, indices, axis=0)
51+
self._update_precomputed_data()
52+
53+
def save(self, folder: Path) -> None:
54+
"""Save the vectors to a path."""
55+
path = folder / "vectors.npy"
56+
with open(path, "wb") as f:
57+
np.save(f, self._vectors)
58+
59+
@staticmethod
60+
def _load_vectors(folder: Path) -> npt.NDArray:
61+
"""Load the vectors from a path."""
62+
path = folder / "vectors.npy"
63+
with open(path, "rb") as f:
64+
vectors = np.load(f)
65+
66+
return vectors
67+
68+
@classmethod
69+
def load(cls, folder: Path) -> BasicVectorStore:
70+
"""Load the vectors from a path."""
71+
vectors = cls._load_vectors(folder)
72+
return cls(vectors=vectors)
3873

3974
@property
4075
def dim(self) -> int:
@@ -55,10 +90,24 @@ def vectors(self, x: Matrix) -> None:
5590
self._vectors = matrix
5691
self._update_precomputed_data()
5792

58-
@abstractmethod
59-
def _update_precomputed_data(self) -> None:
60-
"""Update precomputed data based on the metric."""
61-
raise NotImplementedError()
93+
94+
class BasicBackend(BasicVectorStore, AbstractBackend[BasicArgs], ABC):
95+
argument_class = BasicArgs
96+
_vectors: npt.NDArray
97+
supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN}
98+
99+
def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None:
100+
"""Initialize the backend."""
101+
super().__init__(vectors=vectors, arguments=arguments)
102+
103+
def __len__(self) -> int:
104+
"""Get the number of vectors."""
105+
return self.vectors.shape[0]
106+
107+
@property
108+
def backend_type(self) -> Backend:
109+
"""The type of the backend."""
110+
return Backend.BASIC
62111

63112
@abstractmethod
64113
def _dist(self, x: npt.NDArray) -> npt.NDArray:
@@ -83,10 +132,8 @@ def from_vectors(cls, vectors: npt.NDArray, metric: Union[str, Metric] = "cosine
83132
@classmethod
84133
def load(cls, folder: Path) -> BasicBackend:
85134
"""Load the vectors from a path."""
86-
path = folder / "vectors.npy"
87135
arguments = BasicArgs.load(folder / "arguments.json")
88-
with open(path, "rb") as f:
89-
vectors = np.load(f)
136+
vectors = cls._load_vectors(folder)
90137
if arguments.metric == Metric.COSINE:
91138
return CosineBasicBackend(vectors, arguments)
92139
elif arguments.metric == Metric.EUCLIDEAN:
@@ -96,10 +143,8 @@ def load(cls, folder: Path) -> BasicBackend:
96143

97144
def save(self, folder: Path) -> None:
98145
"""Save the vectors to a path."""
99-
path = folder / "vectors.npy"
146+
super().save(folder)
100147
self.arguments.dump(folder / "arguments.json")
101-
with open(path, "wb") as f:
102-
np.save(f, self._vectors)
103148

104149
def threshold(
105150
self,
@@ -160,26 +205,12 @@ def query(
160205

161206
return out
162207

163-
def insert(self, vectors: npt.NDArray) -> None:
164-
"""Insert vectors into the vector space."""
165-
self._vectors = np.vstack([self._vectors, vectors])
166-
self._update_precomputed_data()
167-
168-
def delete(self, indices: list[int]) -> None:
169-
"""Deletes specific indices from the vector space."""
170-
self._vectors = np.delete(self._vectors, indices, axis=0)
171-
self._update_precomputed_data()
172-
173208

174209
class CosineBasicBackend(BasicBackend):
175210
def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None:
176211
"""Initialize the cosine basic backend."""
177-
super().__init__(arguments)
178-
self._vectors = normalize_or_copy(vectors)
179-
180-
def _update_precomputed_data(self) -> None:
181-
"""Update precomputed data for cosine similarity."""
182-
pass
212+
super().__init__(vectors=vectors, arguments=arguments)
213+
self._vectors = normalize_or_copy(self._vectors)
183214

184215
def _dist(self, x: npt.NDArray) -> npt.NDArray:
185216
"""Compute cosine distance."""
@@ -197,21 +228,12 @@ def insert(self, vectors: npt.NDArray) -> None:
197228
class EuclideanBasicBackend(BasicBackend):
198229
def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None:
199230
"""Initialize the Euclidean basic backend."""
200-
super().__init__(arguments)
201-
self._vectors = vectors
202-
self._squared_norm_vectors: npt.NDArray | None = None
203-
self._update_precomputed_data()
231+
super().__init__(vectors=vectors, arguments=arguments)
232+
self.squared_norm_vectors = (self._vectors**2).sum(1)
204233

205234
def _update_precomputed_data(self) -> None:
206235
"""Update precomputed data for Euclidean distance."""
207-
self._squared_norm_vectors = (self._vectors**2).sum(1)
208-
209-
@property
210-
def squared_norm_vectors(self) -> npt.NDArray:
211-
"""Return squared norms of vectors."""
212-
if self._squared_norm_vectors is None:
213-
self._squared_norm_vectors = (self._vectors**2).sum(1)
214-
return self._squared_norm_vectors
236+
self.squared_norm_vectors = (self._vectors**2).sum(1)
215237

216238
def _dist(self, x: npt.NDArray) -> npt.NDArray:
217239
"""Compute Euclidean distance."""

0 commit comments

Comments
 (0)