Skip to content

Commit 582a9e9

Browse files
Pinecone re-indexing and Postgres deletion of data in test-connection issue fixes (#49)
* Supporting run_id * Refactoring changes * Update version
1 parent a3660c8 commit 582a9e9

File tree

10 files changed

+261
-182
lines changed

10 files changed

+261
-182
lines changed

src/unstract/adapters/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.15.1"
1+
__version__ = "0.16.0"
22

33

44
import logging

src/unstract/adapters/base.py

-25
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,5 @@
11
import logging
22
from abc import ABC, abstractmethod
3-
from typing import Any, Union
4-
5-
from llama_index.core import MockEmbedding
6-
from llama_index.core.embeddings import BaseEmbedding
7-
from llama_index.core.llms import LLM, MockLLM
8-
from llama_index.core.vector_stores import SimpleVectorStore
9-
from llama_index.core.vector_stores.types import (
10-
BasePydanticVectorStore,
11-
VectorStore,
12-
)
133

144
from unstract.adapters.enums import AdapterTypes
155

@@ -50,21 +40,6 @@ def get_json_schema() -> str:
5040
def get_adapter_type() -> AdapterTypes:
5141
return ""
5242

53-
def get_llm_instance(self, llm_config: dict[str, Any]) -> LLM:
54-
# Overriding implementations use llm_config
55-
return MockLLM()
56-
57-
def get_vector_db_instance(
58-
self, vector_db_config: dict[str, Any]
59-
) -> Union[BasePydanticVectorStore, VectorStore]:
60-
# Overriding implementations use vector_db_config
61-
return SimpleVectorStore()
62-
63-
def get_embedding_instance(
64-
self, embed_config: dict[str, Any]
65-
) -> BaseEmbedding:
66-
return MockEmbedding(embed_dim=1)
67-
6843
@abstractmethod
6944
def test_connection(self) -> bool:
7045
"""Override to test connection for a adapter.

src/unstract/adapters/vectordb/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ class VectorDbConstants:
22
VECTOR_DB_NAME = "collection_name"
33
EMBEDDING_DIMENSION = "embedding_dimension"
44
DEFAULT_VECTOR_DB_NAME = "unstract"
5-
DEFAULT_EMBEDDING_SIZE = 1536
5+
DEFAULT_EMBEDDING_SIZE = 1
66
TEST_CONNECTION_EMBEDDING_SIZE = 1

src/unstract/adapters/vectordb/milvus/src/milvus.py

+23-15
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ class Constants:
1919

2020
class Milvus(VectorDBAdapter):
2121
def __init__(self, settings: dict[str, Any]):
22-
super().__init__("Milvus")
23-
self.config = settings
24-
self.client: Optional[MilvusClient] = None
25-
self.collection_name: str = VectorDbConstants.DEFAULT_VECTOR_DB_NAME
22+
self._config = settings
23+
self._client: Optional[MilvusClient] = None
24+
self._collection_name: str = VectorDbConstants.DEFAULT_VECTOR_DB_NAME
25+
self._vector_db_instance = self._get_vector_db_instance()
26+
super().__init__("Milvus", self._vector_db_instance)
2627

2728
@staticmethod
2829
def get_id() -> str:
@@ -48,36 +49,43 @@ def get_json_schema() -> str:
4849
return schema
4950

5051
def get_vector_db_instance(self) -> VectorStore:
52+
return self._vector_db_instance
53+
54+
def _get_vector_db_instance(self) -> VectorStore:
5155
try:
52-
self.collection_name = VectorDBHelper.get_collection_name(
53-
self.config.get(VectorDbConstants.VECTOR_DB_NAME),
54-
self.config.get(VectorDbConstants.EMBEDDING_DIMENSION),
56+
self._collection_name = VectorDBHelper.get_collection_name(
57+
self._config.get(VectorDbConstants.VECTOR_DB_NAME),
58+
self._config.get(VectorDbConstants.EMBEDDING_DIMENSION),
5559
)
56-
dimension = self.config.get(
60+
dimension = self._config.get(
5761
VectorDbConstants.EMBEDDING_DIMENSION,
5862
VectorDbConstants.DEFAULT_EMBEDDING_SIZE,
5963
)
6064
vector_db: VectorStore = MilvusVectorStore(
61-
uri=self.config.get(Constants.URI, ""),
62-
collection_name=self.collection_name,
63-
token=self.config.get(Constants.TOKEN, ""),
65+
uri=self._config.get(Constants.URI, ""),
66+
collection_name=self._collection_name,
67+
token=self._config.get(Constants.TOKEN, ""),
6468
dim=dimension,
6569
)
6670
if vector_db is not None:
67-
self.client = vector_db.client
71+
self._client = vector_db.client
6872
return vector_db
6973
except Exception as e:
7074
raise AdapterError(str(e))
7175

7276
def test_connection(self) -> bool:
73-
self.config[VectorDbConstants.EMBEDDING_DIMENSION] = (
77+
self._config[VectorDbConstants.EMBEDDING_DIMENSION] = (
7478
VectorDbConstants.TEST_CONNECTION_EMBEDDING_SIZE
7579
)
7680
vector_db = self.get_vector_db_instance()
7781
test_result: bool = VectorDBHelper.test_vector_db_instance(
7882
vector_store=vector_db
7983
)
8084
# Delete the collection that was created for testing
81-
if self.client is not None:
82-
self.client.drop_collection(self.collection_name)
85+
if self._client is not None:
86+
self._client.drop_collection(self._collection_name)
8387
return test_result
88+
89+
def close(self, **kwargs: Any) -> None:
90+
if self._client:
91+
self._client.close()

src/unstract/adapters/vectordb/pinecone/src/pinecone.py

+92-54
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
from typing import Any, Optional
44

5+
from llama_index.core.schema import BaseNode
56
from llama_index.core.vector_stores.types import BasePydanticVectorStore
67
from llama_index.vector_stores.pinecone import PineconeVectorStore
78
from pinecone import NotFoundException
@@ -33,10 +34,11 @@ class Constants:
3334

3435
class Pinecone(VectorDBAdapter):
3536
def __init__(self, settings: dict[str, Any]):
36-
super().__init__("Pinecone")
37-
self.config = settings
38-
self.client: Optional[LLamaIndexPinecone] = None
39-
self.collection_name: str = VectorDbConstants.DEFAULT_VECTOR_DB_NAME
37+
self._config = settings
38+
self._client: Optional[LLamaIndexPinecone] = None
39+
self._collection_name: str = VectorDbConstants.DEFAULT_VECTOR_DB_NAME
40+
self._vector_db_instance = self._get_vector_db_instance()
41+
super().__init__("Pinecone", self._vector_db_instance)
4042

4143
@staticmethod
4244
def get_id() -> str:
@@ -62,65 +64,101 @@ def get_json_schema() -> str:
6264
return schema
6365

6466
def get_vector_db_instance(self) -> BasePydanticVectorStore:
65-
try:
66-
self.client = LLamaIndexPinecone(
67-
api_key=str(self.config.get(Constants.API_KEY))
68-
)
69-
collection_name = VectorDBHelper.get_collection_name(
70-
self.config.get(VectorDbConstants.VECTOR_DB_NAME),
71-
self.config.get(VectorDbConstants.EMBEDDING_DIMENSION),
72-
)
73-
self.collection_name = collection_name.replace("_", "-").lower()
74-
dimension = self.config.get(
75-
VectorDbConstants.EMBEDDING_DIMENSION,
76-
VectorDbConstants.DEFAULT_EMBEDDING_SIZE,
77-
)
67+
return self._vector_db_instance
7868

79-
specification = self.config.get(Constants.SPECIFICATION)
80-
if specification == Constants.SPEC_POD:
81-
environment = self.config.get(Constants.ENVIRONMENT)
82-
spec = PodSpec(
83-
environment=environment,
84-
replicas=Constants.DEFAULT_SPEC_COUNT_VALUE,
85-
shards=Constants.DEFAULT_SPEC_COUNT_VALUE,
86-
pods=Constants.DEFAULT_SPEC_COUNT_VALUE,
87-
pod_type=Constants.DEFAULT_POD_TYPE,
88-
)
89-
elif specification == Constants.SPEC_SERVERLESS:
90-
cloud = self.config.get(Constants.CLOUD)
91-
region = self.config.get(Constants.REGION)
92-
spec = ServerlessSpec(cloud=cloud, region=region)
93-
logger.info(f"Setting up Pinecone spec for {spec}")
94-
try:
95-
self.client.describe_index(name=self.collection_name)
96-
except NotFoundException:
97-
logger.info(
98-
f"Index:{self.collection_name} does not exist. Creating it."
99-
)
100-
self.client.create_index(
101-
name=self.collection_name,
102-
dimension=dimension,
103-
metric=Constants.METRIC,
104-
spec=spec,
105-
)
106-
vector_db: BasePydanticVectorStore = PineconeVectorStore(
107-
index_name=self.collection_name,
108-
api_key=str(self.config.get(Constants.API_KEY)),
109-
environment=str(self.config.get(Constants.ENVIRONMENT)),
69+
def _get_vector_db_instance(self) -> BasePydanticVectorStore:
70+
71+
self._client = LLamaIndexPinecone(
72+
api_key=str(self._config.get(Constants.API_KEY))
73+
)
74+
collection_name = VectorDBHelper.get_collection_name(
75+
self._config.get(VectorDbConstants.VECTOR_DB_NAME),
76+
self._config.get(VectorDbConstants.EMBEDDING_DIMENSION),
77+
)
78+
self._collection_name = collection_name.replace("_", "-").lower()
79+
dimension = self._config.get(
80+
VectorDbConstants.EMBEDDING_DIMENSION,
81+
VectorDbConstants.DEFAULT_EMBEDDING_SIZE,
82+
)
83+
84+
specification = self._config.get(Constants.SPECIFICATION)
85+
if specification == Constants.SPEC_POD:
86+
environment = self._config.get(Constants.ENVIRONMENT)
87+
spec = PodSpec(
88+
environment=environment,
89+
replicas=Constants.DEFAULT_SPEC_COUNT_VALUE,
90+
shards=Constants.DEFAULT_SPEC_COUNT_VALUE,
91+
pods=Constants.DEFAULT_SPEC_COUNT_VALUE,
92+
pod_type=Constants.DEFAULT_POD_TYPE,
11093
)
111-
return vector_db
112-
except Exception as e:
113-
raise AdapterError(str(e))
94+
elif specification == Constants.SPEC_SERVERLESS:
95+
cloud = self._config.get(Constants.CLOUD)
96+
region = self._config.get(Constants.REGION)
97+
spec = ServerlessSpec(cloud=cloud, region=region)
98+
logger.info(f"Setting up Pinecone spec for {spec}")
99+
try:
100+
self._client.describe_index(name=self._collection_name)
101+
except NotFoundException:
102+
logger.info(f"Index:{self._collection_name} does not exist. Creating it.")
103+
self._client.create_index(
104+
name=self._collection_name,
105+
dimension=dimension,
106+
metric=Constants.METRIC,
107+
spec=spec,
108+
)
109+
self.vector_db: BasePydanticVectorStore = PineconeVectorStore(
110+
index_name=self._collection_name,
111+
api_key=str(self._config.get(Constants.API_KEY)),
112+
environment=str(self._config.get(Constants.ENVIRONMENT)),
113+
)
114+
return self.vector_db
114115

115116
def test_connection(self) -> bool:
116-
self.config[VectorDbConstants.EMBEDDING_DIMENSION] = (
117+
self._config[VectorDbConstants.EMBEDDING_DIMENSION] = (
117118
VectorDbConstants.TEST_CONNECTION_EMBEDDING_SIZE
118119
)
119120
vector_db = self.get_vector_db_instance()
120121
test_result: bool = VectorDBHelper.test_vector_db_instance(
121122
vector_store=vector_db
122123
)
123124
# Delete the collection that was created for testing
124-
if self.client:
125-
self.client.delete_index(self.collection_name)
125+
if self._client:
126+
self._client.delete_index(self._collection_name)
126127
return test_result
128+
129+
def close(self, **kwargs: Any) -> None:
130+
# Close connection is not defined for this client
131+
pass
132+
133+
def delete(self, ref_doc_id: str, **delete_kwargs: dict[Any, Any]) -> None:
134+
specification = self._config.get(Constants.SPECIFICATION)
135+
if specification == Constants.SPEC_SERVERLESS:
136+
# To delete all records representing chunks of a single document,
137+
# first list the record IDs based on their common ID prefix,
138+
# and then delete the records by ID:
139+
try:
140+
index = self._client.Index(self._collection_name) # type: ignore
141+
# Get all record having the ref_doc_id and delete them
142+
for ids in index.list(prefix=ref_doc_id):
143+
logger.info(ids)
144+
index.delete(ids=ids)
145+
except Exception as e:
146+
raise AdapterError(str(e))
147+
elif specification == Constants.SPEC_POD:
148+
if self.vector_db.environment == "gcp-starter": # type: ignore
149+
raise AdapterError(
150+
"Re-indexing is not supported on Starter indexes. "
151+
"Use Serverless or paid plan for Pod spec"
152+
)
153+
else:
154+
super().delete(ref_doc_id=ref_doc_id, **delete_kwargs)
155+
156+
def add(
157+
self,
158+
ref_doc_id: str,
159+
nodes: list[BaseNode],
160+
) -> list[str]:
161+
for i, node in enumerate(nodes):
162+
node_id = ref_doc_id + "-" + node.node_id
163+
nodes[i].id_ = node_id
164+
return self.vector_db.add(nodes=nodes)

0 commit comments

Comments
 (0)