33
33
34
34
DEFAULT_BATCH_SIZE = 200
35
35
36
+ SEM_MAX_CONCURRENT = 10
37
+
36
38
_logger = logging .getLogger (__name__ )
37
39
38
40
@@ -98,6 +100,14 @@ def _to_pinecone_filter(standard_filters: MetadataFilters) -> dict:
98
100
return filters
99
101
100
102
103
+ async def async_upload (index , vectors , batch_size , semaphore ):
104
+ async def send_batch (batch ):
105
+ async with semaphore :
106
+ return await asyncio .to_thread (index .upsert , batch , async_req = True )
107
+
108
+ await asyncio .gather (* [send_batch (chunk ) for chunk in iter_batch (vectors , size = batch_size )])
109
+
110
+
101
111
import_err_msg = (
102
112
"`pinecone` package not found, please run `pip install pinecone-client`"
103
113
)
@@ -224,39 +234,42 @@ def from_params(
224
234
def class_name (cls ) -> str :
225
235
return "PinconeVectorStore"
226
236
227
- def add (
228
- self ,
229
- nodes : List [BaseNode ],
230
- ) -> List [str ]:
231
- """Add nodes to index.
232
-
233
- Args:
234
- nodes: List[BaseNode]: list of nodes with embeddings
235
-
236
- """
237
- ids = []
237
+ def _prepare_entries_for_upsert (self , nodes : List [BaseNode ]) -> List [Dict ]:
238
238
entries = []
239
239
for node in nodes :
240
- node_id = node .node_id
241
-
242
240
metadata = node_to_metadata_dict (
243
241
node , remove_text = False , flat_metadata = self .flat_metadata
244
242
)
245
243
246
244
entry = {
247
- ID_KEY : node_id ,
245
+ ID_KEY : node . node_id ,
248
246
VECTOR_KEY : node .get_embedding (),
249
247
METADATA_KEY : metadata ,
250
248
}
251
- if self .add_sparse_vector and self ._tokenizer is not None :
249
+
250
+ if self .add_sparse_vector :
252
251
sparse_vector = generate_sparse_vectors (
253
252
[node .get_content (metadata_mode = MetadataMode .EMBED )],
254
253
self ._tokenizer ,
255
254
)[0 ]
256
255
entry [SPARSE_VECTOR_KEY ] = sparse_vector
257
256
258
- ids .append (node_id )
259
257
entries .append (entry )
258
+
259
+ return entries
260
+
261
+ def add (
262
+ self ,
263
+ nodes : List [BaseNode ],
264
+ ) -> List [str ]:
265
+ """Add nodes to index.
266
+
267
+ Args:
268
+ nodes: List[BaseNode]: list of nodes with embeddings
269
+
270
+ """
271
+
272
+ entries = self ._prepare_entries_for_upsert (nodes )
260
273
261
274
[
262
275
self ._pinecone_index .upsert (
@@ -266,7 +279,7 @@ def add(
266
279
for batch in iter_batch (entries , self .batch_size )
267
280
]
268
281
269
- return ids
282
+ return [ entry [ ID_KEY ] for entry in entries ]
270
283
271
284
async def async_add (
272
285
self ,
@@ -280,7 +293,14 @@ async def async_add(
280
293
Returns:
281
294
List[str]: List of IDs of the added documents.
282
295
"""
283
- return await asyncio .to_thread (self .add , nodes ) # type: ignore
296
+
297
+ entries = self ._prepare_entries_for_upsert (nodes )
298
+
299
+ semaphore = asyncio .Semaphore (SEM_MAX_CONCURRENT )
300
+ await async_upload (self ._pinecone_index , entries , DEFAULT_BATCH_SIZE , semaphore )
301
+
302
+ return [entry [ID_KEY ] for entry in entries ]
303
+
284
304
285
305
def delete (self , ref_doc_id : str , ** delete_kwargs : Any ) -> None :
286
306
"""
0 commit comments