fix: Insert documents in batches for Azure AI Search vector store

Dorin POMIAN · Dorin POMIAN · commit b2878e5c1e90 · 2025-04-04T09:55:56.000+02:00
diff --git a/src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py b/src/docs2vecs/subcommands/indexer/skills/ada002_embedding_skill.py
@@ -1,5 +1,4 @@
-from typing import List
-from typing import Optional
+from typing import List, Optional
 
 from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
 
@@ -22,13 +21,25 @@ def az_ada002_embeddings(self, content: str):
         return embed_model.get_query_embedding(content)
 
     def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document]]:
-        self.logger.info("Running AzureAda002EmbeddingSkill...")
-        self.logger.info(f"Number of documents: {len(input)}")
+        self.logger.info(
+            f"Running Azure Embedding Skill with deployment name: {self._config['deployment_name']}..."
+        )
+
+        docs_count = len(input)
+        chunks_count = sum(len(doc.chunks) for doc in input)
+
+        self.logger.info(
+            f"Processing a total of documents: {docs_count}. Total number of chunks: {chunks_count}"
+        )
 
         for doc in input:
             self.logger.debug(f"Processing document: {doc.filename}")
             for chunk in doc.chunks:
                 self.logger.debug(f"Creating embedding for chunk: {chunk.chunk_id}")
-                chunk.embedding = "" if not chunk.content else self.az_ada002_embeddings(chunk.content)
+                chunk.embedding = (
+                    ""
+                    if not chunk.content
+                    else self.az_ada002_embeddings(chunk.content)
+                )
 
         return input
diff --git a/src/docs2vecs/subcommands/indexer/skills/azure_vector_store_skill.py b/src/docs2vecs/subcommands/indexer/skills/azure_vector_store_skill.py
@@ -1,5 +1,4 @@
-from typing import List
-from typing import Optional
+from typing import List, Optional
 
 from azure.core.credentials import AzureKeyCredential
 from azure.identity import DefaultAzureCredential
@@ -15,18 +14,34 @@
 
 
 class AzureVectorStoreSkill(IndexerSkill):
-    def __init__(self, config: dict, global_config: Config, vector_store_tracker: VectorStoreTracker = None):
+    def __init__(
+        self,
+        config: dict,
+        global_config: Config,
+        vector_store_tracker: VectorStoreTracker = None,
+    ):
         super().__init__(config, global_config)
         self._vector_store_tracker = vector_store_tracker
         self._overwrite_index = self._config.get("overwrite_index", False)
 
-        az_credential = AzureKeyCredential(self._config.get("api_key", "")) if self._config.get("api_key", "") else DefaultAzureCredential()
+        az_credential = (
+            AzureKeyCredential(self._config.get("api_key", ""))
+            if self._config.get("api_key", "")
+            else DefaultAzureCredential()
+        )
         self._search_client = SearchClient(
             endpoint=self._config.get("endpoint"),
             index_name=self._config.get("index_name"),
             credential=az_credential,
         )
-        self._index_client = SearchIndexClient(endpoint=self._config.get("endpoint"), credential=az_credential)
+        self._index_client = SearchIndexClient(
+            endpoint=self._config.get("endpoint"), credential=az_credential
+        )
+
+        max_batch_size = 50
+        self._config["batch_size"] = min(
+            max(1, self._config.get("batch_size", max_batch_size)), max_batch_size
+        )
 
     def _upload_embeddings(self, chunks: List[Chunk]):
         field_mapping = self._config.get("field_mapping", {})
@@ -38,21 +53,38 @@ def _upload_embeddings(self, chunks: List[Chunk]):
         results = []
         if chunks:
             az_ai_search_documents = [
-                {field_mapping[key]: getattr(chunk, key) for key in field_mapping if hasattr(chunk, key)} for chunk in chunks
+                {
+                    field_mapping[key]: getattr(chunk, key)
+                    for key in field_mapping
+                    if hasattr(chunk, key)
+                }
+                for chunk in chunks
             ]
 
-            results = self._search_client.upload_documents(documents=az_ai_search_documents)
+            start_idx = 0
+            batch_size = self._config.get("batch_size")
+
+            while start_idx < len(az_ai_search_documents):
+                batch = az_ai_search_documents[start_idx : start_idx + batch_size]
+                results.extend(self._search_client.upload_documents(documents=batch))
+                start_idx += batch_size
 
         return results
 
     def _update_tracker(self, chunks: List[Chunk], results: List[IndexingResult]):
         if self._vector_store_tracker:
             self._vector_store_tracker.update_documents(chunks, results)
 
-    def _log_upload_results(self, chunk_id_list: List[str], results: List[IndexingResult]):
+    def _log_upload_results(
+        self, chunk_id_list: List[str], results: List[IndexingResult]
+    ):
         if self.logger:
             res = [
-                {"chunk_id": chunk_id, "succeeded": result.succeeded, "status_code": result.status_code}
+                {
+                    "chunk_id": chunk_id,
+                    "succeeded": result.succeeded,
+                    "status_code": result.status_code,
+                }
                 for chunk_id, result in zip(chunk_id_list, results)
             ]
             self.logger.debug(f"Azure AI Search upload results: {res}")
@@ -65,7 +97,9 @@ def _cleanup_index(self):
 
         # First search for all documents
         results = self._search_client.search(
-            search_text="*", select=[key_field], include_total_count=True  # Only get the key field as that's all we need for deletion
+            search_text="*",
+            select=[key_field],
+            include_total_count=True,  # Only get the key field as that's all we need for deletion
         )
 
         # Get all document IDs using the correct key field
@@ -84,7 +118,10 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
         chunks = {}
 
         if self._vector_store_tracker:
-            chunks = {chunk.document_id: chunk for chunk in self._vector_store_tracker.retrieve_failed_documents()}
+            chunks = {
+                chunk.document_id: chunk
+                for chunk in self._vector_store_tracker.retrieve_failed_documents()
+            }
 
         self.logger.debug(f"Going to process {len(input)} documents")
         for doc in input: