chore: reduce default batch_size for loading_uri (#651)

Askir · cevian · web-flow · commit f017f8a97758 · 2025-04-27T14:03:25.000-07:00
* chore: reduce default batch_size for loading_uri

* chore: better batch_size docs.

---------

Co-authored-by: Matvey Arye &lt;cevian@gmail.com&gt;
diff --git a/docs/vectorizer/api-reference.md b/docs/vectorizer/api-reference.md
@@ -1284,7 +1284,7 @@ You use `ai.processing_default` to specify the concurrency and batch size for th
 
 |Name| Type | Default                      | Required | Description                                                                                                                                                                                                           |
 |-|------|------------------------------|-|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|batch_size| int  | Determined by the vectorizer |✖| The number of items to process in each batch. The optimal batch size depends on your data and cloud function configuration, larger batch sizes can improve efficiency but may increase memory usage.                  |
+|batch_size| int  | Determined by the vectorizer |✖| The number of items to process in each batch. The optimal batch size depends on your data and cloud function configuration, larger batch sizes can improve efficiency but may increase memory usage. The default is 1 for vectorizers that use document loading (`ai.loading_uri`) and 50 otherwise.                  |
 |concurrency| int  | Determined by the vectorizer |✖| The number of concurrent processing tasks to run. The optimal concurrency depends on your cloud infrastructure and rate limits, higher concurrency can speed up processing but may increase costs and resource usage. |
 
 #### Returns
diff --git a/projects/pgai/pgai/vectorizer/processing.py b/projects/pgai/pgai/vectorizer/processing.py
@@ -26,7 +26,7 @@ class ProcessingDefault(BaseModel):
     """
 
     implementation: Literal["default"]
-    batch_size: Annotated[int, Gt(gt=0), Le(le=2048)] = 50
+    batch_size: int | None = None
     concurrency: Annotated[int, Gt(gt=0), Le(le=10)] = 1
     log_level: Literal[
         "CRITICAL",
diff --git a/projects/pgai/pgai/vectorizer/vectorizer.py b/projects/pgai/pgai/vectorizer/vectorizer.py
@@ -905,6 +905,21 @@ async def _do_batch(self, conn: AsyncConnection) -> int:
 
             return len(items)
 
+    @cached_property
+    def _batch_size(self) -> int:
+        """Returns the batch size for processing.
+        Documents take way longer to process than simple text rows,
+        due to download and parsing overhead.
+        So when the vectorizer is processing documents
+        we use a smaller default batch size."""
+        if self.vectorizer.config.processing.batch_size is not None:
+            return max(1, min(self.vectorizer.config.processing.batch_size, 2048))
+        else:
+            if isinstance(self.vectorizer.config.loading, UriLoading):
+                return 1
+            else:
+                return 50
+
     async def _fetch_work(self, conn: AsyncConnection) -> list[SourceRow]:
         """
         Fetches a batch of tasks from the work queue table. Safe for concurrent use.
@@ -924,15 +939,15 @@ async def _fetch_work(self, conn: AsyncConnection) -> list[SourceRow]:
                 await cursor.execute(
                     self.queries.fetch_work_query_with_retries,
                     (
-                        self.vectorizer.config.processing.batch_size,
+                        self._batch_size,
                         queue_table_oid,
                     ),
                 )
             else:
                 await cursor.execute(
                     self.queries.fetch_work_query,
                     (
-                        self.vectorizer.config.processing.batch_size,
+                        self._batch_size,
                         queue_table_oid,
                     ),
                 )