fix: add language to OCRAgentGoogleVision constructor (#3696)

DavidBlore · christinestraub · web-flow · commit ecf0267b850a · 2024-10-14T05:35:05.000Z
This PR addresses issue #3659 by adding an optional `language` parameter to the `OCRAgentGoogleVision` class constructor. This parameter serves as a "language hint" for the `document_text_detection` method in the `ImageAnnotatorClient`. For more information on language hints, refer to the [Google Cloud Vision documentation](https://cloud.google.com/vision/docs/languages). **Default Behavior**: The language parameter defaults to None, allowing Google Cloud Vision to auto-detect the language, as recommended in their documentation. **Purpose**: This change is necessary because the `OCRAgent`'s `get_instance` method expects all `OCRAgent`s to include a language parameter in their constructors. **Context on Issue:** When trying to parse a PDF with `OCR_AGENT=unstructured.partition.utils.ocr_models.google_vision_ocr.OCRAgentGoogleVision`, an error occurs in the `get_instance` method. The method expects a `language` parameter, which the current `OCRAgentGoogleVision` constructor does not support, leading to a positional argument error. --------- Co-authored-by: Christine Straub <christinemstraub@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.15.15-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+* **Add language parameter to `OCRAgentGoogleVision`.**  Introduces an optional language parameter in the `OCRAgentGoogleVision` constructor to serve as a language hint for `document_text_detection`. This ensures compatibility with the OCRAgent's `get_instance` method and resolves errors when parsing PDFs with Google Cloud Vision as the OCR agent.
+
 ## 0.15.14
 
 ### Enhancements
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -1,4 +1,5 @@
 from collections import namedtuple
+from typing import Optional
 from unittest.mock import patch
 
 import numpy as np
@@ -226,12 +227,13 @@ def google_vision_client(google_vision_text_annotation):
     Response = namedtuple("Response", "full_text_annotation")
 
     class FakeGoogleVisionClient:
-        def document_text_detection(self, image):
+        def document_text_detection(self, image, image_context):
             return Response(full_text_annotation=google_vision_text_annotation)
 
     class OCRAgentFakeGoogleVision(OCRAgentGoogleVision):
-        def __init__(self):
+        def __init__(self, language: Optional[str] = None):
             self.client = FakeGoogleVisionClient()
+            self.language = language
 
     return OCRAgentFakeGoogleVision()
 
@@ -249,7 +251,7 @@ def test_get_layout_from_image_google_vision(google_vision_client):
     image = Image.new("RGB", (100, 100))
 
     ocr_agent = google_vision_client
-    regions = ocr_agent.get_layout_from_image(image, ocr_languages="eng")
+    regions = ocr_agent.get_layout_from_image(image)
     assert len(regions) == 1
     assert regions[0].text == "Hello World!"
     assert regions[0].source == Source.OCR_GOOGLEVISION
@@ -263,7 +265,7 @@ def test_get_layout_elements_from_image_google_vision(google_vision_client):
     image = Image.new("RGB", (100, 100))
 
     ocr_agent = google_vision_client
-    layout_elements = ocr_agent.get_layout_elements_from_image(image, ocr_languages="eng")
+    layout_elements = ocr_agent.get_layout_elements_from_image(image)
     assert len(layout_elements) == 1
 
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.14"  # pragma: no cover
+__version__ = "0.15.15-dev0"  # pragma: no cover
diff --git a/unstructured/partition/utils/ocr_models/google_vision_ocr.py b/unstructured/partition/utils/ocr_models/google_vision_ocr.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
 from io import BytesIO
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
-from google.cloud.vision import Image, ImageAnnotatorClient, Paragraph, TextAnnotation
+from google.cloud.vision import Image, ImageAnnotatorClient, ImageContext, Paragraph, TextAnnotation
 
 from unstructured.logger import logger, trace_logger
 from unstructured.partition.utils.config import env_config
@@ -19,7 +19,8 @@
 class OCRAgentGoogleVision(OCRAgent):
     """OCR service implementation for Google Vision API."""
 
-    def __init__(self) -> None:
+    def __init__(self, language: Optional[str] = None) -> None:
+        self.language = language
         client_options = {}
         api_endpoint = env_config.GOOGLEVISION_API_ENDPOINT
         if api_endpoint:
@@ -32,40 +33,40 @@ def __init__(self) -> None:
     def is_text_sorted(self) -> bool:
         return True
 
-    def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") -> str:
+    def get_text_from_image(self, image: PILImage.Image) -> str:
+        image_context = ImageContext(language_hints=[self.language]) if self.language else None
         with BytesIO() as buffer:
             image.save(buffer, format="PNG")
-            response = self.client.document_text_detection(image=Image(content=buffer.getvalue()))
+            response = self.client.document_text_detection(
+                image=Image(content=buffer.getvalue()), image_context=image_context
+            )
         document = response.full_text_annotation
         assert isinstance(document, TextAnnotation)
         return document.text
 
-    def get_layout_from_image(
-        self, image: PILImage.Image, ocr_languages: str = "eng"
-    ) -> list[TextRegion]:
+    def get_layout_from_image(self, image: PILImage.Image) -> list[TextRegion]:
         trace_logger.detail("Processing entire page OCR with Google Vision API...")
+        image_context = ImageContext(language_hints=[self.language]) if self.language else None
         with BytesIO() as buffer:
             image.save(buffer, format="PNG")
-            response = self.client.document_text_detection(image=Image(content=buffer.getvalue()))
+            response = self.client.document_text_detection(
+                image=Image(content=buffer.getvalue()), image_context=image_context
+            )
         document = response.full_text_annotation
         assert isinstance(document, TextAnnotation)
         regions = self._parse_regions(document)
         return regions
 
-    def get_layout_elements_from_image(
-        self, image: PILImage.Image, ocr_languages: str = "eng"
-    ) -> list[LayoutElement]:
+    def get_layout_elements_from_image(self, image: PILImage.Image) -> list[LayoutElement]:
         from unstructured.partition.pdf_image.inference_utils import (
             build_layout_elements_from_ocr_regions,
         )
 
         ocr_regions = self.get_layout_from_image(
             image,
-            ocr_languages=ocr_languages,
         )
         ocr_text = self.get_text_from_image(
             image,
-            ocr_languages=ocr_languages,
         )
         layout_elements = build_layout_elements_from_ocr_regions(
             ocr_regions=ocr_regions,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.15.14" # pragma: no cover`
	`1`	`+__version__ = "0.15.15-dev0" # pragma: no cover`