feat: replace pytesseract with unstructured.pytesseract fork (#3528)

christinestraub · web-flow · commit fc26426310e4 · 2024-08-16T10:34:22.000-04:00
This PR reverts `pytesseract` dependency to `unstructured.pytesseract`
fork due to the unavailability of some recent release versions of
`pytesseract` on PyPI.

This PR also addresses an issue encountered during the publication of
`unstructured==0.15.4` to PyPI. The error was due to the fact that PyPI
does not allow direct dependencies from Version Control System URLs like
GitHub in the `install_requires` or `extras_require` sections of the
`setup.py` file.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.15.5-dev1
+## 0.15.5
 
 ### Enhancements
 
 ### Features
 
 ### Fixes
 
+* **Revert to using `unstructured.pytesseract` fork**. Due to the unavailability of some recent release versions of `pytesseract` on PyPI, the project now uses the `unstructured.pytesseract` fork to ensure stability and continued support.
 * **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs.
 * **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.
 
diff --git a/Makefile b/Makefile
@@ -45,7 +45,7 @@ install-test:
 	python3 -m pip install -r requirements/test.txt
 	# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
 	# pytesseract installation into the virtual env for testing
-	python3 -m pip install pytesseract -c requirements/deps/constraints.txt
+	python3 -m pip install unstructured_pytesseract
 	# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
 	# NOTE(robinson) - Installing weaviate-client separately here because the requests
 	# version conflicts with label_studio_sdk
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -12,6 +12,4 @@ effdet
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
 unstructured-inference==0.7.36
-# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository.
-# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released.
-pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
+unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -135,8 +135,8 @@ packaging==23.2
     #   matplotlib
     #   onnxruntime
     #   pikepdf
-    #   pytesseract
     #   transformers
+    #   unstructured-pytesseract
 pandas==2.2.2
     # via layoutparser
 pdf2image==1.17.0
@@ -159,8 +159,8 @@ pillow==10.4.0
     #   pdfplumber
     #   pikepdf
     #   pillow-heif
-    #   pytesseract
     #   torchvision
+    #   unstructured-pytesseract
 pillow-heif==0.18.0
     # via -r ./extra-pdf-image.in
 portalocker==2.10.1
@@ -201,8 +201,6 @@ pypdf==4.3.1
     #   -r ./extra-pdf-image.in
 pypdfium2==4.30.0
     # via pdfplumber
-pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
-    # via -r ./extra-pdf-image.in
 python-dateutil==2.9.0.post0
     # via
     #   -c ./base.txt
@@ -289,6 +287,8 @@ tzdata==2024.1
     # via pandas
 unstructured-inference==0.7.36
     # via -r ./extra-pdf-image.in
+unstructured-pytesseract==0.3.13
+    # via -r ./extra-pdf-image.in
 urllib3==1.26.19
     # via
     #   -c ././deps/constraints.txt
diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py
@@ -7,8 +7,8 @@
 
 import pytest
 from PIL import Image
-from pytesseract import TesseractError
 from unstructured_inference.inference import layout
+from unstructured_pytesseract import TesseractError
 
 from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 import pandas as pd
-import pytesseract
 import pytest
+import unstructured_pytesseract
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image, UnidentifiedImageError
 from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
 
 def test_get_ocr_layout_from_image_tesseract(monkeypatch):
     monkeypatch.setattr(
-        pytesseract,
+        unstructured_pytesseract,
         "image_to_data",
         lambda *args, **kwargs: pd.DataFrame(
             {
@@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch):
 
 def test_get_ocr_text_from_image_tesseract(monkeypatch):
     monkeypatch.setattr(
-        pytesseract,
+        unstructured_pytesseract,
         "image_to_string",
         lambda *args, **kwargs: "Hello World",
     )
@@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
     monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
     monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
     monkeypatch.setattr(
-        pytesseract,
+        unstructured_pytesseract,
         "image_to_data",
         lambda *args, **kwargs: pd.DataFrame(
             {
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -384,7 +384,7 @@ def test_partition_pdf_falls_back_to_fast(
     filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
 ):
     def mock_exists(dep):
-        return dep not in ["unstructured_inference", "pytesseract"]
+        return dep not in ["unstructured_inference", "unstructured_pytesseract"]
 
     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 
@@ -406,7 +406,7 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
     filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
 ):
     def mock_exists(dep):
-        return dep not in ["pytesseract"]
+        return dep not in ["unstructured_pytesseract"]
 
     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
 
@@ -432,7 +432,7 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
     filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
 ):
     def mock_exists(dep):
-        return dep not in ["pytesseract"]
+        return dep not in ["unstructured_pytesseract"]
 
     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
@@ -584,7 +584,7 @@ def test_partition_pdf_fails_if_pdf_not_processable(
     filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
 ):
     def mock_exists(dep):
-        return dep not in ["unstructured_inference", "pytesseract"]
+        return dep not in ["unstructured_inference", "unstructured_pytesseract"]
 
     monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
     monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
@@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None():
     [
         (
             PartitionStrategy.HI_RES,
-            "pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_data",
         ),
         (
             PartitionStrategy.OCR_ONLY,
-            "pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_data",
         ),
         (
             PartitionStrategy.OCR_ONLY,
-            "pytesseract.image_to_string",
+            "unstructured_pytesseract.image_to_string",
         ),
     ],
 )
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.5-dev1"  # pragma: no cover
+__version__ = "0.15.5"  # pragma: no cover
diff --git a/unstructured/partition/strategies.py b/unstructured/partition/strategies.py
@@ -31,7 +31,7 @@ def determine_pdf_or_image_strategy(
 ):
     """Determines what strategy to use for processing PDFs or images, accounting for fallback
     logic if some dependencies are not available."""
-    pytesseract_installed = dependency_exists("pytesseract")
+    pytesseract_installed = dependency_exists("unstructured_pytesseract")
     unstructured_inference_installed = dependency_exists("unstructured_inference")
 
     if strategy == PartitionStrategy.AUTO:
diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py
@@ -43,7 +43,7 @@ class PartitionStrategy:
 
 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
 
-# this field is defined by pytesseract
+# this field is defined by unstructured_pytesseract
 TESSERACT_TEXT_HEIGHT = "height"
 
 TESSERACT_LANGUAGES_SPLITTER = "+"
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -6,9 +6,9 @@
 import cv2
 import numpy as np
 import pandas as pd
-import pytesseract
+import unstructured_pytesseract
 from PIL import Image as PILImage
-from pytesseract import Output
+from unstructured_pytesseract import Output
 
 from unstructured.logger import trace_logger
 from unstructured.partition.utils.config import env_config
@@ -40,14 +40,14 @@ def is_text_sorted(self):
         return True
 
     def get_text_from_image(self, image: PILImage.Image) -> str:
-        return pytesseract.image_to_string(np.array(image), lang=self.language)
+        return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language)
 
     def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
         """Get the OCR regions from image as a list of text regions with tesseract."""
 
         trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
-        ocr_df: pd.DataFrame = pytesseract.image_to_data(
+        ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
             np.array(image),
             lang=self.language,
             output_type=Output.DATAFRAME,
@@ -76,7 +76,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
                 max_zoom,
             )
-            ocr_df = pytesseract.image_to_data(
+            ocr_df = unstructured_pytesseract.image_to_data(
                 np.array(zoom_image(image, zoom)),
                 lang=self.language,
                 output_type=Output.DATAFRAME,
@@ -96,9 +96,9 @@ def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutE
         ocr_regions = self.get_layout_from_image(image)
 
         # NOTE(christine): For tesseract, the ocr_text returned by
-        # `pytesseract.image_to_string()` doesn't contain bounding box data but is
+        # `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is
         # well grouped. Conversely, the ocr_layout returned by parsing
-        # `pytesseract.image_to_data()` contains bounding box data but is not well
+        # `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well
         # grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
         # the text regions in each group to create a list of layout elements.
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.15.5-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.15.5" # pragma: no cover`