Skip to content

Commit fc26426

Browse files
feat: replace pytesseract with unstructured.pytesseract fork (#3528)
This PR reverts `pytesseract` dependency to `unstructured.pytesseract` fork due to the unavailability of some recent release versions of `pytesseract` on PyPI. This PR also addresses an issue encountered during the publication of `unstructured==0.15.4` to PyPI. The error was due to the fact that PyPI does not allow direct dependencies from Version Control System URLs like GitHub in the `install_requires` or `extras_require` sections of the `setup.py` file.
1 parent e64e095 commit fc26426

File tree

11 files changed

+30
-31
lines changed

11 files changed

+30
-31
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.15.5-dev1
1+
## 0.15.5
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
* **Revert to using `unstructured.pytesseract` fork**. Due to the unavailability of some recent release versions of `pytesseract` on PyPI, the project now uses the `unstructured.pytesseract` fork to ensure stability and continued support.
910
* **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs.
1011
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.
1112

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ install-test:
4545
python3 -m pip install -r requirements/test.txt
4646
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
4747
# pytesseract installation into the virtual env for testing
48-
python3 -m pip install pytesseract -c requirements/deps/constraints.txt
48+
python3 -m pip install unstructured_pytesseract
4949
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
5050
# NOTE(robinson) - Installing weaviate-client separately here because the requests
5151
# version conflicts with label_studio_sdk

requirements/extra-pdf-image.in

+1-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,4 @@ effdet
1212
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
1313
# when unstructured library is.
1414
unstructured-inference==0.7.36
15-
# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository.
16-
# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released.
17-
pytesseract @ git+https://github.com/madmaze/[email protected]
15+
unstructured.pytesseract>=0.3.12

requirements/extra-pdf-image.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ packaging==23.2
135135
# matplotlib
136136
# onnxruntime
137137
# pikepdf
138-
# pytesseract
139138
# transformers
139+
# unstructured-pytesseract
140140
pandas==2.2.2
141141
# via layoutparser
142142
pdf2image==1.17.0
@@ -159,8 +159,8 @@ pillow==10.4.0
159159
# pdfplumber
160160
# pikepdf
161161
# pillow-heif
162-
# pytesseract
163162
# torchvision
163+
# unstructured-pytesseract
164164
pillow-heif==0.18.0
165165
# via -r ./extra-pdf-image.in
166166
portalocker==2.10.1
@@ -201,8 +201,6 @@ pypdf==4.3.1
201201
# -r ./extra-pdf-image.in
202202
pypdfium2==4.30.0
203203
# via pdfplumber
204-
pytesseract @ git+https://github.com/madmaze/[email protected]
205-
# via -r ./extra-pdf-image.in
206204
python-dateutil==2.9.0.post0
207205
# via
208206
# -c ./base.txt
@@ -289,6 +287,8 @@ tzdata==2024.1
289287
# via pandas
290288
unstructured-inference==0.7.36
291289
# via -r ./extra-pdf-image.in
290+
unstructured-pytesseract==0.3.13
291+
# via -r ./extra-pdf-image.in
292292
urllib3==1.26.19
293293
# via
294294
# -c ././deps/constraints.txt

test_unstructured/partition/pdf_image/test_image.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
import pytest
99
from PIL import Image
10-
from pytesseract import TesseractError
1110
from unstructured_inference.inference import layout
11+
from unstructured_pytesseract import TesseractError
1212

1313
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
1414
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path

test_unstructured/partition/pdf_image/test_ocr.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33

44
import numpy as np
55
import pandas as pd
6-
import pytesseract
76
import pytest
7+
import unstructured_pytesseract
88
from pdf2image.exceptions import PDFPageCountError
99
from PIL import Image, UnidentifiedImageError
1010
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
7070

7171
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
7272
monkeypatch.setattr(
73-
pytesseract,
73+
unstructured_pytesseract,
7474
"image_to_data",
7575
lambda *args, **kwargs: pd.DataFrame(
7676
{
@@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch):
156156

157157
def test_get_ocr_text_from_image_tesseract(monkeypatch):
158158
monkeypatch.setattr(
159-
pytesseract,
159+
unstructured_pytesseract,
160160
"image_to_string",
161161
lambda *args, **kwargs: "Hello World",
162162
)
@@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
443443
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
444444
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
445445
monkeypatch.setattr(
446-
pytesseract,
446+
unstructured_pytesseract,
447447
"image_to_data",
448448
lambda *args, **kwargs: pd.DataFrame(
449449
{

test_unstructured/partition/pdf_image/test_pdf.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ def test_partition_pdf_falls_back_to_fast(
384384
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
385385
):
386386
def mock_exists(dep):
387-
return dep not in ["unstructured_inference", "pytesseract"]
387+
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
388388

389389
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
390390

@@ -406,7 +406,7 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
406406
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
407407
):
408408
def mock_exists(dep):
409-
return dep not in ["pytesseract"]
409+
return dep not in ["unstructured_pytesseract"]
410410

411411
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
412412

@@ -432,7 +432,7 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
432432
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
433433
):
434434
def mock_exists(dep):
435-
return dep not in ["pytesseract"]
435+
return dep not in ["unstructured_pytesseract"]
436436

437437
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
438438
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
@@ -584,7 +584,7 @@ def test_partition_pdf_fails_if_pdf_not_processable(
584584
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
585585
):
586586
def mock_exists(dep):
587-
return dep not in ["unstructured_inference", "pytesseract"]
587+
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
588588

589589
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
590590
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
@@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None():
978978
[
979979
(
980980
PartitionStrategy.HI_RES,
981-
"pytesseract.image_to_data",
981+
"unstructured_pytesseract.image_to_data",
982982
),
983983
(
984984
PartitionStrategy.OCR_ONLY,
985-
"pytesseract.image_to_data",
985+
"unstructured_pytesseract.image_to_data",
986986
),
987987
(
988988
PartitionStrategy.OCR_ONLY,
989-
"pytesseract.image_to_string",
989+
"unstructured_pytesseract.image_to_string",
990990
),
991991
],
992992
)

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.5-dev1" # pragma: no cover
1+
__version__ = "0.15.5" # pragma: no cover

unstructured/partition/strategies.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def determine_pdf_or_image_strategy(
3131
):
3232
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
3333
logic if some dependencies are not available."""
34-
pytesseract_installed = dependency_exists("pytesseract")
34+
pytesseract_installed = dependency_exists("unstructured_pytesseract")
3535
unstructured_inference_installed = dependency_exists("unstructured_inference")
3636

3737
if strategy == PartitionStrategy.AUTO:

unstructured/partition/utils/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class PartitionStrategy:
4343

4444
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
4545

46-
# this field is defined by pytesseract
46+
# this field is defined by unstructured_pytesseract
4747
TESSERACT_TEXT_HEIGHT = "height"
4848

4949
TESSERACT_LANGUAGES_SPLITTER = "+"

unstructured/partition/utils/ocr_models/tesseract_ocr.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
import cv2
77
import numpy as np
88
import pandas as pd
9-
import pytesseract
9+
import unstructured_pytesseract
1010
from PIL import Image as PILImage
11-
from pytesseract import Output
11+
from unstructured_pytesseract import Output
1212

1313
from unstructured.logger import trace_logger
1414
from unstructured.partition.utils.config import env_config
@@ -40,14 +40,14 @@ def is_text_sorted(self):
4040
return True
4141

4242
def get_text_from_image(self, image: PILImage.Image) -> str:
43-
return pytesseract.image_to_string(np.array(image), lang=self.language)
43+
return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language)
4444

4545
def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
4646
"""Get the OCR regions from image as a list of text regions with tesseract."""
4747

4848
trace_logger.detail("Processing entire page OCR with tesseract...")
4949
zoom = 1
50-
ocr_df: pd.DataFrame = pytesseract.image_to_data(
50+
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
5151
np.array(image),
5252
lang=self.language,
5353
output_type=Output.DATAFRAME,
@@ -76,7 +76,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
7676
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
7777
max_zoom,
7878
)
79-
ocr_df = pytesseract.image_to_data(
79+
ocr_df = unstructured_pytesseract.image_to_data(
8080
np.array(zoom_image(image, zoom)),
8181
lang=self.language,
8282
output_type=Output.DATAFRAME,
@@ -96,9 +96,9 @@ def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutE
9696
ocr_regions = self.get_layout_from_image(image)
9797

9898
# NOTE(christine): For tesseract, the ocr_text returned by
99-
# `pytesseract.image_to_string()` doesn't contain bounding box data but is
99+
# `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is
100100
# well grouped. Conversely, the ocr_layout returned by parsing
101-
# `pytesseract.image_to_data()` contains bounding box data but is not well
101+
# `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well
102102
# grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
103103
# the text regions in each group to create a list of layout elements.
104104

0 commit comments

Comments
 (0)