Skip to content

Commit 392cccd

Browse files
MthwRobinsonqued
andauthored
enhancement: add ocr_only strategy for partition_image (#540)
* spike for ocr-only strategy for images * fix for file processing * extra space * add korean to ci * added test for ocr_only strategy * added docs for ocr_only * changelog and version * added test for bad strategy * skip korean test if in docker * bump version * version bump * document valid strategies * bump version for release --------- Co-authored-by: qued <[email protected]>
1 parent fae5f8f commit 392cccd

File tree

7 files changed

+103
-15
lines changed

7 files changed

+103
-15
lines changed

Diff for: .github/workflows/ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ jobs:
107107
sudo apt-get update
108108
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
109109
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
110-
sudo apt-get install -y tesseract-ocr
110+
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
111111
tesseract --version
112112
make test
113113
make check-coverage

Diff for: CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.6.3-dev3
1+
## 0.6.3
22

33
### Enhancements
44

5+
* Add an "ocr_only" strategy for `partition_image`.
6+
57
### Features
68

79
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST

Diff for: docs/source/bricks.rst

+14
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,20 @@ Examples:
430430
elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
431431
432432
433+
The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segements the document using
434+
``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy,
435+
which OCRs the document and then runs the output through ``partition_text``. This can be helpful
436+
if ``detectron2`` does not detect a text element in the image. To run example below, ensure you
437+
have the Korean language pack for Tesseract installed on your system.
438+
439+
440+
.. code:: python
441+
442+
from unstructured.partition.image import partition_image
443+
444+
filename = "example-docs/english-and-korean.png"
445+
elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only")
446+
433447
434448
``partition_email``
435449
---------------------

Diff for: example-docs/english-and-korean.png

298 KB
Loading

Diff for: test_unstructured/partition/test_image.py

+41
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
1+
import os
2+
import pathlib
13
from unittest import mock
24

35
import pytest
46
import requests
57
from pytesseract import TesseractError
68
from unstructured_inference.inference import layout
79

10+
from unstructured.documents.elements import Title
811
from unstructured.partition import image, pdf
912

13+
DIRECTORY = pathlib.Path(__file__).parent.resolve()
14+
15+
is_in_docker = os.path.exists("/.dockerenv")
16+
1017

1118
class MockResponse:
1219
def __init__(self, status_code, response):
@@ -178,3 +185,37 @@ def test_partition_image_from_file_with_language_passed(filename="example-docs/e
178185
def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
179186
with pytest.raises(TesseractError):
180187
image.partition_image(filename=filename, ocr_languages="fakeroo")
188+
189+
190+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
191+
def test_partition_image_with_ocr_detects_korean():
192+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
193+
elements = image.partition_image(
194+
filename=filename,
195+
ocr_languages="eng+kor",
196+
strategy="ocr_only",
197+
)
198+
199+
assert elements[0] == Title("RULES AND INSTRUCTIONS")
200+
assert elements[3].text.startswith("안녕하세요")
201+
202+
203+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
204+
def test_partition_image_with_ocr_detects_korean_from_file():
205+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
206+
207+
with open(filename, "rb") as f:
208+
elements = image.partition_image(
209+
file=f,
210+
ocr_languages="eng+kor",
211+
strategy="ocr_only",
212+
)
213+
214+
assert elements[0] == Title("RULES AND INSTRUCTIONS")
215+
assert elements[3].text.startswith("안녕하세요")
216+
217+
218+
def test_partition_image_raises_with_bad_strategy():
219+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
220+
with pytest.raises(ValueError):
221+
image.partition_image(filename=filename, strategy="fakeroo")

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.3-dev3" # pragma: no cover
1+
__version__ = "0.6.3" # pragma: no cover

Diff for: unstructured/partition/image.py

+43-12
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
from typing import List, Optional
22

3+
import pytesseract
4+
from PIL import Image
5+
36
from unstructured.documents.elements import Element
7+
from unstructured.partition.common import exactly_one
48
from unstructured.partition.pdf import partition_pdf_or_image
9+
from unstructured.partition.text import partition_text
10+
11+
VALID_STRATEGIES = ["hi_res", "ocr_only"]
512

613

714
def partition_image(
@@ -12,8 +19,10 @@ def partition_image(
1219
token: Optional[str] = None,
1320
include_page_breaks: bool = False,
1421
ocr_languages: str = "eng",
22+
strategy: str = "hi_res",
1523
) -> List[Element]:
1624
"""Parses an image into a list of interpreted elements.
25+
1726
Parameters
1827
----------
1928
filename
@@ -30,16 +39,38 @@ def partition_image(
3039
A string defining the authentication token for a self-host url, if applicable.
3140
ocr_languages
3241
The languages to use for the Tesseract agent. To use a language, you'll first need
33-
to isntall the appropriate Tesseract language pack.
42+
to install the appropriate Tesseract language pack.
43+
strategy
44+
The strategy to use for partitioning the PDF. Valid strategies are "hi_res" and
45+
"ocr_only". When using the "hi_res" strategy, the function ses a layout detection
46+
model if to identify document elements. When using the "ocr_only strategy",
47+
partition_image simply extracts the text from the document and processes it.
3448
"""
35-
if template is None:
36-
template = "layout/image"
37-
return partition_pdf_or_image(
38-
filename=filename,
39-
file=file,
40-
url=url,
41-
template=template,
42-
token=token,
43-
include_page_breaks=include_page_breaks,
44-
ocr_languages=ocr_languages,
45-
)
49+
exactly_one(filename=filename, file=file)
50+
51+
if strategy == "hi_res":
52+
if template is None:
53+
template = "layout/image"
54+
return partition_pdf_or_image(
55+
filename=filename,
56+
file=file,
57+
url=url,
58+
template=template,
59+
token=token,
60+
include_page_breaks=include_page_breaks,
61+
ocr_languages=ocr_languages,
62+
)
63+
64+
elif strategy == "ocr_only":
65+
if file is not None:
66+
image = Image.open(file)
67+
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
68+
else:
69+
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
70+
return partition_text(text=text)
71+
72+
else:
73+
raise ValueError(
74+
f"{strategy} is not a valid strategy for partition_image. "
75+
f"Choose one of {VALID_STRATEGIES}.",
76+
)

0 commit comments

Comments
 (0)