Skip to content

Commit 6874df9

Browse files
authored
feat: allow users to pass OCR language into partition (#509)
* pip-compile new reqs * bump inference version * add language to pdf and image calls * tests for passing in language * version bump and changelog * update docs * pass ocr_languages in auto * updated test fixtures * typo in doc string
1 parent db2f70d commit 6874df9

File tree

13 files changed

+91
-21
lines changed

13 files changed

+91
-21
lines changed

Diff for: CHANGELOG.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1-
## 0.5.14-dev0
1+
## 0.5.14-dev1
22

33
### Enhancements
44

55
* Adds an `ssl_verify` kwarg to `partition` and `partition_html` to enable turning off
66
SSL verification for HTTP requests. SSL verification is on by default.
7+
* Allows users to pass in ocr language to `partition_pdf` and `partition_image` through
8+
the `ocr_language` kwarg. `ocr_language` corresponds to the code for the language pack
9+
in Tesseract. You will need to install the relevant Tesseract language pack to use a
10+
given language.
711

812
### Features
913

Diff for: docs/source/bricks.rst

+16
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,10 @@ The ``strategy`` kwarg controls the method that will be used to process the PDF.
283283
will identify the layout of the document using ``detectron2``. The ``"fast"`` strategy will extract the
284284
text using ``pdfminer`` and process the raw text with ``partition_text``. If ``detectron2`` is not available,
285285
and the ``"hi_res"`` strategy is set, ``partition_pdf`` will fallback to the ``"fast"`` strategy.
286+
You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example,
287+
use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the
288+
`Tesseract documentation <https://github.com/tesseract-ocr/tessdata>`_ for a full list of languages and
289+
install instructions. OCR is only applied if the text is not already available in the PDF document.
286290

287291
Examples:
288292

@@ -293,13 +297,22 @@ Examples:
293297
# Returns a List[Element] present in the pages of the parsed pdf document
294298
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
295299
300+
# Applies the English and Swedish language pack for ocr. OCR is only applied
301+
# if the text is not available in the PDF.
302+
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", ocr_languages="eng+swe")
303+
296304
297305
``partition_image``
298306
---------------------
299307

300308
The ``partition_image`` function has the same API as ``partition_pdf``, which is document above.
301309
The only difference is that ``partition_image`` does not need to convert a PDF to an image
302310
prior to processing. The ``partition_image`` function supports ``.png`` and ``.jpg`` files.
311+
You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example,
312+
use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the
313+
`Tesseract documentation <https://github.com/tesseract-ocr/tessdata>`_ for a full list of languages and
314+
install instructions.
315+
303316

304317
Examples:
305318

@@ -310,6 +323,9 @@ Examples:
310323
# Returns a List[Element] present in the pages of the parsed image document
311324
elements = partition_image("example-docs/layout-parser-paper-fast.jpg")
312325
326+
# Applies the English and Swedish language pack for ocr
327+
elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
328+
313329
314330
315331
``partition_email``

Diff for: requirements/local-inference.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ typing-extensions==4.5.0
268268
# rich
269269
# starlette
270270
# torch
271-
unstructured-inference==0.3.2
271+
unstructured-inference==0.4.1
272272
# via unstructured (setup.py)
273273
urllib3==1.26.15
274274
# via requests

Diff for: setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"transformers",
7777
],
7878
"local-inference": [
79-
"unstructured-inference==0.3.2",
79+
"unstructured-inference>=0.4.1",
8080
],
8181
"s3": ["s3fs", "fsspec"],
8282
"azure": ["adlfs", "fsspec"],

Diff for: test_unstructured/partition/test_auto.py

+1
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ def test_auto_partition_pdf_with_fast_strategy():
282282
include_page_breaks=False,
283283
encoding="utf-8",
284284
strategy="fast",
285+
ocr_languages="eng",
285286
)
286287

287288

Diff for: test_unstructured/partition/test_image.py

+21
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import pytest
44
import requests
5+
from pytesseract import TesseractError
56
from unstructured_inference.inference import layout
67

78
from unstructured.partition import image, pdf
@@ -157,3 +158,23 @@ def test_partition_image(url, api_called, local_called):
157158
image.partition_image(filename="fake.pdf", url=url)
158159
assert pdf._partition_via_api.called == api_called
159160
assert pdf._partition_pdf_or_image_local.called == local_called
161+
162+
163+
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
164+
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_partition:
165+
image.partition_image(filename=filename, ocr_languages="eng+swe")
166+
167+
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
168+
169+
170+
def test_partition_image_from_file_with_language_passed(filename="example-docs/example.jpg"):
171+
with mock.patch.object(layout, "process_data_with_model", mock.MagicMock()) as mock_partition:
172+
with open(filename, "rb") as f:
173+
image.partition_image(file=f, ocr_languages="eng+swe")
174+
175+
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
176+
177+
178+
def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
179+
with pytest.raises(TesseractError):
180+
image.partition_image(filename=filename, ocr_languages="fakeroo")

Diff for: test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/2023-Jan-economic-outlook.pdf.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,8 @@
224224
}
225225
},
226226
{
227-
"element_id": "0953470500eb215048fd49263b8829a4",
228-
"text": "Forces Shaping the Outlook",
227+
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
228+
"text": "",
229229
"type": "Title",
230230
"metadata": {
231231
"page_number": 2

Diff for: test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/Silent-Giant-(1).pdf.json

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[
22
{
3-
"element_id": "ea216492b46010685b4a036fe66de211",
4-
"text": "WORLD NUCLEARASSOCIATION",
3+
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
4+
"text": "",
55
"type": "Title",
66
"metadata": {
77
"page_number": 1
@@ -24,8 +24,8 @@
2424
}
2525
},
2626
{
27-
"element_id": "53d548aa01fc3eb72da15a5be7f235e2",
28-
"text": "Executive Summary",
27+
"element_id": "8e76a94ac8320d515375e625bef18292",
28+
"text": "Summary",
2929
"type": "Title",
3030
"metadata": {
3131
"page_number": 3
@@ -248,8 +248,8 @@
248248
}
249249
},
250250
{
251-
"element_id": "3655eec20e80973efc46cc09db7a04ba",
252-
"text": "Moving to a sustainable future",
251+
"element_id": "ff4a9b34d6cdebbc9b8afbf9767f6e1c",
252+
"text": "to a sustainable future",
253253
"type": "Title",
254254
"metadata": {
255255
"page_number": 6

Diff for: test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/recalibrating-risk-report.pdf.json

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[
22
{
3-
"element_id": "ea216492b46010685b4a036fe66de211",
4-
"text": "WORLD NUCLEARASSOCIATION",
3+
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
4+
"text": "",
55
"type": "Title",
66
"metadata": {
77
"page_number": 1
@@ -24,8 +24,8 @@
2424
}
2525
},
2626
{
27-
"element_id": "53d548aa01fc3eb72da15a5be7f235e2",
28-
"text": "Executive Summary",
27+
"element_id": "8e76a94ac8320d515375e625bef18292",
28+
"text": "Summary",
2929
"type": "Title",
3030
"metadata": {
3131
"page_number": 3

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.14-dev0" # pragma: no cover
1+
__version__ = "0.5.14-dev1" # pragma: no cover

Diff for: unstructured/partition/auto.py

+6
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def partition(
3434
paragraph_grouper: Optional[Callable[[str], str]] = None,
3535
headers: Dict[str, str] = {},
3636
ssl_verify: bool = True,
37+
ocr_languages: str = "eng",
3738
):
3839
"""Partitions a document into its constituent elements. Will use libmagic to determine
3940
the file's type and route it to the appropriate partitioning function. Applies the default
@@ -66,6 +67,9 @@ def partition(
6667
ssl_verify
6768
If the URL parameter is set, determines whether or not partition uses SSL verification
6869
in the HTTP request.
70+
ocr_languages
71+
The languages to use for the Tesseract agent. To use a language, you'll first need
72+
to isntall the appropriate Tesseract language pack.
6973
"""
7074
exactly_one(file=file, filename=filename, url=url)
7175

@@ -127,13 +131,15 @@ def partition(
127131
include_page_breaks=include_page_breaks,
128132
encoding=encoding,
129133
strategy=strategy,
134+
ocr_languages=ocr_languages,
130135
)
131136
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
132137
elements = partition_image(
133138
filename=filename, # type: ignore
134139
file=file, # type: ignore
135140
url=None,
136141
include_page_breaks=include_page_breaks,
142+
ocr_languages=ocr_languages,
137143
)
138144
elif filetype == FileType.TXT:
139145
elements = partition_text(

Diff for: unstructured/partition/image.py

+5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def partition_image(
1111
template: Optional[str] = None,
1212
token: Optional[str] = None,
1313
include_page_breaks: bool = False,
14+
ocr_languages: str = "eng",
1415
) -> List[Element]:
1516
"""Parses an image into a list of interpreted elements.
1617
Parameters
@@ -27,6 +28,9 @@ def partition_image(
2728
be used.
2829
token
2930
A string defining the authentication token for a self-host url, if applicable.
31+
ocr_languages
32+
The languages to use for the Tesseract agent. To use a language, you'll first need
33+
to isntall the appropriate Tesseract language pack.
3034
"""
3135
if template is None:
3236
template = "layout/image"
@@ -37,4 +41,5 @@ def partition_image(
3741
template=template,
3842
token=token,
3943
include_page_breaks=include_page_breaks,
44+
ocr_languages=ocr_languages,
4045
)

Diff for: unstructured/partition/pdf.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def partition_pdf(
2323
include_page_breaks: bool = False,
2424
strategy: str = "hi_res",
2525
encoding: str = "utf-8",
26+
ocr_languages: str = "eng",
2627
) -> List[Element]:
2728
"""Parses a pdf document into a list of interpreted elements.
2829
Parameters
@@ -45,6 +46,9 @@ def partition_pdf(
4546
and processes it.
4647
encoding
4748
The encoding method used to decode the text input. If None, utf-8 will be used.
49+
ocr_languages
50+
The languages to use for the Tesseract agent. To use a language, you'll first need
51+
to isntall the appropriate Tesseract language pack.
4852
"""
4953
exactly_one(filename=filename, file=file)
5054
return partition_pdf_or_image(
@@ -56,6 +60,7 @@ def partition_pdf(
5660
include_page_breaks=include_page_breaks,
5761
strategy=strategy,
5862
encoding=encoding,
63+
ocr_languages=ocr_languages,
5964
)
6065

6166

@@ -69,6 +74,7 @@ def partition_pdf_or_image(
6974
include_page_breaks: bool = False,
7075
strategy: str = "hi_res",
7176
encoding: str = "utf-8",
77+
ocr_languages: str = "eng",
7278
) -> List[Element]:
7379
"""Parses a pdf or image document into a list of interpreted elements."""
7480
if url is None:
@@ -103,6 +109,7 @@ def partition_pdf_or_image(
103109
template=out_template,
104110
is_image=is_image,
105111
include_page_breaks=True,
112+
ocr_languages=ocr_languages,
106113
)
107114

108115
elif strategy == "fast" or fallback_to_fast:
@@ -152,6 +159,7 @@ def _partition_pdf_or_image_local(
152159
template: Optional[str] = None,
153160
is_image: bool = False,
154161
include_page_breaks: bool = False,
162+
ocr_languages: str = "eng",
155163
) -> List[Element]:
156164
"""Partition using package installed locally."""
157165
try:
@@ -174,11 +182,20 @@ def _partition_pdf_or_image_local(
174182
"running make install-local-inference from the root directory of the repository.",
175183
) from e
176184

177-
layout = (
178-
process_file_with_model(filename, template, is_image=is_image)
179-
if file is None
180-
else process_data_with_model(file, template, is_image=is_image)
181-
)
185+
if file is None:
186+
layout = process_file_with_model(
187+
filename,
188+
template,
189+
is_image=is_image,
190+
ocr_languages=ocr_languages,
191+
)
192+
else:
193+
layout = process_data_with_model(
194+
file,
195+
template,
196+
is_image=is_image,
197+
ocr_languages=ocr_languages,
198+
)
182199

183200
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
184201

0 commit comments

Comments
 (0)