feat: allow users to pass OCR language into partition (#509)

MthwRobinson · web-flow · commit 6874df91ef20 · 2023-04-21T13:41:26.000Z
* pip-compile new reqs

* bump inference version

* add language to pdf and image calls

* tests for passing in language

* version bump and changelog

* update docs

* pass ocr_languages in auto

* updated test fixtures

* typo in doc string
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,13 @@
-## 0.5.14-dev0
+## 0.5.14-dev1
 
 ### Enhancements
 
 * Adds an `ssl_verify` kwarg to `partition` and `partition_html` to enable turning off
   SSL verification for HTTP requests. SSL verification is on by default.
+* Allows users to pass in ocr language to `partition_pdf` and `partition_image` through
+  the `ocr_language` kwarg. `ocr_language` corresponds to the code for the language pack
+  in Tesseract. You will need to install the relevant Tesseract language pack to use a
+  given language.
 
 ### Features
 
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -283,6 +283,10 @@ The ``strategy`` kwarg controls the method that will be used to process the PDF.
 will identify the layout of the document using ``detectron2``. The ``"fast"`` strategy will extract the
 text using ``pdfminer`` and process the raw text with ``partition_text``. If ``detectron2`` is not available,
 and the ``"hi_res"`` strategy is set, ``partition_pdf`` will fallback to the ``"fast"`` strategy.
+You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example,
+use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the
+`Tesseract documentation <https://github.com/tesseract-ocr/tessdata>`_ for a full list of languages and
+install instructions. OCR is only applied if the text is not already available in the PDF document.
 
 Examples:
 
@@ -293,13 +297,22 @@ Examples:
   # Returns a List[Element] present in the pages of the parsed pdf document
   elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
 
+  # Applies the English and Swedish language pack for ocr. OCR is only applied
+  # if the text is not available in the PDF.
+  elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", ocr_languages="eng+swe")
+
 
 ``partition_image``
 ---------------------
 
 The ``partition_image`` function has the same API as ``partition_pdf``, which is document above.
 The only difference is that ``partition_image`` does not need to convert a PDF to an image
 prior to processing. The ``partition_image`` function supports ``.png`` and ``.jpg`` files.
+You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example,
+use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the
+`Tesseract documentation <https://github.com/tesseract-ocr/tessdata>`_ for a full list of languages and
+install instructions.
+
 
 Examples:
 
@@ -310,6 +323,9 @@ Examples:
   # Returns a List[Element] present in the pages of the parsed image document
   elements = partition_image("example-docs/layout-parser-paper-fast.jpg")
 
+  # Applies the English and Swedish language pack for ocr
+  elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
+
 
 
 ``partition_email``
diff --git a/requirements/local-inference.txt b/requirements/local-inference.txt
@@ -268,7 +268,7 @@ typing-extensions==4.5.0
     #   rich
     #   starlette
     #   torch
-unstructured-inference==0.3.2
+unstructured-inference==0.4.1
     # via unstructured (setup.py)
 urllib3==1.26.15
     # via requests
diff --git a/setup.py b/setup.py
@@ -76,7 +76,7 @@
             "transformers",
         ],
         "local-inference": [
-            "unstructured-inference==0.3.2",
+            "unstructured-inference>=0.4.1",
         ],
         "s3": ["s3fs", "fsspec"],
         "azure": ["adlfs", "fsspec"],
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -282,6 +282,7 @@ def test_auto_partition_pdf_with_fast_strategy():
         include_page_breaks=False,
         encoding="utf-8",
         strategy="fast",
+        ocr_languages="eng",
     )
 
 
diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py
@@ -2,6 +2,7 @@
 
 import pytest
 import requests
+from pytesseract import TesseractError
 from unstructured_inference.inference import layout
 
 from unstructured.partition import image, pdf
@@ -157,3 +158,23 @@ def test_partition_image(url, api_called, local_called):
         image.partition_image(filename="fake.pdf", url=url)
         assert pdf._partition_via_api.called == api_called
         assert pdf._partition_pdf_or_image_local.called == local_called
+
+
+def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
+    with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_partition:
+        image.partition_image(filename=filename, ocr_languages="eng+swe")
+
+    assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
+
+
+def test_partition_image_from_file_with_language_passed(filename="example-docs/example.jpg"):
+    with mock.patch.object(layout, "process_data_with_model", mock.MagicMock()) as mock_partition:
+        with open(filename, "rb") as f:
+            image.partition_image(file=f, ocr_languages="eng+swe")
+
+    assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
+
+
+def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
+    with pytest.raises(TesseractError):
+        image.partition_image(filename=filename, ocr_languages="fakeroo")
diff --git a/test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/2023-Jan-economic-outlook.pdf.json
@@ -224,8 +224,8 @@
     }
   },
   {
-    "element_id": "0953470500eb215048fd49263b8829a4",
-    "text": "Forces Shaping the Outlook",
+    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+    "text": "",
     "type": "Title",
     "metadata": {
       "page_number": 2
diff --git a/test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/Silent-Giant-(1).pdf.json
@@ -1,7 +1,7 @@
 [
   {
-    "element_id": "ea216492b46010685b4a036fe66de211",
-    "text": "WORLD NUCLEARASSOCIATION",
+    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+    "text": "",
     "type": "Title",
     "metadata": {
       "page_number": 1
@@ -24,8 +24,8 @@
     }
   },
   {
-    "element_id": "53d548aa01fc3eb72da15a5be7f235e2",
-    "text": "Executive Summary",
+    "element_id": "8e76a94ac8320d515375e625bef18292",
+    "text": "Summary",
     "type": "Title",
     "metadata": {
       "page_number": 3
@@ -248,8 +248,8 @@
     }
   },
   {
-    "element_id": "3655eec20e80973efc46cc09db7a04ba",
-    "text": "Moving to a sustainable future",
+    "element_id": "ff4a9b34d6cdebbc9b8afbf9767f6e1c",
+    "text": "to a sustainable future",
     "type": "Title",
     "metadata": {
       "page_number": 6
diff --git a/test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/recalibrating-risk-report.pdf.json
@@ -1,7 +1,7 @@
 [
   {
-    "element_id": "ea216492b46010685b4a036fe66de211",
-    "text": "WORLD NUCLEARASSOCIATION",
+    "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
+    "text": "",
     "type": "Title",
     "metadata": {
       "page_number": 1
@@ -24,8 +24,8 @@
     }
   },
   {
-    "element_id": "53d548aa01fc3eb72da15a5be7f235e2",
-    "text": "Executive Summary",
+    "element_id": "8e76a94ac8320d515375e625bef18292",
+    "text": "Summary",
     "type": "Title",
     "metadata": {
       "page_number": 3
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.14-dev0"  # pragma: no cover
+__version__ = "0.5.14-dev1"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -34,6 +34,7 @@ def partition(
     paragraph_grouper: Optional[Callable[[str], str]] = None,
     headers: Dict[str, str] = {},
     ssl_verify: bool = True,
+    ocr_languages: str = "eng",
 ):
     """Partitions a document into its constituent elements. Will use libmagic to determine
     the file's type and route it to the appropriate partitioning function. Applies the default
@@ -66,6 +67,9 @@ def partition(
     ssl_verify
         If the URL parameter is set, determines whether or not partition uses SSL verification
         in the HTTP request.
+    ocr_languages
+        The languages to use for the Tesseract agent. To use a language, you'll first need
+        to isntall the appropriate Tesseract language pack.
     """
     exactly_one(file=file, filename=filename, url=url)
 
@@ -127,13 +131,15 @@ def partition(
             include_page_breaks=include_page_breaks,
             encoding=encoding,
             strategy=strategy,
+            ocr_languages=ocr_languages,
         )
     elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
         elements = partition_image(
             filename=filename,  # type: ignore
             file=file,  # type: ignore
             url=None,
             include_page_breaks=include_page_breaks,
+            ocr_languages=ocr_languages,
         )
     elif filetype == FileType.TXT:
         elements = partition_text(
diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py
@@ -11,6 +11,7 @@ def partition_image(
     template: Optional[str] = None,
     token: Optional[str] = None,
     include_page_breaks: bool = False,
+    ocr_languages: str = "eng",
 ) -> List[Element]:
     """Parses an image into a list of interpreted elements.
     Parameters
@@ -27,6 +28,9 @@ def partition_image(
         be used.
     token
         A string defining the authentication token for a self-host url, if applicable.
+    ocr_languages
+        The languages to use for the Tesseract agent. To use a language, you'll first need
+        to isntall the appropriate Tesseract language pack.
     """
     if template is None:
         template = "layout/image"
@@ -37,4 +41,5 @@ def partition_image(
         template=template,
         token=token,
         include_page_breaks=include_page_breaks,
+        ocr_languages=ocr_languages,
     )
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -23,6 +23,7 @@ def partition_pdf(
     include_page_breaks: bool = False,
     strategy: str = "hi_res",
     encoding: str = "utf-8",
+    ocr_languages: str = "eng",
 ) -> List[Element]:
     """Parses a pdf document into a list of interpreted elements.
     Parameters
@@ -45,6 +46,9 @@ def partition_pdf(
         and processes it.
     encoding
         The encoding method used to decode the text input. If None, utf-8 will be used.
+    ocr_languages
+        The languages to use for the Tesseract agent. To use a language, you'll first need
+        to isntall the appropriate Tesseract language pack.
     """
     exactly_one(filename=filename, file=file)
     return partition_pdf_or_image(
@@ -56,6 +60,7 @@ def partition_pdf(
         include_page_breaks=include_page_breaks,
         strategy=strategy,
         encoding=encoding,
+        ocr_languages=ocr_languages,
     )
 
 
@@ -69,6 +74,7 @@ def partition_pdf_or_image(
     include_page_breaks: bool = False,
     strategy: str = "hi_res",
     encoding: str = "utf-8",
+    ocr_languages: str = "eng",
 ) -> List[Element]:
     """Parses a pdf or image document into a list of interpreted elements."""
     if url is None:
@@ -103,6 +109,7 @@ def partition_pdf_or_image(
                     template=out_template,
                     is_image=is_image,
                     include_page_breaks=True,
+                    ocr_languages=ocr_languages,
                 )
 
         elif strategy == "fast" or fallback_to_fast:
@@ -152,6 +159,7 @@ def _partition_pdf_or_image_local(
     template: Optional[str] = None,
     is_image: bool = False,
     include_page_breaks: bool = False,
+    ocr_languages: str = "eng",
 ) -> List[Element]:
     """Partition using package installed locally."""
     try:
@@ -174,11 +182,20 @@ def _partition_pdf_or_image_local(
             "running make install-local-inference from the root directory of the repository.",
         ) from e
 
-    layout = (
-        process_file_with_model(filename, template, is_image=is_image)
-        if file is None
-        else process_data_with_model(file, template, is_image=is_image)
-    )
+    if file is None:
+        layout = process_file_with_model(
+            filename,
+            template,
+            is_image=is_image,
+            ocr_languages=ocr_languages,
+        )
+    else:
+        layout = process_data_with_model(
+            file,
+            template,
+            is_image=is_image,
+            ocr_languages=ocr_languages,
+        )
 
     return document_to_element_list(layout, include_page_breaks=include_page_breaks)
 

Original file line number	Diff line number	Diff line change
`@@ -282,6 +282,7 @@ def test_auto_partition_pdf_with_fast_strategy():`
`282`	`282`	`include_page_breaks=False,`
`283`	`283`	`encoding="utf-8",`
`284`	`284`	`strategy="fast",`
	`285`	`+ ocr_languages="eng",`
`285`	`286`	`)`
`286`	`287`
`287`	`288`
Original file line number	Diff line number	Diff line change
`@@ -224,8 +224,8 @@`
`224`	`224`	`}`
`225`	`225`	`},`
`226`	`226`	`{`
`227`		`- "element_id": "0953470500eb215048fd49263b8829a4",`
`228`		`- "text": "Forces Shaping the Outlook",`
	`227`	`+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",`
	`228`	`+ "text": "",`
`229`	`229`	`"type": "Title",`
`230`	`230`	`"metadata": {`
`231`	`231`	`"page_number": 2`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`[`
`2`	`2`	`{`
`3`		`- "element_id": "ea216492b46010685b4a036fe66de211",`
`4`		`- "text": "WORLD NUCLEARASSOCIATION",`
	`3`	`+ "element_id": "e3b0c44298fc1c149afbf4c8996fb924",`
	`4`	`+ "text": "",`
`5`	`5`	`"type": "Title",`
`6`	`6`	`"metadata": {`
`7`	`7`	`"page_number": 1`
`@@ -24,8 +24,8 @@`
`24`	`24`	`}`
`25`	`25`	`},`
`26`	`26`	`{`
`27`		`- "element_id": "53d548aa01fc3eb72da15a5be7f235e2",`
`28`		`- "text": "Executive Summary",`
	`27`	`+ "element_id": "8e76a94ac8320d515375e625bef18292",`
	`28`	`+ "text": "Summary",`
`29`	`29`	`"type": "Title",`
`30`	`30`	`"metadata": {`
`31`	`31`	`"page_number": 3`
`@@ -248,8 +248,8 @@`
`248`	`248`	`}`
`249`	`249`	`},`
`250`	`250`	`{`
`251`		`- "element_id": "3655eec20e80973efc46cc09db7a04ba",`
`252`		`- "text": "Moving to a sustainable future",`
	`251`	`+ "element_id": "ff4a9b34d6cdebbc9b8afbf9767f6e1c",`
	`252`	`+ "text": "to a sustainable future",`
`253`	`253`	`"type": "Title",`
`254`	`254`	`"metadata": {`
`255`	`255`	`"page_number": 6`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.14-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.5.14-dev1" # pragma: no cover`