enhancement: add ocr_only strategy for partition_image (#540)

MthwRobinson · qued · web-flow · commit 392cccdbf7a8 · 2023-05-04T20:23:51.000Z
* spike for ocr-only strategy for images

* fix for file processing

* extra space

* add korean to ci

* added test for ocr_only strategy

* added docs for ocr_only

* changelog and version

* added test for bad strategy

* skip korean test if in docker

* bump version

* version bump

* document valid strategies

* bump version for release

---------

Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -107,7 +107,7 @@ jobs:
         sudo apt-get update
         sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
         sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get install -y tesseract-ocr
+        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
         tesseract --version
         make test
         make check-coverage
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.6.3-dev3
+## 0.6.3
 
 ### Enhancements
 
+* Add an "ocr_only" strategy for `partition_image`.
+
 ### Features
 
 * Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -430,6 +430,20 @@ Examples:
   elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
 
 
+The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segements the document using
+``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy,
+which OCRs the document and then runs the output through ``partition_text``. This can be helpful
+if ``detectron2`` does not detect a text element in the image. To run example below, ensure you
+have the Korean language pack for Tesseract installed on your system.
+
+
+.. code:: python
+
+  from unstructured.partition.image import partition_image
+
+  filename = "example-docs/english-and-korean.png"
+  elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only")
+
 
 ``partition_email``
 ---------------------
diff --git a/example-docs/english-and-korean.png b/example-docs/english-and-korean.png
diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py
@@ -1,12 +1,19 @@
+import os
+import pathlib
 from unittest import mock
 
 import pytest
 import requests
 from pytesseract import TesseractError
 from unstructured_inference.inference import layout
 
+from unstructured.documents.elements import Title
 from unstructured.partition import image, pdf
 
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
+is_in_docker = os.path.exists("/.dockerenv")
+
 
 class MockResponse:
     def __init__(self, status_code, response):
@@ -178,3 +185,37 @@ def test_partition_image_from_file_with_language_passed(filename="example-docs/e
 def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
     with pytest.raises(TesseractError):
         image.partition_image(filename=filename, ocr_languages="fakeroo")
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_partition_image_with_ocr_detects_korean():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
+    elements = image.partition_image(
+        filename=filename,
+        ocr_languages="eng+kor",
+        strategy="ocr_only",
+    )
+
+    assert elements[0] == Title("RULES AND INSTRUCTIONS")
+    assert elements[3].text.startswith("안녕하세요")
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_partition_image_with_ocr_detects_korean_from_file():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
+
+    with open(filename, "rb") as f:
+        elements = image.partition_image(
+            file=f,
+            ocr_languages="eng+kor",
+            strategy="ocr_only",
+        )
+
+    assert elements[0] == Title("RULES AND INSTRUCTIONS")
+    assert elements[3].text.startswith("안녕하세요")
+
+
+def test_partition_image_raises_with_bad_strategy():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
+    with pytest.raises(ValueError):
+        image.partition_image(filename=filename, strategy="fakeroo")
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.3-dev3"  # pragma: no cover
+__version__ = "0.6.3"  # pragma: no cover
diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py
@@ -1,7 +1,14 @@
 from typing import List, Optional
 
+import pytesseract
+from PIL import Image
+
 from unstructured.documents.elements import Element
+from unstructured.partition.common import exactly_one
 from unstructured.partition.pdf import partition_pdf_or_image
+from unstructured.partition.text import partition_text
+
+VALID_STRATEGIES = ["hi_res", "ocr_only"]
 
 
 def partition_image(
@@ -12,8 +19,10 @@ def partition_image(
     token: Optional[str] = None,
     include_page_breaks: bool = False,
     ocr_languages: str = "eng",
+    strategy: str = "hi_res",
 ) -> List[Element]:
     """Parses an image into a list of interpreted elements.
+
     Parameters
     ----------
     filename
@@ -30,16 +39,38 @@ def partition_image(
         A string defining the authentication token for a self-host url, if applicable.
     ocr_languages
         The languages to use for the Tesseract agent. To use a language, you'll first need
-        to isntall the appropriate Tesseract language pack.
+        to install the appropriate Tesseract language pack.
+    strategy
+        The strategy to use for partitioning the PDF. Valid strategies are "hi_res" and
+        "ocr_only". When using the "hi_res" strategy, the function  ses a layout detection
+        model if to identify document elements. When using the "ocr_only strategy",
+        partition_image simply extracts the text from the document and processes it.
     """
-    if template is None:
-        template = "layout/image"
-    return partition_pdf_or_image(
-        filename=filename,
-        file=file,
-        url=url,
-        template=template,
-        token=token,
-        include_page_breaks=include_page_breaks,
-        ocr_languages=ocr_languages,
-    )
+    exactly_one(filename=filename, file=file)
+
+    if strategy == "hi_res":
+        if template is None:
+            template = "layout/image"
+        return partition_pdf_or_image(
+            filename=filename,
+            file=file,
+            url=url,
+            template=template,
+            token=token,
+            include_page_breaks=include_page_breaks,
+            ocr_languages=ocr_languages,
+        )
+
+    elif strategy == "ocr_only":
+        if file is not None:
+            image = Image.open(file)
+            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
+        else:
+            text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
+        return partition_text(text=text)
+
+    else:
+        raise ValueError(
+            f"{strategy} is not a valid strategy for partition_image. "
+            f"Choose one of {VALID_STRATEGIES}.",
+        )

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.3-dev3" # pragma: no cover`
	`1`	`+__version__ = "0.6.3" # pragma: no cover`