Feat: improve image extraction by supporting all types of image elements detected by detection models (#286)

christinestraub · web-flow · commit f35b83072f42 · 2023-11-16T14:16:15.000-08:00
Closes #285. ### Summary - support extracting elements with types `Picture` and `Figure` - add a class `ElementType` for the element type constants and use the constants to replace element type strings ### Testing PDF: [algebra-graph-level1-1.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/13368976/algebra-graph-level1-1.pdf) ``` from unstructured_inference.inference.layout import DocumentLayout doc = DocumentLayout.from_file( filename="algebra-graph-level1-1.pdf", extract_images_in_pdf=True, ) ```
diff --git a/.gitignore b/.gitignore
@@ -143,4 +143,5 @@ dmypy.json
 .vscode/
 
 sample-docs/*_images
-examples/**/output
+examples/**/output
+figures
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
-## 0.7.13-dev1
+## 0.7.13
 
+* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings
+* enhancement: support extracting elements with types `Picture` and `Figure`
 * fix: update logger in table initalization where the logger info was not showing
 * chore: supress UserWarning about specified model providers
 
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.13-dev1"  # pragma: no cover
+__version__ = "0.7.13"  # pragma: no cover
diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py
@@ -18,6 +18,25 @@ class Source(Enum):
     SUPER_GRADIENTS = "super-gradients"
 
 
+class ElementType:
+    IMAGE = "Image"
+    FIGURE = "Figure"
+    PICTURE = "Picture"
+    TABLE = "Table"
+    LIST = "List"
+    LIST_ITEM = "List-item"
+    FORMULA = "Formula"
+    CAPTION = "Caption"
+    PAGE_HEADER = "Page-header"
+    SECTION_HEADER = "Section-header"
+    PAGE_FOOTER = "Page-footer"
+    FOOTNOTE = "Footnote"
+    TITLE = "Title"
+    TEXT = "Text"
+    UNCATEGORIZED_TEXT = "UncategorizedText"
+    PAGE_BREAK = "PageBreak"
+
+
 FULL_PAGE_REGION_THRESHOLD = 0.99
 
 # this field is defined by pytesseract/unstructured.pytesseract
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -11,7 +11,7 @@
 from pdfminer.high_level import extract_pages
 from PIL import Image, ImageSequence
 
-from unstructured_inference.constants import Source
+from unstructured_inference.constants import ElementType, Source
 from unstructured_inference.inference.elements import (
     EmbeddedTextRegion,
     ImageTextRegion,
@@ -296,8 +296,9 @@ def extract_images(self, output_dir_path: Optional[str] = None):
         os.makedirs(output_dir_path, exist_ok=True)
 
         figure_number = 0
+        image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
         for el in self.elements:
-            if (el.bbox is None) or (el.type not in ["Image"]):
+            if (el.bbox is None) or (el.type not in image_element_types):
                 continue
 
             figure_number += 1
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -12,6 +12,7 @@
 from unstructured_inference.config import inference_config
 from unstructured_inference.constants import (
     FULL_PAGE_REGION_THRESHOLD,
+    ElementType,
     Source,
 )
 from unstructured_inference.inference.elements import (
@@ -42,7 +43,7 @@ def extract_text(
             objects=objects,
             extract_tables=extract_tables,
         )
-        if extract_tables and self.type == "Table":
+        if extract_tables and self.type == ElementType.TABLE:
             self.text_as_html = interpret_table_block(self, image)
         return text
 
@@ -139,10 +140,10 @@ def merge_inferred_layout_with_extracted_layout(
                     subregion_threshold=subregion_threshold,
                 )
                 inferred_is_text = inferred_region.type not in (
-                    "Figure",
-                    "Image",
-                    "PageBreak",
-                    "Table",
+                    ElementType.FIGURE,
+                    ElementType.IMAGE,
+                    ElementType.PAGE_BREAK,
+                    ElementType.TABLE,
                 )
                 extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of(
                     inferred_region.bbox,
@@ -169,7 +170,10 @@ def merge_inferred_layout_with_extracted_layout(
                         # keep inferred region, remove extracted region
                         grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox)
                         region_matched = True
-                elif either_region_is_subregion_of_other and inferred_region.type != "Table":
+                elif (
+                    either_region_is_subregion_of_other
+                    and inferred_region.type != ElementType.TABLE
+                ):
                     # keep extracted region, remove inferred region
                     inferred_regions_to_remove.append(inferred_region)
         if not region_matched:
@@ -178,7 +182,9 @@ def merge_inferred_layout_with_extracted_layout(
     categorized_extracted_elements_to_add = [
         LayoutElement(
             text=el.text,
-            type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText",
+            type=ElementType.IMAGE
+            if isinstance(el, ImageTextRegion)
+            else ElementType.UNCATEGORIZED_TEXT,
             source=el.source,
             bbox=el.bbox,
         )
diff --git a/unstructured_inference/models/detectron2.py b/unstructured_inference/models/detectron2.py
@@ -9,6 +9,7 @@
 from layoutparser.models.model_config import LayoutModelConfig
 from PIL import Image
 
+from unstructured_inference.constants import ElementType
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.logger import logger
 from unstructured_inference.models.unstructuredmodel import (
@@ -18,11 +19,11 @@
 
 DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
 DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
-    0: "Text",
-    1: "Title",
-    2: "List",
-    3: "Table",
-    4: "Figure",
+    0: ElementType.TEXT,
+    1: ElementType.TITLE,
+    2: ElementType.LIST,
+    3: ElementType.TABLE,
+    4: ElementType.FIGURE,
 }
 DEFAULT_EXTRA_CONFIG: Final[List[Any]] = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8]
 
diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py
@@ -6,6 +6,7 @@
 import numpy as np
 from PIL.Image import Image
 
+from unstructured_inference.constants import ElementType
 from unstructured_inference.inference.elements import (
     grow_region_to_match_region,
     intersections,
@@ -123,7 +124,9 @@ def enhance_regions(
         return elements
 
     @staticmethod
-    def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]:
+    def clean_type(
+        elements: List[LayoutElement], type_to_clean=ElementType.TABLE
+    ) -> List[LayoutElement]:
         """After this function, the list of elements will not contain any element inside
         of the type specified"""
         target_elements = [e for e in elements if e.type == type_to_clean]
diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
@@ -12,23 +12,23 @@
 from onnxruntime.capi import _pybind_state as C
 from PIL import Image
 
-from unstructured_inference.constants import Source
+from unstructured_inference.constants import ElementType, Source
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
 from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
 
 YOLOX_LABEL_MAP = {
-    0: "Caption",
-    1: "Footnote",
-    2: "Formula",
-    3: "List-item",
-    4: "Page-footer",
-    5: "Page-header",
-    6: "Picture",
-    7: "Section-header",
-    8: "Table",
-    9: "Text",
-    10: "Title",
+    0: ElementType.CAPTION,
+    1: ElementType.FOOTNOTE,
+    2: ElementType.FORMULA,
+    3: ElementType.LIST_ITEM,
+    4: ElementType.PAGE_FOOTER,
+    5: ElementType.PAGE_HEADER,
+    6: ElementType.PICTURE,
+    7: ElementType.SECTION_HEADER,
+    8: ElementType.TABLE,
+    9: ElementType.TEXT,
+    10: ElementType.TITLE,
 }
 
 MODEL_TYPES = {

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.13-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.7.13" # pragma: no cover`