refactor: remove code related to embedded text extraction (#349)

christinestraub · web-flow · commit 81549a7e24b6 · 2024-05-21T12:44:08.000-07:00
This PR removes all code related to filling inferred elements text from embedded text (`pdfminer`). This PR is the first part of moving embedded text related code from `unstructured-inference` to `unstructured` and works together with Unstructured-IO/unstructured#3061.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
-## 0.7.32-dev1
+## 0.7.32
 
+* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer). 
 * bug: set the Chipper max_length variable
 
 ## 0.7.31
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -312,16 +312,6 @@ def test_from_image_file_raises_isadirectoryerror_with_dir():
         layout.DocumentLayout.from_image_file(tempdir)
 
 
-@pytest.mark.parametrize("idx", range(2))
-def test_get_elements_from_layout(mock_initial_layout, idx):
-    page = MockPageLayout()
-    block = mock_initial_layout[idx]
-    block.bbox.pad(3)
-    fixed_layout = [block]
-    elements = page.get_elements_from_layout(fixed_layout)
-    assert elements[0].text == block.text
-
-
 def test_page_numbers_in_page_objects():
     with patch(
         "unstructured_inference.inference.layout.PageLayout.get_elements_with_detection_model",
@@ -331,40 +321,6 @@ def test_page_numbers_in_page_objects():
         assert [page.number for page in doc.pages] == list(range(1, len(doc.pages) + 1))
 
 
-@pytest.mark.parametrize(
-    ("fixed_layouts", "called_method", "not_called_method"),
-    [
-        (
-            [MockLayout()],
-            "get_elements_from_layout",
-            "get_elements_with_detection_model",
-        ),
-        (None, "get_elements_with_detection_model", "get_elements_from_layout"),
-    ],
-)
-def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method):
-    with patch.object(
-        layout.PageLayout,
-        "get_elements_with_detection_model",
-        return_value=[],
-    ), patch.object(
-        layout.PageLayout,
-        "get_elements_from_layout",
-        return_value=[],
-    ):
-        layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf", fixed_layouts=fixed_layouts)
-        getattr(layout.PageLayout, called_method).assert_called()
-        getattr(layout.PageLayout, not_called_method).assert_not_called()
-
-
-@pytest.mark.parametrize(
-    ("text", "expected"),
-    [("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
-)
-def test_remove_control_characters(text, expected):
-    assert elements.remove_control_characters(text) == expected
-
-
 no_text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100)
 text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
 overlapping_rect = ImageTextRegion.from_coords(50, 50, 150, 150)
@@ -417,12 +373,6 @@ def check_annotated_image():
         check_annotated_image()
 
 
-@pytest.mark.parametrize(("text", "expected"), [("asdf", "asdf"), (None, "")])
-def test_embedded_text_region(text, expected):
-    etr = elements.EmbeddedTextRegion.from_coords(0, 0, 24, 24, text=text)
-    assert etr.extract_text(objects=None) == expected
-
-
 class MockDetectionModel(layout.UnstructuredObjectDetectionModel):
     def initialize(self, *args, **kwargs):
         pass
diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py
@@ -5,18 +5,6 @@
 from unstructured_inference.inference.layoutelement import LayoutElement, TextRegion
 
 
-def test_layout_element_extract_text(
-    mock_layout_element,
-    mock_text_region,
-):
-    extracted_text = mock_layout_element.extract_text(
-        objects=[mock_text_region],
-    )
-
-    assert isinstance(extracted_text, str)
-    assert "Sample text" in extracted_text
-
-
 def test_layout_element_do_dict(mock_layout_element):
     expected = {
         "coordinates": ((100, 100), (100, 300), (300, 300), (300, 100)),
diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py
@@ -272,16 +272,3 @@ def test_merge_inferred_layout_with_extracted_layout():
     assert merged_layout[0].text == "Example Section Header"
     assert merged_layout[1].type == ElementType.TEXT
     assert merged_layout[1].text == "Example Title"
-
-
-def test_aggregate_by_block():
-    expected = "Inside region1 Inside region2"
-    embedded_regions = [
-        TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
-        TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
-        TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
-    ]
-    target_region = TextRegion.from_coords(0, 0, 300, 300)
-
-    text = elements.aggregate_by_block(target_region, embedded_regions)
-    assert text == expected
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.32-dev1"  # pragma: no cover
+__version__ = "0.7.32"  # pragma: no cover
diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py
@@ -92,16 +92,6 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
         """
         return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)
 
-    @property
-    def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
-        """threshold to determine if an embedded region is a sub-region of a given block
-        when aggregating the text from embedded elements that lie within the given block
-
-        When the intersection region area divided by self area is larger than this threshold self is
-        considered a subregion of the other
-        """
-        return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)
-
     @property
     def ELEMENTS_H_PADDING_COEF(self) -> float:
         """When extending the boundaries of a PDF object for the purpose of determining which other
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -1,13 +1,11 @@
 from __future__ import annotations
 
-import unicodedata
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Collection, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 
-from unstructured_inference.config import inference_config
 from unstructured_inference.constants import Source
 from unstructured_inference.math import safe_division
 
@@ -184,21 +182,6 @@ class TextRegion:
     def __str__(self) -> str:
         return str(self.text)
 
-    def extract_text(
-        self,
-        objects: Optional[Collection[TextRegion]],
-    ) -> str:
-        """Extracts text contained in region."""
-        if self.text is not None:
-            # If block text is already populated, we'll assume it's correct
-            text = self.text
-        elif objects is not None:
-            text = aggregate_by_block(self, objects)
-        else:
-            text = ""
-        cleaned_text = remove_control_characters(text)
-        return cleaned_text
-
     @classmethod
     def from_coords(
         cls,
@@ -217,54 +200,11 @@ def from_coords(
 
 
 class EmbeddedTextRegion(TextRegion):
-    def extract_text(
-        self,
-        objects: Optional[Collection[TextRegion]],
-    ) -> str:
-        """Extracts text contained in region."""
-        if self.text is None:
-            return ""
-        else:
-            return self.text
+    pass
 
 
 class ImageTextRegion(TextRegion):
-    def extract_text(
-        self,
-        objects: Optional[Collection[TextRegion]],
-    ) -> str:
-        """Extracts text contained in region."""
-        if self.text is None:
-            return ""
-        else:
-            return super().extract_text(objects)
-
-
-def aggregate_by_block(
-    text_region: TextRegion,
-    pdf_objects: Collection[TextRegion],
-) -> str:
-    """Extracts the text aggregated from the elements of the given layout that lie within the given
-    block."""
-
-    subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
-    filtered_blocks = [
-        obj
-        for obj in pdf_objects
-        if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
-    ]
-    text = " ".join([x.text for x in filtered_blocks if x.text])
-    return text
-
-
-def remove_control_characters(text: str) -> str:
-    """Removes control characters from text."""
-
-    # Replace newline character with a space
-    text = text.replace("\n", " ")
-    # Remove other control characters
-    out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
-    return out_text
+    pass
 
 
 def region_bounding_boxes_are_almost_the_same(
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -15,10 +15,8 @@
 from unstructured_inference.inference.layoutelement import (
     LayoutElement,
 )
-from unstructured_inference.inference.ordering import order_layout
 from unstructured_inference.logger import logger
 from unstructured_inference.models.base import get_model
-from unstructured_inference.models.chipper import UnstructuredChipperModel
 from unstructured_inference.models.unstructuredmodel import (
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
@@ -201,29 +199,6 @@ def get_elements_with_detection_model(
 
         return inferred_layout
 
-    def get_elements_from_layout(
-        self,
-        layout: List[TextRegion],
-        pdf_objects: Optional[List[TextRegion]] = None,
-    ) -> List[LayoutElement]:
-        """Uses the given Layout to separate the page text into elements, either extracting the
-        text from the discovered layout blocks."""
-
-        # If the model is a chipper model, we don't want to order the
-        # elements, as they are already ordered
-        order_elements = not isinstance(self.detection_model, UnstructuredChipperModel)
-        if order_elements:
-            layout = order_layout(layout)
-
-        elements = [
-            get_element_from_block(
-                block=e,
-                pdf_objects=pdf_objects,
-            )
-            for e in layout
-        ]
-        return elements
-
     def _get_image_array(self) -> Union[np.ndarray, None]:
         """Converts the raw image into a numpy array."""
         if self.image_array is None:
@@ -330,7 +305,7 @@ def from_image(
         elif fixed_layout is None:
             page.get_elements_with_detection_model()
         else:
-            page.elements = page.get_elements_from_layout(fixed_layout)
+            page.elements = []
 
         page.image_metadata = {
             "format": page.image.format if page.image else None,
@@ -405,19 +380,6 @@ def process_file_with_model(
     return layout
 
 
-def get_element_from_block(
-    block: TextRegion,
-    pdf_objects: Optional[List[TextRegion]] = None,
-) -> LayoutElement:
-    """Creates a LayoutElement from a given layout or image by finding all the text that lies within
-    a given block."""
-    element = block if isinstance(block, LayoutElement) else LayoutElement.from_region(block)
-    element.text = element.extract_text(
-        objects=pdf_objects,
-    )
-    return element
-
-
 def convert_pdf_to_image(
     filename: str,
     dpi: int = 200,
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -32,16 +32,6 @@ class LayoutElement(TextRegion):
     image_path: Optional[str] = None
     parent: Optional[LayoutElement] = None
 
-    def extract_text(
-        self,
-        objects: Optional[Collection[TextRegion]],
-    ):
-        """Extracts text contained in region"""
-        text = super().extract_text(
-            objects=objects,
-        )
-        return text
-
     def to_dict(self) -> dict:
         """Converts the class instance to dictionary form."""
         out_dict = {
diff --git a/unstructured_inference/inference/ordering.py b/unstructured_inference/inference/ordering.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.32-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.7.32" # pragma: no cover`