fix: typeerror when using chipper (#311)

badGarnet · web-flow · commit d3b298131352 · 2023-12-20T18:06:04.000-06:00
This PR resolves #310 - chipper, or any page layout extracted with an element extraction model do not have key attributes like `image_metadata` populated - this leads to `None` values for image width and height, which lead to the bug - this fix prevents the function early return after chipper finds the elements - it continues the logic to allow other key attributes of the page to be filled - a bonus from this fix is we remove the image data from the page (which is not needed downstream) for chipper generated pages ## Test A unit test is modified to test all the routes, including using an element extraction model, for page layout Additionally grab this attached pdf and when running partition using chipper the main branch would lead to type error but this fix would run without error. [005-CISA-AA22-076-Strengthening-Cybersecurity-p1-p4.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/13731533/005-CISA-AA22-076-Strengthening-Cybersecurity-p1-p4.pdf)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.21
+
+* fix: fix a bug where chipper, or any element extraction model based `PageLayout` object, lack `image_metadata` and other attributes that are required for downstream processing; this fix also reduces the memory overhead of using chipper model
+
 ## 0.7.20
 
 * chipper-v3: improved table prediction
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -9,7 +9,10 @@
 
 import unstructured_inference.models.base as models
 from unstructured_inference.inference import elements, layout, layoutelement
-from unstructured_inference.inference.elements import EmbeddedTextRegion, ImageTextRegion
+from unstructured_inference.inference.elements import (
+    EmbeddedTextRegion,
+    ImageTextRegion,
+)
 from unstructured_inference.models.unstructuredmodel import (
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
@@ -271,12 +274,14 @@ def filter_by(self, *args, **kwargs):
         return MockLayout()
 
 
+@pytest.mark.parametrize("element_extraction_model", [None, "foo"])
 @pytest.mark.parametrize("filetype", ["png", "jpg", "tiff"])
-def test_from_image_file(monkeypatch, mock_final_layout, filetype):
+def test_from_image_file(monkeypatch, mock_final_layout, filetype, element_extraction_model):
     def mock_get_elements(self, *args, **kwargs):
         self.elements = [mock_final_layout]
 
     monkeypatch.setattr(layout.PageLayout, "get_elements_with_detection_model", mock_get_elements)
+    monkeypatch.setattr(layout.PageLayout, "get_elements_using_image_extraction", mock_get_elements)
     filename = f"sample-docs/loremipsum.{filetype}"
     image = Image.open(filename)
     image_metadata = {
@@ -285,7 +290,10 @@ def mock_get_elements(self, *args, **kwargs):
         "height": image.height,
     }
 
-    doc = layout.DocumentLayout.from_image_file(filename)
+    doc = layout.DocumentLayout.from_image_file(
+        filename,
+        element_extraction_model=element_extraction_model,
+    )
     page = doc.pages[0]
     assert page.elements[0] == mock_final_layout
     assert page.image is None
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.20"  # pragma: no cover
+__version__ = "0.7.21"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -322,10 +322,10 @@ def from_image(
             detection_model=detection_model,
             element_extraction_model=element_extraction_model,
         )
+        # FIXME (yao): refactor the other methods so they all return elements like the third route
         if page.element_extraction_model is not None:
             page.get_elements_using_image_extraction()
-            return page
-        if fixed_layout is None:
+        elif fixed_layout is None:
             page.get_elements_with_detection_model()
         else:
             page.elements = page.get_elements_from_layout(fixed_layout)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.20" # pragma: no cover`
	`1`	`+__version__ = "0.7.21" # pragma: no cover`