Refactor: remove text extraction (pdfminer) related code (#294)

christinestraub · web-flow · commit 2b292542fb38 · 2023-11-30T21:37:38.000-08:00
### Summary This PR is the first part of `pdfminer` refactor to move it from `unstructured-inference` repo to `unstructured` repo. This PR removes all `pdfminer` related code from `unstructured-inference` repo and works together with the unstructured refactor PR - Unstructured-IO/unstructured#2158. ### Note The ingest test won't pass until we merge the unstructured refactor PR - Unstructured-IO/unstructured#2158. ### TODO - image extraction refactor to move it from `unstructured-inference` repo to `unstructured` repo
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
-## 0.7.16-dev1
+## 0.7.17
 
+* refactor: remove all `pdfminer` related code
 * enhancement: improved Chipper bounding boxes
 
 ## 0.7.16
diff --git a/examples/layout_analysis/visualization.py b/examples/layout_analysis/visualization.py
@@ -23,7 +23,6 @@ def run(f_path, scope):
     doc = process_file_with_model(
         f_path,
         model_name=None,
-        analysis=True,
     )
 
     for idx, page in enumerate(doc.pages):
diff --git a/examples/ocr_layout_supplement/ocr_layout_supplement.py b/examples/ocr_layout_supplement/ocr_layout_supplement.py
@@ -38,7 +38,6 @@ def run(f_path, file_type):
             is_image=is_image,
             model_name=None,
             supplement_with_ocr_elements=action,
-            analysis=True,
         )
 
         annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.IMAGE)
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -1,17 +1,15 @@
 import os
 import os.path
 import tempfile
-from functools import partial
 from unittest.mock import ANY, mock_open, patch
 
 import numpy as np
 import pytest
 from PIL import Image
 
 import unstructured_inference.models.base as models
-from unstructured_inference.constants import Source
 from unstructured_inference.inference import elements, layout, layoutelement
-from unstructured_inference.models import detectron2
+from unstructured_inference.inference.elements import EmbeddedTextRegion, ImageTextRegion
 from unstructured_inference.models.unstructuredmodel import (
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
@@ -27,7 +25,7 @@ def mock_image():
 
 @pytest.fixture()
 def mock_initial_layout():
-    text_block = layout.EmbeddedTextRegion.from_coords(
+    text_block = EmbeddedTextRegion.from_coords(
         2,
         4,
         6,
@@ -36,7 +34,7 @@ def mock_initial_layout():
         source="Mock",
     )
 
-    title_block = layout.EmbeddedTextRegion.from_coords(
+    title_block = EmbeddedTextRegion.from_coords(
         1,
         2,
         3,
@@ -81,7 +79,7 @@ def verify_image_array():
         assert page.image_array.all() == image_array.all()
 
     # Scenario 1: where self.image exists
-    page = layout.PageLayout(number=0, image=mock_image, layout=[])
+    page = layout.PageLayout(number=0, image=mock_image)
     verify_image_array()
 
     # Scenario 2: where self.image is None, but self.image_path exists
@@ -111,15 +109,9 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
     page = layout.PageLayout(
         number=0,
         image=image,
-        layout=mock_final_layout,
         detection_model=MockLayoutModel(mock_final_layout),
     )
-
     elements = page.get_elements_with_detection_model(inplace=False)
-
-    assert str(elements[0]) == "A Catchy Title"
-    assert str(elements[1]).startswith("A very repetitive narrative.")
-
     page.get_elements_with_detection_model(inplace=True)
     assert elements == page.elements
 
@@ -135,35 +127,6 @@ def join(self):
         pass
 
 
-def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_image):
-    with tempfile.TemporaryDirectory() as tmpdir:
-        image_path1 = os.path.join(tmpdir, "mock1.jpg")
-        image_path2 = os.path.join(tmpdir, "mock2.jpg")
-        mock_image.save(image_path1)
-        mock_image.save(image_path2)
-        image_paths = [image_path1, image_path2]
-
-        layouts = [mock_initial_layout, mock_initial_layout]
-
-        monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
-
-        with patch.object(layout, "load_pdf", return_value=(layouts, image_paths)), patch.dict(
-            models.model_class_map,
-            {"detectron2_lp": partial(MockLayoutModel, layout=mock_final_layout)},
-        ):
-            model = layout.get_model("detectron2_lp")
-            doc = layout.DocumentLayout.from_file("fake-file.pdf", detection_model=model)
-
-            assert str(doc).startswith("A Catchy Title")
-            assert str(doc).count("A Catchy Title") == 2  # Once for each page
-            assert str(doc).endswith("A very repetitive narrative. ")
-
-            assert doc.pages[0].elements[0].to_dict()["text"] == "A Catchy Title"
-
-            pages = doc.pages
-            assert str(doc) == "\n\n".join([str(page) for page in pages])
-
-
 @pytest.mark.parametrize("model_name", [None, "checkbox", "fake"])
 def test_process_data_with_model(monkeypatch, mock_final_layout, model_name):
     monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
@@ -236,7 +199,7 @@ def tolist(self):
         return [1, 2, 3, 4]
 
 
-class MockEmbeddedTextRegion(layout.EmbeddedTextRegion):
+class MockEmbeddedTextRegion(EmbeddedTextRegion):
     def __init__(self, type=None, text=None):
         self.type = type
         self.text = text
@@ -251,15 +214,16 @@ def __init__(
         self,
         number=1,
         image=None,
-        layout=None,
         model=None,
         extract_tables=False,
+        detection_model=None,
     ):
         self.image = image
         self.layout = layout
         self.model = model
         self.extract_tables = extract_tables
         self.number = number
+        self.detection_model = detection_model
 
 
 @pytest.mark.parametrize(
@@ -349,8 +313,8 @@ def mock_get_elements(self, *args, **kwargs):
 
         with patch.object(
             layout,
-            "load_pdf",
-            lambda *args, **kwargs: ([[]], [image_path]),
+            "convert_pdf_to_image",
+            lambda *args, **kwargs: ([image_path]),
         ):
             doc = layout.DocumentLayout.from_file("fake-file.pdf")
             page = doc.pages[0]
@@ -369,16 +333,9 @@ def test_from_image_file_raises_isadirectoryerror_with_dir():
         layout.DocumentLayout.from_image_file(tempdir)
 
 
-def test_from_file_raises_on_length_mismatch(monkeypatch):
-    monkeypatch.setattr(layout, "load_pdf", lambda *args, **kwargs: ([None, None], []))
-    with pytest.raises(RuntimeError) as e:
-        layout.DocumentLayout.from_file("fake_file")
-    assert "images" in str(e).lower()
-
-
 @pytest.mark.parametrize("idx", range(2))
 def test_get_elements_from_layout(mock_initial_layout, idx):
-    page = MockPageLayout(layout=mock_initial_layout)
+    page = MockPageLayout()
     block = mock_initial_layout[idx]
     block.bbox.pad(3)
     fixed_layout = [block]
@@ -429,74 +386,19 @@ def test_remove_control_characters(text, expected):
     assert elements.remove_control_characters(text) == expected
 
 
-no_text_region = layout.EmbeddedTextRegion.from_coords(0, 0, 100, 100)
-text_region = layout.EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
-cid_text_region = layout.EmbeddedTextRegion.from_coords(
+no_text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100)
+text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
+cid_text_region = EmbeddedTextRegion.from_coords(
     0,
     0,
     100,
     100,
     text="(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)",
 )
-overlapping_rect = layout.ImageTextRegion.from_coords(50, 50, 150, 150)
-nonoverlapping_rect = layout.ImageTextRegion.from_coords(150, 150, 200, 200)
-populated_text_region = layout.EmbeddedTextRegion.from_coords(50, 50, 60, 60, text="test")
-unpopulated_text_region = layout.EmbeddedTextRegion.from_coords(50, 50, 60, 60, text=None)
-
-
-@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
-def test_load_pdf(filename):
-    layouts, images = layout.load_pdf(f"sample-docs/{filename}")
-    assert Source.PDFMINER in {e.source for e in layouts[0]}
-    assert len(layouts)
-    for lo in layouts:
-        assert len(lo)
-    assert len(images)
-    assert len(layouts) == len(images)
-
-
-def test_load_pdf_with_images():
-    layouts, _ = layout.load_pdf("sample-docs/loremipsum-flat.pdf")
-    first_page_layout = layouts[0]
-    assert any(isinstance(obj, layout.ImageTextRegion) for obj in first_page_layout)
-
-
-def test_load_pdf_image_placement():
-    layouts, images = layout.load_pdf("sample-docs/layout-parser-paper.pdf")
-    page_layout = layouts[5]
-    image_regions = [region for region in page_layout if isinstance(region, layout.ImageTextRegion)]
-    image_region = image_regions[0]
-    # Image is in top half of the page, so that should be reflected in the pixel coordinates
-    assert image_region.bbox.y1 < images[5].height / 2
-    assert image_region.bbox.y2 < images[5].height / 2
-
-
-def test_load_pdf_raises_with_path_only_no_output_folder():
-    with pytest.raises(ValueError):
-        layout.load_pdf(
-            "sample-docs/loremipsum-flat.pdf",
-            path_only=True,
-        )
-
-
-@pytest.mark.skip("Temporarily removed multicolumn to fix ordering")
-def test_load_pdf_with_multicolumn_layout(filename="sample-docs/design-thinking.pdf"):
-    layouts, images = layout.load_pdf(filename)
-    doc = layout.process_file_with_model(filename=filename, model_name=None)
-    test_snippets = [
-        "Key to design thinking",
-        "Design thinking also",
-        "But in recent years",
-    ]
-
-    test_elements = []
-    for element in doc.pages[0].elements:
-        for snippet in test_snippets:
-            if element.text.startswith(snippet):
-                test_elements.append(element)
-
-    for i, element in enumerate(test_elements):
-        assert element.text.startswith(test_snippets[i])
+overlapping_rect = ImageTextRegion.from_coords(50, 50, 150, 150)
+nonoverlapping_rect = ImageTextRegion.from_coords(150, 150, 200, 200)
+populated_text_region = EmbeddedTextRegion.from_coords(50, 50, 60, 60, text="test")
+unpopulated_text_region = EmbeddedTextRegion.from_coords(50, 50, 60, 60, text=None)
 
 
 @pytest.mark.parametrize(
@@ -521,7 +423,7 @@ def check_annotated_image():
 
     test_image_arr = np.ones((100, 100, 3), dtype="uint8")
     image = Image.fromarray(test_image_arr)
-    page = layout.PageLayout(number=1, image=image, layout=None)
+    page = layout.PageLayout(number=1, image=image)
     coords1 = (21, 30, 37, 41)
     rect1 = elements.TextRegion.from_coords(*coords1)
     coords2 = (1, 10, 7, 11)
@@ -571,8 +473,8 @@ def test_layout_order(mock_image):
         mock_image.save(mock_image_path)
         with patch.object(layout, "get_model", lambda: MockDetectionModel()), patch.object(
             layout,
-            "load_pdf",
-            lambda *args, **kwargs: ([[]], [mock_image_path]),
+            "convert_pdf_to_image",
+            lambda *args, **kwargs: ([mock_image_path]),
         ):
             doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
             page = doc.pages[0]
diff --git a/test_unstructured_inference/models/test_model.py b/test_unstructured_inference/models/test_model.py
@@ -49,10 +49,6 @@ def test_model_initializes_once():
     ):
         doc = layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf")
         doc.pages[0].detection_model.initializer.assert_called_once()
-        # NOTE(pravin) New Assertion to Make Sure Elements have probability attribute
-        assert hasattr(doc.pages[0].elements[0], "prob")
-        # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
-        assert doc.pages[0].elements[0].prob is None
 
 
 def test_deduplicate_detected_elements():
diff --git a/test_unstructured_inference/models/test_yolox.py b/test_unstructured_inference/models/test_yolox.py
@@ -83,18 +83,13 @@ def test_layout_yolox_local_parsing_image_soft():
 def test_layout_yolox_local_parsing_pdf_soft():
     filename = os.path.join("sample-docs", "loremipsum.pdf")
     document_layout = process_file_with_model(filename, model_name="yolox_tiny")
-    content = str(document_layout)
-    assert "libero fringilla" in content
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
     assert len(document_layout.pages[0].elements) > 0
     assert hasattr(
         document_layout.pages[0].elements[0],
         "prob",
     )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
-    assert (
-        document_layout.pages[0].elements[0].prob is None
-    )  # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
 
 
 def test_layout_yolox_local_parsing_empty_pdf_soft():
diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py
@@ -4,8 +4,15 @@
 
 import pytest
 
+from unstructured_inference.constants import ElementType
 from unstructured_inference.inference import elements
-from unstructured_inference.inference.layoutelement import partition_groups_from_regions, separate
+from unstructured_inference.inference.elements import TextRegion
+from unstructured_inference.inference.layoutelement import (
+    partition_groups_from_regions,
+    separate,
+    merge_inferred_layout_with_extracted_layout,
+    LayoutElement,
+)
 
 skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
 
@@ -228,3 +235,25 @@ def test_separate(rect1, rect2):
     separate(rect1, rect2)
 
     # assert not rect1.intersects(rect2) #TODO: fix this test
+
+
+def test_merge_inferred_layout_with_extracted_layout():
+    inferred_layout = [
+        LayoutElement.from_coords(453, 322, 1258, 408, text=None, type=ElementType.SECTION_HEADER),
+        LayoutElement.from_coords(387, 477, 1320, 537, text=None, type=ElementType.TEXT),
+    ]
+
+    extracted_layout = [
+        TextRegion.from_coords(438, 318, 1272, 407, text="Example Section Header"),
+        TextRegion.from_coords(377, 469, 1335, 535, text="Example Title"),
+    ]
+
+    merged_layout = merge_inferred_layout_with_extracted_layout(
+        inferred_layout=inferred_layout,
+        extracted_layout=extracted_layout,
+        page_image_size=(1700, 2200),
+    )
+    assert merged_layout[0].type == ElementType.SECTION_HEADER
+    assert merged_layout[0].text == "Example Section Header"
+    assert merged_layout[1].type == ElementType.TEXT
+    assert merged_layout[1].text == "Example Title"
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.16-dev1"  # pragma: no cover
+__version__ = "0.7.17"  # pragma: no cover
diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py
@@ -13,7 +13,6 @@ class Source(Enum):
     CHIPPER = "chipper"
     CHIPPERV1 = "chipperv1"
     CHIPPERV2 = "chipperv2"
-    PDFMINER = "pdfminer"
     MERGED = "merged"
     SUPER_GRADIENTS = "super-gradients"
 
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
diff --git a/unstructured_inference/inference/pdf.py b/unstructured_inference/inference/pdf.py
diff --git a/unstructured_inference/patches/__init__.py b/unstructured_inference/patches/__init__.py
diff --git a/unstructured_inference/patches/pdfminer.py b/unstructured_inference/patches/pdfminer.py

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,6 @@ def run(f_path, scope):`
`23`	`23`	`doc = process_file_with_model(`
`24`	`24`	`f_path,`
`25`	`25`	`model_name=None,`
`26`		`- analysis=True,`
`27`	`26`	`)`
`28`	`27`
`29`	`28`	`for idx, page in enumerate(doc.pages):`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,6 @@ def run(f_path, file_type):`
`38`	`38`	`is_image=is_image,`
`39`	`39`	`model_name=None,`
`40`	`40`	`supplement_with_ocr_elements=action,`
`41`		`- analysis=True,`
`42`	`41`	`)`
`43`	`42`
`44`	`43`	`annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.IMAGE)`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.16-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.7.17" # pragma: no cover`