Skip to content

Commit 2b29254

Browse files
Refactor: remove text extraction (pdfminer) related code (#294)
### Summary This PR is the first part of `pdfminer` refactor to move it from `unstructured-inference` repo to `unstructured` repo. This PR removes all `pdfminer` related code from `unstructured-inference` repo and works together with the unstructured refactor PR - Unstructured-IO/unstructured#2158. ### Note The ingest test won't pass until we merge the unstructured refactor PR - Unstructured-IO/unstructured#2158. ### TODO - image extraction refactor to move it from `unstructured-inference` repo to `unstructured` repo
1 parent 631e6fb commit 2b29254

File tree

13 files changed

+72
-311
lines changed

13 files changed

+72
-311
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.7.16-dev1
1+
## 0.7.17
22

3+
* refactor: remove all `pdfminer` related code
34
* enhancement: improved Chipper bounding boxes
45

56
## 0.7.16

examples/layout_analysis/visualization.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def run(f_path, scope):
2323
doc = process_file_with_model(
2424
f_path,
2525
model_name=None,
26-
analysis=True,
2726
)
2827

2928
for idx, page in enumerate(doc.pages):

examples/ocr_layout_supplement/ocr_layout_supplement.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ def run(f_path, file_type):
3838
is_image=is_image,
3939
model_name=None,
4040
supplement_with_ocr_elements=action,
41-
analysis=True,
4241
)
4342

4443
annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.IMAGE)

test_unstructured_inference/inference/test_layout.py

Lines changed: 20 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
import os
22
import os.path
33
import tempfile
4-
from functools import partial
54
from unittest.mock import ANY, mock_open, patch
65

76
import numpy as np
87
import pytest
98
from PIL import Image
109

1110
import unstructured_inference.models.base as models
12-
from unstructured_inference.constants import Source
1311
from unstructured_inference.inference import elements, layout, layoutelement
14-
from unstructured_inference.models import detectron2
12+
from unstructured_inference.inference.elements import EmbeddedTextRegion, ImageTextRegion
1513
from unstructured_inference.models.unstructuredmodel import (
1614
UnstructuredElementExtractionModel,
1715
UnstructuredObjectDetectionModel,
@@ -27,7 +25,7 @@ def mock_image():
2725

2826
@pytest.fixture()
2927
def mock_initial_layout():
30-
text_block = layout.EmbeddedTextRegion.from_coords(
28+
text_block = EmbeddedTextRegion.from_coords(
3129
2,
3230
4,
3331
6,
@@ -36,7 +34,7 @@ def mock_initial_layout():
3634
source="Mock",
3735
)
3836

39-
title_block = layout.EmbeddedTextRegion.from_coords(
37+
title_block = EmbeddedTextRegion.from_coords(
4038
1,
4139
2,
4240
3,
@@ -81,7 +79,7 @@ def verify_image_array():
8179
assert page.image_array.all() == image_array.all()
8280

8381
# Scenario 1: where self.image exists
84-
page = layout.PageLayout(number=0, image=mock_image, layout=[])
82+
page = layout.PageLayout(number=0, image=mock_image)
8583
verify_image_array()
8684

8785
# Scenario 2: where self.image is None, but self.image_path exists
@@ -111,15 +109,9 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
111109
page = layout.PageLayout(
112110
number=0,
113111
image=image,
114-
layout=mock_final_layout,
115112
detection_model=MockLayoutModel(mock_final_layout),
116113
)
117-
118114
elements = page.get_elements_with_detection_model(inplace=False)
119-
120-
assert str(elements[0]) == "A Catchy Title"
121-
assert str(elements[1]).startswith("A very repetitive narrative.")
122-
123115
page.get_elements_with_detection_model(inplace=True)
124116
assert elements == page.elements
125117

@@ -135,35 +127,6 @@ def join(self):
135127
pass
136128

137129

138-
def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_image):
139-
with tempfile.TemporaryDirectory() as tmpdir:
140-
image_path1 = os.path.join(tmpdir, "mock1.jpg")
141-
image_path2 = os.path.join(tmpdir, "mock2.jpg")
142-
mock_image.save(image_path1)
143-
mock_image.save(image_path2)
144-
image_paths = [image_path1, image_path2]
145-
146-
layouts = [mock_initial_layout, mock_initial_layout]
147-
148-
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
149-
150-
with patch.object(layout, "load_pdf", return_value=(layouts, image_paths)), patch.dict(
151-
models.model_class_map,
152-
{"detectron2_lp": partial(MockLayoutModel, layout=mock_final_layout)},
153-
):
154-
model = layout.get_model("detectron2_lp")
155-
doc = layout.DocumentLayout.from_file("fake-file.pdf", detection_model=model)
156-
157-
assert str(doc).startswith("A Catchy Title")
158-
assert str(doc).count("A Catchy Title") == 2 # Once for each page
159-
assert str(doc).endswith("A very repetitive narrative. ")
160-
161-
assert doc.pages[0].elements[0].to_dict()["text"] == "A Catchy Title"
162-
163-
pages = doc.pages
164-
assert str(doc) == "\n\n".join([str(page) for page in pages])
165-
166-
167130
@pytest.mark.parametrize("model_name", [None, "checkbox", "fake"])
168131
def test_process_data_with_model(monkeypatch, mock_final_layout, model_name):
169132
monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
@@ -236,7 +199,7 @@ def tolist(self):
236199
return [1, 2, 3, 4]
237200

238201

239-
class MockEmbeddedTextRegion(layout.EmbeddedTextRegion):
202+
class MockEmbeddedTextRegion(EmbeddedTextRegion):
240203
def __init__(self, type=None, text=None):
241204
self.type = type
242205
self.text = text
@@ -251,15 +214,16 @@ def __init__(
251214
self,
252215
number=1,
253216
image=None,
254-
layout=None,
255217
model=None,
256218
extract_tables=False,
219+
detection_model=None,
257220
):
258221
self.image = image
259222
self.layout = layout
260223
self.model = model
261224
self.extract_tables = extract_tables
262225
self.number = number
226+
self.detection_model = detection_model
263227

264228

265229
@pytest.mark.parametrize(
@@ -349,8 +313,8 @@ def mock_get_elements(self, *args, **kwargs):
349313

350314
with patch.object(
351315
layout,
352-
"load_pdf",
353-
lambda *args, **kwargs: ([[]], [image_path]),
316+
"convert_pdf_to_image",
317+
lambda *args, **kwargs: ([image_path]),
354318
):
355319
doc = layout.DocumentLayout.from_file("fake-file.pdf")
356320
page = doc.pages[0]
@@ -369,16 +333,9 @@ def test_from_image_file_raises_isadirectoryerror_with_dir():
369333
layout.DocumentLayout.from_image_file(tempdir)
370334

371335

372-
def test_from_file_raises_on_length_mismatch(monkeypatch):
373-
monkeypatch.setattr(layout, "load_pdf", lambda *args, **kwargs: ([None, None], []))
374-
with pytest.raises(RuntimeError) as e:
375-
layout.DocumentLayout.from_file("fake_file")
376-
assert "images" in str(e).lower()
377-
378-
379336
@pytest.mark.parametrize("idx", range(2))
380337
def test_get_elements_from_layout(mock_initial_layout, idx):
381-
page = MockPageLayout(layout=mock_initial_layout)
338+
page = MockPageLayout()
382339
block = mock_initial_layout[idx]
383340
block.bbox.pad(3)
384341
fixed_layout = [block]
@@ -429,74 +386,19 @@ def test_remove_control_characters(text, expected):
429386
assert elements.remove_control_characters(text) == expected
430387

431388

432-
no_text_region = layout.EmbeddedTextRegion.from_coords(0, 0, 100, 100)
433-
text_region = layout.EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
434-
cid_text_region = layout.EmbeddedTextRegion.from_coords(
389+
no_text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100)
390+
text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
391+
cid_text_region = EmbeddedTextRegion.from_coords(
435392
0,
436393
0,
437394
100,
438395
100,
439396
text="(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)",
440397
)
441-
overlapping_rect = layout.ImageTextRegion.from_coords(50, 50, 150, 150)
442-
nonoverlapping_rect = layout.ImageTextRegion.from_coords(150, 150, 200, 200)
443-
populated_text_region = layout.EmbeddedTextRegion.from_coords(50, 50, 60, 60, text="test")
444-
unpopulated_text_region = layout.EmbeddedTextRegion.from_coords(50, 50, 60, 60, text=None)
445-
446-
447-
@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
448-
def test_load_pdf(filename):
449-
layouts, images = layout.load_pdf(f"sample-docs/{filename}")
450-
assert Source.PDFMINER in {e.source for e in layouts[0]}
451-
assert len(layouts)
452-
for lo in layouts:
453-
assert len(lo)
454-
assert len(images)
455-
assert len(layouts) == len(images)
456-
457-
458-
def test_load_pdf_with_images():
459-
layouts, _ = layout.load_pdf("sample-docs/loremipsum-flat.pdf")
460-
first_page_layout = layouts[0]
461-
assert any(isinstance(obj, layout.ImageTextRegion) for obj in first_page_layout)
462-
463-
464-
def test_load_pdf_image_placement():
465-
layouts, images = layout.load_pdf("sample-docs/layout-parser-paper.pdf")
466-
page_layout = layouts[5]
467-
image_regions = [region for region in page_layout if isinstance(region, layout.ImageTextRegion)]
468-
image_region = image_regions[0]
469-
# Image is in top half of the page, so that should be reflected in the pixel coordinates
470-
assert image_region.bbox.y1 < images[5].height / 2
471-
assert image_region.bbox.y2 < images[5].height / 2
472-
473-
474-
def test_load_pdf_raises_with_path_only_no_output_folder():
475-
with pytest.raises(ValueError):
476-
layout.load_pdf(
477-
"sample-docs/loremipsum-flat.pdf",
478-
path_only=True,
479-
)
480-
481-
482-
@pytest.mark.skip("Temporarily removed multicolumn to fix ordering")
483-
def test_load_pdf_with_multicolumn_layout(filename="sample-docs/design-thinking.pdf"):
484-
layouts, images = layout.load_pdf(filename)
485-
doc = layout.process_file_with_model(filename=filename, model_name=None)
486-
test_snippets = [
487-
"Key to design thinking",
488-
"Design thinking also",
489-
"But in recent years",
490-
]
491-
492-
test_elements = []
493-
for element in doc.pages[0].elements:
494-
for snippet in test_snippets:
495-
if element.text.startswith(snippet):
496-
test_elements.append(element)
497-
498-
for i, element in enumerate(test_elements):
499-
assert element.text.startswith(test_snippets[i])
398+
overlapping_rect = ImageTextRegion.from_coords(50, 50, 150, 150)
399+
nonoverlapping_rect = ImageTextRegion.from_coords(150, 150, 200, 200)
400+
populated_text_region = EmbeddedTextRegion.from_coords(50, 50, 60, 60, text="test")
401+
unpopulated_text_region = EmbeddedTextRegion.from_coords(50, 50, 60, 60, text=None)
500402

501403

502404
@pytest.mark.parametrize(
@@ -521,7 +423,7 @@ def check_annotated_image():
521423

522424
test_image_arr = np.ones((100, 100, 3), dtype="uint8")
523425
image = Image.fromarray(test_image_arr)
524-
page = layout.PageLayout(number=1, image=image, layout=None)
426+
page = layout.PageLayout(number=1, image=image)
525427
coords1 = (21, 30, 37, 41)
526428
rect1 = elements.TextRegion.from_coords(*coords1)
527429
coords2 = (1, 10, 7, 11)
@@ -571,8 +473,8 @@ def test_layout_order(mock_image):
571473
mock_image.save(mock_image_path)
572474
with patch.object(layout, "get_model", lambda: MockDetectionModel()), patch.object(
573475
layout,
574-
"load_pdf",
575-
lambda *args, **kwargs: ([[]], [mock_image_path]),
476+
"convert_pdf_to_image",
477+
lambda *args, **kwargs: ([mock_image_path]),
576478
):
577479
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
578480
page = doc.pages[0]

test_unstructured_inference/models/test_model.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,6 @@ def test_model_initializes_once():
4949
):
5050
doc = layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf")
5151
doc.pages[0].detection_model.initializer.assert_called_once()
52-
# NOTE(pravin) New Assertion to Make Sure Elements have probability attribute
53-
assert hasattr(doc.pages[0].elements[0], "prob")
54-
# NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
55-
assert doc.pages[0].elements[0].prob is None
5652

5753

5854
def test_deduplicate_detected_elements():

test_unstructured_inference/models/test_yolox.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,13 @@ def test_layout_yolox_local_parsing_image_soft():
8383
def test_layout_yolox_local_parsing_pdf_soft():
8484
filename = os.path.join("sample-docs", "loremipsum.pdf")
8585
document_layout = process_file_with_model(filename, model_name="yolox_tiny")
86-
content = str(document_layout)
87-
assert "libero fringilla" in content
8886
assert len(document_layout.pages) == 1
8987
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
9088
assert len(document_layout.pages[0].elements) > 0
9189
assert hasattr(
9290
document_layout.pages[0].elements[0],
9391
"prob",
9492
) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
95-
assert (
96-
document_layout.pages[0].elements[0].prob is None
97-
) # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
9893

9994

10095
def test_layout_yolox_local_parsing_empty_pdf_soft():

test_unstructured_inference/test_elements.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,15 @@
44

55
import pytest
66

7+
from unstructured_inference.constants import ElementType
78
from unstructured_inference.inference import elements
8-
from unstructured_inference.inference.layoutelement import partition_groups_from_regions, separate
9+
from unstructured_inference.inference.elements import TextRegion
10+
from unstructured_inference.inference.layoutelement import (
11+
partition_groups_from_regions,
12+
separate,
13+
merge_inferred_layout_with_extracted_layout,
14+
LayoutElement,
15+
)
916

1017
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
1118

@@ -228,3 +235,25 @@ def test_separate(rect1, rect2):
228235
separate(rect1, rect2)
229236

230237
# assert not rect1.intersects(rect2) #TODO: fix this test
238+
239+
240+
def test_merge_inferred_layout_with_extracted_layout():
241+
inferred_layout = [
242+
LayoutElement.from_coords(453, 322, 1258, 408, text=None, type=ElementType.SECTION_HEADER),
243+
LayoutElement.from_coords(387, 477, 1320, 537, text=None, type=ElementType.TEXT),
244+
]
245+
246+
extracted_layout = [
247+
TextRegion.from_coords(438, 318, 1272, 407, text="Example Section Header"),
248+
TextRegion.from_coords(377, 469, 1335, 535, text="Example Title"),
249+
]
250+
251+
merged_layout = merge_inferred_layout_with_extracted_layout(
252+
inferred_layout=inferred_layout,
253+
extracted_layout=extracted_layout,
254+
page_image_size=(1700, 2200),
255+
)
256+
assert merged_layout[0].type == ElementType.SECTION_HEADER
257+
assert merged_layout[0].text == "Example Section Header"
258+
assert merged_layout[1].type == ElementType.TEXT
259+
assert merged_layout[1].text == "Example Title"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.16-dev1" # pragma: no cover
1+
__version__ = "0.7.17" # pragma: no cover

unstructured_inference/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ class Source(Enum):
1313
CHIPPER = "chipper"
1414
CHIPPERV1 = "chipperv1"
1515
CHIPPERV2 = "chipperv2"
16-
PDFMINER = "pdfminer"
1716
MERGED = "merged"
1817
SUPER_GRADIENTS = "super-gradients"
1918

0 commit comments

Comments
 (0)