Skip to content

Commit 0ed69a1

Browse files
refactor: pdfminer image cleanup (#3648)
This PR aims to remove `clean_pdfminer_duplicate_image_elements()` function, as its functionality has already been integrated into the `remove_duplicate_elements()` function in [PR #3630](#3630).
1 parent be88eef commit 0ed69a1

File tree

5 files changed

+2
-60
lines changed

5 files changed

+2
-60
lines changed

Diff for: CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.15.13-dev2
1+
## 0.15.13-dev3
22

33
### Enhancements
44

Diff for: test_unstructured/partition/pdf_image/test_pdfminer_processing.py

-18
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
aggregate_embedded_text_by_block,
1010
bboxes1_is_almost_subregion_of_bboxes2,
1111
boxes_self_iou,
12-
clean_pdfminer_duplicate_image_elements,
1312
clean_pdfminer_inner_elements,
1413
remove_duplicate_elements,
1514
)
@@ -129,23 +128,6 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc
129128
]
130129

131130

132-
@pytest.mark.parametrize(
133-
("elements", "expected_document_length"),
134-
[
135-
(elements_with_duplicate_images, 2),
136-
(elements_without_duplicate_images, 4),
137-
],
138-
)
139-
def test_clean_pdfminer_duplicate_image_elements(elements, expected_document_length):
140-
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
141-
page.elements = elements
142-
document = DocumentLayout(pages=[page])
143-
144-
cleaned_doc = clean_pdfminer_duplicate_image_elements(document)
145-
146-
assert len(cleaned_doc.pages[0].elements) == expected_document_length
147-
148-
149131
def test_aggregate_by_block():
150132
expected = "Inside region1 Inside region2"
151133
embedded_regions = [

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.13-dev2" # pragma: no cover
1+
__version__ = "0.15.13-dev3" # pragma: no cover

Diff for: unstructured/partition/pdf.py

-2
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
save_elements,
6969
)
7070
from unstructured.partition.pdf_image.pdfminer_processing import (
71-
clean_pdfminer_duplicate_image_elements,
7271
clean_pdfminer_inner_elements,
7372
merge_inferred_with_extracted_layout,
7473
)
@@ -712,7 +711,6 @@ def _partition_pdf_or_image_local(
712711
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
713712
kwargs["sort_mode"] = SORT_MODE_DONT
714713

715-
final_document_layout = clean_pdfminer_duplicate_image_elements(final_document_layout)
716714
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
717715

718716
for page in final_document_layout.pages:

Diff for: unstructured/partition/pdf_image/pdfminer_processing.py

-38
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import numpy as np
44
from pdfminer.utils import open_filename
55

6-
from unstructured.documents.elements import ElementType
76
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
87
from unstructured.partition.pdf_image.pdfminer_utils import (
98
extract_image_objects,
@@ -268,43 +267,6 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
268267
return document
269268

270269

271-
def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "DocumentLayout":
272-
"""Removes duplicate image elements extracted by PDFMiner from a document layout."""
273-
274-
for page in document.pages:
275-
image_bboxes = []
276-
texts = []
277-
bbox_to_iou_mapping = {}
278-
current_idx = 0
279-
for i, element in enumerate(page.elements):
280-
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
281-
continue
282-
image_bboxes.append(element.bbox)
283-
texts.append(element.text)
284-
bbox_to_iou_mapping[i] = current_idx
285-
current_idx += 1
286-
287-
iou = boxes_self_iou(image_bboxes, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD)
288-
289-
filtered_elements = []
290-
for i, element in enumerate(page.elements[:-1]):
291-
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
292-
filtered_elements.append(element)
293-
continue
294-
text = element.text
295-
this_idx = bbox_to_iou_mapping[i]
296-
if any(
297-
text == texts[potential_match + this_idx + 1]
298-
for potential_match in np.where(iou[this_idx, this_idx + 1 :])[0]
299-
):
300-
continue
301-
else:
302-
filtered_elements.append(element)
303-
page.elements[:-1] = filtered_elements
304-
305-
return document
306-
307-
308270
@requires_dependencies("unstructured_inference")
309271
def remove_duplicate_elements(
310272
elements: list["TextRegion"],

0 commit comments

Comments
 (0)