Skip to content

Commit 794f38b

Browse files
Refactor: remove image extraction related code (#299)
### Summary This PR is the first part of the "image extraction" refactor to move it from unstructured-inference repo to unstructured repo. This PR removes all "image extraction" related code from unstructured-inference repo and works together with the unstructured refactor PR - Unstructured-IO/unstructured#2201. ### Note The ingest test won't pass until we merge the unstructured refactor PR - Unstructured-IO/unstructured#2201.
1 parent 2b29254 commit 794f38b

File tree

7 files changed

+6
-177
lines changed

7 files changed

+6
-177
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.7.18
2+
3+
* refactor: remove all image extraction related code
4+
15
## 0.7.17
26

37
* refactor: remove all `pdfminer` related code

examples/image_extraction/README.md

Lines changed: 0 additions & 21 deletions
This file was deleted.

examples/image_extraction/embedded-image-extraction.py

Lines changed: 0 additions & 94 deletions
This file was deleted.

examples/image_extraction/requirements.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

test_unstructured_inference/inference/test_layout.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
import os.path
33
import tempfile
4-
from unittest.mock import ANY, mock_open, patch
4+
from unittest.mock import mock_open, patch
55

66
import numpy as np
77
import pytest
@@ -557,22 +557,6 @@ def test_from_image(
557557
assert mock_detection.called == detection_model_called
558558

559559

560-
def test_extract_images(mock_pil_image):
561-
page = MockPageLayout(image=mock_pil_image)
562-
page.elements = [
563-
layoutelement.LayoutElement.from_coords(1, 1, 10, 10, text=None, type="Image"),
564-
layoutelement.LayoutElement.from_coords(11, 11, 20, 20, text=None, type="Image"),
565-
]
566-
567-
with tempfile.TemporaryDirectory() as tmpdir:
568-
page.extract_images(output_dir_path=str(tmpdir))
569-
570-
for i, el in enumerate(page.elements):
571-
expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg")
572-
assert os.path.isfile(el.image_path)
573-
assert el.image_path == expected_image_path
574-
575-
576560
class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):
577561
def initialize(self, *args, **kwargs):
578562
return super().initialize(*args, **kwargs)
@@ -614,8 +598,6 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
614598
fixed_layouts=None,
615599
extract_tables=False,
616600
pdf_image_dpi=200,
617-
extract_images_in_pdf=ANY,
618-
image_output_dir_path=ANY,
619601
)
620602

621603

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.17" # pragma: no cover
1+
__version__ = "0.7.18" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import pdf2image
1010
from PIL import Image, ImageSequence
1111

12-
from unstructured_inference.constants import ElementType
1312
from unstructured_inference.inference.elements import (
1413
TextRegion,
1514
)
@@ -24,7 +23,6 @@
2423
UnstructuredElementExtractionModel,
2524
UnstructuredObjectDetectionModel,
2625
)
27-
from unstructured_inference.utils import write_image
2826
from unstructured_inference.visualize import draw_bbox
2927

3028

@@ -230,34 +228,6 @@ def get_elements_from_layout(
230228
]
231229
return elements
232230

233-
def extract_images(self, output_dir_path: Optional[str] = None):
234-
"""
235-
Extract and save images from the page. This method iterates through the layout elements
236-
of the page, identifies image regions, and extracts and saves them as separate image files.
237-
"""
238-
239-
if not output_dir_path:
240-
output_dir_path = os.path.join(os.getcwd(), "figures")
241-
os.makedirs(output_dir_path, exist_ok=True)
242-
243-
figure_number = 0
244-
image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
245-
for el in self.elements:
246-
if (el.bbox is None) or (el.type not in image_element_types):
247-
continue
248-
249-
figure_number += 1
250-
try:
251-
output_f_path = os.path.join(
252-
output_dir_path,
253-
f"figure-{self.number}-{figure_number}.jpg",
254-
)
255-
cropped_image = self.image.crop((el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2))
256-
write_image(cropped_image, output_f_path)
257-
el.image_path = output_f_path
258-
except (ValueError, IOError):
259-
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
260-
261231
def _get_image_array(self) -> Union[np.ndarray, None]:
262232
"""Converts the raw image into a numpy array."""
263233
if self.image_array is None:
@@ -350,8 +320,6 @@ def from_image(
350320
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
351321
extract_tables: bool = False,
352322
fixed_layout: Optional[List[TextRegion]] = None,
353-
extract_images_in_pdf: bool = False,
354-
image_output_dir_path: Optional[str] = None,
355323
):
356324
"""Creates a PageLayout from an already-loaded PIL Image."""
357325

@@ -378,9 +346,6 @@ def from_image(
378346
page.image_path = os.path.abspath(image_path) if image_path else None
379347
page.document_filename = os.path.abspath(document_filename) if document_filename else None
380348

381-
if extract_images_in_pdf:
382-
page.extract_images(image_output_dir_path)
383-
384349
# Clear the image to save memory
385350
page.image = None
386351

@@ -413,8 +378,6 @@ def process_file_with_model(
413378
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
414379
extract_tables: bool = False,
415380
pdf_image_dpi: int = 200,
416-
extract_images_in_pdf: bool = False,
417-
image_output_dir_path: Optional[str] = None,
418381
**kwargs,
419382
) -> DocumentLayout:
420383
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
@@ -445,8 +408,6 @@ def process_file_with_model(
445408
fixed_layouts=fixed_layouts,
446409
extract_tables=extract_tables,
447410
pdf_image_dpi=pdf_image_dpi,
448-
extract_images_in_pdf=extract_images_in_pdf,
449-
image_output_dir_path=image_output_dir_path,
450411
**kwargs,
451412
)
452413
)

0 commit comments

Comments
 (0)