From 9f6ab9e472faba2c6cb4aa174a475e7a7244f673 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 11 Nov 2025 18:02:47 -0600 Subject: [PATCH 1/4] add check crop box padding to save_elements test --- .../pdf_image/test_pdf_image_utils.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index 0c895fbeb3..cdd7608571 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -78,12 +78,21 @@ def test_convert_pdf_to_image_raises_error(): ) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) @pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) +@pytest.mark.parametrize("horizontal_padding", [0, 20]) +@pytest.mark.parametrize("vertical_padding", [0, 10]) def test_save_elements( element_category_to_save, extract_image_block_to_payload, filename, is_image, + horizontal_padding, + vertical_padding, + monkeypatch, ): + if horizontal_padding > 0: + monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", str(horizontal_padding)) + if vertical_padding > 0: + monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", str(vertical_padding)) with tempfile.TemporaryDirectory() as tmpdir: elements = [ Image( @@ -136,10 +145,27 @@ def test_save_elements( if extract_image_block_to_payload: assert isinstance(el.metadata.image_base64, str) assert isinstance(el.metadata.image_mime_type, str) + import base64 + import io + image_bytes = base64.b64decode(el.metadata.image_base64) + image = PILImg.open(io.BytesIO(image_bytes)) + x1, y1 = el.metadata.coordinates.points[0] + x2, y2 = el.metadata.coordinates.points[2] + width = x2 - x1 + height = y2 - y1 + assert image.width == width + 2 * horizontal_padding + assert image.height == height + 2 * vertical_padding assert not el.metadata.image_path assert not os.path.isfile(expected_image_path) else: assert os.path.isfile(expected_image_path) + image = PILImg.open(expected_image_path) + x1, y1 = el.metadata.coordinates.points[0] + x2, y2 = el.metadata.coordinates.points[2] + width = x2 - x1 + height = y2 - y1 + assert image.width == width + 2 * horizontal_padding + assert image.height == height + 2 * vertical_padding assert el.metadata.image_path == expected_image_path assert not el.metadata.image_base64 assert not el.metadata.image_mime_type From 7957585ba8bfdcd9f4cb695302bb205e54b0b72c Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 18 Nov 2025 13:49:03 -0600 Subject: [PATCH 2/4] update CHANGELOG and version --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5b762d7f6..6bf1744e09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.18.19-dev0 +## 0.18.19-dev1 ### Enhancement - Flag extracted elements as such in the metadata for downstream use +- Update save_elements unit test to check crop box padding behavior ### Features diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 15c3dcdee3..0d07804921 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.19-dev0" # pragma: no cover +__version__ = "0.18.19-dev1" # pragma: no cover From d152c60ac6050a2868f61348b67a99d7f947caee Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 18 Nov 2025 13:51:50 -0600 Subject: [PATCH 3/4] move imports --- test_unstructured/partition/pdf_image/test_pdf_image_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index cdd7608571..d4ed54f1f1 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -1,3 +1,5 @@ +import base64 +import io import os import tempfile from unittest.mock import MagicMock, patch @@ -145,8 +147,6 @@ def test_save_elements( if extract_image_block_to_payload: assert isinstance(el.metadata.image_base64, str) assert isinstance(el.metadata.image_mime_type, str) - import base64 - import io image_bytes = base64.b64decode(el.metadata.image_base64) image = PILImg.open(io.BytesIO(image_bytes)) x1, y1 = el.metadata.coordinates.points[0] From a23bf6a9e7fb67f35c2aa16cc409d2de92886db8 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 18 Nov 2025 14:01:32 -0600 Subject: [PATCH 4/4] Fix changelog --- CHANGELOG.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69e40d992d..de7bf1e44a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,26 @@ ## 0.18.21-dev0 +### Enhancement +- Update save_elements unit test to check crop box padding behavior + +### Features + +### Fixes + +## 0.18.20 + ### Enhancement - Improve the VoyageAI integration - Add voyage-context-3 support + +### Features + +### Fixes + +## 0.18.19 + +### Enhancement - Flag extracted elements as such in the metadata for downstream use -- Update save_elements unit test to check crop box padding behavior ### Features