Skip to content

Commit 6c1bbb3

Browse files
authored
test: add check crop box padding to save_elements test (#4123)
Updated `save_elements` test to check the behavior of the environment variables `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and `EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD` that pad the crop box for image extraction. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Enhances save_elements tests to validate crop-box padding via env vars and image dimensions for both payload and file outputs; bumps version and updates changelog. > > - **Tests (pdf_image_utils)**: > - `test_save_elements` now parametrizes `horizontal_padding`/`vertical_padding`, sets `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and `EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD`, and asserts padded image dimensions for both `extract_image_block_to_payload` paths (decoding `image_base64` or reading saved file). > - Adds required imports (`base64`, `io`). > - **Versioning**: > - Update `unstructured/__version__.py` to `0.18.21-dev0`. > - Add CHANGELOG entry noting the unit test enhancement. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit a23bf6a. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent 7c4d0b9 commit 6c1bbb3

File tree

3 files changed

+44
-1
lines changed

3 files changed

+44
-1
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,25 @@
1+
## 0.18.21-dev0
2+
3+
### Enhancement
4+
- Update save_elements unit test to check crop box padding behavior
5+
6+
### Features
7+
8+
### Fixes
9+
110
## 0.18.20
211

312
### Enhancement
413
- Improve the VoyageAI integration
514
- Add voyage-context-3 support
15+
16+
### Features
17+
18+
### Fixes
19+
20+
## 0.18.19
21+
22+
### Enhancement
623
- Flag extracted elements as such in the metadata for downstream use
724

825
### Features

test_unstructured/partition/pdf_image/test_pdf_image_utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import base64
2+
import io
13
import os
24
import tempfile
35
from unittest.mock import MagicMock, patch
@@ -78,12 +80,21 @@ def test_convert_pdf_to_image_raises_error():
7880
)
7981
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
8082
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
83+
@pytest.mark.parametrize("horizontal_padding", [0, 20])
84+
@pytest.mark.parametrize("vertical_padding", [0, 10])
8185
def test_save_elements(
8286
element_category_to_save,
8387
extract_image_block_to_payload,
8488
filename,
8589
is_image,
90+
horizontal_padding,
91+
vertical_padding,
92+
monkeypatch,
8693
):
94+
if horizontal_padding > 0:
95+
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", str(horizontal_padding))
96+
if vertical_padding > 0:
97+
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", str(vertical_padding))
8798
with tempfile.TemporaryDirectory() as tmpdir:
8899
elements = [
89100
Image(
@@ -136,10 +147,25 @@ def test_save_elements(
136147
if extract_image_block_to_payload:
137148
assert isinstance(el.metadata.image_base64, str)
138149
assert isinstance(el.metadata.image_mime_type, str)
150+
image_bytes = base64.b64decode(el.metadata.image_base64)
151+
image = PILImg.open(io.BytesIO(image_bytes))
152+
x1, y1 = el.metadata.coordinates.points[0]
153+
x2, y2 = el.metadata.coordinates.points[2]
154+
width = x2 - x1
155+
height = y2 - y1
156+
assert image.width == width + 2 * horizontal_padding
157+
assert image.height == height + 2 * vertical_padding
139158
assert not el.metadata.image_path
140159
assert not os.path.isfile(expected_image_path)
141160
else:
142161
assert os.path.isfile(expected_image_path)
162+
image = PILImg.open(expected_image_path)
163+
x1, y1 = el.metadata.coordinates.points[0]
164+
x2, y2 = el.metadata.coordinates.points[2]
165+
width = x2 - x1
166+
height = y2 - y1
167+
assert image.width == width + 2 * horizontal_padding
168+
assert image.height == height + 2 * vertical_padding
143169
assert el.metadata.image_path == expected_image_path
144170
assert not el.metadata.image_base64
145171
assert not el.metadata.image_mime_type

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.20" # pragma: no cover
1+
__version__ = "0.18.21-dev0" # pragma: no cover

0 commit comments

Comments
 (0)