Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.18.19-dev0
## 0.18.19-dev1

### Enhancement
- Flag extracted elements as such in the metadata for downstream use
- Update save_elements unit test to check crop box padding behavior

### Features

Expand Down
26 changes: 26 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64
import io
import os
import tempfile
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -78,12 +80,21 @@ def test_convert_pdf_to_image_raises_error():
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
@pytest.mark.parametrize("horizontal_padding", [0, 20])
@pytest.mark.parametrize("vertical_padding", [0, 10])
def test_save_elements(
element_category_to_save,
extract_image_block_to_payload,
filename,
is_image,
horizontal_padding,
vertical_padding,
monkeypatch,
):
if horizontal_padding > 0:
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", str(horizontal_padding))
if vertical_padding > 0:
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", str(vertical_padding))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Env Var Leakage Breaks Test Isolation

Environment variables are only set conditionally when padding values are greater than 0. If EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD or EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD exist in the OS environment, test cases with padding=0 will incorrectly use those pre-existing values instead of 0, causing assertion failures. The variables need to be set unconditionally to ensure proper test isolation.

Fix in Cursor Fix in Web

with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
Expand Down Expand Up @@ -136,10 +147,25 @@ def test_save_elements(
if extract_image_block_to_payload:
assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
image_bytes = base64.b64decode(el.metadata.image_base64)
image = PILImg.open(io.BytesIO(image_bytes))
x1, y1 = el.metadata.coordinates.points[0]
x2, y2 = el.metadata.coordinates.points[2]
width = x2 - x1
height = y2 - y1
assert image.width == width + 2 * horizontal_padding
assert image.height == height + 2 * vertical_padding
assert not el.metadata.image_path
assert not os.path.isfile(expected_image_path)
else:
assert os.path.isfile(expected_image_path)
image = PILImg.open(expected_image_path)
x1, y1 = el.metadata.coordinates.points[0]
x2, y2 = el.metadata.coordinates.points[2]
width = x2 - x1
height = y2 - y1
assert image.width == width + 2 * horizontal_padding
assert image.height == height + 2 * vertical_padding
assert el.metadata.image_path == expected_image_path
assert not el.metadata.image_base64
assert not el.metadata.image_mime_type
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.19-dev0" # pragma: no cover
__version__ = "0.18.19-dev1" # pragma: no cover