Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
## 0.18.21-dev0

### Enhancement
- Update save_elements unit test to check crop box padding behavior

### Features

### Fixes

## 0.18.20

### Enhancement
- Improve the VoyageAI integration
- Add voyage-context-3 support

### Features

### Fixes

## 0.18.19

### Enhancement
- Flag extracted elements as such in the metadata for downstream use

### Features
Expand Down
26 changes: 26 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64
import io
import os
import tempfile
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -78,12 +80,21 @@ def test_convert_pdf_to_image_raises_error():
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
@pytest.mark.parametrize("horizontal_padding", [0, 20])
@pytest.mark.parametrize("vertical_padding", [0, 10])
def test_save_elements(
element_category_to_save,
extract_image_block_to_payload,
filename,
is_image,
horizontal_padding,
vertical_padding,
monkeypatch,
):
if horizontal_padding > 0:
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", str(horizontal_padding))
if vertical_padding > 0:
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", str(vertical_padding))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Env Var Leakage Breaks Test Isolation

Environment variables are only set conditionally when padding values are greater than 0. If EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD or EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD exist in the OS environment, test cases with padding=0 will incorrectly use those pre-existing values instead of 0, causing assertion failures. The variables need to be set unconditionally to ensure proper test isolation.

Fix in Cursor Fix in Web

with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
Expand Down Expand Up @@ -136,10 +147,25 @@ def test_save_elements(
if extract_image_block_to_payload:
assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
image_bytes = base64.b64decode(el.metadata.image_base64)
image = PILImg.open(io.BytesIO(image_bytes))
x1, y1 = el.metadata.coordinates.points[0]
x2, y2 = el.metadata.coordinates.points[2]
width = x2 - x1
height = y2 - y1
assert image.width == width + 2 * horizontal_padding
assert image.height == height + 2 * vertical_padding
assert not el.metadata.image_path
assert not os.path.isfile(expected_image_path)
else:
assert os.path.isfile(expected_image_path)
image = PILImg.open(expected_image_path)
x1, y1 = el.metadata.coordinates.points[0]
x2, y2 = el.metadata.coordinates.points[2]
width = x2 - x1
height = y2 - y1
assert image.width == width + 2 * horizontal_padding
assert image.height == height + 2 * vertical_padding
assert el.metadata.image_path == expected_image_path
assert not el.metadata.image_base64
assert not el.metadata.image_mime_type
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.18.20" # pragma: no cover
__version__ = "0.18.21-dev0" # pragma: no cover
Loading