Skip to content

Commit 1158d8f

Browse files
Refactor image block extraction in pdf partitioning (#3514)
Closes [#3503](#3503). ### Summary This PR prevents creation of `figures` directory for saving image blocks (`Image`, `Table`) when `extract_image_block_to_payload` parameter is set to True ### Testing ``` elements = partition_image( filename="example-docs/img/embedded-images-tables.jpg", strategy="hi_res", extract_image_block_types=["Image", "Table"], extract_image_block_to_payload=True, ) ``` **Results:** - `Main` Branch: `figures` directory is created. - `PR`: `figures` directory is not created.
1 parent cbe1b35 commit 1158d8f

File tree

3 files changed

+18
-13
lines changed

3 files changed

+18
-13
lines changed

Diff for: CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.15.2-dev7
1+
## 0.15.2-dev8
22

33
### Enhancements
44

5+
* **Improve directory handling when extracting image blocks**. The `figures` directory is no longer created when the `extract_image_block_to_payload` parameter is set to `True`.
6+
57
### Features
68

79
* **Added per-class Object Detection metrics in the evaluation**. The metrics include average precision, precision, recall, and f1-score for each class in the dataset.

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.2-dev7" # pragma: no cover
1+
__version__ = "0.15.2-dev8" # pragma: no cover

Diff for: unstructured/partition/pdf_image/pdf_image_utils.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,15 @@ def save_elements(
139139
a specified directory or embedded into the element's payload as a base64-encoded string.
140140
"""
141141

142-
if not output_dir_path:
143-
if env_config.GLOBAL_WORKING_DIR_ENABLED:
144-
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
145-
else:
146-
output_dir_path = str(Path.cwd() / "figures")
147-
os.makedirs(output_dir_path, exist_ok=True)
142+
# Determine the output directory path
143+
if not extract_image_block_to_payload:
144+
output_dir_path = output_dir_path or (
145+
str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
146+
if env_config.GLOBAL_WORKING_DIR_ENABLED
147+
else str(Path.cwd() / "figures")
148+
)
149+
150+
os.makedirs(output_dir_path, exist_ok=True)
148151

149152
with tempfile.TemporaryDirectory() as temp_dir:
150153
if is_image:
@@ -193,11 +196,6 @@ def save_elements(
193196

194197
figure_number += 1
195198
try:
196-
basename = "table" if el.category == ElementType.TABLE else "figure"
197-
output_f_path = os.path.join(
198-
output_dir_path,
199-
f"{basename}-{metadata_page_number}-{figure_number}.jpg",
200-
)
201199
image_path = image_paths[page_index]
202200
image = Image.open(image_path)
203201
cropped_image = image.crop(padded_bbox)
@@ -209,6 +207,11 @@ def save_elements(
209207
el.metadata.image_base64 = img_base64_str
210208
el.metadata.image_mime_type = "image/jpeg"
211209
else:
210+
basename = "table" if el.category == ElementType.TABLE else "figure"
211+
output_f_path = os.path.join(
212+
output_dir_path,
213+
f"{basename}-{metadata_page_number}-{figure_number}.jpg",
214+
)
212215
write_image(cropped_image, output_f_path)
213216
# add image path to element metadata
214217
el.metadata.image_path = output_f_path

0 commit comments

Comments
 (0)