Skip to content

Commit 0b73978

Browse files
awalker4MthwRobinsonchristinestraub
authored
fix: fix IndexError when partioning a pdf with starting_page_number (#3246)
The Issue: When extracting images from pdfs, we use the metadata page number to index into a list of the images. However, the metadata page number can now be changed via `starting_page_number`. To get the true page index, we need to subtract this value. Testing: Run this snippet in a python shell. Before the fix, this throws an IndexError. On this branch, it will return the elements. ``` from unstructured.partition.auto import partition filename = "example-docs/layout-parser-paper-with-table.pdf" partition(filename, strategy="hi_res", extract_image_block_types=["Image", "Table"], starting_page_number=20) ``` --------- Co-authored-by: Matt Robinson <[email protected]> Co-authored-by: christinestraub <[email protected]>
1 parent c3af03d commit 0b73978

File tree

6 files changed

+21
-6
lines changed

6 files changed

+21
-6
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.7-dev4
1+
## 0.14.7-dev5
22

33
### Enhancements
44

@@ -12,6 +12,7 @@
1212
### Fixes
1313

1414
* **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image.
15+
* **Fix a IndexError when partitioning a pdf with values for both `extract_image_block_types` and `starting_page_number`.
1516

1617
## 0.14.6
1718

Diff for: test_unstructured/partition/pdf_image/test_pdf.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,8 @@ def test_partition_pdf_element_extraction(
12231223
if file_mode == "filename":
12241224
elements = pdf.partition_pdf(
12251225
filename=filename,
1226+
# Image extraction shouldn't break by setting this
1227+
starting_page_number=20,
12261228
extract_image_block_types=extract_image_block_types,
12271229
extract_image_block_to_payload=extract_image_block_to_payload,
12281230
extract_image_block_output_dir=tmpdir,
@@ -1231,11 +1233,13 @@ def test_partition_pdf_element_extraction(
12311233
with open(filename, "rb") as f:
12321234
elements = pdf.partition_pdf(
12331235
file=f,
1236+
# Image extraction shouldn't break by setting this
1237+
starting_page_number=20,
12341238
extract_image_block_types=extract_image_block_types,
12351239
extract_image_block_to_payload=extract_image_block_to_payload,
12361240
extract_image_block_output_dir=tmpdir,
12371241
)
1238-
1242+
assert elements[0].metadata.page_number == 20
12391243
assert_element_extraction(
12401244
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
12411245
)

Diff for: test_unstructured/partition/pdf_image/test_pdf_image_utils.py

+2
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def test_save_elements(
117117

118118
pdf_image_utils.save_elements(
119119
elements=elements,
120+
starting_page_number=1,
120121
element_category_to_save=element_category_to_save,
121122
pdf_image_dpi=200,
122123
filename=filename,
@@ -157,6 +158,7 @@ def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
157158
pdf_image_utils.save_elements(
158159
elements=[],
159160
element_category_to_save="",
161+
starting_page_number=1,
160162
pdf_image_dpi=200,
161163
filename="dummy.pdf",
162164
output_dir_path=None,

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.7-dev4" # pragma: no cover
1+
__version__ = "0.14.7-dev5" # pragma: no cover

Diff for: unstructured/partition/pdf.py

+2
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ def _partition_pdf_or_image_local(
660660
if extract_images_in_pdf:
661661
save_elements(
662662
elements=elements,
663+
starting_page_number=starting_page_number,
663664
element_category_to_save=ElementType.IMAGE,
664665
filename=filename,
665666
file=file,
@@ -675,6 +676,7 @@ def _partition_pdf_or_image_local(
675676

676677
save_elements(
677678
elements=elements,
679+
starting_page_number=starting_page_number,
678680
element_category_to_save=el_type,
679681
filename=filename,
680682
file=file,

Diff for: unstructured/partition/pdf_image/pdf_image_utils.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def pad_bbox(
121121

122122
def save_elements(
123123
elements: List["Element"],
124+
starting_page_number: int,
124125
element_category_to_save: str,
125126
pdf_image_dpi: int,
126127
filename: str = "",
@@ -183,16 +184,21 @@ def save_elements(
183184
padded_bbox = cast(
184185
Tuple[int, int, int, int], pad_bbox((x1, y1, x2, y2), (h_padding, v_padding))
185186
)
186-
page_number = el.metadata.page_number
187+
188+
# The page number in the metadata may have been offset
189+
# by starting_page_number. Make sure we use the right
190+
# value for indexing!
191+
metadata_page_number = el.metadata.page_number
192+
page_index = metadata_page_number - starting_page_number
187193

188194
figure_number += 1
189195
try:
190196
basename = "table" if el.category == ElementType.TABLE else "figure"
191197
output_f_path = os.path.join(
192198
output_dir_path,
193-
f"{basename}-{page_number}-{figure_number}.jpg",
199+
f"{basename}-{metadata_page_number}-{figure_number}.jpg",
194200
)
195-
image_path = image_paths[page_number - 1]
201+
image_path = image_paths[page_index]
196202
image = Image.open(image_path)
197203
cropped_image = image.crop(padded_bbox)
198204
if extract_image_block_to_payload:

0 commit comments

Comments
 (0)