Skip to content

Commit b38338d

Browse files
ppradosConiferish
andauthored
Add password with PDF files (Unstructured-IO#3721)
Add password with PDF files Must be combined with [PR 392 in unstructured-inference](Unstructured-IO/unstructured-inference#392) --------- Co-authored-by: John J <[email protected]>
1 parent 4995fc6 commit b38338d

File tree

12 files changed

+96
-15
lines changed

12 files changed

+96
-15
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.16.21-dev3
1+
## 0.16.21-dev4
22

33
### Enhancements
4+
- **Use password** to load PDF with all modes
45

56
- **use vectorized logic to merge inferred and extracted layouts**. Using the new `LayoutElements` data structure and numpy library to refactor the layout merging logic to improve compute performance as well as making logic more clear
67

example-docs/pdf/password.pdf

13.8 KB
Binary file not shown.

requirements/extra-pdf-image.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ google-cloud-vision
1111
effdet
1212
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
1313
# when unstructured library is.
14-
unstructured-inference>=0.8.6
14+
unstructured-inference>=0.8.7
1515
unstructured.pytesseract>=0.3.12

requirements/extra-pdf-image.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ typing-extensions==4.12.2
263263
# torch
264264
tzdata==2025.1
265265
# via pandas
266-
unstructured-inference==0.8.6
266+
unstructured-inference==0.8.7
267267
# via -r ./extra-pdf-image.in
268268
unstructured-pytesseract==0.3.13
269269
# via -r ./extra-pdf-image.in

test_unstructured/partition/pdf_image/test_pdf.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ def _test(result):
262262
strategy=strategy,
263263
starting_page_number=starting_page_number,
264264
)
265-
_test(result)
265+
_test(result)
266266

267267

268268
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
@@ -1545,3 +1545,43 @@ def test_document_to_element_list_sets_category_depth_titles():
15451545
assert elements[1].metadata.category_depth == 2
15461546
assert elements[2].metadata.category_depth is None
15471547
assert elements[3].metadata.category_depth == 0
1548+
1549+
1550+
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
1551+
@pytest.mark.parametrize(
1552+
"strategy",
1553+
# fast: can't capture the "intentionally left blank page" page
1554+
# others: will ignore the actual blank page
1555+
[
1556+
PartitionStrategy.FAST,
1557+
PartitionStrategy.HI_RES,
1558+
PartitionStrategy.OCR_ONLY,
1559+
],
1560+
)
1561+
def test_partition_pdf_with_password(
1562+
file_mode,
1563+
strategy,
1564+
filename=example_doc_path("pdf/password.pdf"),
1565+
):
1566+
# Test that the partition_pdf function can handle filename
1567+
def _test(result):
1568+
# validate that the result is a non-empty list of dicts
1569+
assert len(result) == 1
1570+
assert result[0].text == "File with password"
1571+
1572+
if file_mode == "filename":
1573+
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
1574+
_test(result)
1575+
elif file_mode == "rb":
1576+
with open(filename, "rb") as f:
1577+
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
1578+
_test(result)
1579+
else:
1580+
with open(filename, "rb") as test_file:
1581+
with SpooledTemporaryFile() as spooled_temp_file:
1582+
spooled_temp_file.write(test_file.read())
1583+
spooled_temp_file.seek(0)
1584+
result = pdf.partition_pdf(
1585+
file=spooled_temp_file, strategy=strategy, password="password"
1586+
)
1587+
_test(result)

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.21-dev3" # pragma: no cover
1+
__version__ = "0.16.21-dev4" # pragma: no cover

unstructured/partition/image.py

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def partition_image(
3232
starting_page_number: int = 1,
3333
extract_forms: bool = False,
3434
form_extraction_skip_tables: bool = True,
35+
password: Optional[str] = None,
3536
**kwargs: Any,
3637
) -> list[Element]:
3738
"""Parses an image into a list of interpreted elements.
@@ -91,6 +92,8 @@ def partition_image(
9192
(results in adding FormKeysValues elements to output).
9293
form_extraction_skip_tables
9394
Whether the form extraction logic should ignore regions designated as Tables.
95+
password
96+
The password to decrypt the PDF file.
9497
"""
9598
exactly_one(filename=filename, file=file)
9699

@@ -113,5 +116,6 @@ def partition_image(
113116
starting_page_number=starting_page_number,
114117
extract_forms=extract_forms,
115118
form_extraction_skip_tables=form_extraction_skip_tables,
119+
password=password,
116120
**kwargs,
117121
)

unstructured/partition/pdf.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def partition_pdf(
144144
starting_page_number: int = 1,
145145
extract_forms: bool = False,
146146
form_extraction_skip_tables: bool = True,
147+
password: Optional[str] = None,
147148
**kwargs: Any,
148149
) -> list[Element]:
149150
"""Parses a pdf document into a list of interpreted elements.
@@ -224,6 +225,7 @@ def partition_pdf(
224225
starting_page_number=starting_page_number,
225226
extract_forms=extract_forms,
226227
form_extraction_skip_tables=form_extraction_skip_tables,
228+
password=password,
227229
**kwargs,
228230
)
229231

@@ -245,6 +247,7 @@ def partition_pdf_or_image(
245247
starting_page_number: int = 1,
246248
extract_forms: bool = False,
247249
form_extraction_skip_tables: bool = True,
250+
password: Optional[str] = None,
248251
**kwargs: Any,
249252
) -> list[Element]:
250253
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -273,6 +276,7 @@ def partition_pdf_or_image(
273276
languages=languages,
274277
metadata_last_modified=metadata_last_modified or last_modified,
275278
starting_page_number=starting_page_number,
279+
password=password,
276280
**kwargs,
277281
)
278282
pdf_text_extractable = any(
@@ -322,6 +326,7 @@ def partition_pdf_or_image(
322326
starting_page_number=starting_page_number,
323327
extract_forms=extract_forms,
324328
form_extraction_skip_tables=form_extraction_skip_tables,
329+
password=password,
325330
**kwargs,
326331
)
327332
out_elements = _process_uncategorized_text_elements(elements)
@@ -347,6 +352,7 @@ def partition_pdf_or_image(
347352
is_image=is_image,
348353
metadata_last_modified=metadata_last_modified or last_modified,
349354
starting_page_number=starting_page_number,
355+
password=password,
350356
**kwargs,
351357
)
352358
out_elements = _process_uncategorized_text_elements(elements)
@@ -360,6 +366,7 @@ def extractable_elements(
360366
languages: Optional[list[str]] = None,
361367
metadata_last_modified: Optional[str] = None,
362368
starting_page_number: int = 1,
369+
password: Optional[str] = None,
363370
**kwargs: Any,
364371
) -> list[list[Element]]:
365372
if isinstance(file, bytes):
@@ -370,6 +377,7 @@ def extractable_elements(
370377
languages=languages,
371378
metadata_last_modified=metadata_last_modified,
372379
starting_page_number=starting_page_number,
380+
password=password,
373381
**kwargs,
374382
)
375383

@@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
380388
languages: list[str],
381389
metadata_last_modified: Optional[str],
382390
starting_page_number: int = 1,
391+
password: Optional[str] = None,
383392
**kwargs: Any,
384393
) -> list[list[Element]]:
385394
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
403412
languages=languages,
404413
metadata_last_modified=metadata_last_modified,
405414
starting_page_number=starting_page_number,
415+
password=password,
406416
**kwargs,
407417
)
408418

@@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
413423
languages=languages,
414424
metadata_last_modified=metadata_last_modified,
415425
starting_page_number=starting_page_number,
426+
password=password,
416427
**kwargs,
417428
)
418429

@@ -427,14 +438,16 @@ def _process_pdfminer_pages(
427438
metadata_last_modified: Optional[str],
428439
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
429440
starting_page_number: int = 1,
441+
password: Optional[str] = None,
430442
**kwargs,
431443
) -> list[list[Element]]:
432444
"""Uses PDFMiner to split a document into pages and process them."""
433445

434446
elements = []
435447

436448
for page_number, (page, page_layout) in enumerate(
437-
open_pdfminer_pages_generator(fp), start=starting_page_number
449+
open_pdfminer_pages_generator(fp, password=password),
450+
start=starting_page_number,
438451
):
439452
width, height = page_layout.width, page_layout.height
440453

@@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
556569
extract_forms: bool = False,
557570
form_extraction_skip_tables: bool = True,
558571
pdf_hi_res_max_pages: Optional[int] = None,
572+
password: Optional[str] = None,
559573
**kwargs: Any,
560574
) -> list[Element]:
561575
"""Partition using package installed locally"""
@@ -592,10 +606,11 @@ def _partition_pdf_or_image_local(
592606
is_image=is_image,
593607
model_name=hi_res_model_name,
594608
pdf_image_dpi=pdf_image_dpi,
609+
password=password,
595610
)
596611

597612
extracted_layout, layouts_links = (
598-
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
613+
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
599614
if pdf_text_extractable
600615
else ([], [])
601616
)
@@ -635,20 +650,22 @@ def _partition_pdf_or_image_local(
635650
ocr_mode=ocr_mode,
636651
pdf_image_dpi=pdf_image_dpi,
637652
ocr_layout_dumper=ocr_layout_dumper,
653+
password=password,
638654
)
639655
else:
640656
inferred_document_layout = process_data_with_model(
641657
file,
642658
is_image=is_image,
643659
model_name=hi_res_model_name,
644660
pdf_image_dpi=pdf_image_dpi,
661+
password=password,
645662
)
646663

647664
if hasattr(file, "seek"):
648665
file.seek(0)
649666

650667
extracted_layout, layouts_links = (
651-
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
668+
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
652669
if pdf_text_extractable
653670
else ([], [])
654671
)
@@ -690,6 +707,7 @@ def _partition_pdf_or_image_local(
690707
ocr_mode=ocr_mode,
691708
pdf_image_dpi=pdf_image_dpi,
692709
ocr_layout_dumper=ocr_layout_dumper,
710+
password=password,
693711
)
694712

695713
# vectorization of the data structure ends here
@@ -837,6 +855,7 @@ def _partition_pdf_or_image_with_ocr(
837855
is_image: bool = False,
838856
metadata_last_modified: Optional[str] = None,
839857
starting_page_number: int = 1,
858+
password: Optional[str] = None,
840859
**kwargs: Any,
841860
):
842861
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -861,7 +880,7 @@ def _partition_pdf_or_image_with_ocr(
861880
elements.extend(page_elements)
862881
else:
863882
for page_number, image in enumerate(
864-
convert_pdf_to_images(filename, file), start=starting_page_number
883+
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
865884
):
866885
page_elements = _partition_pdf_or_image_with_ocr_from_image(
867886
image=image,

unstructured/partition/pdf_image/ocr.py

+4
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def process_data_with_ocr(
4242
ocr_mode: str = OCRMode.FULL_PAGE.value,
4343
pdf_image_dpi: int = 200,
4444
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
45+
password: Optional[str] = None,
4546
) -> "DocumentLayout":
4647
"""
4748
Process OCR data from a given data and supplement the output DocumentLayout
@@ -89,6 +90,7 @@ def process_data_with_ocr(
8990
ocr_mode=ocr_mode,
9091
pdf_image_dpi=pdf_image_dpi,
9192
ocr_layout_dumper=ocr_layout_dumper,
93+
password=password,
9294
)
9395

9496
return merged_layouts
@@ -105,6 +107,7 @@ def process_file_with_ocr(
105107
ocr_mode: str = OCRMode.FULL_PAGE.value,
106108
pdf_image_dpi: int = 200,
107109
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
110+
password: Optional[str] = None,
108111
) -> "DocumentLayout":
109112
"""
110113
Process OCR data from a given file and supplement the output DocumentLayout
@@ -165,6 +168,7 @@ def process_file_with_ocr(
165168
dpi=pdf_image_dpi,
166169
output_folder=temp_dir,
167170
paths_only=True,
171+
userpw=password or "",
168172
)
169173
image_paths = cast(List[str], _image_paths)
170174
for i, image_path in enumerate(image_paths):

unstructured/partition/pdf_image/pdf_image_utils.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def convert_pdf_to_image(
5858
dpi: int = 200,
5959
output_folder: Optional[Union[str, PurePath]] = None,
6060
path_only: bool = False,
61+
password: Optional[str] = None,
6162
) -> Union[List[Image.Image], List[str]]:
6263
"""Get the image renderings of the pdf pages using pdf2image"""
6364

@@ -71,6 +72,7 @@ def convert_pdf_to_image(
7172
dpi=dpi,
7273
output_folder=output_folder,
7374
paths_only=path_only,
75+
userpw=password,
7476
)
7577
else:
7678
images = pdf2image.convert_from_path(
@@ -125,6 +127,7 @@ def save_elements(
125127
is_image: bool = False,
126128
extract_image_block_to_payload: bool = False,
127129
output_dir_path: str | None = None,
130+
password: Optional[str] = None,
128131
):
129132
"""
130133
Saves specific elements from a PDF as images either to a directory or embeds them in the
@@ -167,6 +170,7 @@ def save_elements(
167170
pdf_image_dpi,
168171
output_folder=temp_dir,
169172
path_only=True,
173+
password=password,
170174
)
171175
image_paths = cast(List[str], _image_paths)
172176

@@ -389,15 +393,16 @@ def convert_pdf_to_images(
389393
filename: str = "",
390394
file: Optional[bytes | IO[bytes]] = None,
391395
chunk_size: int = 10,
396+
password: Optional[str] = None,
392397
) -> Iterator[Image.Image]:
393398
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
394399
exactly_one(filename=filename, file=file)
395400
if file is not None:
396401
f_bytes = convert_to_bytes(file)
397-
info = pdf2image.pdfinfo_from_bytes(f_bytes)
402+
info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
398403
else:
399404
f_bytes = None
400-
info = pdf2image.pdfinfo_from_path(filename)
405+
info = pdf2image.pdfinfo_from_path(filename, userpw=password)
401406

402407
total_pages = info["Pages"]
403408
for start_page in range(1, total_pages + 1, chunk_size):
@@ -407,12 +412,14 @@ def convert_pdf_to_images(
407412
f_bytes,
408413
first_page=start_page,
409414
last_page=end_page,
415+
userpw=password,
410416
)
411417
else:
412418
chunk_images = pdf2image.convert_from_path(
413419
filename,
414420
first_page=start_page,
415421
last_page=end_page,
422+
userpw=password,
416423
)
417424

418425
for image in chunk_images:

0 commit comments

Comments
 (0)