Skip to content

Commit 734240b

Browse files
committed
Add password for pdf
1 parent 451ad97 commit 734240b

File tree

10 files changed

+140
-60
lines changed

10 files changed

+140
-60
lines changed

CHANGELOG.md

+34-28
Large diffs are not rendered by default.

example-docs/pdf/password.pdf

13.8 KB
Binary file not shown.

test_unstructured/partition/pdf_image/test_pdf.py

+55-22
Original file line numberDiff line numberDiff line change
@@ -208,48 +208,34 @@ def test_partition_pdf_local_raises_with_no_filename():
208208

209209
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
210210
@pytest.mark.parametrize(
211-
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
211+
"strategy",
212212
# fast: can't capture the "intentionally left blank page" page
213213
# others: will ignore the actual blank page
214214
[
215-
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
216-
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
217-
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer", "ocr_tesseract"}),
218-
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
215+
PartitionStrategy.FAST,
216+
PartitionStrategy.HI_RES,
217+
PartitionStrategy.OCR_ONLY,
219218
],
220219
)
221220
def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
222221
file_mode,
223222
strategy,
224-
starting_page_number,
225-
expected_page_numbers,
226-
origin,
227223
filename=example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf"),
228224
):
229225
# Test that the partition_pdf function can handle filename
230226
def _test(result):
231227
# validate that the result is a non-empty list of dicts
232228
assert len(result) > 10
233-
# check that the pdf has multiple different page numbers
234-
assert {element.metadata.page_number for element in result} == expected_page_numbers
235-
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
236-
print(
237-
[
238-
(element.metadata.detection_origin, element.category, element.text)
239-
for element in result
240-
]
241-
)
242-
assert {element.metadata.detection_origin for element in result} == origin
243229

244230
if file_mode == "filename":
245231
result = pdf.partition_pdf(
246-
filename=filename, strategy=strategy, starting_page_number=starting_page_number
232+
filename=filename, strategy=strategy,
247233
)
248234
_test(result)
249235
elif file_mode == "rb":
250236
with open(filename, "rb") as f:
251237
result = pdf.partition_pdf(
252-
file=f, strategy=strategy, starting_page_number=starting_page_number
238+
file=f, strategy=strategy,
253239
)
254240
_test(result)
255241
else:
@@ -260,9 +246,8 @@ def _test(result):
260246
result = pdf.partition_pdf(
261247
file=spooled_temp_file,
262248
strategy=strategy,
263-
starting_page_number=starting_page_number,
264249
)
265-
_test(result)
250+
_test(result)
266251

267252

268253
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
@@ -1545,3 +1530,51 @@ def test_document_to_element_list_sets_category_depth_titles():
15451530
assert elements[1].metadata.category_depth == 2
15461531
assert elements[2].metadata.category_depth is None
15471532
assert elements[3].metadata.category_depth == 0
1533+
1534+
1535+
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
1536+
@pytest.mark.parametrize(
1537+
"strategy",
1538+
# fast: can't capture the "intentionally left blank page" page
1539+
# others: will ignore the actual blank page
1540+
[
1541+
PartitionStrategy.FAST,
1542+
PartitionStrategy.HI_RES,
1543+
PartitionStrategy.OCR_ONLY,
1544+
],
1545+
)
1546+
def test_partition_pdf_with_password(
1547+
file_mode,
1548+
strategy,
1549+
filename=example_doc_path("pdf/password.pdf"),
1550+
):
1551+
# Test that the partition_pdf function can handle filename
1552+
def _test(result):
1553+
# validate that the result is a non-empty list of dicts
1554+
assert len(result) == 1
1555+
assert result[0].text == 'File with password'
1556+
1557+
if file_mode == "filename":
1558+
result = pdf.partition_pdf(
1559+
filename=filename, strategy=strategy,
1560+
password="password"
1561+
)
1562+
_test(result)
1563+
elif file_mode == "rb":
1564+
with open(filename, "rb") as f:
1565+
result = pdf.partition_pdf(
1566+
file=f, strategy=strategy,
1567+
password="password"
1568+
)
1569+
_test(result)
1570+
else:
1571+
with open(filename, "rb") as test_file:
1572+
with SpooledTemporaryFile() as spooled_temp_file:
1573+
spooled_temp_file.write(test_file.read())
1574+
spooled_temp_file.seek(0)
1575+
result = pdf.partition_pdf(
1576+
file=spooled_temp_file,
1577+
strategy=strategy,
1578+
password="password"
1579+
)
1580+
_test(result)

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.19-dev2" # pragma: no cover
1+
__version__ = "0.16.19-dev3" # pragma: no cover

unstructured/partition/image.py

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def partition_image(
3232
starting_page_number: int = 1,
3333
extract_forms: bool = False,
3434
form_extraction_skip_tables: bool = True,
35+
password: Optional[str] = None,
3536
**kwargs: Any,
3637
) -> list[Element]:
3738
"""Parses an image into a list of interpreted elements.
@@ -91,6 +92,8 @@ def partition_image(
9192
(results in adding FormKeysValues elements to output).
9293
form_extraction_skip_tables
9394
Whether the form extraction logic should ignore regions designated as Tables.
95+
password
96+
The password to decrypt the PDF file.
9497
"""
9598
exactly_one(filename=filename, file=file)
9699

@@ -113,5 +116,6 @@ def partition_image(
113116
starting_page_number=starting_page_number,
114117
extract_forms=extract_forms,
115118
form_extraction_skip_tables=form_extraction_skip_tables,
119+
password=password,
116120
**kwargs,
117121
)

unstructured/partition/pdf.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def partition_pdf(
144144
starting_page_number: int = 1,
145145
extract_forms: bool = False,
146146
form_extraction_skip_tables: bool = True,
147+
password: Optional[str] = None,
147148
**kwargs: Any,
148149
) -> list[Element]:
149150
"""Parses a pdf document into a list of interpreted elements.
@@ -224,6 +225,7 @@ def partition_pdf(
224225
starting_page_number=starting_page_number,
225226
extract_forms=extract_forms,
226227
form_extraction_skip_tables=form_extraction_skip_tables,
228+
password=password,
227229
**kwargs,
228230
)
229231

@@ -245,6 +247,7 @@ def partition_pdf_or_image(
245247
starting_page_number: int = 1,
246248
extract_forms: bool = False,
247249
form_extraction_skip_tables: bool = True,
250+
password: Optional[str] = None,
248251
**kwargs: Any,
249252
) -> list[Element]:
250253
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -273,6 +276,7 @@ def partition_pdf_or_image(
273276
languages=languages,
274277
metadata_last_modified=metadata_last_modified or last_modified,
275278
starting_page_number=starting_page_number,
279+
password=password,
276280
**kwargs,
277281
)
278282
pdf_text_extractable = any(
@@ -322,6 +326,7 @@ def partition_pdf_or_image(
322326
starting_page_number=starting_page_number,
323327
extract_forms=extract_forms,
324328
form_extraction_skip_tables=form_extraction_skip_tables,
329+
password=password,
325330
**kwargs,
326331
)
327332
out_elements = _process_uncategorized_text_elements(elements)
@@ -347,6 +352,7 @@ def partition_pdf_or_image(
347352
is_image=is_image,
348353
metadata_last_modified=metadata_last_modified or last_modified,
349354
starting_page_number=starting_page_number,
355+
password=password,
350356
**kwargs,
351357
)
352358
out_elements = _process_uncategorized_text_elements(elements)
@@ -360,6 +366,7 @@ def extractable_elements(
360366
languages: Optional[list[str]] = None,
361367
metadata_last_modified: Optional[str] = None,
362368
starting_page_number: int = 1,
369+
password:Optional[str] = None,
363370
**kwargs: Any,
364371
) -> list[list[Element]]:
365372
if isinstance(file, bytes):
@@ -370,6 +377,7 @@ def extractable_elements(
370377
languages=languages,
371378
metadata_last_modified=metadata_last_modified,
372379
starting_page_number=starting_page_number,
380+
password=password,
373381
**kwargs,
374382
)
375383

@@ -380,6 +388,7 @@ def _partition_pdf_with_pdfminer(
380388
languages: list[str],
381389
metadata_last_modified: Optional[str],
382390
starting_page_number: int = 1,
391+
password:Optional[str] = None,
383392
**kwargs: Any,
384393
) -> list[list[Element]]:
385394
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -403,6 +412,7 @@ def _partition_pdf_with_pdfminer(
403412
languages=languages,
404413
metadata_last_modified=metadata_last_modified,
405414
starting_page_number=starting_page_number,
415+
password=password,
406416
**kwargs,
407417
)
408418

@@ -413,6 +423,7 @@ def _partition_pdf_with_pdfminer(
413423
languages=languages,
414424
metadata_last_modified=metadata_last_modified,
415425
starting_page_number=starting_page_number,
426+
password=password,
416427
**kwargs,
417428
)
418429

@@ -427,14 +438,16 @@ def _process_pdfminer_pages(
427438
metadata_last_modified: Optional[str],
428439
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
429440
starting_page_number: int = 1,
441+
password: Optional[str] = None,
430442
**kwargs,
431443
) -> list[list[Element]]:
432444
"""Uses PDFMiner to split a document into pages and process them."""
433445

434446
elements = []
435447

436448
for page_number, (page, page_layout) in enumerate(
437-
open_pdfminer_pages_generator(fp), start=starting_page_number
449+
open_pdfminer_pages_generator(fp, password=password),
450+
start=starting_page_number,
438451
):
439452
width, height = page_layout.width, page_layout.height
440453

@@ -556,6 +569,7 @@ def _partition_pdf_or_image_local(
556569
extract_forms: bool = False,
557570
form_extraction_skip_tables: bool = True,
558571
pdf_hi_res_max_pages: Optional[int] = None,
572+
password:Optional[str] = None,
559573
**kwargs: Any,
560574
) -> list[Element]:
561575
"""Partition using package installed locally"""
@@ -592,10 +606,12 @@ def _partition_pdf_or_image_local(
592606
is_image=is_image,
593607
model_name=hi_res_model_name,
594608
pdf_image_dpi=pdf_image_dpi,
609+
password=password,
595610
)
596611

597612
extracted_layout, layouts_links = (
598-
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
613+
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi,
614+
password=password)
599615
if pdf_text_extractable
600616
else ([], [])
601617
)
@@ -635,20 +651,22 @@ def _partition_pdf_or_image_local(
635651
ocr_mode=ocr_mode,
636652
pdf_image_dpi=pdf_image_dpi,
637653
ocr_layout_dumper=ocr_layout_dumper,
654+
password=password,
638655
)
639656
else:
640657
inferred_document_layout = process_data_with_model(
641658
file,
642659
is_image=is_image,
643660
model_name=hi_res_model_name,
644661
pdf_image_dpi=pdf_image_dpi,
662+
password=password,
645663
)
646664

647665
if hasattr(file, "seek"):
648666
file.seek(0)
649667

650668
extracted_layout, layouts_links = (
651-
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
669+
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
652670
if pdf_text_extractable
653671
else ([], [])
654672
)
@@ -690,6 +708,7 @@ def _partition_pdf_or_image_local(
690708
ocr_mode=ocr_mode,
691709
pdf_image_dpi=pdf_image_dpi,
692710
ocr_layout_dumper=ocr_layout_dumper,
711+
password=password,
693712
)
694713

695714
# vectorization of the data structure ends here
@@ -837,6 +856,7 @@ def _partition_pdf_or_image_with_ocr(
837856
is_image: bool = False,
838857
metadata_last_modified: Optional[str] = None,
839858
starting_page_number: int = 1,
859+
password: Optional[str] = None,
840860
**kwargs: Any,
841861
):
842862
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -861,7 +881,8 @@ def _partition_pdf_or_image_with_ocr(
861881
elements.extend(page_elements)
862882
else:
863883
for page_number, image in enumerate(
864-
convert_pdf_to_images(filename, file), start=starting_page_number
884+
convert_pdf_to_images(filename, file, password=password),
885+
start=starting_page_number
865886
):
866887
page_elements = _partition_pdf_or_image_with_ocr_from_image(
867888
image=image,

unstructured/partition/pdf_image/ocr.py

+4
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def process_data_with_ocr(
4242
ocr_mode: str = OCRMode.FULL_PAGE.value,
4343
pdf_image_dpi: int = 200,
4444
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
45+
password:Optional[str] = None,
4546
) -> "DocumentLayout":
4647
"""
4748
Process OCR data from a given data and supplement the output DocumentLayout
@@ -89,6 +90,7 @@ def process_data_with_ocr(
8990
ocr_mode=ocr_mode,
9091
pdf_image_dpi=pdf_image_dpi,
9192
ocr_layout_dumper=ocr_layout_dumper,
93+
password=password,
9294
)
9395

9496
return merged_layouts
@@ -105,6 +107,7 @@ def process_file_with_ocr(
105107
ocr_mode: str = OCRMode.FULL_PAGE.value,
106108
pdf_image_dpi: int = 200,
107109
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
110+
password:Optional[str] = None,
108111
) -> "DocumentLayout":
109112
"""
110113
Process OCR data from a given file and supplement the output DocumentLayout
@@ -165,6 +168,7 @@ def process_file_with_ocr(
165168
dpi=pdf_image_dpi,
166169
output_folder=temp_dir,
167170
paths_only=True,
171+
userpw=password or ""
168172
)
169173
image_paths = cast(List[str], _image_paths)
170174
for i, image_path in enumerate(image_paths):

0 commit comments

Comments
 (0)