Skip to content

Commit cd9ea8b

Browse files
ppradosConiferish
andauthored
Add password with PDF files (#392)
Add password with PDF files. Must be combined with [PR 3721 in unstructured](Unstructured-IO/unstructured#3721) --------- Co-authored-by: John J <[email protected]>
1 parent 85bcdc1 commit cd9ea8b

File tree

6 files changed

+36
-2
lines changed

6 files changed

+36
-2
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.8.7
2+
3+
* fix: add `password` for PDF
4+
15
## 0.8.6
26

37
* feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility

sample-docs/password.pdf

13.8 KB
Binary file not shown.

test_unstructured_inference/inference/test_layout.py

+16
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,21 @@ def mock_get_elements(self, *args, **kwargs):
302302
assert page.image is None
303303

304304

305+
@pytest.mark.slow()
306+
def test_from_file_with_password(monkeypatch, mock_final_layout):
307+
308+
doc = layout.DocumentLayout.from_file("sample-docs/password.pdf", password="password")
309+
assert doc
310+
311+
monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
312+
with patch(
313+
"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
314+
MockLayoutModel,
315+
), open("sample-docs/password.pdf", mode="rb") as fp:
316+
doc = layout.process_data_with_model(fp, model_name="fake", password="password")
317+
assert doc
318+
319+
305320
def test_from_image_file_raises_with_empty_fn():
306321
with pytest.raises(FileNotFoundError):
307322
layout.DocumentLayout.from_image_file("")
@@ -544,6 +559,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
544559
detection_model=detection_model,
545560
element_extraction_model=element_extraction_model,
546561
fixed_layouts=None,
562+
password=None,
547563
pdf_image_dpi=200,
548564
)
549565

test_unstructured_inference/models/test_tables.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111

1212
import unstructured_inference.models.table_postprocess as postprocess
1313
from unstructured_inference.models import tables
14-
from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells
14+
from unstructured_inference.models.tables import (
15+
apply_thresholds_on_objects,
16+
structure_to_cells,
17+
)
1518

1619
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
1720

unstructured_inference/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.8.6" # pragma: no cover
1+
__version__ = "0.8.7" # pragma: no cover

unstructured_inference/inference/layout.py

+11
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def from_file(
5151
filename: str,
5252
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
5353
pdf_image_dpi: int = 200,
54+
password: Optional[str] = None,
5455
**kwargs,
5556
) -> DocumentLayout:
5657
"""Creates a DocumentLayout from a pdf file."""
@@ -62,6 +63,7 @@ def from_file(
6263
pdf_image_dpi,
6364
output_folder=temp_dir,
6465
path_only=True,
66+
password=password,
6567
)
6668
image_paths = cast(List[str], _image_paths)
6769
number_of_pages = len(image_paths)
@@ -133,6 +135,7 @@ def __init__(
133135
document_filename: Optional[Union[str, PurePath]] = None,
134136
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
135137
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
138+
password: Optional[str] = None,
136139
):
137140
if detection_model is not None and element_extraction_model is not None:
138141
raise ValueError("Only one of detection_model and extraction_model should be passed.")
@@ -148,6 +151,7 @@ def __init__(
148151
self.element_extraction_model = element_extraction_model
149152
self.elements: Collection[LayoutElement] = []
150153
self.elements_array: LayoutElements | None = None
154+
self.password = password
151155
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
152156
# locations now and if we need to support LayoutElements without bounding boxes we can make
153157
# the bbox property optional
@@ -325,6 +329,7 @@ def from_image(
325329
def process_data_with_model(
326330
data: BinaryIO,
327331
model_name: Optional[str],
332+
password: Optional[str] = None,
328333
**kwargs: Any,
329334
) -> DocumentLayout:
330335
"""Process PDF as file-like object `data` into a `DocumentLayout`.
@@ -339,6 +344,7 @@ def process_data_with_model(
339344
layout = process_file_with_model(
340345
file_path,
341346
model_name,
347+
password=password,
342348
**kwargs,
343349
)
344350

@@ -351,6 +357,7 @@ def process_file_with_model(
351357
is_image: bool = False,
352358
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
353359
pdf_image_dpi: int = 200,
360+
password: Optional[str] = None,
354361
**kwargs: Any,
355362
) -> DocumentLayout:
356363
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
@@ -379,6 +386,7 @@ def process_file_with_model(
379386
element_extraction_model=element_extraction_model,
380387
fixed_layouts=fixed_layouts,
381388
pdf_image_dpi=pdf_image_dpi,
389+
password=password,
382390
**kwargs,
383391
)
384392
)
@@ -390,6 +398,7 @@ def convert_pdf_to_image(
390398
dpi: int = 200,
391399
output_folder: Optional[Union[str, PurePath]] = None,
392400
path_only: bool = False,
401+
password: Optional[str] = None,
393402
) -> Union[List[Image.Image], List[str]]:
394403
"""Get the image renderings of the pdf pages using pdf2image"""
395404

@@ -402,12 +411,14 @@ def convert_pdf_to_image(
402411
dpi=dpi,
403412
output_folder=output_folder,
404413
paths_only=path_only,
414+
userpw=password or "",
405415
)
406416
else:
407417
images = pdf2image.convert_from_path(
408418
filename,
409419
dpi=dpi,
410420
paths_only=path_only,
421+
userpw=password or "",
411422
)
412423

413424
return images

0 commit comments

Comments
 (0)