Add password with PDF files (#392)

pprados · Coniferish · web-flow · commit cd9ea8b846cd · 2025-01-30T14:37:59.000-05:00
Add password with PDF files. Must be combined with [PR 3721 in unstructured](Unstructured-IO/unstructured#3721) --------- Co-authored-by: John J <43506685+Coniferish@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.8.7
+
+* fix: add `password` for PDF
+
 ## 0.8.6
 
 * feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility
diff --git a/sample-docs/password.pdf b/sample-docs/password.pdf
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -302,6 +302,21 @@ def mock_get_elements(self, *args, **kwargs):
             assert page.image is None
 
 
+@pytest.mark.slow()
+def test_from_file_with_password(monkeypatch, mock_final_layout):
+
+    doc = layout.DocumentLayout.from_file("sample-docs/password.pdf", password="password")
+    assert doc
+
+    monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
+    with patch(
+        "unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
+        MockLayoutModel,
+    ), open("sample-docs/password.pdf", mode="rb") as fp:
+        doc = layout.process_data_with_model(fp, model_name="fake", password="password")
+        assert doc
+
+
 def test_from_image_file_raises_with_empty_fn():
     with pytest.raises(FileNotFoundError):
         layout.DocumentLayout.from_image_file("")
@@ -544,6 +559,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
             detection_model=detection_model,
             element_extraction_model=element_extraction_model,
             fixed_layouts=None,
+            password=None,
             pdf_image_dpi=200,
         )
 
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -11,7 +11,10 @@
 
 import unstructured_inference.models.table_postprocess as postprocess
 from unstructured_inference.models import tables
-from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells
+from unstructured_inference.models.tables import (
+    apply_thresholds_on_objects,
+    structure_to_cells,
+)
 
 skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
 
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.8.6"  # pragma: no cover
+__version__ = "0.8.7"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -51,6 +51,7 @@ def from_file(
         filename: str,
         fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
         pdf_image_dpi: int = 200,
+        password: Optional[str] = None,
         **kwargs,
     ) -> DocumentLayout:
         """Creates a DocumentLayout from a pdf file."""
@@ -62,6 +63,7 @@ def from_file(
                 pdf_image_dpi,
                 output_folder=temp_dir,
                 path_only=True,
+                password=password,
             )
             image_paths = cast(List[str], _image_paths)
             number_of_pages = len(image_paths)
@@ -133,6 +135,7 @@ def __init__(
         document_filename: Optional[Union[str, PurePath]] = None,
         detection_model: Optional[UnstructuredObjectDetectionModel] = None,
         element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
+        password: Optional[str] = None,
     ):
         if detection_model is not None and element_extraction_model is not None:
             raise ValueError("Only one of detection_model and extraction_model should be passed.")
@@ -148,6 +151,7 @@ def __init__(
         self.element_extraction_model = element_extraction_model
         self.elements: Collection[LayoutElement] = []
         self.elements_array: LayoutElements | None = None
+        self.password = password
         # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
         # locations now and if we need to support LayoutElements without bounding boxes we can make
         # the bbox property optional
@@ -325,6 +329,7 @@ def from_image(
 def process_data_with_model(
     data: BinaryIO,
     model_name: Optional[str],
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> DocumentLayout:
     """Process PDF as file-like object `data` into a `DocumentLayout`.
@@ -339,6 +344,7 @@ def process_data_with_model(
         layout = process_file_with_model(
             file_path,
             model_name,
+            password=password,
             **kwargs,
         )
 
@@ -351,6 +357,7 @@ def process_file_with_model(
     is_image: bool = False,
     fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
     pdf_image_dpi: int = 200,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> DocumentLayout:
     """Processes pdf file with name filename into a DocumentLayout by using a model identified by
@@ -379,6 +386,7 @@ def process_file_with_model(
             element_extraction_model=element_extraction_model,
             fixed_layouts=fixed_layouts,
             pdf_image_dpi=pdf_image_dpi,
+            password=password,
             **kwargs,
         )
     )
@@ -390,6 +398,7 @@ def convert_pdf_to_image(
     dpi: int = 200,
     output_folder: Optional[Union[str, PurePath]] = None,
     path_only: bool = False,
+    password: Optional[str] = None,
 ) -> Union[List[Image.Image], List[str]]:
     """Get the image renderings of the pdf pages using pdf2image"""
 
@@ -402,12 +411,14 @@ def convert_pdf_to_image(
             dpi=dpi,
             output_folder=output_folder,
             paths_only=path_only,
+            userpw=password or "",
         )
     else:
         images = pdf2image.convert_from_path(
             filename,
             dpi=dpi,
             paths_only=path_only,
+            userpw=password or "",
         )
 
     return images

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.8.6" # pragma: no cover`
	`1`	`+__version__ = "0.8.7" # pragma: no cover`