diff --git a/CHANGELOG.md b/CHANGELOG.md index 7eb1a95393..4544bab72b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Enhancements ### Features +- **Use password** to load PDF with all modes ### Fixes - **Fix an issue with multiple values for `infer_table_structure`** when paritioning email with image attachements the kwarg calls into `partition` to partition the image already contains `infer_table_structure`. Now `partition` function checks if the `kwarg` has `infer_table_structure` already diff --git a/example-docs/pdf/password.pdf b/example-docs/pdf/password.pdf new file mode 100644 index 0000000000..21bd55d500 Binary files /dev/null and b/example-docs/pdf/password.pdf differ diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py index 88e01563fe..2318bfeb60 100644 --- a/test_unstructured/chunking/test_basic.py +++ b/test_unstructured/chunking/test_basic.py @@ -164,5 +164,5 @@ def it_supports_the_include_orig_elements_option( # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() - def _chunk_elements_(self, request: FixtureRequest): + def _chunk_elements_(self, request: FixtureRequest): # noqa: PT005 return function_mock(request, "unstructured.chunking.basic._chunk_elements") diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 443b073755..050989182d 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -468,7 +468,7 @@ def it_supports_the_include_orig_elements_option( # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() - def _chunk_by_title_(self, request: FixtureRequest): + def _chunk_by_title_(self, request: FixtureRequest): # noqa: PT005 return function_mock(request, "unstructured.chunking.title._chunk_by_title") diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 200edf3e2a..a600de7f46 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1517,3 +1517,43 @@ def test_document_to_element_list_sets_category_depth_titles(): assert elements[1].metadata.category_depth == 2 assert elements[2].metadata.category_depth is None assert elements[3].metadata.category_depth == 0 + + +@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) +@pytest.mark.parametrize( + "strategy", + # fast: can't capture the "intentionally left blank page" page + # others: will ignore the actual blank page + [ + PartitionStrategy.FAST, + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, + ], +) +def test_partition_pdf_with_password( + file_mode: str, + strategy: str, + filename: str = example_doc_path("pdf/password.pdf"), +): + # Test that the partition_pdf function can handle filename + def _test(result: list[Element]): + # validate that the result is a non-empty list of dicts + assert len(result) == 1 + assert result[0].text == "File with password" + + if file_mode == "filename": + result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password") + _test(result) + elif file_mode == "rb": + with open(filename, "rb") as f: + result = pdf.partition_pdf(file=f, strategy=strategy, password="password") + _test(result) + else: + with open(filename, "rb") as test_file: + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf( + file=spooled_temp_file, strategy=strategy, password="password" + ) + _test(result) diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index d1d66876ed..9ecc593089 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -443,7 +443,7 @@ def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_funct # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture - def _last_modified_prop_(self, request: FixtureRequest): + def _last_modified_prop_(self, request: FixtureRequest): # noqa: PT005 return property_mock(request, MsgPartitionerOptions, "_last_modified") @pytest.fixture diff --git a/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py b/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py index 28623372a2..cb1d471b45 100644 --- a/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py +++ b/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py @@ -99,7 +99,7 @@ def get_instance_(self, request: FixtureRequest): return method_mock(request, OCRAgent, "get_instance") @pytest.fixture() - def _get_ocr_agent_cls_qname_(self, request: FixtureRequest): + def _get_ocr_agent_cls_qname_(self, request: FixtureRequest): # noqa: PT005 return method_mock(request, OCRAgent, "_get_ocr_agent_cls_qname") @pytest.fixture() diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 50ceaa1187..712384e0d5 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -32,6 +32,7 @@ def partition_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses an image into a list of interpreted elements. @@ -91,6 +92,8 @@ def partition_image( (results in adding FormKeysValues elements to output). form_extraction_skip_tables Whether the form extraction logic should ignore regions designated as Tables. + password + The password to decrypt the PDF file. """ exactly_one(filename=filename, file=file) @@ -113,5 +116,6 @@ def partition_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f87812d40b..4643b38c0a 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -142,6 +142,7 @@ def partition_pdf( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf document into a list of interpreted elements. @@ -222,6 +223,7 @@ def partition_pdf( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) @@ -243,6 +245,7 @@ def partition_pdf_or_image( starting_page_number: int = 1, extract_forms: bool = False, form_extraction_skip_tables: bool = True, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -270,6 +273,7 @@ def partition_pdf_or_image( file=spooled_to_bytes_io_if_needed(file), languages=languages, metadata_last_modified=metadata_last_modified or last_modified, + password=password, starting_page_number=starting_page_number, **kwargs, ) @@ -320,6 +324,7 @@ def partition_pdf_or_image( starting_page_number=starting_page_number, extract_forms=extract_forms, form_extraction_skip_tables=form_extraction_skip_tables, + password=password, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) @@ -344,6 +349,7 @@ def partition_pdf_or_image( ocr_languages=ocr_languages, is_image=is_image, metadata_last_modified=metadata_last_modified or last_modified, + password=password, starting_page_number=starting_page_number, **kwargs, ) @@ -358,6 +364,7 @@ def extractable_elements( languages: Optional[list[str]] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: if isinstance(file, bytes): @@ -367,6 +374,7 @@ def extractable_elements( file=file, languages=languages, metadata_last_modified=metadata_last_modified, + password=password, starting_page_number=starting_page_number, **kwargs, ) @@ -378,6 +386,7 @@ def _partition_pdf_with_pdfminer( languages: list[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster @@ -401,6 +410,7 @@ def _partition_pdf_with_pdfminer( languages=languages, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + password=password, **kwargs, ) @@ -410,6 +420,7 @@ def _partition_pdf_with_pdfminer( filename=filename, languages=languages, metadata_last_modified=metadata_last_modified, + password=password, starting_page_number=starting_page_number, **kwargs, ) @@ -425,6 +436,7 @@ def _process_pdfminer_pages( metadata_last_modified: Optional[str], annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs, ) -> list[list[Element]]: """Uses PDFMiner to split a document into pages and process them.""" @@ -432,7 +444,8 @@ def _process_pdfminer_pages( elements = [] for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp), start=starting_page_number + open_pdfminer_pages_generator(fp, password=password), + start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -554,6 +567,7 @@ def _partition_pdf_or_image_local( extract_forms: bool = False, form_extraction_skip_tables: bool = True, pdf_hi_res_max_pages: Optional[int] = None, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -589,11 +603,12 @@ def _partition_pdf_or_image_local( filename, is_image=is_image, model_name=hi_res_model_name, + password=password, pdf_image_dpi=pdf_image_dpi, ) extracted_layout, layouts_links = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password) if pdf_text_extractable else ([], []) ) @@ -631,6 +646,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + password=password, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, ) @@ -639,6 +655,7 @@ def _partition_pdf_or_image_local( file, is_image=is_image, model_name=hi_res_model_name, + password=password, pdf_image_dpi=pdf_image_dpi, ) @@ -646,7 +663,7 @@ def _partition_pdf_or_image_local( file.seek(0) extracted_layout, layouts_links = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password) if pdf_text_extractable else ([], []) ) @@ -686,6 +703,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + password=password, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, ) @@ -725,6 +743,7 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, extract_image_block_to_payload=extract_image_block_to_payload, output_dir_path=extract_image_block_output_dir, + password=password, ) for el_type in extract_image_block_types: @@ -796,6 +815,7 @@ def _partition_pdf_or_image_local( draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION, resize=env_config.ANALYSIS_BBOX_RESIZE, format=env_config.ANALYSIS_BBOX_FORMAT, + password=password, ) return out_elements @@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr( is_image: bool = False, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, + password: Optional[str] = None, **kwargs: Any, ): """Partitions an image or PDF using OCR. For PDFs, each page is converted @@ -858,7 +879,7 @@ def _partition_pdf_or_image_with_ocr( elements.extend(page_elements) else: for page_number, image in enumerate( - convert_pdf_to_images(filename, file), start=starting_page_number + convert_pdf_to_images(filename, file, password=password), start=starting_page_number ): page_elements = _partition_pdf_or_image_with_ocr_from_image( image=image, diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py index ecd7f722bf..b3752b0f22 100644 --- a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py +++ b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py @@ -546,6 +546,7 @@ def __init__( draw_grid: bool = False, resize: Optional[float] = None, format: str = "png", + password: Optional[str] = None, ): self.draw_caption = draw_caption self.draw_grid = draw_grid @@ -554,6 +555,7 @@ def __init__( self.format = format self.drawers = [] self.file = file + self.password = password super().__init__(filename, save_dir) @@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]: file=self.file, output_folder=temp_dir, path_only=True, + password=self.password, ) except Exception as ex: # noqa: E722 print( diff --git a/unstructured/partition/pdf_image/analysis/tools.py b/unstructured/partition/pdf_image/analysis/tools.py index 3000f08db6..ba42a432a9 100644 --- a/unstructured/partition/pdf_image/analysis/tools.py +++ b/unstructured/partition/pdf_image/analysis/tools.py @@ -66,6 +66,7 @@ def save_analysis_artifiacts( draw_caption: bool = True, resize: Optional[float] = None, format: str = "png", + password: Optional[str] = None, ): """Save the analysis artifacts for a given file. Loads some settings from the environment configuration. @@ -82,6 +83,7 @@ def save_analysis_artifiacts( draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source) resize: Output image resize value. If not provided, the image will not be resized. format: The format for analyzed pages with bboxes drawn on them. Default is 'png'. + password (optional): The password to decrypt the PDF file. """ if not filename: filename = _generate_filename(is_image) @@ -109,6 +111,7 @@ def save_analysis_artifiacts( draw_caption=draw_caption, resize=resize, format=format, + password=password, ) for layout_dumper in layout_dumpers: @@ -125,6 +128,7 @@ def render_bboxes_for_file( draw_caption: bool = True, resize: Optional[float] = None, format: str = "png", + password: Optional[str] = None, ): """Render the bounding boxes for a given layout dimp file. To be used for analysis after the partition is performed for @@ -144,6 +148,7 @@ def render_bboxes_for_file( draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source) resize: Output image resize value. If not provided, the image will not be resized. format: The format for analyzed pages with bboxes drawn on them. Default is 'png'. + password (optional): The password to decrypt the PDF file. """ filename_stem = Path(filename).stem is_image = not Path(filename).suffix.endswith("pdf") @@ -183,6 +188,7 @@ def render_bboxes_for_file( draw_caption=draw_caption, resize=resize, format=format, + password=password, ) for drawer in layout_drawers: diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index f6b81dd2e4..92df9a701f 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -37,6 +37,7 @@ def process_data_with_ocr( ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, ocr_layout_dumper: Optional[OCRLayoutDumper] = None, + password: Optional[str] = None, ) -> "DocumentLayout": """ Process OCR data from a given data and supplement the output DocumentLayout @@ -64,6 +65,8 @@ def process_data_with_ocr( - ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout. + - password (optional): The password to decrypt the PDF file. + Returns: DocumentLayout: The merged layout information obtained after OCR processing. """ @@ -84,6 +87,7 @@ def process_data_with_ocr( ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, + password=password, ) return merged_layouts @@ -100,6 +104,7 @@ def process_file_with_ocr( ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, ocr_layout_dumper: Optional[OCRLayoutDumper] = None, + password: Optional[str] = None, ) -> "DocumentLayout": """ Process OCR data from a given file and supplement the output DocumentLayout @@ -124,6 +129,8 @@ def process_file_with_ocr( - pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200. + - password (optional): The password to decrypt the PDF file. + Returns: DocumentLayout: The merged layout information obtained after OCR processing. """ @@ -157,6 +164,7 @@ def process_file_with_ocr( dpi=pdf_image_dpi, output_folder=temp_dir, paths_only=True, + userpw=password, ) image_paths = cast(List[str], _image_paths) for i, image_path in enumerate(image_paths): diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index a809c7f76d..d57af9d532 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -58,6 +58,7 @@ def convert_pdf_to_image( dpi: int = 200, output_folder: Optional[Union[str, PurePath]] = None, path_only: bool = False, + password: Optional[str] = None, ) -> Union[List[Image.Image], List[str]]: """Get the image renderings of the pdf pages using pdf2image""" @@ -71,6 +72,7 @@ def convert_pdf_to_image( dpi=dpi, output_folder=output_folder, paths_only=path_only, + userpw=password, ) else: images = pdf2image.convert_from_path( @@ -125,6 +127,7 @@ def save_elements( is_image: bool = False, extract_image_block_to_payload: bool = False, output_dir_path: str | None = None, + password: Optional[str] = None, ): """ Saves specific elements from a PDF as images either to a directory or embeds them in the @@ -167,6 +170,7 @@ def save_elements( pdf_image_dpi, output_folder=temp_dir, path_only=True, + password=password, ) image_paths = cast(List[str], _image_paths) @@ -389,15 +393,16 @@ def convert_pdf_to_images( filename: str = "", file: Optional[bytes | IO[bytes]] = None, chunk_size: int = 10, + password: Optional[str] = None, ) -> Iterator[Image.Image]: # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on) exactly_one(filename=filename, file=file) if file is not None: f_bytes = convert_to_bytes(file) - info = pdf2image.pdfinfo_from_bytes(f_bytes) + info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password) else: f_bytes = None - info = pdf2image.pdfinfo_from_path(filename) + info = pdf2image.pdfinfo_from_path(filename, userpw=password) total_pages = info["Pages"] for start_page in range(1, total_pages + 1, chunk_size): @@ -407,12 +412,14 @@ def convert_pdf_to_images( f_bytes, first_page=start_page, last_page=end_page, + userpw=password, ) else: chunk_images = pdf2image.convert_from_path( filename, first_page=start_page, last_page=end_page, + userpw=password, ) for image in chunk_images: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 91a3e689f2..810b577ef0 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -35,12 +35,14 @@ def process_file_with_pdfminer( filename: str = "", dpi: int = 200, + password: Optional[str] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) extracted_layout, layouts_links = process_data_with_pdfminer( file=fp, dpi=dpi, + password=password, ) return extracted_layout, layouts_links @@ -49,6 +51,7 @@ def process_file_with_pdfminer( def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, + password: Optional[str] = None, ) -> tuple[List[List["TextRegion"]], List[List]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" @@ -62,7 +65,9 @@ def process_data_with_pdfminer( layouts_links = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 - for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)): + for page_number, (page, page_layout) in enumerate( + open_pdfminer_pages_generator(file, password=password) + ): width, height = page_layout.width, page_layout.height text_layout = [] diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 23332745e6..fb91de18f4 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -73,6 +73,7 @@ def rect_to_bbox( @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, + password: str = "", ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" @@ -84,7 +85,7 @@ def open_pdfminer_pages_generator( with tempfile.TemporaryDirectory() as tmp_dir_path: tmp_file_path = os.path.join(tmp_dir_path, "tmp_file") try: - pages = PDFPage.get_pages(fp) + pages = PDFPage.get_pages(fp, password=password) # Detect invalid dictionary construct for entire PDF for i, page in enumerate(pages): try: