Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add password #3876

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
### Enhancements

### Features
- **Use password** to load PDF with all modes

### Fixes
- **Fix an issue with multiple values for `infer_table_structure`** when paritioning email with image attachements the kwarg calls into `partition` to partition the image already contains `infer_table_structure`. Now `partition` function checks if the `kwarg` has `infer_table_structure` already
Expand Down
Binary file added example-docs/pdf/password.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion test_unstructured/chunking/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,5 +164,5 @@ def it_supports_the_include_orig_elements_option(
# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
def _chunk_elements_(self, request: FixtureRequest):
def _chunk_elements_(self, request: FixtureRequest): # noqa: PT005
return function_mock(request, "unstructured.chunking.basic._chunk_elements")
2 changes: 1 addition & 1 deletion test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def it_supports_the_include_orig_elements_option(
# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture()
def _chunk_by_title_(self, request: FixtureRequest):
def _chunk_by_title_(self, request: FixtureRequest): # noqa: PT005
return function_mock(request, "unstructured.chunking.title._chunk_by_title")


Expand Down
40 changes: 40 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,43 @@ def test_document_to_element_list_sets_category_depth_titles():
assert elements[1].metadata.category_depth == 2
assert elements[2].metadata.category_depth is None
assert elements[3].metadata.category_depth == 0


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
file_mode: str,
strategy: str,
filename: str = example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result: list[Element]):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == "File with password"

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy, password="password")
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy, password="password")
_test(result)
else:
with open(filename, "rb") as test_file:
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, password="password"
)
_test(result)
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_funct
# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture
def _last_modified_prop_(self, request: FixtureRequest):
def _last_modified_prop_(self, request: FixtureRequest): # noqa: PT005
return property_mock(request, MsgPartitionerOptions, "_last_modified")

@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_instance_(self, request: FixtureRequest):
return method_mock(request, OCRAgent, "get_instance")

@pytest.fixture()
def _get_ocr_agent_cls_qname_(self, request: FixtureRequest):
def _get_ocr_agent_cls_qname_(self, request: FixtureRequest): # noqa: PT005
return method_mock(request, OCRAgent, "_get_ocr_agent_cls_qname")

@pytest.fixture()
Expand Down
4 changes: 4 additions & 0 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def partition_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Expand Down Expand Up @@ -91,6 +92,8 @@ def partition_image(
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
password
The password to decrypt the PDF file.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -113,5 +116,6 @@ def partition_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
29 changes: 25 additions & 4 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def partition_pdf(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf document into a list of interpreted elements.
Expand Down Expand Up @@ -222,6 +223,7 @@ def partition_pdf(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)

Expand All @@ -243,6 +245,7 @@ def partition_pdf_or_image(
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
Expand Down Expand Up @@ -270,6 +273,7 @@ def partition_pdf_or_image(
file=spooled_to_bytes_io_if_needed(file),
languages=languages,
metadata_last_modified=metadata_last_modified or last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
Expand Down Expand Up @@ -320,6 +324,7 @@ def partition_pdf_or_image(
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
password=password,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
Expand All @@ -344,6 +349,7 @@ def partition_pdf_or_image(
ocr_languages=ocr_languages,
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
Expand All @@ -358,6 +364,7 @@ def extractable_elements(
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
if isinstance(file, bytes):
Expand All @@ -367,6 +374,7 @@ def extractable_elements(
file=file,
languages=languages,
metadata_last_modified=metadata_last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
Expand All @@ -378,6 +386,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
Expand All @@ -401,6 +410,7 @@ def _partition_pdf_with_pdfminer(
languages=languages,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
password=password,
**kwargs,
)

Expand All @@ -410,6 +420,7 @@ def _partition_pdf_with_pdfminer(
filename=filename,
languages=languages,
metadata_last_modified=metadata_last_modified,
password=password,
starting_page_number=starting_page_number,
**kwargs,
)
Expand All @@ -425,14 +436,16 @@ def _process_pdfminer_pages(
metadata_last_modified: Optional[str],
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs,
) -> list[list[Element]]:
"""Uses PDFMiner to split a document into pages and process them."""

elements = []

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp), start=starting_page_number
open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height

Expand Down Expand Up @@ -554,6 +567,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partition using package installed locally"""
Expand Down Expand Up @@ -589,11 +603,12 @@ def _partition_pdf_or_image_local(
filename,
is_image=is_image,
model_name=hi_res_model_name,
password=password,
pdf_image_dpi=pdf_image_dpi,
)

extracted_layout, layouts_links = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -631,6 +646,7 @@ def _partition_pdf_or_image_local(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
password=password,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
Expand All @@ -639,14 +655,15 @@ def _partition_pdf_or_image_local(
file,
is_image=is_image,
model_name=hi_res_model_name,
password=password,
pdf_image_dpi=pdf_image_dpi,
)

if hasattr(file, "seek"):
file.seek(0)

extracted_layout, layouts_links = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi, password=password)
if pdf_text_extractable
else ([], [])
)
Expand Down Expand Up @@ -686,6 +703,7 @@ def _partition_pdf_or_image_local(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
password=password,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
Expand Down Expand Up @@ -725,6 +743,7 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
extract_image_block_to_payload=extract_image_block_to_payload,
output_dir_path=extract_image_block_output_dir,
password=password,
)

for el_type in extract_image_block_types:
Expand Down Expand Up @@ -796,6 +815,7 @@ def _partition_pdf_or_image_local(
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
password=password,
)

return out_elements
Expand Down Expand Up @@ -834,6 +854,7 @@ def _partition_pdf_or_image_with_ocr(
is_image: bool = False,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password: Optional[str] = None,
**kwargs: Any,
):
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
Expand All @@ -858,7 +879,7 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements)
else:
for page_number, image in enumerate(
convert_pdf_to_images(filename, file), start=starting_page_number
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ def __init__(
draw_grid: bool = False,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
self.draw_caption = draw_caption
self.draw_grid = draw_grid
Expand All @@ -554,6 +555,7 @@ def __init__(
self.format = format
self.drawers = []
self.file = file
self.password = password

super().__init__(filename, save_dir)

Expand Down Expand Up @@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]:
file=self.file,
output_folder=temp_dir,
path_only=True,
password=self.password,
)
except Exception as ex: # noqa: E722
print(
Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/pdf_image/analysis/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def save_analysis_artifiacts(
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
"""Save the analysis artifacts for a given file. Loads some settings from
the environment configuration.
Expand All @@ -82,6 +83,7 @@ def save_analysis_artifiacts(
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
password (optional): The password to decrypt the PDF file.
"""
if not filename:
filename = _generate_filename(is_image)
Expand Down Expand Up @@ -109,6 +111,7 @@ def save_analysis_artifiacts(
draw_caption=draw_caption,
resize=resize,
format=format,
password=password,
)

for layout_dumper in layout_dumpers:
Expand All @@ -125,6 +128,7 @@ def render_bboxes_for_file(
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
password: Optional[str] = None,
):
"""Render the bounding boxes for a given layout dimp file.
To be used for analysis after the partition is performed for
Expand All @@ -144,6 +148,7 @@ def render_bboxes_for_file(
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
password (optional): The password to decrypt the PDF file.
"""
filename_stem = Path(filename).stem
is_image = not Path(filename).suffix.endswith("pdf")
Expand Down Expand Up @@ -183,6 +188,7 @@ def render_bboxes_for_file(
draw_caption=draw_caption,
resize=resize,
format=format,
password=password,
)

for drawer in layout_drawers:
Expand Down
Loading
Loading