Skip to content

Commit 087adb2

Browse files
authored
feat(docx): differentiate no-file from not-ZIP (#3306)
**Summary** The `python-docx` error `docx.opc.exceptions.PackageNotFoundError` arises both when no file exists at the given path and when the file exists but is not a ZIP archive (and so is not a DOCX file). This ambiguity is unwelcome when diagnosing the error as the two possible conditions generally indicate a different course of action to resolve the error. Add detailed validation to `DocxPartitionerOptions` to distinguish these two and provide more precise exception messages. **Additional Context** - `python-pptx` shares the same OPC-Package (file) loading code used by `python-docx`, so the same ambiguity will be present in `python-pptx`. - It would be preferable for this distinguished exception behavior to be upstream in `python-docx` and `python-pptx`. If we're willing to take the version bump it might be worth considering doing that instead.
1 parent 54ec311 commit 087adb2

File tree

4 files changed

+66
-16
lines changed

4 files changed

+66
-16
lines changed

Diff for: CHANGELOG.md

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
## 0.14.9-dev6
1+
## 0.14.9-dev7
22

33
### Enhancements
44

5-
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
6-
to visualize the result of the OD model and dump the result to a file.
7-
Additionally, the visualization of bounding boxes of each layout source is rendered and saved
8-
for each page.
5+
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page.
6+
* **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix.
97

108
### Features
119

Diff for: test_unstructured/partition/test_docx.py

+35-4
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]:
770770
class DescribeDocxPartitionerOptions:
771771
"""Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
772772

773+
# -- .load() ---------------------------------
774+
775+
def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]):
776+
opts_args["file_path"] = example_doc_path("simple.docx")
777+
778+
opts = DocxPartitionerOptions.load(**opts_args)
779+
780+
assert isinstance(opts, DocxPartitionerOptions)
781+
782+
def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]):
783+
with pytest.raises(ValueError, match="no DOCX document specified, "):
784+
DocxPartitionerOptions.load(**opts_args)
785+
773786
# -- .document -------------------------------
774787

775788
def it_loads_the_docx_document(
@@ -1024,13 +1037,31 @@ def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
10241037
assert isinstance(docx_file, io.BytesIO)
10251038
assert docx_file.getvalue() == b"abcdefg"
10261039

1027-
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
1040+
# -- ._validate() ----------------------------
1041+
1042+
def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]):
1043+
opts_args["file_path"] = "l/m/n.docx"
1044+
with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"):
1045+
DocxPartitionerOptions.load(**opts_args)
1046+
1047+
def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive(
10281048
self, opts_args: dict[str, Any]
10291049
):
1030-
opts = DocxPartitionerOptions(**opts_args)
1050+
opts_args["file_path"] = example_doc_path("simple.doc")
1051+
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
1052+
DocxPartitionerOptions.load(**opts_args)
10311053

1032-
with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
1033-
opts._docx_file
1054+
def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive(
1055+
self, opts_args: dict[str, Any]
1056+
):
1057+
with open(example_doc_path("simple.doc"), "rb") as f:
1058+
opts_args["file"] = f
1059+
with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
1060+
DocxPartitionerOptions.load(**opts_args)
1061+
1062+
def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]):
1063+
with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "):
1064+
DocxPartitionerOptions.load(**opts_args)
10341065

10351066
# -- fixtures --------------------------------------------------------------------------------
10361067

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.9-dev6" # pragma: no cover
1+
__version__ = "0.14.9-dev7" # pragma: no cover

Diff for: unstructured/partition/docx.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import html
66
import io
77
import itertools
8+
import os
89
import tempfile
10+
import zipfile
911
from typing import IO, Any, Iterator, Optional, Protocol, Type
1012

1113
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
@@ -155,7 +157,7 @@ def partition_docx(
155157
Assign this number to the first page of this document and increment the page number from
156158
there.
157159
"""
158-
opts = DocxPartitionerOptions(
160+
opts = DocxPartitionerOptions.load(
159161
date_from_file_object=date_from_file_object,
160162
file=file,
161163
file_path=filename,
@@ -214,6 +216,11 @@ def __init__(
214216
# -- options object maintains page-number state --
215217
self._page_counter = starting_page_number
216218

219+
@classmethod
220+
def load(cls, **kwargs: Any) -> DocxPartitionerOptions:
221+
"""Construct and validate an instance."""
222+
return cls(**kwargs)._validate()
223+
217224
@classmethod
218225
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
219226
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
@@ -358,12 +365,26 @@ def _docx_file(self) -> str | IO[bytes]:
358365
self._file.seek(0)
359366
return io.BytesIO(self._file.read())
360367

361-
if self._file:
362-
return self._file
368+
assert self._file is not None # -- assured by `._validate()` --
369+
return self._file
363370

364-
raise ValueError(
365-
"No DOCX document specified, either `filename` or `file` argument must be provided"
366-
)
371+
def _validate(self) -> DocxPartitionerOptions:
372+
"""Raise on first invalide option, return self otherwise."""
373+
# -- provide distinguished error between "file-not-found" and "not-a-DOCX-file" --
374+
if self._file_path:
375+
if not os.path.isfile(self._file_path):
376+
raise FileNotFoundError(f"no such file or directory: {repr(self._file_path)}")
377+
if not zipfile.is_zipfile(self._file_path):
378+
raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file_path)}")
379+
elif self._file:
380+
if not zipfile.is_zipfile(self._file):
381+
raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file)}")
382+
else:
383+
raise ValueError(
384+
"no DOCX document specified, either `filename` or `file` argument must be provided"
385+
)
386+
387+
return self
367388

368389

369390
class _DocxPartitioner:

0 commit comments

Comments
 (0)