feat(docx): differentiate no-file from not-ZIP (#3306)

scanny · web-flow · commit 087adb218fa2 · 2024-06-27T00:18:56.000Z
**Summary**
The `python-docx` error `docx.opc.exceptions.PackageNotFoundError`
arises both when no file exists at the given path and when the file
exists but is not a ZIP archive (and so is not a DOCX file).

This ambiguity is unwelcome when diagnosing the error as the two
possible conditions generally indicate a different course of action to
resolve the error.

Add detailed validation to `DocxPartitionerOptions` to distinguish these
two and provide more precise exception messages.

**Additional Context**
- `python-pptx` shares the same OPC-Package (file) loading code used by
`python-docx`, so the same ambiguity will be present in `python-pptx`.
- It would be preferable for this distinguished exception behavior to be
upstream in `python-docx` and `python-pptx`. If we're willing to take
the version bump it might be worth considering doing that instead.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,9 @@
-## 0.14.9-dev6
+## 0.14.9-dev7
 
 ### Enhancements
 
-* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
-  to visualize the result of the OD model and dump the result to a file.
-  Additionally, the visualization of bounding boxes of each layout source is rendered and saved
-  for each page.
+* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used to visualize the result of the OD model and dump the result to a file. Additionally, the visualization of bounding boxes of each layout source is rendered and saved for each page.
+* **`partition_docx()` distinguishes "file not found" from "not a ZIP archive" error.** `partition_docx()` now provides different error messages for "file not found" and "file is not a ZIP archive (and therefore not a DOCX file)". This aids diagnosis since these two conditions generally point in different directions as to the cause and fix.
 
 ### Features
 
diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
@@ -770,6 +770,19 @@ def opts_args() -> dict[str, Any]:
 class DescribeDocxPartitionerOptions:
     """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
 
+    # -- .load() ---------------------------------
+
+    def it_provides_a_validating_constructor(self, opts_args: dict[str, Any]):
+        opts_args["file_path"] = example_doc_path("simple.docx")
+
+        opts = DocxPartitionerOptions.load(**opts_args)
+
+        assert isinstance(opts, DocxPartitionerOptions)
+
+    def and_it_raises_when_options_are_not_valid(self, opts_args: dict[str, Any]):
+        with pytest.raises(ValueError, match="no DOCX document specified, "):
+            DocxPartitionerOptions.load(**opts_args)
+
     # -- .document -------------------------------
 
     def it_loads_the_docx_document(
@@ -1024,13 +1037,31 @@ def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
         assert isinstance(docx_file, io.BytesIO)
         assert docx_file.getvalue() == b"abcdefg"
 
-    def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
+    # -- ._validate() ----------------------------
+
+    def it_raises_when_no_file_exists_at_file_path(self, opts_args: dict[str, Any]):
+        opts_args["file_path"] = "l/m/n.docx"
+        with pytest.raises(FileNotFoundError, match="no such file or directory: 'l/m/n.docx'"):
+            DocxPartitionerOptions.load(**opts_args)
+
+    def and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive(
         self, opts_args: dict[str, Any]
     ):
-        opts = DocxPartitionerOptions(**opts_args)
+        opts_args["file_path"] = example_doc_path("simple.doc")
+        with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
+            DocxPartitionerOptions.load(**opts_args)
 
-        with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
-            opts._docx_file
+    def and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive(
+        self, opts_args: dict[str, Any]
+    ):
+        with open(example_doc_path("simple.doc"), "rb") as f:
+            opts_args["file"] = f
+            with pytest.raises(ValueError, match=r"not a ZIP archive \(so not a DOCX file\): "):
+                DocxPartitionerOptions.load(**opts_args)
+
+    def and_it_raises_when_neither_a_file_path_or_file_is_provided(self, opts_args: dict[str, Any]):
+        with pytest.raises(ValueError, match="no DOCX document specified, either `filename` or "):
+            DocxPartitionerOptions.load(**opts_args)
 
     # -- fixtures --------------------------------------------------------------------------------
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.9-dev6"  # pragma: no cover
+__version__ = "0.14.9-dev7"  # pragma: no cover
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -5,7 +5,9 @@
 import html
 import io
 import itertools
+import os
 import tempfile
+import zipfile
 from typing import IO, Any, Iterator, Optional, Protocol, Type
 
 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
@@ -155,7 +157,7 @@ def partition_docx(
         Assign this number to the first page of this document and increment the page number from
         there.
     """
-    opts = DocxPartitionerOptions(
+    opts = DocxPartitionerOptions.load(
         date_from_file_object=date_from_file_object,
         file=file,
         file_path=filename,
@@ -214,6 +216,11 @@ def __init__(
         # -- options object maintains page-number state --
         self._page_counter = starting_page_number
 
+    @classmethod
+    def load(cls, **kwargs: Any) -> DocxPartitionerOptions:
+        """Construct and validate an instance."""
+        return cls(**kwargs)._validate()
+
     @classmethod
     def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
         """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
@@ -358,12 +365,26 @@ def _docx_file(self) -> str | IO[bytes]:
             self._file.seek(0)
             return io.BytesIO(self._file.read())
 
-        if self._file:
-            return self._file
+        assert self._file is not None  # -- assured by `._validate()` --
+        return self._file
 
-        raise ValueError(
-            "No DOCX document specified, either `filename` or `file` argument must be provided"
-        )
+    def _validate(self) -> DocxPartitionerOptions:
+        """Raise on first invalide option, return self otherwise."""
+        # -- provide distinguished error between "file-not-found" and "not-a-DOCX-file" --
+        if self._file_path:
+            if not os.path.isfile(self._file_path):
+                raise FileNotFoundError(f"no such file or directory: {repr(self._file_path)}")
+            if not zipfile.is_zipfile(self._file_path):
+                raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file_path)}")
+        elif self._file:
+            if not zipfile.is_zipfile(self._file):
+                raise ValueError(f"not a ZIP archive (so not a DOCX file): {repr(self._file)}")
+        else:
+            raise ValueError(
+                "no DOCX document specified, either `filename` or `file` argument must be provided"
+            )
+
+        return self
 
 
 class _DocxPartitioner:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.14.9-dev6" # pragma: no cover`
	`1`	`+__version__ = "0.14.9-dev7" # pragma: no cover`