Skip to content

Commit 30e5a0c

Browse files
authored
rfctr(docx): organize docx tests (#3070)
**Summary** I preparation for adding DOCX pluggable image extraction, organize a few of the DOCX tests to be parallel to very similar tests for the DOC and ODT partitioners.
1 parent 7832dfc commit 30e5a0c

File tree

4 files changed

+32
-28
lines changed

4 files changed

+32
-28
lines changed

Diff for: CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.2-dev0
1+
## 0.14.2-dev1
22

33
### Enhancements
44

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ line-length = 100
33

44
[tool.pyright]
55
pythonPlatform = "Linux"
6-
pythonVersion = "3.8"
6+
pythonVersion = "3.9"
77
reportUnnecessaryCast = true
88
reportUnnecessaryTypeIgnoreComment = true
99
stubPath = "./typings"

Diff for: test_unstructured/partition/docx/test_docx.py

+29-25
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,6 @@ def test_partition_docx_from_filename(
6161
assert {element.metadata.detection_origin for element in elements} == {"docx"}
6262

6363

64-
def test_partition_docx_from_filename_with_metadata_filename(mock_document_file_path: str):
65-
elements = partition_docx(mock_document_file_path, metadata_filename="test")
66-
assert all(element.metadata.filename == "test" for element in elements)
67-
68-
6964
def test_partition_docx_with_spooled_file(
7065
mock_document_file_path: str, expected_elements: list[Text]
7166
):
@@ -92,16 +87,6 @@ def test_partition_docx_from_file(mock_document_file_path: str, expected_element
9287
assert element.metadata.filename is None
9388

9489

95-
def test_partition_docx_from_file_with_metadata_filename(
96-
mock_document_file_path: str, expected_elements: list[Text]
97-
):
98-
with open(mock_document_file_path, "rb") as f:
99-
elements = partition_docx(file=f, metadata_filename="test")
100-
assert elements == expected_elements
101-
for element in elements:
102-
assert element.metadata.filename == "test"
103-
104-
10590
def test_partition_docx_uses_file_path_when_both_are_specified(
10691
mock_document_file_path: str, expected_elements: list[Text]
10792
):
@@ -221,21 +206,37 @@ def test_partition_docx_detects_lists():
221206
assert sum(1 for e in elements if isinstance(e, ListItem)) == 10
222207

223208

224-
def test_partition_docx_from_filename_exclude_metadata():
209+
# -- `include_metadata` arg ----------------------------------------------------------------------
210+
211+
212+
def test_partition_docx_from_filename_excludes_metadata_when_so_instructed():
225213
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_metadata=False)
214+
assert all(e.metadata.to_dict() == {} for e in elements)
226215

227-
assert elements[0].metadata.filetype is None
228-
assert elements[0].metadata.page_name is None
229-
assert elements[0].metadata.filename is None
230216

217+
def test_partition_docx_from_file_excludes_metadata_when_so_instructed():
218+
with open(example_doc_path("simple.docx"), "rb") as f:
219+
assert all(
220+
element.metadata.to_dict() == {}
221+
for element in partition_docx(file=f, include_metadata=False)
222+
)
231223

232-
def test_partition_docx_from_file_exclude_metadata(mock_document_file_path: str):
233-
with open(mock_document_file_path, "rb") as f:
234-
elements = partition_docx(file=f, include_metadata=False)
235224

236-
assert elements[0].metadata.filetype is None
237-
assert elements[0].metadata.page_name is None
238-
assert elements[0].metadata.filename is None
225+
# -- .metadata.filename --------------------------------------------------------------------------
226+
227+
228+
def test_partition_docx_from_filename_prefers_metadata_filename_when_provided():
229+
elements = partition_docx(example_doc_path("simple.docx"), metadata_filename="test")
230+
assert all(element.metadata.filename == "test" for element in elements)
231+
232+
233+
def test_partition_docx_from_file_prefers_metadata_filename_when_provided():
234+
with open(example_doc_path("simple.docx"), "rb") as f:
235+
elements = partition_docx(file=f, metadata_filename="test")
236+
assert all(element.metadata.filename == "test" for element in elements)
237+
238+
239+
# -- .metadata.last_modified ---------------------------------------------------------------------
239240

240241

241242
def test_partition_docx_metadata_date(mocker: MockFixture):
@@ -307,6 +308,9 @@ def test_partition_docx_from_file_without_metadata_date():
307308
assert elements[0].metadata.last_modified is None
308309

309310

311+
# ------------------------------------------------------------------------------------------------
312+
313+
310314
def test_get_emphasized_texts_from_paragraph(
311315
opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
312316
):

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.2-dev0" # pragma: no cover
1+
__version__ = "0.14.2-dev1" # pragma: no cover

0 commit comments

Comments
 (0)