Skip to content

Commit edddf9f

Browse files
authored
Feat/pass down strategy to partition ppt as well (#3274)
Following the same pattern of #3273 and pass down `strategy` parameter to `partition_ppt` as well.
1 parent 16df694 commit edddf9f

File tree

5 files changed

+35
-3
lines changed

5 files changed

+35
-3
lines changed

Diff for: CHANGELOG.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
## 0.14.8-dev2
1+
## 0.14.8-dev3
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9-
* **`partition()` now forwards `strategy` arg to `partition_docx()` and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()` and `partition_pptx()` when those filetypes are detected.
9+
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_ppt()`, and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_ppt()`, and `partition_pptx()` when those filetypes are detected.
1010

1111
## 0.14.7
1212

Diff for: test_unstructured/partition/test_auto.py

+28
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,34 @@ def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]
617617
assert element.text == f"strategy=={strategy}"
618618

619619

620+
@pytest.mark.parametrize(
621+
"strategy",
622+
[
623+
PartitionStrategy.AUTO,
624+
PartitionStrategy.FAST,
625+
PartitionStrategy.HI_RES,
626+
PartitionStrategy.OCR_ONLY,
627+
],
628+
)
629+
def test_partition_forwards_strategy_arg_to_partition_ppt(request: FixtureRequest, strategy: str):
630+
from unstructured.partition.pptx import _PptxPartitioner
631+
632+
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
633+
yield Text(f"strategy=={self._opts.strategy}")
634+
635+
_iter_elements_ = method_mock(
636+
request,
637+
_PptxPartitioner,
638+
"_iter_presentation_elements",
639+
side_effect=fake_iter_presentation_elements,
640+
)
641+
642+
(element,) = partition(example_doc_path("fake-power-point.ppt"), strategy=strategy)
643+
644+
_iter_elements_.assert_called_once_with(ANY)
645+
assert element.text == f"strategy=={strategy}"
646+
647+
620648
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
621649
def test_auto_partition_ppt_from_filename():
622650
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.8-dev2" # pragma: no cover
1+
__version__ = "0.14.8-dev3" # pragma: no cover

Diff for: unstructured/partition/auto.py

+1
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,7 @@ def partition(
488488
infer_table_structure=infer_table_structure,
489489
languages=languages,
490490
detect_language_per_element=detect_language_per_element,
491+
strategy=strategy,
491492
**kwargs,
492493
)
493494
elif filetype == FileType.PPTX:

Diff for: unstructured/partition/ppt.py

+3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
get_last_modified_date_from_file,
1515
)
1616
from unstructured.partition.pptx import partition_pptx
17+
from unstructured.partition.utils.constants import PartitionStrategy
1718

1819

1920
@process_metadata()
@@ -33,6 +34,7 @@ def partition_ppt(
3334
detect_language_per_element: bool = False,
3435
date_from_file_object: bool = False,
3536
starting_page_number: int = 1,
37+
strategy: str = PartitionStrategy.FAST,
3638
**kwargs: Any,
3739
) -> list[Element]:
3840
"""Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
@@ -113,6 +115,7 @@ def partition_ppt(
113115
metadata_filename=metadata_filename,
114116
metadata_last_modified=metadata_last_modified or last_modification_date,
115117
starting_page_number=starting_page_number,
118+
strategy=strategy,
116119
)
117120

118121
# remove tmp.name from filename if parsing file

0 commit comments

Comments
 (0)