Skip to content

Commit 3b718ec

Browse files
authored
rfctr: prep for pluggable partitioners (#3806)
**Summary** Prepare auto-partitioning for pluggable partitioners. Move toward a uniform partitioner call signature in `auto/partition()` such that a custom or override partitioner can be registered without requiring code changes. **Additional Context** The central job of `auto/partition()` is to detect the file-type of the given file and use that to dispatch partitioning to the corresponding partitioner function e.g. `partition_pdf()` or `partition_docx()`. In the existing code, each partitioner function is called with parameters "hand-picked" from the available parameters passed to the `partition()` function. This is unnecessary and couples those partitioners tightly with the dispatch function. The desired state is that all available arguments are passed as `kwargs` and the partitioner function "self-selects" the arguments it will be sensitive to, applies its own appropriate default values when the argument is omitted, and simply ignore any arguments it doesn't use. Note that achieving this requires no changes to partitioner functions because they already do precisely this. So the job is to pass all arguments (other than `filename` and `file`) to the partitioner as `kwargs`. This will allow additional or alternate partitioners to be registered at runtime and dispatched to, because as long as they have the signature `partition_x(filename, file, kwargs) -> list[Element]` then they can be dispatched to without customization.
1 parent b981d71 commit 3b718ec

File tree

11 files changed

+104
-328
lines changed

11 files changed

+104
-328
lines changed

CHANGELOG.md

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
## 0.16.11
1+
## 0.16.12-dev0
2+
3+
### Enhancements
4+
5+
- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
6+
7+
### Features
28

39
### Fixes
410

5-
- Fix ipv4 regex to correctly include up to three digit octets.
11+
## 0.16.11
612

713
### Enhancements
814

@@ -14,6 +20,8 @@
1420

1521
### Fixes
1622

23+
- Fix ipv4 regex to correctly include up to three digit octets.
24+
1725
## 0.16.10
1826

1927
### Enhancements

test_unstructured/metrics/test_element_type.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
("Title", 0): 4,
3030
("Title", 1): 1,
3131
("NarrativeText", 0): 3,
32+
("PageBreak", None): 3,
3233
("ListItem", 0): 6,
3334
("ListItem", 1): 6,
3435
("ListItem", 2): 3,

test_unstructured/partition/html/test_partition.py

-11
Original file line numberDiff line numberDiff line change
@@ -1232,17 +1232,6 @@ def it_knows_the_caller_provided_detection_origin(
12321232

12331233
assert opts.detection_origin == detection_origin
12341234

1235-
# -- .encoding -------------------------------
1236-
1237-
@pytest.mark.parametrize("encoding", ["utf-8", None])
1238-
def it_knows_the_caller_provided_encoding(
1239-
self, encoding: str | None, opts_args: dict[str, Any]
1240-
):
1241-
opts_args["encoding"] = encoding
1242-
opts = HtmlPartitionerOptions(**opts_args)
1243-
1244-
assert opts.encoding == encoding
1245-
12461235
# -- .html_text ------------------------------
12471236

12481237
def it_gets_the_HTML_from_the_file_path_when_one_is_provided(self, opts_args: dict[str, Any]):

test_unstructured/partition/test_auto.py

+1-23
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from __future__ import annotations
44

5-
import io
65
import json
76
import os
87
import pathlib
@@ -561,7 +560,6 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
561560
strategy=PartitionStrategy.FAST,
562561
languages=None,
563562
metadata_filename=None,
564-
include_page_breaks=False,
565563
infer_table_structure=False,
566564
extract_images_in_pdf=False,
567565
extract_image_block_types=None,
@@ -897,7 +895,7 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
897895

898896
with pytest.raises(
899897
UnsupportedFileFormatError,
900-
match="Invalid file made-up.fake. The FileType.UNK file type is not supported in partiti",
898+
match="Partitioning is not supported for the FileType.UNK file type.",
901899
):
902900
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
903901

@@ -1037,26 +1035,6 @@ def test_auto_partition_forwards_metadata_filename_via_kwargs():
10371035
assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements)
10381036

10391037

1040-
def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
1041-
file_path = example_doc_path("fake-text.txt")
1042-
1043-
with open(file_path, "rb") as f:
1044-
elements = partition(file=f, file_filename=file_path)
1045-
1046-
assert all(e.metadata.filename == "fake-text.txt" for e in elements)
1047-
assert caplog.records[0].levelname == "WARNING"
1048-
assert "The file_filename kwarg will be deprecated" in caplog.text
1049-
1050-
1051-
def test_auto_partition_raises_when_both_file_filename_and_metadata_filename_args_are_used():
1052-
file_path = example_doc_path("fake-text.txt")
1053-
with open(file_path, "rb") as f:
1054-
file = io.BytesIO(f.read())
1055-
1056-
with pytest.raises(ValueError, match="Only one of metadata_filename and file_filename is spe"):
1057-
partition(file=file, file_filename=file_path, metadata_filename=file_path)
1058-
1059-
10601038
# -- ocr_languages --------------------------------------------------------
10611039

10621040

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.11" # pragma: no cover
1+
__version__ = "0.16.12-dev0" # pragma: no cover

0 commit comments

Comments
 (0)