Skip to content

Commit 19373de

Browse files
vangheemrbiseck3
andauthored
Enable dynamic file type registration (#3946)
The purpose of this PR is to enable registering new file types dynamically. The PR enables this through 2 primary functions: 1. `unstructured.file_utils.model.create_file_type` This registers the new `FileType` enum which enables the rest of unstructured to understand a new type of file 2. `unstructured.file_utils.model.register_partitioner` Decorator that enables registering a partitioner function to run for a file type. --------- Co-authored-by: Roman Isecke <[email protected]>
1 parent 061462d commit 19373de

File tree

6 files changed

+145
-16
lines changed

6 files changed

+145
-16
lines changed

CHANGELOG.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
## 0.16.24-dev4
1+
## 0.16.24-dev5
22

33
### Enhancements
44

5+
- **Support dynamic partitioner file type registration**. Use `create_file_type` to create new file type that can be handled
6+
in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.
7+
58
- **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
69

710
### Features

test_unstructured/file_utils/test_filetype.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
detect_filetype,
2626
is_json_processable,
2727
)
28-
from unstructured.file_utils.model import FileType
28+
from unstructured.file_utils.model import FileType, create_file_type
2929

3030
is_in_docker = os.path.exists("/.dockerenv")
3131

@@ -467,6 +467,13 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
467467
assert detect_filetype(file=f) == FileType.CSV
468468

469469

470+
def test_it_detects_correct_file_type_for_custom_types(tmp_path):
471+
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
472+
dumb_file = tmp_path / "dumb.foo"
473+
dumb_file.write_bytes(b"38v8df889qw8sdfj")
474+
assert detect_filetype(file_path=str(dumb_file), content_type="application/foo") is file_type
475+
476+
470477
# ================================================================================================
471478
# Describe `is_json_processable()`
472479
# ================================================================================================

test_unstructured/file_utils/test_model.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import pytest
66

7-
from unstructured.file_utils.model import FileType
7+
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
88

99

1010
class DescribeFileType:
@@ -225,3 +225,21 @@ def it_provides_access_to_the_partitioner_shortname(
225225
self, file_type: FileType, expected_value: str
226226
):
227227
assert file_type.partitioner_shortname == expected_value
228+
229+
230+
def test_create_file_type():
231+
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
232+
233+
assert FileType.from_extension(".foo") is file_type
234+
assert FileType.from_mime_type("application/foo") is file_type
235+
236+
237+
def test_register_partitioner():
238+
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
239+
240+
@register_partitioner(file_type)
241+
def partition_foo():
242+
pass
243+
244+
assert file_type.partitioner_function_name == "partition_foo"
245+
assert file_type.partitioner_module_qname == "test_unstructured.file_utils.test_model"

test_unstructured/partition/test_auto.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
Title,
4444
)
4545
from unstructured.file_utils.filetype import detect_filetype
46-
from unstructured.file_utils.model import FileType
46+
from unstructured.file_utils.model import FileType, create_file_type, register_partitioner
4747
from unstructured.partition.auto import _PartitionerLoader, partition
4848
from unstructured.partition.common import UnsupportedFileFormatError
4949
from unstructured.partition.utils.constants import PartitionStrategy
@@ -1331,3 +1331,17 @@ def expected_docx_elements():
13311331
Text("2023"),
13321332
Address("DOYLESTOWN, PA 18901"),
13331333
]
1334+
1335+
1336+
def _test_partition_foo():
1337+
pass
1338+
1339+
1340+
def test_auto_partition_works_with_custom_types(
1341+
request: FixtureRequest,
1342+
):
1343+
file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"])
1344+
1345+
register_partitioner(file_type)(_test_partition_foo)
1346+
loader = _PartitionerLoader()
1347+
assert loader.get(file_type) is _test_partition_foo

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.24-dev4" # pragma: no cover
1+
__version__ = "0.16.24-dev5" # pragma: no cover

unstructured/file_utils/model.py

+98-11
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,43 @@
33
from __future__ import annotations
44

55
import enum
6-
from typing import Iterable, cast
6+
from typing import TYPE_CHECKING, Callable, Iterable, Type, cast
7+
8+
from typing_extensions import ParamSpec
9+
10+
if TYPE_CHECKING:
11+
from unstructured.documents.elements import Element
12+
else:
13+
Element = None
14+
15+
16+
def _create_file_type_enum(
17+
cls: Type["FileType"],
18+
value: str,
19+
partitioner_shortname: str | None,
20+
importable_package_dependencies: Iterable[str],
21+
extra_name: str | None,
22+
extensions: Iterable[str],
23+
canonical_mime_type: str,
24+
alias_mime_types: Iterable[str],
25+
partitioner_full_module_path: str | None = None,
26+
) -> "FileType":
27+
"""
28+
Moving here instead of directly in the FileType.__new__ allows us
29+
to dynamically create new enum properties.
30+
31+
FileType.__new__ does not work with dynamic properties.
32+
"""
33+
val = object.__new__(cls)
34+
val._value_ = value
35+
val._partitioner_shortname = partitioner_shortname
36+
val._importable_package_dependencies = tuple(importable_package_dependencies)
37+
val._extra_name = extra_name
38+
val._extensions = tuple(extensions)
39+
val._canonical_mime_type = canonical_mime_type
40+
val._alias_mime_types = tuple(alias_mime_types)
41+
val._partitioner_full_module_path = partitioner_full_module_path
42+
return val
743

844

945
class FileType(enum.Enum):
@@ -30,6 +66,9 @@ class FileType(enum.Enum):
3066
_alias_mime_types: tuple[str, ...]
3167
"""MIME-types accepted as identifying this file-type."""
3268

69+
_partitioner_full_module_path: str | None
70+
"""Fully-qualified name of module providing partitioner for this file-type."""
71+
3372
def __new__(
3473
cls,
3574
value: str,
@@ -39,16 +78,19 @@ def __new__(
3978
extensions: Iterable[str],
4079
canonical_mime_type: str,
4180
alias_mime_types: Iterable[str],
81+
partitioner_full_module_path: str | None = None,
4282
):
43-
self = object.__new__(cls)
44-
self._value_ = value
45-
self._partitioner_shortname = partitioner_shortname
46-
self._importable_package_dependencies = tuple(importable_package_dependencies)
47-
self._extra_name = extra_name
48-
self._extensions = tuple(extensions)
49-
self._canonical_mime_type = canonical_mime_type
50-
self._alias_mime_types = tuple(alias_mime_types)
51-
return self
83+
return _create_file_type_enum(
84+
cls,
85+
value,
86+
partitioner_shortname,
87+
importable_package_dependencies,
88+
extra_name,
89+
extensions,
90+
canonical_mime_type,
91+
alias_mime_types,
92+
partitioner_full_module_path,
93+
)
5294

5395
def __lt__(self, other: FileType) -> bool:
5496
"""Makes `FileType` members comparable with relational operators, at least with `<`.
@@ -132,7 +174,7 @@ def is_partitionable(self) -> bool:
132174
distinguishing file-types like WAV, ZIP, EMPTY, and UNK which are legitimate file-types
133175
but have no associated partitioner.
134176
"""
135-
return bool(self._partitioner_shortname)
177+
return bool(self._partitioner_shortname) or bool(self._partitioner_full_module_path)
136178

137179
@property
138180
def mime_type(self) -> str:
@@ -154,6 +196,9 @@ def partitioner_function_name(self) -> str:
154196
# -- Raise when this property is accessed on a FileType member that has no partitioner
155197
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
156198
# -- when code would try to `getattr(module, None)` or whatever.
199+
if full_module_path := self._partitioner_full_module_path:
200+
return full_module_path.split(".")[-1]
201+
157202
if (shortname := self._partitioner_shortname) is None:
158203
raise ValueError(
159204
f"`.partitioner_function_name` is undefined because FileType.{self.name} is not"
@@ -171,6 +216,9 @@ def partitioner_module_qname(self) -> str:
171216
# -- Raise when this property is accessed on a FileType member that has no partitioner
172217
# -- shortname. This prevents a harder-to-find bug from appearing far away from this call
173218
# -- when code would try to `importlib.import_module(None)` or whatever.
219+
if full_module_path := self._partitioner_full_module_path:
220+
return ".".join(full_module_path.split(".")[:-1])
221+
174222
if (shortname := self._partitioner_shortname) is None:
175223
raise ValueError(
176224
f"`.partitioner_module_qname` is undefined because FileType.{self.name} is not"
@@ -446,3 +494,42 @@ def partitioner_shortname(self) -> str | None:
446494
"inode/x-empty",
447495
cast(list[str], []),
448496
)
497+
498+
499+
def create_file_type(
500+
name: str,
501+
*,
502+
canonical_mime_type: str,
503+
importable_package_dependencies: Iterable[str] | None = None,
504+
extra_name: str | None = None,
505+
extensions: Iterable[str] | None = None,
506+
alias_mime_types: Iterable[str] | None = None,
507+
) -> FileType:
508+
"""Register a new FileType member."""
509+
type_ = _create_file_type_enum(
510+
FileType,
511+
name,
512+
None,
513+
importable_package_dependencies or cast(list[str], []),
514+
extra_name,
515+
extensions or cast(list[str], []),
516+
canonical_mime_type,
517+
alias_mime_types or cast(list[str], []),
518+
None,
519+
)
520+
type_._name_ = name
521+
FileType._member_map_[name] = type_
522+
return type_
523+
524+
525+
_P = ParamSpec("_P")
526+
527+
528+
def register_partitioner(
529+
file_type: FileType,
530+
) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
531+
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
532+
file_type._partitioner_full_module_path = func.__module__ + "." + func.__name__
533+
return func
534+
535+
return decorator

0 commit comments

Comments
 (0)