Skip to content

Commit 1d68bb2

Browse files
authored
feat: apply method to apply cleaning bricks to elements (#102)
* add apply method to apply cleaners to elements * bump version * add check for string output * documentations for the apply method * change interface to *cleaners
1 parent b1cce16 commit 1d68bb2

File tree

5 files changed

+76
-3
lines changed

5 files changed

+76
-3
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.3.2-dev0
1+
## 0.3.2
22

33
* Added `translate_text` brick for translating text between languages
4+
* Add an `apply` method to make it easier to apply cleaners to elements
45

56
## 0.3.1
67

docs/source/elements.rst

+32
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,35 @@ elements.
1111
* ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``.
1212
* ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``.
1313
* ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``.
14+
15+
16+
#########################################
17+
Applying Cleaning Bricks to Text Elements
18+
#########################################
19+
20+
You can apply cleaning bricks to a text element by using the ``apply`` method. The
21+
apply method accepts any function that takes a string as input and produces a string
22+
as output. Use the `partial` function from `functools` if you need to set additional
23+
args or kwargs for your cleaning brick. The `apply` method will accept either a single
24+
cleaner or a list of cleaners.
25+
26+
Examples:
27+
28+
.. code:: python
29+
30+
from functools import partial
31+
32+
from unstructured.cleaners.core import clean_prefix
33+
from unstructured.cleaners.translate import translate_text
34+
from unstructured.documents.elements import ListItem
35+
36+
cleaners = [
37+
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
38+
partial(translate_text, target_lang="ru"),
39+
]
40+
41+
item = ListItem(text="[1] A Textbook on Crocodile Habitats")
42+
item.apply(*cleaners)
43+
44+
# The output will be: Учебник по крокодильным средам обитания
45+
print(item)

test_unstructured/documents/test_elements.py

+28
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
from functools import partial
2+
import pytest
3+
4+
from unstructured.cleaners.core import clean_prefix
5+
from unstructured.cleaners.translate import translate_text
16
from unstructured.documents.elements import Element, NoID, Text
27

38

@@ -9,3 +14,26 @@ def test_text_id():
914
def test_element_defaults_to_blank_id():
1015
element = Element()
1116
assert isinstance(element.id, NoID)
17+
18+
19+
def test_text_element_apply_cleaners():
20+
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
21+
22+
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
23+
assert str(text_element) == "A Textbook on Crocodile Habitats"
24+
25+
26+
def test_text_element_apply_multiple_cleaners():
27+
cleaners = [
28+
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
29+
partial(translate_text, target_lang="ru"),
30+
]
31+
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
32+
text_element.apply(*cleaners)
33+
assert str(text_element) == "Учебник по крокодильным средам обитания"
34+
35+
36+
def test_apply_raises_if_func_does_not_produce_string():
37+
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
38+
with pytest.raises(ValueError):
39+
text_element.apply(lambda s: 1)

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.2-dev0" # pragma: no cover
1+
__version__ = "0.3.2" # pragma: no cover

unstructured/documents/elements.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from abc import ABC
22
import hashlib
3-
from typing import Union
3+
from typing import Callable, Union
44

55

66
class NoID(ABC):
@@ -36,6 +36,18 @@ def __str__(self):
3636
def __eq__(self, other):
3737
return self.text == other.text
3838

39+
def apply(self, *cleaners: Callable):
40+
"""Applies a cleaning brick to the text element. The function that's passed in
41+
should take a string as input and produce a string as output."""
42+
cleaned_text = self.text
43+
for cleaner in cleaners:
44+
cleaned_text = cleaner(cleaned_text)
45+
46+
if not isinstance(cleaned_text, str):
47+
raise ValueError("Cleaner produced a non-string output.")
48+
49+
self.text = cleaned_text
50+
3951

4052
class NarrativeText(Text):
4153
"""NarrativeText is an element consisting of multiple, well-formulated sentences. This

0 commit comments

Comments
 (0)