Skip to content

Commit f3756ab

Browse files
authored
feat: Cleaning bricks for removing prefixes and postfixes (#62)
* added prefix and postfix cleaners * added test for pre and postfix cleaners * added docs for prefix and postfix bricks * changelog and bump version * add dev to version
1 parent 64f2d3a commit f3756ab

File tree

5 files changed

+113
-3
lines changed

5 files changed

+113
-3
lines changed

Diff for: CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.2.3-dev0
2+
3+
* Add cleaning bricks for removing prefixes and postfixes
4+
15
## 0.2.2
26

37
* Add staging brick for Datasaur

Diff for: docs/source/bricks.rst

+46
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,52 @@ Examples:
314314
replace_unicode_characters("'()[]{};:'\",.?/\\-_")
315315
316316
317+
``clean_prefix``
318+
----------------
319+
320+
Removes the prefix from a string if they match a specified pattern.
321+
322+
Options:
323+
324+
* Ignores case if ``ignore_case`` is set to ``True``. The default is ``False``.
325+
* Strips leading whitespace is ``strip`` is set to ``True``. The default is ``True``.
326+
327+
328+
Examples:
329+
330+
.. code:: python
331+
332+
from unstructured.cleaners.core import clean_prefix
333+
334+
text = "SUMMARY: This is the best summary of all time!"
335+
336+
# Returns "This is the best summary of all time!"
337+
clean_prefix(text, r"(SUMMARY|DESCRIPTION):", ignore_case=True)
338+
339+
340+
``clean_postfix``
341+
-----------------
342+
343+
Removes the postfix from a string if they match a specified pattern.
344+
345+
Options:
346+
347+
* Ignores case if ``ignore_case`` is set to ``True``. The default is ``False``.
348+
* Strips trailing whitespace is ``strip`` is set to ``True``. The default is ``True``.
349+
350+
351+
Examples:
352+
353+
.. code:: python
354+
355+
from unstructured.cleaners.core import clean_postfix
356+
357+
text = "The end! END"
358+
359+
# Returns "The end!"
360+
clean_postfix(text, r"(END|STOP)", ignore_case=True)
361+
362+
317363
#######
318364
Staging
319365
#######

Diff for: test_unstructured/cleaners/test_core.py

+26
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,32 @@ def test_clean_trailing_punctuation(text, expected):
8383
assert core.clean(text=text, trailing_punctuation=True) == expected
8484

8585

86+
@pytest.mark.parametrize(
87+
"text, pattern, ignore_case, strip, expected",
88+
[
89+
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
90+
("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
91+
("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
92+
("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
93+
],
94+
)
95+
def test_clean_prefix(text, pattern, ignore_case, strip, expected):
96+
assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
97+
98+
99+
@pytest.mark.parametrize(
100+
"text, pattern, ignore_case, strip, expected",
101+
[
102+
("The END! END", r"(END|STOP)", False, True, "The END!"),
103+
("The END! STOP", r"(END|STOP)", False, True, "The END!"),
104+
("The END! END", r"(END|STOP)", False, False, "The END! "),
105+
("The END! end", r"(END|STOP)", True, True, "The END!"),
106+
],
107+
)
108+
def test_clean_postfix(text, pattern, ignore_case, strip, expected):
109+
assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
110+
111+
86112
@pytest.mark.parametrize(
87113
# NOTE(yuming): Tests combined cleaners
88114
"text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected",

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.2" # pragma: no cover
1+
__version__ = "0.2.3-dev0" # pragma: no cover

Diff for: unstructured/cleaners/core.py

+36-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1+
import re
12
import sys
23
import unicodedata
34
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
4-
import re
55

66

77
def clean_bullets(text) -> str:
@@ -80,6 +80,40 @@ def clean_trailing_punctuation(text: str) -> str:
8080
return text.strip().rstrip(".,:;")
8181

8282

83+
def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
84+
"""Removes prefixes from a string according to the specified pattern. Strips leading
85+
whitespace if the strip parameter is set to True.
86+
87+
Input
88+
-----
89+
text: The text to clean
90+
pattern: The pattern for the prefix. Can be a simple string or a regex pattern
91+
ignore_case: If True, ignores case in the pattern
92+
strip: If True, removes leading whitespace from the cleaned string.
93+
"""
94+
flags = re.IGNORECASE if ignore_case else 0
95+
clean_text = re.sub(r"^{0}".format(pattern), "", text, flags=flags)
96+
clean_text = clean_text.lstrip() if strip else clean_text
97+
return clean_text
98+
99+
100+
def clean_postfix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
101+
"""Removes postfixes from a string according to the specified pattern. Strips trailing
102+
whitespace if the strip parameters is set to True.
103+
104+
Input
105+
-----
106+
text: The text to clean
107+
pattern: The pattern for the postfix. Can be a simple string or a regex pattern
108+
ignore_case: If True, ignores case in the pattern
109+
strip: If True, removes trailing whitespace from the cleaned string.
110+
"""
111+
flags = re.IGNORECASE if ignore_case else 0
112+
clean_text = re.sub(r"{0}$".format(pattern), "", text, flags=flags)
113+
clean_text = clean_text.rstrip() if strip else clean_text
114+
return clean_text
115+
116+
83117
def clean(
84118
text: str,
85119
extra_whitespace: bool = False,
@@ -91,7 +125,7 @@ def clean(
91125
"""Cleans text.
92126
93127
Input
94-
-------
128+
-----
95129
extra_whitespace: Whether to clean extra whitespace characters in text.
96130
dashes: Whether to clean dash characters in text.
97131
bullets: Whether to clean unicode bullets from a section of text.

0 commit comments

Comments
 (0)