feat: Cleaning bricks for removing prefixes and postfixes (#62)

MthwRobinson · web-flow · commit f3756abc9017 · 2022-11-10T12:24:58.000-05:00
* added prefix and postfix cleaners

* added test for pre and postfix cleaners

* added docs for prefix and postfix bricks

* changelog and bump version

* add dev to version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.2.3-dev0
+
+* Add cleaning bricks for removing prefixes and postfixes
+
 ## 0.2.2
 
 * Add staging brick for Datasaur
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -314,6 +314,52 @@ Examples:
   replace_unicode_characters("'()[]{};:'\",.?/\\-_")
 
 
+``clean_prefix``
+----------------
+
+Removes the prefix from a string if they match a specified pattern.
+
+Options:
+
+* Ignores case if ``ignore_case`` is set to ``True``. The default is ``False``.
+* Strips leading whitespace is ``strip`` is set to ``True``. The default is ``True``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.cleaners.core import clean_prefix
+
+  text = "SUMMARY: This is the best summary of all time!"
+
+  # Returns "This is the best summary of all time!"
+  clean_prefix(text, r"(SUMMARY|DESCRIPTION):", ignore_case=True)
+
+
+``clean_postfix``
+-----------------
+
+Removes the postfix from a string if they match a specified pattern.
+
+Options:
+
+* Ignores case if ``ignore_case`` is set to ``True``. The default is ``False``.
+* Strips trailing whitespace is ``strip`` is set to ``True``. The default is ``True``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.cleaners.core import clean_postfix
+
+  text = "The end! END"
+
+  # Returns "The end!"
+  clean_postfix(text, r"(END|STOP)", ignore_case=True)
+
+
 #######
 Staging
 #######
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
@@ -83,6 +83,32 @@ def test_clean_trailing_punctuation(text, expected):
     assert core.clean(text=text, trailing_punctuation=True) == expected
 
 
+@pytest.mark.parametrize(
+    "text, pattern, ignore_case, strip, expected",
+    [
+        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
+        ("DESC: A great SUMMARY", r"(SUMMARY|DESC):", False, True, "A great SUMMARY"),
+        ("SUMMARY: A great SUMMARY", r"(SUMMARY|DESC):", False, False, " A great SUMMARY"),
+        ("summary: A great SUMMARY", r"(SUMMARY|DESC):", True, True, "A great SUMMARY"),
+    ],
+)
+def test_clean_prefix(text, pattern, ignore_case, strip, expected):
+    assert core.clean_prefix(text, pattern, ignore_case, strip) == expected
+
+
+@pytest.mark.parametrize(
+    "text, pattern, ignore_case, strip, expected",
+    [
+        ("The END! END", r"(END|STOP)", False, True, "The END!"),
+        ("The END! STOP", r"(END|STOP)", False, True, "The END!"),
+        ("The END! END", r"(END|STOP)", False, False, "The END! "),
+        ("The END! end", r"(END|STOP)", True, True, "The END!"),
+    ],
+)
+def test_clean_postfix(text, pattern, ignore_case, strip, expected):
+    assert core.clean_postfix(text, pattern, ignore_case, strip) == expected
+
+
 @pytest.mark.parametrize(
     # NOTE(yuming): Tests combined cleaners
     "text, extra_whitespace, dashes, bullets, lowercase, trailing_punctuation, expected",
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.2"  # pragma: no cover
+__version__ = "0.2.3-dev0"  # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
@@ -1,7 +1,7 @@
+import re
 import sys
 import unicodedata
 from unstructured.nlp.patterns import UNICODE_BULLETS_RE
-import re
 
 
 def clean_bullets(text) -> str:
@@ -80,6 +80,40 @@ def clean_trailing_punctuation(text: str) -> str:
     return text.strip().rstrip(".,:;")
 
 
+def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
+    """Removes prefixes from a string according to the specified pattern. Strips leading
+    whitespace if the strip parameter is set to True.
+
+    Input
+    -----
+    text: The text to clean
+    pattern: The pattern for the prefix. Can be a simple string or a regex pattern
+    ignore_case: If True, ignores case in the pattern
+    strip: If True, removes leading whitespace from the cleaned string.
+    """
+    flags = re.IGNORECASE if ignore_case else 0
+    clean_text = re.sub(r"^{0}".format(pattern), "", text, flags=flags)
+    clean_text = clean_text.lstrip() if strip else clean_text
+    return clean_text
+
+
+def clean_postfix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
+    """Removes postfixes from a string according to the specified pattern. Strips trailing
+    whitespace if the strip parameters is set to True.
+
+    Input
+    -----
+    text: The text to clean
+    pattern: The pattern for the postfix. Can be a simple string or a regex pattern
+    ignore_case: If True, ignores case in the pattern
+    strip: If True, removes trailing whitespace from the cleaned string.
+    """
+    flags = re.IGNORECASE if ignore_case else 0
+    clean_text = re.sub(r"{0}$".format(pattern), "", text, flags=flags)
+    clean_text = clean_text.rstrip() if strip else clean_text
+    return clean_text
+
+
 def clean(
     text: str,
     extra_whitespace: bool = False,
@@ -91,7 +125,7 @@ def clean(
     """Cleans text.
 
     Input
-    -------
+    -----
     extra_whitespace: Whether to clean extra whitespace characters in text.
     dashes: Whether to clean dash characters in text.
     bullets: Whether to clean unicode bullets from a section of text.

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.2.2" # pragma: no cover`
	`1`	`+__version__ = "0.2.3-dev0" # pragma: no cover`