Skip to content

Commit 300c564

Browse files
authored
feat: Cleaning bricks to extract text before/after a pattern (#63)
* brick to extract text before * brick for extract text after * tests for extract before and after * updated docs * changelog and bump version * fix typo * fix another typo * positive -> non-negative
1 parent f3756ab commit 300c564

File tree

5 files changed

+118
-2
lines changed

5 files changed

+118
-2
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.2.3-dev0
1+
## 0.2.3
22

33
* Add cleaning bricks for removing prefixes and postfixes
4+
* Add cleaning bricks for extracting text before and after a pattern
45

56
## 0.2.2
67

docs/source/bricks.rst

+46
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,52 @@ Examples:
360360
clean_postfix(text, r"(END|STOP)", ignore_case=True)
361361
362362
363+
``extract_text_before``
364+
-----------------------
365+
366+
Extracts text that occurs before the specified pattern.
367+
368+
Options:
369+
370+
* If ``index`` is set, extract before the ``(index + 1)``th occurence of the pattern. The default is ``0``.
371+
* Strips leading whitespace if ``strip`` is set to ``True``. The default is ``True``.
372+
373+
374+
Examples:
375+
376+
.. code:: python
377+
378+
from unstructured.cleaners.extract import extract_text_before
379+
380+
text = "Here I am! STOP Look at me! STOP I'm flying! STOP"
381+
382+
# Returns "Here I am!"
383+
extract_text_before(text, r"STOP")
384+
385+
386+
``extract_text_after``
387+
----------------------
388+
389+
Extracts text that occurs after the specified pattern.
390+
391+
Options:
392+
393+
* If ``index`` is set, extract after the ``(index + 1)``th occurence of the pattern. The default is ``0``.
394+
* Strips trailing whitespace if ``strip`` is set to ``True``. The default is ``True``.
395+
396+
397+
Examples:
398+
399+
.. code:: python
400+
401+
from unstructured.cleaners.extract import extract_text_after
402+
403+
text = "SPEAKER 1: Look at me, I'm flying!"
404+
405+
# Returns "Look at me, I'm flying!"
406+
extract_text_after(text, r"SPEAKER \d{1}:")
407+
408+
363409
#######
364410
Staging
365411
#######
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pytest
2+
3+
import unstructured.cleaners.extract as extract
4+
5+
6+
def test_get_indexed_match_raises_with_bad_index():
7+
with pytest.raises(ValueError):
8+
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
9+
10+
11+
def test_get_indexed_match_raises_with_index_too_high():
12+
with pytest.raises(ValueError):
13+
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
14+
15+
16+
def test_extract_text_before():
17+
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
18+
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
19+
20+
21+
def test_extract_text_after():
22+
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
23+
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.3-dev0" # pragma: no cover
1+
__version__ = "0.2.3" # pragma: no cover

unstructured/cleaners/extract.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import re
2+
3+
4+
def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match:
5+
if not isinstance(index, int) or index < 0:
6+
raise ValueError(f"The index is {index}. Index must be a non-negative integer.")
7+
8+
regex_match = None
9+
for i, result in enumerate(re.finditer(pattern, text)):
10+
if i == index:
11+
regex_match = result
12+
13+
if regex_match is None:
14+
raise ValueError(f"Result with index {index} was not found. The largest index was {i}.")
15+
16+
return regex_match
17+
18+
19+
def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
20+
"""Extracts texts that occurs before the specified pattern. By default, it will use
21+
the first occurence of the pattern (index 0). Use the index kwarg to choose a different
22+
index.
23+
24+
Input
25+
-----
26+
strip: If True, removes trailing whitespace from the extracted string
27+
"""
28+
regex_match = _get_indexed_match(text, pattern, index)
29+
start, _ = regex_match.span()
30+
before_text = text[:start]
31+
return before_text.rstrip() if strip else before_text
32+
33+
34+
def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
35+
"""Extracts texts that occurs before the specified pattern. By default, it will use
36+
the first occurence of the pattern (index 0). Use the index kwarg to choose a different
37+
index.
38+
39+
Input
40+
-----
41+
strip: If True, removes leading whitespace from the extracted string
42+
"""
43+
regex_match = _get_indexed_match(text, pattern, index)
44+
_, end = regex_match.span()
45+
before_text = text[end:]
46+
return before_text.lstrip() if strip else before_text

0 commit comments

Comments
 (0)