Skip to content

Commit 32c79ca

Browse files
authored
chore: use only regex for contains_english_word. (#382)
Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
1 parent e5dd9d5 commit 32c79ca

File tree

4 files changed

+19
-8
lines changed

4 files changed

+19
-8
lines changed

Diff for: CHANGELOG.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.8-dev5
1+
## 0.5.8-dev6
22

33
### Enhancements
44

@@ -44,8 +44,6 @@
4444

4545
## 0.5.6
4646

47-
* Fix problem with PDF partition (duplicated test)
48-
4947
### Enhancements
5048

5149
* `contains_english_word()`, used heavily in text processing, is 10x faster.
@@ -57,6 +55,8 @@
5755

5856
### Fixes
5957

58+
* Fix problem with PDF partition (duplicated test)
59+
6060
## 0.5.4
6161

6262
### Enhancements

Diff for: test_unstructured/partition/test_text_type.py

+5
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
190190
("Parrot Beak", True),
191191
("parrot beak", True),
192192
("parrot!", True),
193+
("?parrot", True),
194+
("zombie?parrot", True),
195+
("notaWordHa 'parrot'", True),
196+
("notaWordHa'parrot'", False),
197+
('notaWordHa "parrot,"', True),
193198
("daljdf adlfajldj ajadfa", False),
194199
("BTAR ADFJA L", False),
195200
("Unstructured Technologies", True),

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.8-dev5" # pragma: no cover
1+
__version__ = "0.5.8-dev6" # pragma: no cover

Diff for: unstructured/partition/text_type.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
2121

2222
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
23-
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
23+
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
24+
NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")
2425

2526

2627
def is_possible_narrative_text(
@@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
188189
text = text.lower()
189190
words = ENGLISH_WORD_SPLIT_RE.split(text)
190191
for word in words:
191-
# NOTE(robinson) - to ignore punctuation at the ends of words like "best."
192-
word = "".join([character for character in word if character.isalpha()])
192+
# NOTE(Crag): Remove any non-lowercase alphabetical
193+
# characters. These removed chars will usually be trailing or
194+
# leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
195+
# The possessive case is also generally ok:
196+
# "beggar's" -> "beggars" (still an english word)
197+
# and of course:
198+
# "'beggars'"-> "beggars" (also still an english word)
199+
word = NON_LOWERCASE_ALPHA_RE.sub("", word)
193200
if len(word) > 1 and word in ENGLISH_WORDS:
194201
return True
195-
196202
return False
197203

198204

0 commit comments

Comments
 (0)