chore: use only regex for contains_english_word. (#382)

cragwolfe · web-flow · commit 32c79caee30a · 2023-03-30T16:57:43.000Z
Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word

Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.5.8-dev5
+## 0.5.8-dev6
 
 ### Enhancements
 
@@ -44,8 +44,6 @@
 
 ## 0.5.6
 
-* Fix problem with PDF partition (duplicated test)
-
 ### Enhancements
 
 * `contains_english_word()`, used heavily in text processing, is 10x faster.
@@ -57,6 +55,8 @@
 
 ### Fixes
 
+* Fix problem with PDF partition (duplicated test)
+
 ## 0.5.4
 
 ### Enhancements
diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py
@@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
         ("Parrot Beak", True),
         ("parrot beak", True),
         ("parrot!", True),
+        ("?parrot", True),
+        ("zombie?parrot", True),
+        ("notaWordHa 'parrot'", True),
+        ("notaWordHa'parrot'", False),
+        ('notaWordHa "parrot,"', True),
         ("daljdf adlfajldj ajadfa", False),
         ("BTAR ADFJA L", False),
         ("Unstructured Technologies", True),
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.8-dev5"  # pragma: no cover
+__version__ = "0.5.8-dev6"  # pragma: no cover
diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py
@@ -20,7 +20,8 @@
 from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
 
 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
-ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
+ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
+NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")
 
 
 def is_possible_narrative_text(
@@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
     text = text.lower()
     words = ENGLISH_WORD_SPLIT_RE.split(text)
     for word in words:
-        # NOTE(robinson) - to ignore punctuation at the ends of words like "best."
-        word = "".join([character for character in word if character.isalpha()])
+        # NOTE(Crag): Remove any non-lowercase alphabetical
+        # characters.  These removed chars will usually be trailing or
+        # leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
+        # The possessive case is also generally ok:
+        #   "beggar's" -> "beggars" (still an english word)
+        # and of course:
+        #   "'beggars'"-> "beggars" (also still an english word)
+        word = NON_LOWERCASE_ALPHA_RE.sub("", word)
         if len(word) > 1 and word in ENGLISH_WORDS:
             return True
-
     return False
 
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.8-dev5" # pragma: no cover`
	`1`	`+__version__ = "0.5.8-dev6" # pragma: no cover`