Unstructured-IO
diff --git a/‎CHANGELOG.md
Lines changed: 2 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/bricks.rst
Lines changed: 2 additions & 2 deletions b/‎docs/source/bricks.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎setup.py
Lines changed: 2 additions & 0 deletions b/‎setup.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎test_unstructured/partition/test_text_type.py
Lines changed: 8 additions & 4 deletions b/‎test_unstructured/partition/test_text_type.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎unstructured/__version__.py
Lines changed: 1 addition & 1 deletion b/‎unstructured/__version__.py
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-## 0.4.5-dev4
+## 0.4.5-dev5
 
 * Loosen the default cap threshold to `0.5`.
 * Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@@ -10,7 +10,7 @@
 * Adds an `Address` element for capturing elements that only contain an address.
 * Suppress the `UserWarning` when detectron is called.
 * Checks that titles and narrative test have at least one English word.
-* Checks that titles and narrative text are at least 75% alpha characters.
+* Checks that titles and narrative text are at least 50% alpha characters.
 * Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
   environment variable for controlling the max number of words in a title.
 
 
@@ -252,7 +252,7 @@ for consideration as narrative text. The function performs the following checks
   takes precedence over the kwarg.
 * If a the text contains too many non-alpha characters it is
   not narrative text.
-  The default is to expect a minimum of 75% alpha characters
+  The default is to expect a minimum of 50% alpha characters
   (not countings spaces). You can change the minimum value with the
   ``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
   The environment variables takes precedence over the kwarg.
@@ -290,7 +290,7 @@ for consideration as a title. The function performs the following checks:
   the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
   variable takes precedence over the kwarg.
 * If a text contains too many non-alpha characters it is not a
-  title. The default is to expect a minimum of 75% alpha characters
+  title. The default is to expect a minimum of 50% alpha characters
   (not countings spaces). You can change the minimum value with the
   ``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
   The environment variables takes precedence over the kwarg.
 
@@ -72,4 +72,6 @@
         ],
         "local-inference": ["unstructured-inference>=0.2.4"],
     },
+    package_dir={"unstructured": "unstructured"},
+    package_data={"unstructured": ["nlp/*.txt"]},
 )
@@ -46,8 +46,8 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
     monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
     monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
     monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
-    has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
-    assert has_verb is expected
+    is_possible_narrative = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
+    assert is_possible_narrative is expected
 
 
 @pytest.mark.parametrize(
@@ -65,6 +65,9 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
         ("BTAR ADFJA L", False),  # Doesn't have english words
         ("ITEM 1A. RISK FACTORS " * 15, False),  # Title is too long
         ("/--------BREAK-------/", False),  # Contains too many non-alpha characters
+        ("1.A.RISKS", True),  # Tests that "RISKS" gets flagged as an english word
+        ("1. Unstructured Technologies", True),  # Make sure we're English words :-)
+        ("Big/Brown/Sheet", True),
     ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
@@ -144,11 +147,12 @@ def test_contains_verb(text, expected, monkeypatch):
         ("daljdf adlfajldj ajadfa", False),
         ("BTAR ADFJA L", False),
         ("Unstructured Technologies", True),
+        ("1.A.RISKS", True),  # Test crammed together words get picked up
+        ("Big/Brown/Sheep", True),
     ],
 )
 def test_contains_english_word(text, expected, monkeypatch):
-    has_verb = text_type.contains_english_word(text)
-    assert has_verb is expected
+    assert text_type.contains_english_word(text) is expected
 
 
 @pytest.mark.parametrize(
 
@@ -1 +1 @@
-__version__ = "0.4.5-dev4"  # pragma: no cover
+__version__ = "0.4.5-dev5"  # pragma: no cover
Original file line number	Diff line number	Diff line change
`@@ -72,4 +72,6 @@`
`72`	`72`	`],`
`73`	`73`	`"local-inference": ["unstructured-inference>=0.2.4"],`
`74`	`74`	`},`
	`75`	`+ package_dir={"unstructured": "unstructured"},`
	`76`	`+ package_data={"unstructured": ["nlp/*.txt"]},`
`75`	`77`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.5-dev4" # pragma: no cover`
	`1`	`+__version__ = "0.4.5-dev5" # pragma: no cover`