Skip to content

Commit a7ca58e

Browse files
authored
fix: more english words; split on punctuation (#191)
* add a bigger list of english words * update thresholds and add tests * update docs; bump version * fix version * add additional english words back in * linting, linting, linting * add slashes * work -> word
1 parent 0589344 commit a7ca58e

File tree

8 files changed

+427177
-19
lines changed

8 files changed

+427177
-19
lines changed

Diff for: CHANGELOG.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.5-dev4
1+
## 0.4.5-dev5
22

33
* Loosen the default cap threshold to `0.5`.
44
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@@ -10,7 +10,7 @@
1010
* Adds an `Address` element for capturing elements that only contain an address.
1111
* Suppress the `UserWarning` when detectron is called.
1212
* Checks that titles and narrative test have at least one English word.
13-
* Checks that titles and narrative text are at least 75% alpha characters.
13+
* Checks that titles and narrative text are at least 50% alpha characters.
1414
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
1515
environment variable for controlling the max number of words in a title.
1616

Diff for: docs/source/bricks.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ for consideration as narrative text. The function performs the following checks
252252
takes precedence over the kwarg.
253253
* If a the text contains too many non-alpha characters it is
254254
not narrative text.
255-
The default is to expect a minimum of 75% alpha characters
255+
The default is to expect a minimum of 50% alpha characters
256256
(not countings spaces). You can change the minimum value with the
257257
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
258258
The environment variables takes precedence over the kwarg.
@@ -290,7 +290,7 @@ for consideration as a title. The function performs the following checks:
290290
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
291291
variable takes precedence over the kwarg.
292292
* If a text contains too many non-alpha characters it is not a
293-
title. The default is to expect a minimum of 75% alpha characters
293+
title. The default is to expect a minimum of 50% alpha characters
294294
(not countings spaces). You can change the minimum value with the
295295
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
296296
The environment variables takes precedence over the kwarg.

Diff for: setup.py

+2
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,6 @@
7272
],
7373
"local-inference": ["unstructured-inference>=0.2.4"],
7474
},
75+
package_dir={"unstructured": "unstructured"},
76+
package_data={"unstructured": ["nlp/*.txt"]},
7577
)

Diff for: test_unstructured/partition/test_text_type.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
4646
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
4747
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
4848
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
49-
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
50-
assert has_verb is expected
49+
is_possible_narrative = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
50+
assert is_possible_narrative is expected
5151

5252

5353
@pytest.mark.parametrize(
@@ -65,6 +65,9 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
6565
("BTAR ADFJA L", False), # Doesn't have english words
6666
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
6767
("/--------BREAK-------/", False), # Contains too many non-alpha characters
68+
("1.A.RISKS", True), # Tests that "RISKS" gets flagged as an english word
69+
("1. Unstructured Technologies", True), # Make sure we're English words :-)
70+
("Big/Brown/Sheet", True),
6871
],
6972
)
7073
def test_is_possible_title(text, expected, monkeypatch):
@@ -144,11 +147,12 @@ def test_contains_verb(text, expected, monkeypatch):
144147
("daljdf adlfajldj ajadfa", False),
145148
("BTAR ADFJA L", False),
146149
("Unstructured Technologies", True),
150+
("1.A.RISKS", True), # Test crammed together words get picked up
151+
("Big/Brown/Sheep", True),
147152
],
148153
)
149154
def test_contains_english_word(text, expected, monkeypatch):
150-
has_verb = text_type.contains_english_word(text)
151-
assert has_verb is expected
155+
assert text_type.contains_english_word(text) is expected
152156

153157

154158
@pytest.mark.parametrize(

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.5-dev4" # pragma: no cover
1+
__version__ = "0.4.5-dev5" # pragma: no cover

0 commit comments

Comments
 (0)