Skip to content

Commit 0589344

Browse files
authored
fix: require a minimum prop of alpha characters for titles and narrative text (#190)
* added alpha ratio check * added tests for alpha ratio * bump changelog and update docs * update changelog/version; update docs * ofr -> or
1 parent 1230a16 commit 0589344

File tree

5 files changed

+107
-16
lines changed

5 files changed

+107
-16
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.5-dev3
1+
## 0.4.5-dev4
22

33
* Loosen the default cap threshold to `0.5`.
44
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@@ -10,6 +10,7 @@
1010
* Adds an `Address` element for capturing elements that only contain an address.
1111
* Suppress the `UserWarning` when detectron is called.
1212
* Checks that titles and narrative test have at least one English word.
13+
* Checks that titles and narrative text are at least 75% alpha characters.
1314
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
1415
environment variable for controlling the max number of words in a title.
1516

Diff for: docs/source/bricks.rst

+12-1
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,12 @@ for consideration as narrative text. The function performs the following checks
250250
``cap_threshold=1.0``. You can also set the threshold by using the
251251
``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
252252
takes precedence over the kwarg.
253+
* If a the text contains too many non-alpha characters it is
254+
not narrative text.
255+
The default is to expect a minimum of 75% alpha characters
256+
(not countings spaces). You can change the minimum value with the
257+
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
258+
The environment variables takes precedence over the kwarg.
253259
* The cap ratio test does not apply to text that is all uppercase.
254260

255261

@@ -280,9 +286,14 @@ for consideration as a title. The function performs the following checks:
280286

281287
* Empty text cannot be a title
282288
* Text that is all numeric cannot be a title.
283-
* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with
289+
* If a title contains too many words it is not a title. The default max length is ``12``. You can change the max length with
284290
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
285291
variable takes precedence over the kwarg.
292+
* If a text contains too many non-alpha characters it is not a
293+
title. The default is to expect a minimum of 75% alpha characters
294+
(not countings spaces). You can change the minimum value with the
295+
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
296+
The environment variables takes precedence over the kwarg.
286297
* Narrative text must contain at least one English word (if ``language`` is set to "en")
287298
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
288299
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.

Diff for: test_unstructured/partition/test_text_type.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def test_headings_are_not_narrative_text(text, expected):
3838
("7", False), # Fails because it is numeric
3939
("intellectual property", False), # Fails because it does not contain a verb
4040
("Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek", False),
41+
("---------------Aske the teacher for an apple----------", False), # Too many non-alpha
4142
("", False), # Doesn't have english words # Fails because it is empty
4243
],
4344
)
@@ -63,13 +64,13 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
6364
("To My Dearest Friends,", False), # Ends with a comma
6465
("BTAR ADFJA L", False), # Doesn't have english words
6566
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
67+
("/--------BREAK-------/", False), # Contains too many non-alpha characters
6668
],
6769
)
6870
def test_is_possible_title(text, expected, monkeypatch):
6971
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
7072
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
71-
has_verb = text_type.is_possible_title(text)
72-
assert has_verb is expected
73+
assert text_type.is_possible_title(text) is expected
7374

7475

7576
@pytest.mark.parametrize(
@@ -178,6 +179,30 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch):
178179
mock_exceeds.assert_called_once_with(text, threshold=0.8)
179180

180181

182+
def test_set_title_non_alpha_threshold_with_environment_variable(monkeypatch):
183+
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
184+
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
185+
monkeypatch.setenv("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", 0.8)
186+
187+
text = "/--------------- All the king's horses----------------/"
188+
with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
189+
text_type.is_possible_title(text)
190+
191+
mock_exceeds.assert_called_once_with(text, threshold=0.8)
192+
193+
194+
def test_set_narrative_text_non_alpha_threshold_with_environment_variable(monkeypatch):
195+
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
196+
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
197+
monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", 0.8)
198+
199+
text = "/--------------- All the king's horses----------------/"
200+
with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
201+
text_type.is_possible_narrative_text(text)
202+
203+
mock_exceeds.assert_called_once_with(text, threshold=0.8)
204+
205+
181206
def test_set_title_max_word_length_with_environment_variable(monkeypatch):
182207
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
183208
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.5-dev3" # pragma: no cover
1+
__version__ = "0.4.5-dev4" # pragma: no cover

Diff for: unstructured/partition/text_type.py

+65-11
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
2020

2121

22-
def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language: str = "en") -> bool:
22+
def is_possible_narrative_text(
23+
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
24+
) -> bool:
2325
"""Checks to see if the text passes all of the checks for a narrative text section.
2426
You can change the cap threshold using the cap_threshold kwarg or the
2527
NARRATIVE_TEXT_CAP_THRESHOLD environment variable. The environment variable takes
@@ -28,11 +30,14 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
2830
Parameters
2931
----------
3032
text
31-
the input text
33+
The input text to check
3234
cap_threshold
33-
the percentage of capitalized words necessary to disqualify the segment as narrative
35+
The percentage of capitalized words necessary to disqualify the segment as narrative
36+
non_alpha_threshold
37+
The minimum proportion of alpha characters the text needs to be considered
38+
narrative text
3439
language
35-
the two letter language code for the text. defaults to "en" for English
40+
The two letter language code for the text. defaults to "en" for English
3641
"""
3742
if len(text) == 0:
3843
logger.debug("Not narrative. Text is empty.")
@@ -54,6 +59,12 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
5459
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
5560
return False
5661

62+
non_alpha_threshold = float(
63+
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", non_alpha_threshold)
64+
)
65+
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
66+
return False
67+
5768
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
5869
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
5970
return False
@@ -62,20 +73,26 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
6273

6374

6475
def is_possible_title(
65-
text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en"
76+
text: str,
77+
sentence_min_length: int = 5,
78+
title_max_word_length: int = 12,
79+
non_alpha_threshold: float = 0.75,
80+
language: str = "en",
6681
) -> bool:
6782
"""Checks to see if the text passes all of the checks for a valid title.
6883
6984
Parameters
7085
----------
7186
text
72-
the input text
87+
The input text to check
7388
sentence_min_length
74-
the minimum number of words required to consider a section of text a sentence
89+
The minimum number of words required to consider a section of text a sentence
7590
title_max_word_length
76-
the maximum number of words a title can contain
91+
The maximum number of words a title can contain
92+
non_alpha_threshold
93+
The minimum number of alpha characters the text needs to be considered a title
7794
language
78-
the two letter language code for the text. defaults to "en" for English
95+
The two letter language code for the text. defaults to "en" for English
7996
"""
8097
if len(text) == 0:
8198
logger.debug("Not a title. Text is empty.")
@@ -89,6 +106,12 @@ def is_possible_title(
89106
if len(text.split(" ")) > title_max_word_length:
90107
return False
91108

109+
non_alpha_threshold = float(
110+
os.environ.get("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", non_alpha_threshold)
111+
)
112+
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
113+
return False
114+
92115
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
93116
if text.endswith(","):
94117
return False
@@ -177,9 +200,40 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
177200
return count
178201

179202

203+
def under_non_alpha_ratio(text: str, threshold: float = 0.75):
204+
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
205+
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
206+
as a title or narrative text. The ratio does not count spaces.
207+
208+
Parameters
209+
----------
210+
text
211+
The input string to test
212+
threshold
213+
If the proportion of non-alpha characters exceeds this threshold, the function
214+
returns False
215+
"""
216+
if len(text) == 0:
217+
return False
218+
219+
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
220+
total_count = len([char for char in text if char.strip()])
221+
ratio = alpha_count / total_count
222+
return ratio < threshold
223+
224+
180225
def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
181-
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
182-
capitalized."""
226+
"""Checks the title ratio in a section of text. If a sufficient proportion of the words
227+
are capitalized, that can be indiciated on non-narrative text (i.e. "1A. Risk Factors").
228+
229+
Parameters
230+
----------
231+
text
232+
The input string to test
233+
threshold
234+
If the percentage of words beginning with a capital letter exceeds this threshold,
235+
the function returns True
236+
"""
183237
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
184238
# The assumption is that sections with multiple sentences are not titles.
185239
if sentence_count(text, 3) > 1:

0 commit comments

Comments
 (0)