Skip to content

Commit 558ee63

Browse files
authored
feat: ability to skip English language specific checks with env var (#224)
* add language env var * update docs * version and bump change log
1 parent a68dc35 commit 558ee63

File tree

5 files changed

+34
-3
lines changed

5 files changed

+34
-3
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.4.9-dev1
1+
## 0.4.9-dev2
22

33
* Added ingest modules and s3 connector
44
* Default to `url=None` for `partition_pdf` and `partition_image`
5+
* Add ability to skip English specific check by setting the `UNSTRUCTURED_LANGUAGE` env var to `""`.
56

67
## 0.4.8
78

Diff for: docs/source/bricks.rst

+3
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ for consideration as narrative text. The function performs the following checks
280280
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
281281
The environment variables takes precedence over the kwarg.
282282
* The cap ratio test does not apply to text that is all uppercase.
283+
* If you use the ``language=""`` kwarg or set the ``UNSTRUCTURED_LANGUAGE`` environment variable to ``""``, the function will skip the verb check and the English word check.
283284

284285

285286
Examples:
@@ -320,6 +321,8 @@ for consideration as a title. The function performs the following checks:
320321
* Narrative text must contain at least one English word (if ``language`` is set to "en")
321322
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
322323
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
324+
* If you use the ``language=""`` kwarg or set the ``UNSTRUCTURED_LANGUAGE`` environment variable to ``""``, the function will skip the English word check.
325+
323326

324327

325328
Examples:

Diff for: test_unstructured/partition/test_text_type.py

+25
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,31 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
5050
assert is_possible_narrative is expected
5151

5252

53+
def test_text_type_handles_non_english_examples():
54+
narrative_text = "Я говорю по-русски. Вы тоже?"
55+
title = "Риски"
56+
57+
assert text_type.is_possible_narrative_text(narrative_text, language="en") is False
58+
assert text_type.is_possible_narrative_text(narrative_text, language="") is True
59+
60+
assert text_type.is_possible_narrative_text(title, language="en") is False
61+
assert text_type.is_possible_narrative_text(title, language="") is False
62+
63+
assert text_type.is_possible_title(title, language="en") is False
64+
assert text_type.is_possible_title(title, language="") is True
65+
66+
67+
def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
68+
monkeypatch.setenv("UNSTRUCTURED_LANGUAGE", "")
69+
70+
narrative_text = "Я говорю по-русски. Вы тоже?"
71+
title = "Риски"
72+
73+
assert text_type.is_possible_narrative_text(narrative_text) is True
74+
assert text_type.is_possible_narrative_text(title) is False
75+
assert text_type.is_possible_title(title) is True
76+
77+
5378
@pytest.mark.parametrize(
5479
"text, expected",
5580
[

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.9-dev1" # pragma: no cover
1+
__version__ = "0.4.9-dev2" # pragma: no cover

Diff for: unstructured/partition/text_type.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def is_possible_narrative_text(
4949
logger.debug(f"Not narrative. Text is all numeric:\n\n{text}")
5050
return False
5151

52+
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
5253
if language == "en" and not contains_english_word(text):
5354
return False
5455

@@ -67,7 +68,7 @@ def is_possible_narrative_text(
6768
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
6869
return False
6970

70-
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
71+
if (sentence_count(text, 3) < 2) and (not contains_verb(text)) and language == "en":
7172
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
7273
return False
7374

@@ -118,6 +119,7 @@ def is_possible_title(
118119
if text.endswith(","):
119120
return False
120121

122+
language = os.environ.get("UNSTRUCTURED_LANGUAGE", language)
121123
if language == "en" and not contains_english_word(text):
122124
return False
123125

0 commit comments

Comments
 (0)