feat: translate_text cleaning brick (#101)

MthwRobinson · web-flow · commit b1cce16c166d · 2022-12-15T15:35:15.000-05:00
* initial implementation for translate brick

* more input validation

* tests for translate brick

* added docs

* bumped version

* chinese and arabic tests

* re-run pip-compile

* add torch to dependencies

* cleanup doc string

* fix long string

* fix typo in docs

* take out empty string check

* return string if string is empty

* added huggingface into make install
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.3.2-dev0
+
+* Added `translate_text` brick for translating text between languages
+
 ## 0.3.1
 
 * Added \_\_init.py\_\_ to `partition`
diff --git a/Makefile b/Makefile
@@ -17,7 +17,7 @@ install-base: install-base-pip-packages install-nltk-models
 
 ## install:                 installs all test, dev, and experimental requirements
 .PHONY: install
-install: install-base-pip-packages install-dev install-nltk-models install-test
+install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface
 
 .PHONY: install-ci
 install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -447,6 +447,37 @@ Examples:
   extract_text_after(text, r"SPEAKER \d{1}:")
 
 
+``translate_text``
+------------------
+
+The ``translate_text`` cleaning bricks translates text between languages. ``translate_text``
+uses the `Helsinki NLP MT models <https://huggingface.co/Helsinki-NLP>`_ from
+``transformers`` for machine translation. Works for Russian, Chinese, Arabic, and many
+other languages.
+
+Parameters:
+
+* ``text``: the input string to translate.
+* ``source_lang``: the two letter language code for the source language of the text. 
+  If ``source_lang`` is not specified,
+  the language will be detected using ``langdetect``.
+* ``target_lang``: the two letter language code for the target language for translation.
+  Defaults to ``"en"``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.cleaners.translate import translate_text
+
+  # Output is "I'm a Berliner!"
+  translate_text("Ich bin ein Berliner!")
+
+  # Output is "I can also translate Russian!"
+  translate_text("Я тоже можно переводать русский язык!", "ru", "en")
+
+
 #######
 Staging
 #######
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -15,7 +15,9 @@ certifi==2022.9.24
 charset-normalizer==2.1.1
     # via requests
 click==8.1.3
-    # via nltk
+    # via
+    #   nltk
+    #   sacremoses
 deprecated==1.2.13
     # via argilla
 filelock==3.8.2
@@ -35,7 +37,11 @@ idna==3.4
     #   requests
     #   rfc3986
 joblib==1.2.0
-    # via nltk
+    # via
+    #   nltk
+    #   sacremoses
+langdetect==1.0.9
+    # via unstructured (setup.py)
 lxml==4.9.1
     # via unstructured (setup.py)
 monotonic==1.6
@@ -69,33 +75,45 @@ pyyaml==6.0
 regex==2022.10.31
     # via
     #   nltk
+    #   sacremoses
     #   transformers
 requests==2.28.1
     # via
     #   huggingface-hub
     #   transformers
 rfc3986[idna2008]==1.5.0
     # via httpx
+sacremoses==0.0.53
+    # via unstructured (setup.py)
+sentencepiece==0.1.97
+    # via unstructured (setup.py)
 six==1.16.0
-    # via python-dateutil
+    # via
+    #   langdetect
+    #   python-dateutil
+    #   sacremoses
 sniffio==1.3.0
     # via
     #   httpcore
     #   httpx
 tokenizers==0.13.2
     # via transformers
+torch==1.13.0
+    # via unstructured (setup.py)
 tqdm==4.64.1
     # via
     #   argilla
     #   huggingface-hub
     #   nltk
+    #   sacremoses
     #   transformers
 transformers==4.23.1
     # via unstructured (setup.py)
 typing-extensions==4.4.0
     # via
     #   huggingface-hub
     #   pydantic
+    #   torch
 urllib3==1.26.13
     # via requests
 wrapt==1.13.3
diff --git a/setup.py b/setup.py
@@ -54,6 +54,10 @@
     ],
     extras_require={
         "huggingface": [
+            "langdetect",
+            "sacremoses",
+            "sentencepiece",
+            "torch",
             "transformers",
         ],
     },
diff --git a/test_unstructured/cleaners/test_translate.py b/test_unstructured/cleaners/test_translate.py
@@ -0,0 +1,55 @@
+import pytest
+
+import unstructured.cleaners.translate as translate
+
+
+def test_get_opus_mt_model_name():
+    model_name = translate._get_opus_mt_model_name("ru", "en")
+    assert model_name == "Helsinki-NLP/opus-mt-ru-en"
+
+
+@pytest.mark.parametrize("code", ["way-too-long", "a", "", None])
+def test_validate_language_code(code):
+    with pytest.raises(ValueError):
+        translate._validate_language_code(code)
+
+
+def test_translate_returns_same_text_if_dest_is_same():
+    text = "This is already in English!"
+    assert translate.translate_text(text, "en", "en") == text
+
+
+def test_translate_returns_same_text_text_is_empty():
+    text = "      "
+    assert translate.translate_text(text) == text
+
+
+def test_translate_with_language_specified():
+    text = "Ich bin ein Berliner!"
+    assert translate.translate_text(text, "de") == "I'm a Berliner!"
+
+
+def test_translate_with_no_language_specified():
+    text = "Ich bin ein Berliner!"
+    assert translate.translate_text(text) == "I'm a Berliner!"
+
+
+def test_translate_raises_with_bad_language():
+    text = "Ich bin ein Berliner!"
+    with pytest.raises(ValueError):
+        translate.translate_text(text, "zz")
+
+
+def test_tranlate_works_with_russian():
+    text = "Я тоже можно переводать русский язык!"
+    assert translate.translate_text(text) == "I can also translate Russian!"
+
+
+def test_translate_works_with_chinese():
+    text = "網站有中、英文版本"
+    translate.translate_text(text) == "Website available in Chinese and English"
+
+
+def translate_works_with_arabic():
+    text = "مرحباً بكم في متجرنا"
+    translate.translate_text(text) == "Welcome to our store."
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.1"  # pragma: no cover
+__version__ = "0.3.2-dev0"  # pragma: no cover
diff --git a/unstructured/cleaners/translate.py b/unstructured/cleaners/translate.py
@@ -0,0 +1,86 @@
+from typing import List, Optional
+import warnings
+
+import langdetect
+from transformers import MarianMTModel, MarianTokenizer
+
+from unstructured.staging.huggingface import chunk_by_attention_window
+from unstructured.nlp.tokenize import sent_tokenize
+
+
+def _get_opus_mt_model_name(source_lang: str, target_lang: str):
+    """Constructs the name of the MarianMT machine translation model based on the
+    source and target language."""
+    return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
+
+
+def _validate_language_code(language_code: str):
+    if not isinstance(language_code, str) or len(language_code) != 2:
+        raise ValueError(
+            f"Invalid language code: {language_code}. Language codes must be two letter strings."
+        )
+
+
+def translate_text(text, source_lang: Optional[str] = None, target_lang: str = "en") -> str:
+    """Translates the foreign language text. If the source language is not specified, the
+    function will attempt to detect it using langdetect.
+
+    Parameters
+    ----------
+    text: str
+        The text to translate
+    target_lang: str
+        The two letter language code for the target langague. Defaults to "en".
+    source_lang: Optional[str]
+        The two letter language code for the language of the input text. If source_lang is
+        not provided, the function will try to detect it.
+    """
+    if text.strip() == "":
+        return text
+
+    _source_lang: str = source_lang if source_lang is not None else langdetect.detect(text)
+    # NOTE(robinson) - Chinese gets detected with codes zh-cn, zh-tw, zh-hk for various
+    # Chinese variants. We normalizes these because there is a single model for Chinese
+    # machine translation
+    if _source_lang.startswith("zh"):
+        _source_lang = "zh"
+
+    _validate_language_code(target_lang)
+    _validate_language_code(_source_lang)
+
+    if target_lang == _source_lang:
+        return text
+
+    model_name = _get_opus_mt_model_name(_source_lang, target_lang)
+
+    try:
+        tokenizer = MarianTokenizer.from_pretrained(model_name)
+        model = MarianMTModel.from_pretrained(model_name)
+    except OSError:
+        raise ValueError(
+            f"Transformers could not find the translation model {model_name}. "
+            "The requested source/target language combo is not suppored."
+        )
+
+    chunks: List[str] = chunk_by_attention_window(text, tokenizer, split_function=sent_tokenize)
+
+    translated_chunks: List[str] = list()
+    for chunk in chunks:
+        translated_chunks.append(_translate_text(text, model, tokenizer))
+
+    return " ".join(translated_chunks)
+
+
+def _translate_text(text, model, tokenizer):
+    """Translates text using the specified model and tokenizer."""
+    # NOTE(robinson) - Suppresses the HuggingFace UserWarning resulting from the "max_length"
+    # key in the MarianMT config. The warning states that "max_length" will be deprecated
+    # in transformers v5
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        translated = model.generate(
+            **tokenizer([text], return_tensors="pt", padding="max_length", max_length=512)
+        )
+    return [tokenizer.decode(t, max_new_tokens=512, skip_special_tokens=True) for t in translated][
+        0
+    ]

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.3.1" # pragma: no cover`
	`1`	`+__version__ = "0.3.2-dev0" # pragma: no cover`