Issue/unicode error (#608)

christinestraub · web-flow · commit a1fed6d4c691 · 2023-05-23T13:35:38.000-07:00
This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.6.9-dev1
+## 0.6.9-dev2
 
 ### Enhancements
 
@@ -8,6 +8,7 @@
 
 ### Fixes
 
+* Adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
 * Adds additional MIME types for CSV
 
 ## 0.6.8
diff --git a/example-docs/fake-text-utf-16-le.txt b/example-docs/fake-text-utf-16-le.txt
diff --git a/example-docs/fake-text-utf-16.txt b/example-docs/fake-text-utf-16.txt
diff --git a/example-docs/fake-text-utf-32.txt b/example-docs/fake-text-utf-32.txt
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -18,6 +18,8 @@ certifi==2022.12.7
     #   unstructured (setup.py)
 cffi==1.15.1
     # via cryptography
+chardet==5.1.0
+    # via unstructured (setup.py)
 charset-normalizer==3.1.0
     # via
     #   pdfminer-six
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -35,6 +35,8 @@ cffi==1.15.1
     # via argon2-cffi-bindings
 cfgv==3.3.1
     # via pre-commit
+chardet==5.1.0
+    # via -r requirements/dev.in
 click==8.1.3
     # via pip-tools
 comm==0.1.3
diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@
     },
     install_requires=[
         "argilla",
+        "chardet",
         "lxml",
         "msg_parser",
         "nltk",
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -30,6 +30,17 @@ def test_partition_text_from_filename(filename, encoding):
     assert elements == EXPECTED_OUTPUT
 
 
+@pytest.mark.parametrize(
+    "filename",
+    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
+)
+def test_partition_text_from_filename_default_encoding(filename):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    elements = partition_text(filename=filename)
+    assert len(elements) > 0
+    assert elements == EXPECTED_OUTPUT
+
+
 @pytest.mark.parametrize(
     ("filename", "encoding", "error"),
     [
@@ -51,6 +62,18 @@ def test_partition_text_from_file():
     assert elements == EXPECTED_OUTPUT
 
 
+@pytest.mark.parametrize(
+    "filename",
+    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
+)
+def test_partition_text_from_file_default_encoding(filename):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    with open(filename) as f:
+        elements = partition_text(file=f)
+    assert len(elements) > 0
+    assert elements == EXPECTED_OUTPUT
+
+
 def test_partition_text_from_bytes_file():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
     with open(filename, "rb") as f:
@@ -59,6 +82,18 @@ def test_partition_text_from_bytes_file():
     assert elements == EXPECTED_OUTPUT
 
 
+@pytest.mark.parametrize(
+    "filename",
+    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
+)
+def test_partition_text_from_bytes_file_default_encoding(filename):
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    with open(filename, "rb") as f:
+        elements = partition_text(file=f)
+    assert len(elements) > 0
+    assert elements == EXPECTED_OUTPUT
+
+
 def test_partition_text_from_text():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
     with open(filename) as f:
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.9-dev1"  # pragma: no cover
+__version__ = "0.6.9-dev2"  # pragma: no cover
diff --git a/unstructured/file_utils/encoding.py b/unstructured/file_utils/encoding.py
@@ -0,0 +1,103 @@
+from typing import IO, Optional, Tuple
+
+import chardet
+
+ENCODE_REC_THRESHOLD = 0.5
+
+# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
+COMMON_ENCODINGS = [
+    "utf_8",
+    "iso_8859_1",
+    "ascii",
+    "big5",
+    "utf_16",
+    "utf_16_be",
+    "utf_16_le",
+    "utf_32",
+    "utf_32_be",
+    "utf_32_le",
+    "euc_jis_2004",
+    "euc_jisx0213",
+    "euc_jp",
+    "euc_kr",
+    "gb18030",
+    "shift_jis",
+    "shift_jis_2004",
+    "shift_jisx0213",
+]
+
+
+def detect_file_encoding(filename: str = "", file: Optional[IO] = None) -> Tuple[str, str]:
+    if filename:
+        with open(filename, "rb") as f:
+            binary_data = f.read()
+    elif file:
+        if "b" in file.mode:
+            binary_data = file.read()
+        else:
+            with open(file.name, "rb") as f:
+                binary_data = f.read()
+    else:
+        raise FileNotFoundError("No filename nor file were specified")
+
+    result = chardet.detect(binary_data)
+    encoding = result["encoding"]
+    confidence = result["confidence"]
+
+    if encoding is None or confidence < ENCODE_REC_THRESHOLD:
+        # Encoding detection failed, fallback to predefined encodings
+        for enc in COMMON_ENCODINGS:
+            try:
+                with open(filename, encoding=enc) as f:
+                    file_text = f.read()
+                encoding = enc
+                break
+            except (UnicodeDecodeError, UnicodeError):
+                continue
+        else:
+            raise UnicodeDecodeError(
+                "Unable to determine the encoding of the file or match it with any "
+                "of the specified encodings.",
+                binary_data,
+                0,
+                len(binary_data),
+                "Invalid encoding",
+            )
+
+    else:
+        file_text = binary_data.decode(encoding)
+
+    return encoding, file_text
+
+
+def read_txt_file(
+    filename: str = "",
+    file: Optional[IO] = None,
+    encoding: Optional[str] = None,
+) -> Tuple[str, str]:
+    """Extracts document metadata from a plain text document."""
+    if filename:
+        if encoding:
+            with open(filename, encoding=encoding) as f:
+                try:
+                    file_text = f.read()
+                except (UnicodeDecodeError, UnicodeError) as error:
+                    raise error
+        else:
+            encoding, file_text = detect_file_encoding(filename)
+    elif file:
+        if encoding:
+            try:
+                file_content = file.read()
+                if isinstance(file_content, bytes):
+                    file_text = file_content.decode(encoding)
+                else:
+                    file_text = file_content
+            except (UnicodeDecodeError, UnicodeError) as error:
+                raise error
+        else:
+            encoding, file_text = detect_file_encoding(file=file)
+    else:
+        raise FileNotFoundError("No filename was specified")
+
+    return encoding, file_text
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
@@ -11,6 +11,7 @@
     Text,
     Title,
 )
+from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
 from unstructured.partition.common import exactly_one
@@ -31,7 +32,7 @@ def partition_text(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     text: Optional[str] = None,
-    encoding: Optional[str] = "utf-8",
+    encoding: Optional[str] = None,
     paragraph_grouper: Optional[Callable[[str], str]] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
@@ -60,16 +61,10 @@ def partition_text(
     exactly_one(filename=filename, file=file, text=text)
 
     if filename is not None:
-        with open(filename, encoding=encoding) as f:
-            try:
-                file_text = f.read()
-            except (UnicodeDecodeError, UnicodeError) as error:
-                raise error
+        encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
 
     elif file is not None:
-        file_text = file.read()
-        if isinstance(file_text, bytes):
-            file_text = file_text.decode(encoding)
+        encoding, file_text = read_txt_file(file=file, encoding=encoding)
 
     elif text is not None:
         file_text = str(text)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.9-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.6.9-dev2" # pragma: no cover`