Skip to content

Commit a1fed6d

Browse files
Issue/unicode error (#608)
This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
1 parent a787196 commit a1fed6d

File tree

11 files changed

+150
-11
lines changed

11 files changed

+150
-11
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.6.9-dev1
1+
## 0.6.9-dev2
22

33
### Enhancements
44

@@ -8,6 +8,7 @@
88

99
### Fixes
1010

11+
* Adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding.
1112
* Adds additional MIME types for CSV
1213

1314
## 0.6.8

Diff for: example-docs/fake-text-utf-16-le.txt

376 Bytes
Binary file not shown.

Diff for: example-docs/fake-text-utf-16.txt

378 Bytes
Binary file not shown.

Diff for: example-docs/fake-text-utf-32.txt

756 Bytes
Binary file not shown.

Diff for: requirements/base.txt

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ certifi==2022.12.7
1818
# unstructured (setup.py)
1919
cffi==1.15.1
2020
# via cryptography
21+
chardet==5.1.0
22+
# via unstructured (setup.py)
2123
charset-normalizer==3.1.0
2224
# via
2325
# pdfminer-six

Diff for: requirements/dev.txt

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ cffi==1.15.1
3535
# via argon2-cffi-bindings
3636
cfgv==3.3.1
3737
# via pre-commit
38+
chardet==5.1.0
39+
# via -r requirements/dev.in
3840
click==8.1.3
3941
# via pip-tools
4042
comm==0.1.3

Diff for: setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
},
5252
install_requires=[
5353
"argilla",
54+
"chardet",
5455
"lxml",
5556
"msg_parser",
5657
"nltk",

Diff for: test_unstructured/partition/test_text.py

+35
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@ def test_partition_text_from_filename(filename, encoding):
3030
assert elements == EXPECTED_OUTPUT
3131

3232

33+
@pytest.mark.parametrize(
34+
"filename",
35+
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
36+
)
37+
def test_partition_text_from_filename_default_encoding(filename):
38+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
39+
elements = partition_text(filename=filename)
40+
assert len(elements) > 0
41+
assert elements == EXPECTED_OUTPUT
42+
43+
3344
@pytest.mark.parametrize(
3445
("filename", "encoding", "error"),
3546
[
@@ -51,6 +62,18 @@ def test_partition_text_from_file():
5162
assert elements == EXPECTED_OUTPUT
5263

5364

65+
@pytest.mark.parametrize(
66+
"filename",
67+
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
68+
)
69+
def test_partition_text_from_file_default_encoding(filename):
70+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
71+
with open(filename) as f:
72+
elements = partition_text(file=f)
73+
assert len(elements) > 0
74+
assert elements == EXPECTED_OUTPUT
75+
76+
5477
def test_partition_text_from_bytes_file():
5578
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
5679
with open(filename, "rb") as f:
@@ -59,6 +82,18 @@ def test_partition_text_from_bytes_file():
5982
assert elements == EXPECTED_OUTPUT
6083

6184

85+
@pytest.mark.parametrize(
86+
"filename",
87+
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
88+
)
89+
def test_partition_text_from_bytes_file_default_encoding(filename):
90+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
91+
with open(filename, "rb") as f:
92+
elements = partition_text(file=f)
93+
assert len(elements) > 0
94+
assert elements == EXPECTED_OUTPUT
95+
96+
6297
def test_partition_text_from_text():
6398
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
6499
with open(filename) as f:

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.9-dev1" # pragma: no cover
1+
__version__ = "0.6.9-dev2" # pragma: no cover

Diff for: unstructured/file_utils/encoding.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from typing import IO, Optional, Tuple
2+
3+
import chardet
4+
5+
ENCODE_REC_THRESHOLD = 0.5
6+
7+
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
8+
COMMON_ENCODINGS = [
9+
"utf_8",
10+
"iso_8859_1",
11+
"ascii",
12+
"big5",
13+
"utf_16",
14+
"utf_16_be",
15+
"utf_16_le",
16+
"utf_32",
17+
"utf_32_be",
18+
"utf_32_le",
19+
"euc_jis_2004",
20+
"euc_jisx0213",
21+
"euc_jp",
22+
"euc_kr",
23+
"gb18030",
24+
"shift_jis",
25+
"shift_jis_2004",
26+
"shift_jisx0213",
27+
]
28+
29+
30+
def detect_file_encoding(filename: str = "", file: Optional[IO] = None) -> Tuple[str, str]:
31+
if filename:
32+
with open(filename, "rb") as f:
33+
binary_data = f.read()
34+
elif file:
35+
if "b" in file.mode:
36+
binary_data = file.read()
37+
else:
38+
with open(file.name, "rb") as f:
39+
binary_data = f.read()
40+
else:
41+
raise FileNotFoundError("No filename nor file were specified")
42+
43+
result = chardet.detect(binary_data)
44+
encoding = result["encoding"]
45+
confidence = result["confidence"]
46+
47+
if encoding is None or confidence < ENCODE_REC_THRESHOLD:
48+
# Encoding detection failed, fallback to predefined encodings
49+
for enc in COMMON_ENCODINGS:
50+
try:
51+
with open(filename, encoding=enc) as f:
52+
file_text = f.read()
53+
encoding = enc
54+
break
55+
except (UnicodeDecodeError, UnicodeError):
56+
continue
57+
else:
58+
raise UnicodeDecodeError(
59+
"Unable to determine the encoding of the file or match it with any "
60+
"of the specified encodings.",
61+
binary_data,
62+
0,
63+
len(binary_data),
64+
"Invalid encoding",
65+
)
66+
67+
else:
68+
file_text = binary_data.decode(encoding)
69+
70+
return encoding, file_text
71+
72+
73+
def read_txt_file(
74+
filename: str = "",
75+
file: Optional[IO] = None,
76+
encoding: Optional[str] = None,
77+
) -> Tuple[str, str]:
78+
"""Extracts document metadata from a plain text document."""
79+
if filename:
80+
if encoding:
81+
with open(filename, encoding=encoding) as f:
82+
try:
83+
file_text = f.read()
84+
except (UnicodeDecodeError, UnicodeError) as error:
85+
raise error
86+
else:
87+
encoding, file_text = detect_file_encoding(filename)
88+
elif file:
89+
if encoding:
90+
try:
91+
file_content = file.read()
92+
if isinstance(file_content, bytes):
93+
file_text = file_content.decode(encoding)
94+
else:
95+
file_text = file_content
96+
except (UnicodeDecodeError, UnicodeError) as error:
97+
raise error
98+
else:
99+
encoding, file_text = detect_file_encoding(file=file)
100+
else:
101+
raise FileNotFoundError("No filename was specified")
102+
103+
return encoding, file_text

Diff for: unstructured/partition/text.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
Text,
1212
Title,
1313
)
14+
from unstructured.file_utils.encoding import read_txt_file
1415
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
1516
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
1617
from unstructured.partition.common import exactly_one
@@ -31,7 +32,7 @@ def partition_text(
3132
filename: Optional[str] = None,
3233
file: Optional[IO] = None,
3334
text: Optional[str] = None,
34-
encoding: Optional[str] = "utf-8",
35+
encoding: Optional[str] = None,
3536
paragraph_grouper: Optional[Callable[[str], str]] = None,
3637
metadata_filename: Optional[str] = None,
3738
include_metadata: bool = True,
@@ -60,16 +61,10 @@ def partition_text(
6061
exactly_one(filename=filename, file=file, text=text)
6162

6263
if filename is not None:
63-
with open(filename, encoding=encoding) as f:
64-
try:
65-
file_text = f.read()
66-
except (UnicodeDecodeError, UnicodeError) as error:
67-
raise error
64+
encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
6865

6966
elif file is not None:
70-
file_text = file.read()
71-
if isinstance(file_text, bytes):
72-
file_text = file_text.decode(encoding)
67+
encoding, file_text = read_txt_file(file=file, encoding=encoding)
7368

7469
elif text is not None:
7570
file_text = str(text)

0 commit comments

Comments
 (0)