Skip to content

Commit b855fd2

Browse files
authored
fix: fix html encoding to support foreign characters (#452)
* fix: fix html encoding to support foreign characters * version and changelog
1 parent 1dca0db commit b855fd2

File tree

4 files changed

+17
-3
lines changed

4 files changed

+17
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.11-dev0
1+
## 0.5.11
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
### Fixes
88

99
* Guard against null style attribute in docx document elements
10+
* Update HTML encoding to better support foreign language characters
1011

1112
## 0.5.10
1213

test_unstructured/partition/test_html_partition.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,9 @@ def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeyp
149149
# partition html should still work
150150
elements = partition_html(filename=read_only_file.resolve())
151151
assert len(elements) > 0
152+
153+
154+
def test_partition_html_processes_chinese_chracters():
155+
html_text = "<html><div><p>每日新闻</p></div></html>"
156+
elements = partition_html(text=html_text)
157+
assert elements[0].text == "每日新闻"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.11-dev0" # pragma: no cover
1+
__version__ = "0.5.11" # pragma: no cover

unstructured/documents/xml.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,14 @@ def pages(self) -> List[Page]:
5555
def _read_xml(self, content):
5656
"""Reads in an XML file and converts it to an lxml element tree object."""
5757
if self.document_tree is None:
58-
document_tree = etree.fromstring(content.encode(), self.parser)
58+
try:
59+
document_tree = etree.fromstring(content, self.parser)
60+
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
61+
# case, we call back to encoding the string and passing in bytes.
62+
# ValueError: Unicode strings with encoding declaration are not supported.
63+
# Please use bytes input or XML fragments without declaration.
64+
except ValueError:
65+
document_tree = etree.fromstring(content.encode(), self.parser)
5966

6067
if self.stylesheet:
6168
if isinstance(self.parser, etree.HTMLParser):

0 commit comments

Comments
 (0)