Skip to content

Commit b855fd2

Browse files
authored
fix: fix html encoding to support foreign characters (#452)
* fix: fix html encoding to support foreign characters * version and changelog
1 parent 1dca0db commit b855fd2

File tree

4 files changed

+17
-3
lines changed

4 files changed

+17
-3
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.11-dev0
1+
## 0.5.11
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
### Fixes
88

99
* Guard against null style attribute in docx document elements
10+
* Update HTML encoding to better support foreign language characters
1011

1112
## 0.5.10
1213

Diff for: test_unstructured/partition/test_html_partition.py

+6
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,9 @@ def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeyp
149149
# partition html should still work
150150
elements = partition_html(filename=read_only_file.resolve())
151151
assert len(elements) > 0
152+
153+
154+
def test_partition_html_processes_chinese_chracters():
155+
html_text = "<html><div><p>每日新闻</p></div></html>"
156+
elements = partition_html(text=html_text)
157+
assert elements[0].text == "每日新闻"

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.11-dev0" # pragma: no cover
1+
__version__ = "0.5.11" # pragma: no cover

Diff for: unstructured/documents/xml.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,14 @@ def pages(self) -> List[Page]:
5555
def _read_xml(self, content):
5656
"""Reads in an XML file and converts it to an lxml element tree object."""
5757
if self.document_tree is None:
58-
document_tree = etree.fromstring(content.encode(), self.parser)
58+
try:
59+
document_tree = etree.fromstring(content, self.parser)
60+
# NOTE(robinson) - The following ValueError occurs with unicode strings. In that
61+
# case, we call back to encoding the string and passing in bytes.
62+
# ValueError: Unicode strings with encoding declaration are not supported.
63+
# Please use bytes input or XML fragments without declaration.
64+
except ValueError:
65+
document_tree = etree.fromstring(content.encode(), self.parser)
5966

6067
if self.stylesheet:
6168
if isinstance(self.parser, etree.HTMLParser):

0 commit comments

Comments
 (0)