Skip to content

Commit 9c3c14e

Browse files
authored
fix: resolves UnicodeDecodeError in partition_email for emails with attachments (#158)
* split emails by \n= * added test for equivalence betweent html and plain text * changelog and bump version * add check for content disposition
1 parent 7ed5f71 commit 9c3c14e

File tree

4 files changed

+33
-14
lines changed

4 files changed

+33
-14
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
## 0.4.2-dev0
1+
## 0.4.2
2+
23
* Added `partition_image` to process documents in an image format.
4+
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`
35

46

57
## 0.4.1

test_unstructured/partition/test_email.py

+12
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,18 @@ def test_partition_email_header():
122122
assert elements == HEADER_EXPECTED_OUTPUT
123123

124124

125+
def test_extract_email_text_matches_html():
126+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
127+
elements_from_text = partition_email(filename=filename, content_source="text/plain")
128+
elements_from_html = partition_email(filename=filename, content_source="text/html")
129+
130+
assert len(elements_from_text) == len(elements_from_html)
131+
# NOTE(robinson) - checking each individually is necessary because the text/html returns
132+
# HTMLTitle, HTMLNarrativeText, etc
133+
for i, element in enumerate(elements_from_text):
134+
assert element == elements_from_text[i]
135+
136+
125137
def test_extract_attachment_info():
126138
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
127139
with open(filename, "r") as f:

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.2-dev0" # pragma: no cover
1+
__version__ = "0.4.2" # pragma: no cover

unstructured/partition/email.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -181,30 +181,35 @@ def partition_email(
181181
else:
182182
raise ValueError("Only one of filename, file, or text can be specified.")
183183

184-
content_map: Dict[str, str] = {
185-
part.get_content_type(): part.get_payload() for part in msg.walk()
186-
}
184+
content_map: Dict[str, str] = {}
185+
for part in msg.walk():
186+
# NOTE(robinson) - content dispostiion is None for the content of the email itself.
187+
# Other dispositions include "attachment" for attachments
188+
if part.get_content_disposition() is not None:
189+
continue
190+
content_type = part.get_content_type()
191+
content_map[content_type] = part.get_payload()
187192

188193
content = content_map.get(content_source, "")
189194
if not content:
190195
raise ValueError(f"{content_source} content not found in email")
191196

192-
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
193-
# looks like the following, resulting in extraneous "=" characters in the output if
194-
# you don't clean it up
195-
# <ul> =
196-
# <li>Item 1</li>=
197-
# <li>Item 2<li>=
198-
# </ul>
199-
list_content = split_by_paragraph(content)
200-
201197
if content_source == "text/html":
198+
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
199+
# looks like the following, resulting in extraneous "=" characters in the output if
200+
# you don't clean it up
201+
# <ul> =
202+
# <li>Item 1</li>=
203+
# <li>Item 2<li>=
204+
# </ul>
205+
list_content = content.split("=\n")
202206
content = "".join(list_content)
203207
elements = partition_html(text=content)
204208
for element in elements:
205209
if isinstance(element, Text):
206210
element.apply(replace_mime_encodings)
207211
elif content_source == "text/plain":
212+
list_content = split_by_paragraph(content)
208213
elements = partition_text(text=content)
209214

210215
for idx, element in enumerate(elements):

0 commit comments

Comments
 (0)