fix: resolves UnicodeDecodeError in partition_email for emails with attachments (#158)

MthwRobinson · web-flow · commit 9c3c14e94da5 · 2023-01-17T11:33:45.000-05:00
* split emails by \n=

* added test for equivalence betweent html and plain text

* changelog and bump version

* add check for content disposition
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
-## 0.4.2-dev0
+## 0.4.2
+
 * Added `partition_image` to process documents in an image format.
+* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`
 
 
 ## 0.4.1
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
@@ -122,6 +122,18 @@ def test_partition_email_header():
     assert elements == HEADER_EXPECTED_OUTPUT
 
 
+def test_extract_email_text_matches_html():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
+    elements_from_text = partition_email(filename=filename, content_source="text/plain")
+    elements_from_html = partition_email(filename=filename, content_source="text/html")
+
+    assert len(elements_from_text) == len(elements_from_html)
+    # NOTE(robinson) - checking each individually is necessary because the text/html returns
+    # HTMLTitle, HTMLNarrativeText, etc
+    for i, element in enumerate(elements_from_text):
+        assert element == elements_from_text[i]
+
+
 def test_extract_attachment_info():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
     with open(filename, "r") as f:
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.2-dev0"  # pragma: no cover
+__version__ = "0.4.2"  # pragma: no cover
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -181,30 +181,35 @@ def partition_email(
     else:
         raise ValueError("Only one of filename, file, or text can be specified.")
 
-    content_map: Dict[str, str] = {
-        part.get_content_type(): part.get_payload() for part in msg.walk()
-    }
+    content_map: Dict[str, str] = {}
+    for part in msg.walk():
+        # NOTE(robinson) - content dispostiion is None for the content of the email itself.
+        # Other dispositions include "attachment" for attachments
+        if part.get_content_disposition() is not None:
+            continue
+        content_type = part.get_content_type()
+        content_map[content_type] = part.get_payload()
 
     content = content_map.get(content_source, "")
     if not content:
         raise ValueError(f"{content_source} content not found in email")
 
-    # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
-    # looks like the following, resulting in extraneous "=" characters in the output if
-    # you don't clean it up
-    # <ul> =
-    #    <li>Item 1</li>=
-    #    <li>Item 2<li>=
-    # </ul>
-    list_content = split_by_paragraph(content)
-
     if content_source == "text/html":
+        # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
+        # looks like the following, resulting in extraneous "=" characters in the output if
+        # you don't clean it up
+        # <ul> =
+        #    <li>Item 1</li>=
+        #    <li>Item 2<li>=
+        # </ul>
+        list_content = content.split("=\n")
         content = "".join(list_content)
         elements = partition_html(text=content)
         for element in elements:
             if isinstance(element, Text):
                 element.apply(replace_mime_encodings)
     elif content_source == "text/plain":
+        list_content = split_by_paragraph(content)
         elements = partition_text(text=content)
 
     for idx, element in enumerate(elements):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.2-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.4.2" # pragma: no cover`