Skip to content

Commit 5d1e61c

Browse files
authored
feat: add msg attachment support (#510)
* add msg function and fix bug in eml attachment function
1 parent 6874df9 commit 5d1e61c

File tree

7 files changed

+63
-4
lines changed

7 files changed

+63
-4
lines changed

Diff for: CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.5.15-dev2
2+
3+
### Features
4+
5+
* Adds support for extracting attachments from `.msg` files
6+
17
## 0.5.14-dev1
28

39
### Enhancements

Diff for: example-docs/fake-email-attachment.msg

15.5 KB
Binary file not shown.

Diff for: example-docs/fake-email-multiple-attachments.msg

4.12 MB
Binary file not shown.

Diff for: test_unstructured/partition/test_msg.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
NarrativeText,
1111
Title,
1212
)
13-
from unstructured.partition.msg import partition_msg
13+
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
1414

1515
DIRECTORY = pathlib.Path(__file__).parent.resolve()
1616
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@@ -22,6 +22,15 @@
2222
ListItem(text="Violets are blue"),
2323
]
2424

25+
ATTACH_EXPECTED_OUTPUT = [
26+
{
27+
"filename": "fake-attachment.txt",
28+
"extension": ".txt",
29+
"file_size": "unknown",
30+
"payload": b"Hey this is a fake attachment!",
31+
},
32+
]
33+
2534

2635
def test_partition_msg_from_filename():
2736
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
@@ -63,6 +72,13 @@ def test_partition_msg_from_file():
6372
assert elements == EXPECTED_MSG_OUTPUT
6473

6574

75+
def test_extract_attachment_info():
76+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.msg")
77+
attachment_info = extract_msg_attachment_info(filename)
78+
assert len(attachment_info) > 0
79+
assert attachment_info == ATTACH_EXPECTED_OUTPUT
80+
81+
6682
def test_partition_msg_raises_with_both_specified():
6783
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
6884
with open(filename, "rb") as f, pytest.raises(ValueError):

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.14-dev1" # pragma: no cover
1+
__version__ = "0.5.15-dev2" # pragma: no cover

Diff for: unstructured/partition/email.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,15 @@ def extract_attachment_info(
123123
output_dir: Optional[str] = None,
124124
) -> List[Dict[str, str]]:
125125
list_attachments = []
126-
attachment_info = {}
126+
127127
for part in message.walk():
128128
if "content-disposition" in part:
129129
cdisp = part["content-disposition"].split(";")
130130
cdisp = [clean_extra_whitespace(item) for item in cdisp]
131131

132132
for item in cdisp:
133+
attachment_info = {}
134+
133135
if item.lower() == "attachment":
134136
continue
135137
key, value = item.split("=")

Diff for: unstructured/partition/msg.py

+36-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import tempfile
2-
from typing import IO, List, Optional
2+
from typing import IO, Dict, List, Optional
33

44
import msg_parser
55

@@ -67,3 +67,38 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage) -> ElementMetadata:
6767
subject=getattr(msg_obj, "subject", None),
6868
date=email_date,
6969
)
70+
71+
72+
def extract_msg_attachment_info(
73+
filename: str,
74+
file: Optional[IO] = None,
75+
output_dir: Optional[str] = None,
76+
) -> List[Dict[str, str]]:
77+
exactly_one(filename=filename, file=file)
78+
79+
if filename is not None:
80+
msg_obj = msg_parser.MsOxMessage(filename)
81+
elif file is not None:
82+
tmp = tempfile.NamedTemporaryFile(delete=False)
83+
tmp.write(file.read())
84+
tmp.close()
85+
msg_obj = msg_parser.MsOxMessage(tmp.name)
86+
87+
list_attachments = []
88+
89+
for attachment in msg_obj.attachments:
90+
attachment_info = {}
91+
92+
attachment_info["filename"] = attachment.AttachLongFilename
93+
attachment_info["extension"] = attachment.AttachExtension
94+
attachment_info["file_size"] = attachment.AttachmentSize
95+
attachment_info["payload"] = attachment.data
96+
97+
list_attachments.append(attachment_info)
98+
99+
if output_dir is not None:
100+
filename = output_dir + "/" + attachment_info["filename"]
101+
with open(filename, "wb") as f:
102+
f.write(attachment.data)
103+
104+
return list_attachments

0 commit comments

Comments
 (0)