Skip to content

Commit e0feba8

Browse files
authored
feat: Add Image element and find_embedded_image function (#130)
* add find_embedded_image
1 parent 7b3b594 commit e0feba8

File tree

8 files changed

+7754
-19
lines changed

8 files changed

+7754
-19
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Added new functions to extract header information `parse_received_data` and `partition_header`
1414
* Added new function to parse plain text files `partition_text`
1515
* Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
16+
* Add new `Image` element and function to find embedded images `find_embedded_images`
1617

1718
## 0.3.5
1819

Diff for: example-docs/email-with-image.eml

+3,828
Large diffs are not rendered by default.

Diff for: example-docs/fake-email-image-embedded.eml

+3,833
Large diffs are not rendered by default.

Diff for: requirements/dev.txt

+23-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
#
55
# pip-compile requirements/dev.in
66
#
7+
appnope==0.1.3
8+
# via
9+
# ipykernel
10+
# ipython
711
argon2-cffi==21.3.0
812
# via notebook
913
argon2-cffi-bindings==21.2.0
@@ -36,6 +40,10 @@ executing==1.0.0
3640
# via stack-data
3741
fastjsonschema==2.16.2
3842
# via nbformat
43+
importlib-metadata==6.0.0
44+
# via nbconvert
45+
importlib-resources==5.10.2
46+
# via jsonschema
3947
ipykernel==6.15.3
4048
# via
4149
# ipywidgets
@@ -45,7 +53,7 @@ ipykernel==6.15.3
4553
# qtconsole
4654
ipython==8.6.0
4755
# via
48-
# -r dev.in
56+
# -r requirements/dev.in
4957
# ipykernel
5058
# ipywidgets
5159
# jupyter-console
@@ -64,7 +72,7 @@ jinja2==3.1.2
6472
jsonschema==4.16.0
6573
# via nbformat
6674
jupyter==1.0.0
67-
# via -r dev.in
75+
# via -r requirements/dev.in
6876
jupyter-client==7.3.5
6977
# via
7078
# ipykernel
@@ -76,7 +84,7 @@ jupyter-console==6.4.4
7684
# via jupyter
7785
jupyter-core==5.1.3
7886
# via
79-
# -r dev.in
87+
# -r requirements/dev.in
8088
# jupyter-client
8189
# nbconvert
8290
# nbformat
@@ -134,7 +142,9 @@ pexpect==4.8.0
134142
pickleshare==0.7.5
135143
# via ipython
136144
pip-tools==6.12.1
137-
# via -r dev.in
145+
# via -r requirements/dev.in
146+
pkgutil-resolve-name==1.3.10
147+
# via jsonschema
138148
platformdirs==2.5.4
139149
# via jupyter-core
140150
prometheus-client==0.14.1
@@ -190,6 +200,10 @@ terminado==0.15.0
190200
# via notebook
191201
tinycss2==1.1.1
192202
# via nbconvert
203+
tomli==2.0.1
204+
# via
205+
# build
206+
# pep517
193207
tornado==6.2
194208
# via
195209
# ipykernel
@@ -217,10 +231,14 @@ webencodings==0.5.1
217231
# tinycss2
218232
wheel==0.38.4
219233
# via
220-
# -r dev.in
234+
# -r requirements/dev.in
221235
# pip-tools
222236
widgetsnbextension==4.0.3
223237
# via ipywidgets
238+
zipp==3.11.0
239+
# via
240+
# importlib-metadata
241+
# importlib-resources
224242

225243
# The following packages are considered to be unsafe in a requirements file:
226244
# pip

Diff for: requirements/test.txt

+15-10
Original file line numberDiff line numberDiff line change
@@ -7,39 +7,39 @@
77
attrs==22.1.0
88
# via pytest
99
black==22.12.0
10-
# via -r test.in
10+
# via -r requirements/test.in
1111
certifi==2022.12.7
1212
# via
13-
# -r test.in
13+
# -r requirements/test.in
1414
# requests
1515
charset-normalizer==2.1.1
1616
# via requests
1717
click==8.1.3
1818
# via
19-
# -r test.in
19+
# -r requirements/test.in
2020
# black
2121
coverage[toml]==6.4.4
2222
# via
23-
# -r test.in
23+
# -r requirements/test.in
2424
# pytest-cov
2525
flake8==5.0.4
26-
# via -r test.in
26+
# via -r requirements/test.in
2727
idna==3.4
2828
# via
2929
# requests
3030
# yarl
3131
iniconfig==1.1.1
3232
# via pytest
3333
label-studio-sdk==0.0.15
34-
# via -r test.in
34+
# via -r requirements/test.in
3535
lxml==4.9.1
3636
# via label-studio-sdk
3737
mccabe==0.7.0
3838
# via flake8
3939
multidict==6.0.2
4040
# via yarl
4141
mypy==0.991
42-
# via -r test.in
42+
# via -r requirements/test.in
4343
mypy-extensions==0.4.3
4444
# via
4545
# black
@@ -65,23 +65,28 @@ pyparsing==3.0.9
6565
pytest==7.1.3
6666
# via pytest-cov
6767
pytest-cov==4.0.0
68-
# via -r test.in
68+
# via -r requirements/test.in
6969
pyyaml==6.0
7070
# via vcrpy
7171
requests==2.28.1
7272
# via label-studio-sdk
7373
six==1.16.0
7474
# via vcrpy
7575
tomli==2.0.1
76-
# via pytest
76+
# via
77+
# black
78+
# coverage
79+
# mypy
80+
# pytest
7781
typing-extensions==4.3.0
7882
# via
83+
# black
7984
# mypy
8085
# pydantic
8186
urllib3==1.26.12
8287
# via requests
8388
vcrpy==4.2.1
84-
# via -r test.in
89+
# via -r requirements/test.in
8590
wrapt==1.14.1
8691
# via vcrpy
8792
yarl==1.8.1

Diff for: test_unstructured/partition/test_email.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pathlib
44
import pytest
55

6-
from unstructured.documents.elements import NarrativeText, Title, ListItem
6+
from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
77
from unstructured.documents.email_elements import (
88
MetaData,
99
Recipient,
@@ -27,6 +27,15 @@
2727
ListItem(text="Violets are blue"),
2828
]
2929

30+
IMAGE_EXPECTED_OUTPUT = [
31+
NarrativeText(text="This is a test email to use for unit tests."),
32+
Title(text="Important points:"),
33+
NarrativeText(text="hello this is our logo."),
34+
Image(text="unstructured_logo.png"),
35+
ListItem(text="Roses are red"),
36+
ListItem(text="Violets are blue"),
37+
]
38+
3039
HEADER_EXPECTED_OUTPUT = [
3140
MetaData(name="MIME-Version", text="1.0"),
3241
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
@@ -97,6 +106,13 @@ def test_partition_email_from_text():
97106
assert elements == EXPECTED_OUTPUT
98107

99108

109+
def test_partition_email_from_filename_with_embedded_image():
110+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
111+
elements = partition_email(filename=filename, content_source="text/plain")
112+
assert len(elements) > 0
113+
assert elements == IMAGE_EXPECTED_OUTPUT
114+
115+
100116
def test_partition_email_header():
101117
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
102118
with open(filename, "r") as f:

Diff for: unstructured/documents/elements.py

+8
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,11 @@ class Title(Text):
7272
category = "Title"
7373

7474
pass
75+
76+
77+
class Image(Text):
78+
"""A text element for capturing image metadata."""
79+
80+
category = "Image"
81+
82+
pass

Diff for: unstructured/partition/email.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
import re
44
from email.message import Message
5-
from typing import Dict, IO, List, Optional, Tuple
5+
from typing import Dict, IO, List, Optional, Tuple, Union
66

77
if sys.version_info < (3, 8):
88
from typing_extensions import Final
@@ -24,7 +24,7 @@
2424
ReceivedInfo,
2525
MetaData,
2626
)
27-
from unstructured.documents.elements import Element, Text
27+
from unstructured.documents.elements import Element, Text, Image, NarrativeText, Title
2828
from unstructured.partition.html import partition_html
2929
from unstructured.partition.text import split_by_paragraph, partition_text
3030

@@ -113,6 +113,25 @@ def extract_attachment_info(
113113
return list_attachments
114114

115115

116+
def has_embedded_image(element):
117+
118+
PATTERN = re.compile("\[image: .+\]") # noqa: W605 NOTE(harrell)
119+
return PATTERN.search(element.text)
120+
121+
122+
def find_embedded_image(
123+
element: Union[NarrativeText, Title], indices: re.Match
124+
) -> Tuple[Element, Element]:
125+
126+
start, end = indices.start(), indices.end()
127+
128+
image_raw_info = element.text[start:end]
129+
image_info = clean_extra_whitespace(image_raw_info.split(":")[1])
130+
element.text = element.text.replace("[image: " + image_info[:-1] + "]", "")
131+
132+
return Image(text=image_info[:-1]), element
133+
134+
116135
def partition_email(
117136
filename: Optional[str] = None,
118137
file: Optional[IO] = None,
@@ -171,7 +190,7 @@ def partition_email(
171190
raise ValueError(f"{content_source} content not found in email")
172191

173192
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
174-
# looks like the following, resulting in extraneous "=" chracters in the output if
193+
# looks like the following, resulting in extraneous "=" characters in the output if
175194
# you don't clean it up
176195
# <ul> =
177196
# <li>Item 1</li>=
@@ -188,6 +207,13 @@ def partition_email(
188207
elif content_source == "text/plain":
189208
elements = partition_text(text=content)
190209

210+
for idx, element in enumerate(elements):
211+
indices = has_embedded_image(element)
212+
if (isinstance(element, NarrativeText) or isinstance(element, Title)) and indices:
213+
image_info, clean_element = find_embedded_image(element, indices)
214+
elements[idx] = clean_element
215+
elements.insert(idx + 1, image_info)
216+
191217
header: List[Element] = list()
192218
if include_headers:
193219
header = partition_email_header(msg)

0 commit comments

Comments
 (0)