Skip to content

Commit c0457c1

Browse files
feat: include images when partitioning html (#3945)
Currently we [filter img tags](https://github.com/Unstructured-IO/unstructured/blob/2addb19473ba9e27af995291f57d35fb50bec4b0/unstructured/partition/html/partition.py#L226-L229) before tags are converted to Elements by the html partitioner. More importantly we also don’t currently have a defined “block” / mapping to support these. This adds these mappings and logic to process. It also respects `extract_image_block_types` and `extract_image_block_to_payload` (as we do with pdfs) to determine whether base64 is included in the metadata. The partitioned Image Elements sets the text to the img tag’s alt text if available. The partitioned Image Elements include the [url in the metadata](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/documents/elements.py#L209) (rather than image_base64) if the img tag src is a url. ## Testing unit tests have been added for explicit coverage. existing integration tests and other unit test fixtures have been updated to account for `Image` elements now present --------- Co-authored-by: ryannikolaidis <[email protected]>
1 parent 74b0647 commit c0457c1

File tree

23 files changed

+1014
-264
lines changed

23 files changed

+1014
-264
lines changed

Diff for: CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.16.26-dev1
2+
3+
### Enhancements
4+
5+
- **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
6+
7+
### Features
8+
9+
### Fixes
10+
111
## 0.16.25
212

313
### Enhancements

Diff for: test_unstructured/partition/html/test_partition.py

+64-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import io
88
import pathlib
9-
from typing import Any
9+
from typing import Any, Optional
1010

1111
import pytest
1212
from lxml import etree
@@ -24,6 +24,7 @@
2424
from unstructured.documents.elements import (
2525
Address,
2626
CompositeElement,
27+
ElementType,
2728
ListItem,
2829
NarrativeText,
2930
Table,
@@ -296,6 +297,68 @@ def test_it_does_not_extract_text_in_style_tags():
296297
assert element.text == "Lorem ipsum dolor"
297298

298299

300+
# -- image parsing behaviors ---------------------------------------------------------------------
301+
302+
303+
@pytest.mark.parametrize(
304+
("extract_to_payload", "extract_types", "expect_base64"),
305+
[
306+
(True, ["Image"], True),
307+
(True, [], False),
308+
(True, None, False),
309+
(False, ["Image"], False),
310+
],
311+
)
312+
def test_partition_html_base64_for_images(
313+
opts_args: dict[str, Any],
314+
extract_to_payload: bool,
315+
extract_types: Optional[list[str]],
316+
expect_base64: bool,
317+
):
318+
base64 = (
319+
"iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/"
320+
"w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="
321+
)
322+
src = "data:image/png;base64," + base64
323+
alt_text = "Base64 Image"
324+
325+
html = f"""
326+
<div class="Page">
327+
<img src="{src}" alt="{alt_text}">
328+
</div>
329+
"""
330+
opts_args["text"] = html
331+
opts_args["extract_image_block_to_payload"] = extract_to_payload
332+
opts_args["extract_image_block_types"] = extract_types
333+
opts = HtmlPartitionerOptions(**opts_args)
334+
(element,) = list(_HtmlPartitioner.iter_elements(opts))
335+
336+
assert element.category == ElementType.IMAGE
337+
assert element.text == alt_text
338+
assert element.metadata.image_mime_type == "image/png"
339+
if expect_base64:
340+
assert element.metadata.image_base64 == base64
341+
else:
342+
assert element.metadata.image_base64 is None
343+
344+
345+
def test_partition_html_includes_url_for_images():
346+
url = "https://example.com/image.png"
347+
alt_text = "URL Image"
348+
# language=HTML
349+
html = f"""
350+
<div class="Page">
351+
<img src="{url}" alt="{alt_text}">
352+
</div>
353+
"""
354+
(image,) = partition_html(
355+
text=html,
356+
)
357+
assert image.category == ElementType.IMAGE
358+
assert image.text == alt_text
359+
assert image.metadata.url == url
360+
361+
299362
# -- table parsing behaviors ---------------------------------------------------------------------
300363

301364

Diff for: test_unstructured/partition/test_auto.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -210,15 +210,15 @@ def test_auto_partition_epub_from_filename():
210210
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
211211

212212
assert len(elements) > 0
213-
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
213+
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
214214

215215

216216
def test_auto_partition_epub_from_file():
217217
with open(example_doc_path("winter-sports.epub"), "rb") as f:
218218
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
219219

220220
assert len(elements) > 0
221-
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
221+
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
222222

223223

224224
# ================================================================================================
@@ -430,7 +430,7 @@ def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
430430
def test_partition_md_from_url_works_with_embedded_html():
431431
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
432432
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
433-
assert "unstructured" in elements[0].text
433+
assert "unstructured" in elements[1].text
434434

435435

436436
# ================================================================================================

Diff for: test_unstructured/partition/test_epub.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ def test_partition_epub_from_filename():
1414

1515
assert len(elements) > 0
1616
assert isinstance(elements[0], Text)
17-
assert elements[0].text.startswith("a shared culture")
17+
assert elements[1].text.startswith("a shared culture")
1818
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
1919
assert {element.metadata.detection_origin for element in elements} == {"epub"}
2020

2121

2222
def test_partition_epub_from_filename_returns_table_in_elements():
2323
elements = partition_epub(example_doc_path("winter-sports.epub"))
24-
assert elements[10] == Table(
24+
assert elements[12] == Table(
2525
"Contents. List of Illustrations (In certain versions of this etext [in certain\nbrowsers]"
2626
" clicking on the image will bring up a larger\nversion.) (etext transcriber's note)"
2727
)
@@ -32,7 +32,7 @@ def test_partition_epub_from_file():
3232
elements = partition_epub(file=f)
3333

3434
assert len(elements) > 0
35-
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
35+
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
3636

3737

3838
# -- .metadata.filename --------------------------------------------------------------------------

Diff for: test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/1605956.html

+37-28
Original file line numberDiff line numberDiff line change
@@ -13,89 +13,98 @@ <h1 class="Title" id="d5576cc299d7d8417c136933f890831c">
1313
<p class="NarrativeText" id="d36113941235a14bdacafa399698ee71">
1414
The overview is the first page visitors will see when they visit your space, so it helps to include some information on what the space is about and what your team is working on.
1515
</p>
16-
<p class="NarrativeText" id="21e1683c1bc71c40ea20081368bcc7f6">
16+
<img alt="" class="Image" id="2051072f068db11d81f2bcbd031f8c19"/>
17+
<p class="NarrativeText" id="156af6589ee1a114454df9aa55b88d85">
1718
Add a header image. This gives your overview visual appeal and makes it welcoming for visitors.
1819
</p>
19-
<p class="NarrativeText" id="65f03aec0f3637db38c5a3741968eeff">
20+
<p class="NarrativeText" id="618dd7e3cee45b5b0f04847b33879336">
2021
Explain what the space is for. Start by summarizing the purpose of the space. This could be your team's mission statement or a brief description of the kind of work you do.
2122
</p>
22-
<p class="NarrativeText" id="e2522f792c3c5ef32bf1ba342a282fdd">
23+
<p class="NarrativeText" id="ca6d9e5f81ae268b7bbf6b62dad3357b">
2324
Share team goals. Add links to your team's OKRs, project plans, and product roadmaps so visitors can quickly get a sense of your team's goals.
2425
</p>
25-
<p class="NarrativeText" id="bd058a2d2c45c92a3178e327564e135a">
26+
<p class="NarrativeText" id="cf63812b68970732916946496b13b763">
2627
Tell people how to contact you. Share your timezone and links to Slack channels, email aliases, or other contact details your team uses so visitors can contact you with questions or feedback about your team's work.
2728
</p>
28-
<h1 class="Title" id="eab79997042ec6e273d0a13383347a57">
29+
<h1 class="Title" id="82d520e252b220d5c4c6ce29ffb1ade1">
2930
Use shortcuts for easy access
3031
</h1>
31-
<p class="NarrativeText" id="29cdfa9dda669b1dac60890795ab526c">
32+
<p class="NarrativeText" id="b2d427efb6bb6f37c4afd368cefab926">
3233
Shortcuts are helpful for important pages that members of a space might need to get to often. These shortcuts are added and organized by the space administrator. Space admins can link to pages in the space, other related spaces, or relevant external web content as well as reorder the shortcuts as needed.
3334
</p>
34-
<h1 class="Title" id="3251fe353cdbb64ce5cf084aef00cd96">
35+
<img alt="" class="Image" id="d9f3cfd98a3c67adb56cfafae39d3e03"/>
36+
<h1 class="Title" id="84ef673952608f3ba8bc4d2fa9deab59">
3537
💭Start discussions with inline comments
3638
</h1>
37-
<p class="NarrativeText" id="29a93ef334092c2a12daf86b1c1b61fb">
39+
<p class="NarrativeText" id="bcb788a54a545e7f1448f6e4dacb91eb">
3840
Thoughtful responses can get lost and lose context as email replies pile up. And if you neglect to copy someone or want to add them later on, it's difficult for them to get up to speed. Inline comments allow anyone (or everyone) to huddle around an idea while referencing key information on the project page.
3941
</p>
40-
<p class="NarrativeText" id="15cc91b0ec273ab28ab202cd5e7836ea">
42+
<p class="NarrativeText" id="c9dd716e43dfb450e3ff4cf59a3b5c63">
4143
To leave an inline comment, highlight text on the page and the comment icon will appear.
4244
</p>
43-
<p class="NarrativeText" id="c606d30a11f8686a33c4f5305ab878fa">
45+
<img alt="" class="Image" id="46647a4ff2f932d50ca02a1ef0ac51a2"/>
46+
<p class="NarrativeText" id="3452f07fead697f48e719306657044a6">
4447
Team members with permission to access the page can respond to any comment. Plus, when a comment thread comes to its natural conclusion, comments can be resolved and cleared away.
4548
</p>
46-
<h1 class="Title" id="9cec5c4cb40b1424590a7d2255ba5d98">
49+
<h1 class="Title" id="025ce3293479133863a7a64723611197">
4750
👋Loop in team members with @mentions
4851
</h1>
49-
<p class="NarrativeText" id="158ce46e2f05121666d26652b44ce556">
52+
<p class="NarrativeText" id="0fa6faf7cc80d654c319b481e7c7ffce">
5053
@mentions on Confluence function like @mentions on social media platforms like Twitter, Instagram, and Slack. Type the @ symbol on a Confluence page or in a comment, begin spelling a team member's first name, and a list will appear. Select the individual to ask a question or assign a task.
5154
</p>
52-
<h1 class="Title" id="aedbcb95b475418adc9e82fb50e1832f">
55+
<img alt="" class="Image" id="df15c1a5963603656576632632e1dced"/>
56+
<h1 class="Title" id="964954bfb165e4c1aa687b78fba71144">
5357
👏Endorse ideas with reactions
5458
</h1>
55-
<p class="NarrativeText" id="9dcf5a605331e2e0db925a329a727df8">
59+
<p class="NarrativeText" id="fe5335fa2c3bc18a1cbb8425fe071e47">
5660
Use reactions when you want to support a comment or acknowledge you've seen one without clogging up the thread with another comment.
5761
</p>
58-
<p class="NarrativeText" id="a26e40b5555fb394e0844b7ae0118a90">
62+
<p class="NarrativeText" id="d336ac79f4cbd3245fad05bfbc4c8f2b">
5963
You can also use reactions on a page or blog post. The author of the content will be notified, and if enough team members react or add comments to the content, it'll be surfaced on Confluence home feed
6064
</p>
61-
<h1 class="Title" id="04dfe464a23b5192ca7465fca96e8a56">
65+
<img alt="" class="Image" id="984da83593997e86b62223f8d1b03a62"/>
66+
<h1 class="Title" id="9901914d311723f7f14e905d32ee94fd">
6267
Take your Confluence space to the next level
6368
</h1>
64-
<p class="NarrativeText" id="06b459a1ab6ee59cbf44705c24934f15">
69+
<p class="NarrativeText" id="30b4b4dc49d65a5a014b40312edbb424">
6570
Extend the capabilities of your Confluence pages by adding extra functionality or including dynamic content.
6671
</p>
67-
<p class="NarrativeText" id="7d4a53bc8e11c662ba62212041b24cf6">
72+
<p class="NarrativeText" id="a4d482bff56873324e2f2578c381e971">
6873
To add functionality:
6974
</p>
70-
<p class="NarrativeText" id="29eaf10632e9bd8a0f0c46ac3f6ff876">
75+
<p class="NarrativeText" id="f17948e62a99462cb4013796e97eea23">
7176
Type ' / ' to open the list of items available to use
7277
</p>
73-
<p class="NarrativeText" id="885e34b9230d70d0c3257eef2d3f6a0f">
78+
<p class="NarrativeText" id="62804fd3619c5c942cf3944315db132c">
7479
Find the item to be inserted and select it
7580
</p>
76-
<p class="UncategorizedText" id="258ee604863fd54e308f2925d07ebd79">
81+
<p class="UncategorizedText" id="80ba4f784cb65e206b17b76f79c55818">
7782
Select Insert
7883
</p>
79-
<h1 class="Title" id="04a5e0e0b40cb961c84088dcc67b26b7">
84+
<img alt="" class="Image" id="7927a0fdb568097efde58fdd68ed7e0a"/>
85+
<h1 class="Title" id="60a261f17ffc821a917909bfb88a6d70">
8086
Useful elements for Team space
8187
</h1>
82-
<p class="UncategorizedText" id="bd4f8d2535746efce21ce872c09ef973">
88+
<p class="UncategorizedText" id="39d32e21527ef07823ab779970d88f26">
8389
Introduce the team
8490
</p>
85-
<p class="NarrativeText" id="433789f2b20ca6275f62a944390e3c1d">
91+
<p class="NarrativeText" id="fd0d57485d0925b681a03e270faeeb06">
8692
Add user profiles to display a short summary of a given Confluence user's profile with their role, profile photo and contact details.
8793
</p>
88-
<p class="UncategorizedText" id="959ffe89453ca67c279ed576df24e196">
94+
<img alt="" class="Image" id="b82b06b66608a8353fc7f99608bd8b08"/>
95+
<img alt="" class="Image" id="32ce3055a4b209c2734306d8e7266c08"/>
96+
<p class="UncategorizedText" id="2bad3c29ae9bd81da3a1d4c52487b032">
8997
Share news and announcements with your team
9098
</p>
91-
<p class="NarrativeText" id="8b81b2db2cef191090cfa1d4204b8964">
99+
<p class="NarrativeText" id="aa92002440f8c5a41323b8f85d131665">
92100
Display a stream of latest blog posts so your team can easily see what's been going on.
93101
</p>
94-
<p class="NarrativeText" id="3fd46bb09e57e95f1211f475c45b575b">
102+
<p class="NarrativeText" id="b313e6521d8168c6c840f8113c0ebd27">
95103
Display a list of important pages
96104
</p>
97-
<p class="NarrativeText" id="5cbfe913e369743f1f14830c0b6572ab">
105+
<p class="NarrativeText" id="c4bffd5805a6c7d1cb196dcd505f13d1">
98106
Paste in page URLs to create smart links, or use the content report table to create a list of all the pages in the space.
99107
</p>
108+
<img alt="" class="Image" id="15e9a49d1413538015b1fd4d7dee1825"/>
100109
</body>
101110
</html>

Diff for: test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/229477.html

+33-27
Original file line numberDiff line numberDiff line change
@@ -46,85 +46,91 @@ <h1 class="Title" id="f4638a66942901e4533240afec32333a">
4646
<p class="UncategorizedText" id="9d2ea8da0d1c12bb3616cd3cb4e56128">
4747
Add team members to your space.
4848
</p>
49-
<h1 class="Title" id="8e206800f74b037f87bc91ce09a66587">
49+
<img alt="" class="Image" id="11d63c2d51214128c8caebb58f2bf06d"/>
50+
<h1 class="Title" id="3d68b97296629da6f56dbee7226fb9ea">
5051
Team member
5152
</h1>
52-
<p class="UncategorizedText" id="2c4cc93ed9393b0f05a3e564c436e13e">
53+
<p class="UncategorizedText" id="b14012a7e1df00e14688673e6836af91">
5354
Role
5455
</p>
55-
<p class="UncategorizedText" id="554c2527470d9fea2aaf8cefd8aa8ffc">
56+
<p class="UncategorizedText" id="2ee3fe067727e804a8089f8c0131cd7e">
5657
Responsibility
5758
</p>
58-
<h1 class="Title" id="feb3b3be79c77e3d661dc3fa522de26f">
59+
<img alt="" class="Image" id="e206acc35c25cd275875533feb308ecf"/>
60+
<h1 class="Title" id="e9f3973e622aaacb42556e6f29d140c0">
5961
Team member
6062
</h1>
61-
<p class="UncategorizedText" id="5a73ff028549542468675768deee0430">
63+
<p class="UncategorizedText" id="2b43cb7e0a29b1411d109e9a682940fa">
6264
Role
6365
</p>
64-
<p class="UncategorizedText" id="94d211691238a7f3f74db151876c6734">
66+
<p class="UncategorizedText" id="3560a31004a2e271125262ae3435cd80">
6567
Responsibility
6668
</p>
67-
<h1 class="Title" id="198d8ad5606c445ba4dcafd19926c65e">
69+
<img alt="" class="Image" id="48a5d1f209c8025b1cfb1d882658743e"/>
70+
<h1 class="Title" id="64c696a8ba912e8c86e3dacc55bcfd09">
6871
Team member
6972
</h1>
70-
<p class="UncategorizedText" id="776f1a1125f787afd3d193ede37edbf3">
73+
<p class="UncategorizedText" id="60781a8a6086a335e6ef8efa6e767f74">
7174
Role
7275
</p>
73-
<p class="UncategorizedText" id="7d9faf5ffc93c10998801ec69e82969d">
76+
<p class="UncategorizedText" id="47137487152e9d98851e213658f3b212">
7477
Responsibility
7578
</p>
76-
<h1 class="Title" id="46bdd16cf46259b25d67480f1467e0b0">
79+
<h1 class="Title" id="5189c62c2edeed476df22eaa2bb5af21">
7780
Contact us
7881
</h1>
79-
<p class="NarrativeText" id="80dadf7b66548e15b0b7f73c59ee50cf">
82+
<p class="NarrativeText" id="43e843feeaed82e03996b90693f9c8eb">
8083
How can someone reach out to your team?
8184
</p>
82-
<div class="EmailAddress" id="23168bef3f665803fb9ec74644a65674">
85+
<div class="EmailAddress" id="0bae84d0e5cdc716a1dce4f739b86469">
8386
8487
</div>
85-
<p class="UncategorizedText" id="02510c1509479158e837ac5d13f84bf5">
88+
<p class="UncategorizedText" id="4d103f0c3f7f3527c37f34a8c4e86782">
8689
Tickets
8790
</p>
88-
<p class="UncategorizedText" id="c59943bccf5535ffd752fe52a2f6a184">
91+
<p class="UncategorizedText" id="deda95e4491b693fdb7bb978868beefd">
8992
Jira board
9093
</p>
91-
<p class="UncategorizedText" id="21d150625554235f8fe3270ed63d2921">
94+
<p class="UncategorizedText" id="e35c7cd3ecffe9ca0e65935f3feebfbd">
9295
#channel
9396
</p>
94-
<h1 class="Title" id="29c4e13f95e215957a8d697601c3d1cc">
97+
<h1 class="Title" id="f953d1e45bf1cf4cd4985b61255a41e3">
9598
Important Pages
9699
</h1>
97-
<p class="NarrativeText" id="8bdacdf1a36489a491926616432b7b8e">
100+
<p class="NarrativeText" id="53c5427b05c4256bd7c7e03346e58b9f">
98101
List them here
99102
</p>
100-
<h1 class="Title" id="68accd9d0365712f54b96da661cce03d">
103+
<img alt="" class="Image" id="6e5310473567927ff094c33ba42ff201"/>
104+
<h1 class="Title" id="a139fb30a2382364053eb57aa180550f">
101105
Onboarding FAQs
102106
</h1>
103-
<p class="UncategorizedText" id="35aa0d02a38ad72c0ca0534155dbdeb8">
107+
<p class="UncategorizedText" id="eb784ba0d48bf9e06b53aed2ac3fbd72">
104108
Add resources for new hires
105109
</p>
106-
<h1 class="Title" id="ea538f1ebdd2ced67e8c86dcf50bc164">
110+
<img alt="" class="Image" id="768cfb8a51125da06add3109e7d155b3"/>
111+
<h1 class="Title" id="67503783d98953e33cdc2846b90c21fd">
107112
Meeting notes
108113
</h1>
109-
<p class="NarrativeText" id="6f4ae84a8d8a1d9005384f35e2ce793c">
114+
<p class="NarrativeText" id="27194483431e4365b86572cbc73b9af5">
110115
Add links to meeting notes
111116
</p>
112-
<h1 class="Title" id="9616030a71ad0e0654b28e61578d0443">
117+
<img alt="" class="Image" id="c941c078ee573a2bbca654a7b5ce68f4"/>
118+
<h1 class="Title" id="885ad7169d419802971c64780c7a7968">
113119
Team goals
114120
</h1>
115-
<p class="NarrativeText" id="d81cb76df56721595c0495e4f5e6094f">
121+
<p class="NarrativeText" id="4d12c0c0f2d8211bc2b3eae35ac4f854">
116122
List them here
117123
</p>
118-
<h1 class="Title" id="46c3bd98dbea47cb63923597c929b932">
124+
<h1 class="Title" id="019ded9026166e1794b589358870fe60">
119125
Team news
120126
</h1>
121-
<p class="NarrativeText" id="1558d5e9d97c1cbb5cbb5cb2b077f83d">
127+
<p class="NarrativeText" id="7cdd15b42c50cc95a64aa83149e72aec">
122128
Create a blog post to share team news. It will automatically appear here once it's published.
123129
</p>
124-
<h1 class="Title" id="c281ed85f2e1125c9aaf318fd5178d4d">
130+
<h1 class="Title" id="008813f1d7a4380879ff001294f8bc6e">
125131
Blog stream
126132
</h1>
127-
<p class="NarrativeText" id="4b401fd3bc190fce17f70000e0164772">
133+
<p class="NarrativeText" id="2a28d14ef4ba44c8f0098df26a520f23">
128134
Create a blog post to share news and announcements with your team and company.
129135
</p>
130136
</body>

0 commit comments

Comments
 (0)