Skip to content

Commit 66bf4b0

Browse files
feat: support extracting image url in html (#3955)
also removes mimetype when base64 is not included in image metadata --------- Co-authored-by: ryannikolaidis <[email protected]>
1 parent 2dceac3 commit 66bf4b0

File tree

18 files changed

+63
-8
lines changed

18 files changed

+63
-8
lines changed

Diff for: CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.17.1-dev0
2+
3+
### Enhancements
4+
5+
- **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
6+
7+
### Features
8+
9+
### Fixes
10+
111
## 0.17.0
212

313
### Enhancements

Diff for: example-docs/fake-html-with-image-from-url.html

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<div>
2+
<p>Test page</p>
3+
<img src="https://avatars.githubusercontent.com/u/108372208?s=200&v=4" alt="Unstructured Logo" />
4+
</div>

Diff for: test_unstructured/partition/html/test_partition.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -335,28 +335,29 @@ def test_partition_html_base64_for_images(
335335

336336
assert element.category == ElementType.IMAGE
337337
assert element.text == alt_text
338-
assert element.metadata.image_mime_type == "image/png"
339338
if expect_base64:
340339
assert element.metadata.image_base64 == base64
340+
assert element.metadata.image_mime_type == "image/png"
341341
else:
342342
assert element.metadata.image_base64 is None
343+
assert element.metadata.image_mime_type is None
343344

344345

345346
def test_partition_html_includes_url_for_images():
346-
url = "https://example.com/image.png"
347+
image_url = "https://example.com/image.png"
347348
alt_text = "URL Image"
348349
# language=HTML
349350
html = f"""
350351
<div class="Page">
351-
<img src="{url}" alt="{alt_text}">
352+
<img src="{image_url}" alt="{alt_text}">
352353
</div>
353354
"""
354355
(image,) = partition_html(
355356
text=html,
356357
)
357358
assert image.category == ElementType.IMAGE
358359
assert image.text == alt_text
359-
assert image.metadata.url == url
360+
assert image.metadata.image_url == image_url
360361

361362

362363
# -- table parsing behaviors ---------------------------------------------------------------------

Diff for: test_unstructured/partition/test_auto.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -632,14 +632,21 @@ def test_auto_partition_html_element_extraction():
632632

633633
with tempfile.TemporaryDirectory() as tmpdir:
634634
elements = partition(
635-
example_doc_path("html-with-base64-image.html"),
635+
example_doc_path("fake-html-with-base64-image.html"),
636636
extract_image_block_types=extract_image_block_types,
637637
extract_image_block_to_payload=True,
638638
)
639639

640640
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
641641

642642

643+
def test_auto_partition_html_image_with_url():
644+
elements = partition(
645+
example_doc_path("fake-html-with-image-from-url.html"),
646+
)
647+
assert elements[1].metadata.image_url is not None
648+
649+
643650
def test_partition_pdf_does_not_raise_warning():
644651
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
645652
# per the pytest docs.

Diff for: test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json

+9
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"version": "1"
5656
},
5757
"filetype": "text/html",
58+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/overview.svg?version=1&modificationDate=1688907285640&cacheVersion=1&api=v2",
5859
"languages": [
5960
"eng"
6061
]
@@ -236,6 +237,7 @@
236237
"version": "1"
237238
},
238239
"filetype": "text/html",
240+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/shortcuts.svg?version=1&modificationDate=1688907288893&cacheVersion=1&api=v2",
239241
"languages": [
240242
"eng"
241243
]
@@ -326,6 +328,7 @@
326328
"version": "1"
327329
},
328330
"filetype": "text/html",
331+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/inline_comment.svg?version=1&modificationDate=1688907286335&cacheVersion=1&api=v2&width=442&height=99",
329332
"languages": [
330333
"eng"
331334
]
@@ -416,6 +419,7 @@
416419
"version": "1"
417420
},
418421
"filetype": "text/html",
422+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/teamspace_mention.svg?version=1&modificationDate=1688907289571&cacheVersion=1&api=v2&width=442&height=417",
419423
"languages": [
420424
"eng"
421425
]
@@ -500,6 +504,7 @@
500504
"version": "1"
501505
},
502506
"filetype": "text/html",
507+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/reactions.svg?version=1&modificationDate=1688907286993&cacheVersion=1&api=v2",
503508
"languages": [
504509
"eng"
505510
]
@@ -659,6 +664,7 @@
659664
"version": "1"
660665
},
661666
"filetype": "text/html",
667+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605956/slash_menu.svg?version=1&modificationDate=1688907287621&cacheVersion=1&api=v2&width=544&height=586",
662668
"languages": [
663669
"eng"
664670
]
@@ -755,6 +761,7 @@
755761
"version": "1"
756762
},
757763
"filetype": "text/html",
764+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_introduce.svg?version=1&modificationDate=1688907290201&cacheVersion=1&api=v2",
758765
"languages": [
759766
"eng"
760767
]
@@ -776,6 +783,7 @@
776783
"version": "1"
777784
},
778785
"filetype": "text/html",
786+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/teamspace_announcements.svg?version=1&modificationDate=1688907290847&cacheVersion=1&api=v2",
779787
"languages": [
780788
"eng"
781789
]
@@ -905,6 +913,7 @@
905913
"version": "1"
906914
},
907915
"filetype": "text/html",
916+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605956/content_report.svg?version=1&modificationDate=1688907288249&cacheVersion=1&api=v2",
908917
"languages": [
909918
"eng"
910919
]

Diff for: test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json

+6
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@
328328
"version": "1"
329329
},
330330
"filetype": "text/html",
331+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/angie.svg?version=1&modificationDate=1688145926387&cacheVersion=1&api=v2&width=256&height=257",
331332
"languages": [
332333
"eng"
333334
]
@@ -412,6 +413,7 @@
412413
"version": "1"
413414
},
414415
"filetype": "text/html",
416+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/gael.svg?version=1&modificationDate=1688145927077&cacheVersion=1&api=v2&width=256&height=257",
415417
"languages": [
416418
"eng"
417419
]
@@ -496,6 +498,7 @@
496498
"version": "1"
497499
},
498500
"filetype": "text/html",
501+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/229477/claudia.svg?version=1&modificationDate=1688145927764&cacheVersion=1&api=v2&width=256&height=257",
499502
"languages": [
500503
"eng"
501504
]
@@ -766,6 +769,7 @@
766769
"version": "1"
767770
},
768771
"filetype": "text/html",
772+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/raised_hand.svg?version=1&modificationDate=1688145928452&cacheVersion=1&api=v2",
769773
"languages": [
770774
"eng"
771775
]
@@ -835,6 +839,7 @@
835839
"version": "1"
836840
},
837841
"filetype": "text/html",
842+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/ledger.svg?version=1&modificationDate=1688145929151&cacheVersion=1&api=v2",
838843
"languages": [
839844
"eng"
840845
]
@@ -904,6 +909,7 @@
904909
"version": "1"
905910
},
906911
"filetype": "text/html",
912+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/229477/astronaut.svg?version=1&modificationDate=1688145929790&cacheVersion=1&api=v2",
907913
"languages": [
908914
"eng"
909915
]

Diff for: test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json

+6
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@
328328
"version": "2"
329329
},
330330
"filetype": "text/html",
331+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/angie.svg?version=1&modificationDate=1688907281095&cacheVersion=1&api=v2&width=256&height=257",
331332
"languages": [
332333
"eng"
333334
]
@@ -412,6 +413,7 @@
412413
"version": "2"
413414
},
414415
"filetype": "text/html",
416+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/gael.svg?version=1&modificationDate=1688907281775&cacheVersion=1&api=v2&width=256&height=257",
415417
"languages": [
416418
"eng"
417419
]
@@ -496,6 +498,7 @@
496498
"version": "2"
497499
},
498500
"filetype": "text/html",
501+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605859/claudia.svg?version=1&modificationDate=1688907282424&cacheVersion=1&api=v2&width=256&height=257",
499502
"languages": [
500503
"eng"
501504
]
@@ -766,6 +769,7 @@
766769
"version": "2"
767770
},
768771
"filetype": "text/html",
772+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/raised_hand.svg?version=1&modificationDate=1688907283067&cacheVersion=1&api=v2",
769773
"languages": [
770774
"eng"
771775
]
@@ -835,6 +839,7 @@
835839
"version": "2"
836840
},
837841
"filetype": "text/html",
842+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/ledger.svg?version=1&modificationDate=1688907283728&cacheVersion=1&api=v2",
838843
"languages": [
839844
"eng"
840845
]
@@ -904,6 +909,7 @@
904909
"version": "2"
905910
},
906911
"filetype": "text/html",
912+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/attachments/1605859/astronaut.svg?version=1&modificationDate=1688907284407&cacheVersion=1&api=v2",
907913
"languages": [
908914
"eng"
909915
]

Diff for: test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json

+2
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@
321321
"version": "1"
322322
},
323323
"filetype": "text/html",
324+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-015203.png?version=1&modificationDate=1688907429067&cacheVersion=1&api=v2&width=680&height=259",
324325
"languages": [
325326
"eng",
326327
"fra"
@@ -802,6 +803,7 @@
802803
"version": "1"
803804
},
804805
"filetype": "text/html",
806+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1605989/image-20230709-020021.png?version=1&modificationDate=1688907429074&cacheVersion=1&api=v2&width=475&height=236",
805807
"languages": [
806808
"eng",
807809
"fra"

Diff for: test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json

+2
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@
321321
"version": "1"
322322
},
323323
"filetype": "text/html",
324+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-015203.png?version=1&modificationDate=1689094907437&cacheVersion=1&api=v2&width=680&height=259",
324325
"languages": [
325326
"eng",
326327
"fra"
@@ -802,6 +803,7 @@
802803
"version": "1"
803804
},
804805
"filetype": "text/html",
806+
"image_url": "https://unstructured-ingest-test.atlassian.net/wiki/download/thumbnails/1802252/image-20230709-020021.png?version=1&modificationDate=1689094907442&cacheVersion=1&api=v2&width=475&height=236",
805807
"languages": [
806808
"eng",
807809
"fra"

Diff for: test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json

+1
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,7 @@
545545
"date_modified": "2023-08-17T18:48:00.000Z"
546546
},
547547
"filetype": "text/html",
548+
"image_url": "https://media4.giphy.com/media/26FPsOhR3tyQRTc2Y/giphy.gif?cid=7941fdc68sl3vdqajgosqug9hfhg3zq3t5yoflyy9p7y66q0&ep=v1_gifs_trending&rid=giphy.gif&ct=g",
548549
"languages": [
549550
"eng"
550551
]

Diff for: test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"element_id": "f714fa214dac2f441515c4f28370d279",
3333
"text": "",
3434
"metadata": {
35+
"image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTZ&from=int",
3536
"languages": [
3637
"eng"
3738
],

Diff for: test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"element_id": "68870d055535f48c7439ce67092768f6",
3333
"text": "",
3434
"metadata": {
35+
"image_url": "https://unstructuredio-dev-ed.develop.my.salesforce.com/servlet/servlet.ImageServer?oid=00DHu0000018RDe&esid=018Hu00001JMmTa&from=int",
3536
"languages": [
3637
"eng"
3738
],

Diff for: test_unstructured_ingest/src/local.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
3232
--input-path example-docs \
3333
--work-dir "$WORK_DIR"
3434

35-
"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME
35+
"$SCRIPT_DIR"/check-num-files-output.sh 15 $OUTPUT_FOLDER_NAME
3636

3737
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.17.0" # pragma: no cover
1+
__version__ = "0.17.1-dev0" # pragma: no cover

Diff for: unstructured/documents/elements.py

+4
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ class ElementMetadata:
172172
file_directory: Optional[str]
173173
filename: Optional[str]
174174
filetype: Optional[str]
175+
image_url: Optional[str]
175176
image_path: Optional[str]
176177
image_base64: Optional[str]
177178
image_mime_type: Optional[str]
@@ -230,6 +231,7 @@ def __init__(
230231
header_footer_type: Optional[str] = None,
231232
image_base64: Optional[str] = None,
232233
image_mime_type: Optional[str] = None,
234+
image_url: Optional[str] = None,
233235
image_path: Optional[str] = None,
234236
is_continuation: Optional[bool] = None,
235237
languages: Optional[list[str]] = None,
@@ -274,6 +276,7 @@ def __init__(
274276
self.header_footer_type = header_footer_type
275277
self.image_base64 = image_base64
276278
self.image_mime_type = image_mime_type
279+
self.image_url = image_url
277280
self.image_path = image_path
278281
self.is_continuation = is_continuation
279282
self.languages = languages
@@ -490,6 +493,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
490493
"filename": cls.FIRST,
491494
"filetype": cls.FIRST,
492495
"header_footer_type": cls.DROP,
496+
"image_url": cls.DROP,
493497
"image_path": cls.DROP,
494498
"image_base64": cls.DROP,
495499
"image_mime_type": cls.DROP,

Diff for: unstructured/partition/html/parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ def iter_elements(self) -> Iterator[Element]:
502502
metadata=ElementMetadata(
503503
image_mime_type=img_mime_type,
504504
image_base64=img_base64,
505-
url=img_url,
505+
image_url=img_url,
506506
),
507507
)
508508

Diff for: unstructured/partition/html/partition.py

+1
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ def _iter_elements(self) -> Iterator[Element]:
223223
# -- remove <image_base64> if not requested --
224224
if not self._should_include_image_base64(e):
225225
e.metadata.image_base64 = None
226+
e.metadata.image_mime_type = None
226227
yield e
227228

228229
@lazyproperty

0 commit comments

Comments
 (0)