Skip to content

Commit 0fa5174

Browse files
authored
Image within div or span with no text is annotated as Image (#3962)
Ticket: https://unstructured-ai.atlassian.net/browse/ML-942 The following uncompressed HTML document can be used to test the transformation using the `partition_html` function from the VLM partitioner. [recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
1 parent 7de630e commit 0fa5174

File tree

4 files changed

+32
-2
lines changed

4 files changed

+32
-2
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.17.2
2+
3+
* Fix Image in a <div> tag is "UncategorizedText" with no .text
4+
15
## 0.17.1
26

37
### Enhancements

test_unstructured/partition/html/test_html_to_ontology_parsing.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from bs4 import BeautifulSoup
22

3-
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
3+
from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
44
from unstructured.partition.html.html_utils import indent_html
55
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
66

@@ -672,3 +672,24 @@ def test_get_text_when_recursion_limit_activated():
672672
last_child = last_child.children[0]
673673

674674
assert last_child.to_text() == "some text"
675+
676+
677+
def test_uncategorizedtest_has_image_and_no_text():
678+
# language=HTML
679+
base_html = _wrap_with_body(
680+
"""
681+
<div class="Page">
682+
<div class="UncategorizedText">
683+
<img src="https://www.example.com/image.jpg"/>
684+
</div>
685+
</div>
686+
"""
687+
)
688+
689+
base_html = indent_html(base_html)
690+
691+
ontology: OntologyElement = parse_html_to_ontology(base_html)
692+
693+
element = ontology.children[0].children[0]
694+
assert type(element) is Image
695+
assert element.css_class_name == "Image"

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.17.1" # pragma: no cover
1+
__version__ = "0.17.2" # pragma: no cover

unstructured/partition/html/transformations.py

+5
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,11 @@ def extract_tag_and_ontology_class_from_tag(
437437
html_tag = "span"
438438
element_class = ontology.UncategorizedText
439439

440+
# Scenario 5: UncategorizedText has image and no text
441+
# Typically, this happens with a span or div tag with an image inside
442+
if element_class == ontology.UncategorizedText and soup.find("img") and not soup.text.strip():
443+
element_class = ontology.Image
444+
440445
return html_tag, element_class
441446

442447

0 commit comments

Comments
 (0)