Image within div or span with no text is annotated as Image (#3962)

ajjimeno · web-flow · commit 0fa5174bd741 · 2025-03-20T04:09:02.000Z
Ticket: https://unstructured-ai.atlassian.net/browse/ML-942 The following uncompressed HTML document can be used to test the transformation using the `partition_html` function from the VLM partitioner. [recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.17.2
+
+* Fix Image in a <div> tag is "UncategorizedText" with no .text
+
 ## 0.17.1
 
 ### Enhancements
diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py
@@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup
 
-from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
+from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
 from unstructured.partition.html.html_utils import indent_html
 from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
 
@@ -672,3 +672,24 @@ def test_get_text_when_recursion_limit_activated():
         last_child = last_child.children[0]
 
     assert last_child.to_text() == "some text"
+
+
+def test_uncategorizedtest_has_image_and_no_text():
+    # language=HTML
+    base_html = _wrap_with_body(
+        """
+        <div class="Page">
+    <div class="UncategorizedText">
+        <img src="https://www.example.com/image.jpg"/>
+    </div>
+    </div>
+    """
+    )
+
+    base_html = indent_html(base_html)
+
+    ontology: OntologyElement = parse_html_to_ontology(base_html)
+
+    element = ontology.children[0].children[0]
+    assert type(element) is Image
+    assert element.css_class_name == "Image"
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.17.1"  # pragma: no cover
+__version__ = "0.17.2"  # pragma: no cover
diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
@@ -437,6 +437,11 @@ def extract_tag_and_ontology_class_from_tag(
         html_tag = "span"
         element_class = ontology.UncategorizedText
 
+    # Scenario 5: UncategorizedText has image and no text
+    # Typically, this happens with a span or div tag with an image inside
+    if element_class == ontology.UncategorizedText and soup.find("img") and not soup.text.strip():
+        element_class = ontology.Image
+
     return html_tag, element_class
 
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.17.1" # pragma: no cover`
	`1`	`+__version__ = "0.17.2" # pragma: no cover`