Skip to content

Commit e48d79e

Browse files
authored
image alt support (#3797)
1 parent 626f73a commit e48d79e

File tree

10 files changed

+192
-12
lines changed

10 files changed

+192
-12
lines changed

CHANGELOG.md

+9
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.16.7
2+
3+
### Enhancements
4+
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
5+
6+
### Features
7+
8+
### Fixes
9+
110
## 0.16.6
211

312
### Enhancements
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
2+
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
3+
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
4+
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
5+
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
6+
</header>
7+
</div>
8+
</body>

test_unstructured/documents/test_ontology_to_unstructured_parsing.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
181181
[
182182
("html_files/example.html", "unstructured_json_output/example.json"),
183183
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
184+
(
185+
"html_files/example_with_alternative_text.html",
186+
"unstructured_json_output/example_with_alternative_text.json",
187+
),
184188
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
185189
(
186190
"html_files/example_with_inline_fields.html",
@@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
191195
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
192196
html_file_path = Path(__file__).parent / html_file_path
193197
json_file_path = Path(__file__).parent / json_file_path
194-
195198
expected_json_elements = elements_from_json(str(json_file_path))
196199
html_code = html_file_path.read_text()
197200

198201
predicted_elements = partition_html(
199202
text=html_code, html_parser_version="v2", unique_element_ids=True
200203
)
204+
201205
assert len(expected_json_elements) == len(predicted_elements)
202206

203207
for i in range(len(expected_json_elements)):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
[
2+
{
3+
"element_id": "3a6b156a81764e17be128264241f8136",
4+
"metadata": {
5+
"category_depth": 0,
6+
"filetype": "text/html",
7+
"languages": [
8+
"eng"
9+
],
10+
"page_number": 1,
11+
"parent_id": "897a8a47377c4ad6aab839a929879537",
12+
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
13+
},
14+
"text": "",
15+
"type": "UncategorizedText"
16+
},
17+
{
18+
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
19+
"metadata": {
20+
"category_depth": 1,
21+
"filetype": "text/html",
22+
"languages": [
23+
"eng"
24+
],
25+
"page_number": 1,
26+
"parent_id": "3a6b156a81764e17be128264241f8136",
27+
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
28+
},
29+
"text": "",
30+
"type": "UncategorizedText"
31+
},
32+
{
33+
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
34+
"metadata": {
35+
"category_depth": 2,
36+
"filetype": "text/html",
37+
"languages": [
38+
"eng"
39+
],
40+
"page_number": 1,
41+
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
42+
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
43+
},
44+
"text": "New York logo",
45+
"type": "Image"
46+
},
47+
{
48+
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
49+
"metadata": {
50+
"category_depth": 2,
51+
"filetype": "text/html",
52+
"languages": [
53+
"eng"
54+
],
55+
"page_number": 1,
56+
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
57+
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
58+
},
59+
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
60+
"type": "Image"
61+
}
62+
]

test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py

+18
Original file line numberDiff line numberDiff line change
@@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
555555
assert len(unstructured_elements) == 2
556556
assert isinstance(unstructured_elements[0], Text)
557557
assert isinstance(unstructured_elements[1], NarrativeText)
558+
559+
560+
def test_alternate_text_from_image_is_passed():
561+
# language=HTML
562+
input_html = """
563+
<div class="Page">
564+
<table>
565+
<tr>
566+
<td rowspan="2">Example image nested in the table:</td>
567+
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
568+
</tr>
569+
</table>
570+
</div>add_img_alt_text
571+
"""
572+
page = parse_html_to_ontology(input_html)
573+
unstructured_elements = ontology_to_unstructured_elements(page)
574+
assert len(unstructured_elements) == 2
575+
assert "ALT TEXT Logo" in unstructured_elements[1].text
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from unstructured.partition.html import partition_html
2+
3+
4+
def test_alternative_image_text_can_be_included():
5+
# language=HTML
6+
html = """
7+
<div class="Page">
8+
<img src="my-logo.png" alt="ALT TEXT Logo"/>
9+
</div>
10+
"""
11+
_, image_to_text_alt_mode = partition_html(
12+
text=html,
13+
image_alt_mode="to_text",
14+
html_parser_version="v2",
15+
)
16+
assert "ALT TEXT Logo" in image_to_text_alt_mode.text
17+
18+
_, image_none_alt_mode = partition_html(
19+
text=html,
20+
image_alt_mode=None,
21+
html_parser_version="v2",
22+
)
23+
assert "ALT TEXT Logo" not in image_none_alt_mode.text
24+
25+
26+
def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
27+
# language=HTML
28+
html = """
29+
<div class="Page">
30+
<p class="Paragraph">
31+
<img src="my-logo.png" alt="ALT TEXT Logo"/>
32+
</p>
33+
</div>
34+
"""
35+
_, paragraph_to_text_alt_mode = partition_html(
36+
text=html,
37+
image_alt_mode="to_text",
38+
html_parser_version="v2",
39+
)
40+
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
41+
42+
_, paragraph_none_alt_mode = partition_html(
43+
text=html,
44+
image_alt_mode=None,
45+
html_parser_version="v2",
46+
)
47+
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.6" # pragma: no cover
1+
__version__ = "0.16.7" # pragma: no cover

unstructured/documents/ontology.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str:
8989

9090
return result_html
9191

92-
def to_text(self, add_children=True) -> str:
92+
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
93+
"""
94+
Returns the text representation of the element.
95+
96+
Args:
97+
add_children: If True, the text of the children will be included.
98+
Otherwise, element is represented as single self-closing tag.
99+
add_img_alt_text: If True, the alt text of the image will be included.
100+
"""
93101
if self.children and add_children:
94-
children_text = " ".join(child.to_text().strip() for child in self.children)
102+
children_text = " ".join(
103+
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
104+
)
95105
return children_text
96-
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
106+
107+
text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
108+
109+
if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
110+
text += f" {self.additional_attributes.get('alt', '')}"
111+
112+
return text.strip()
97113

98114
def _construct_attribute_string(self, attributes: dict) -> str:
99115
return " ".join(
@@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
473489
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
474490
allowed_tags: List[str] = Field(["input"], frozen=True)
475491

476-
def to_text(self, add_children=True) -> str:
477-
text = super().to_text()
492+
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
493+
text = super().to_text(add_children, add_img_alt_text)
478494
value = self.additional_attributes.get("value", "")
479495
if not value:
480496
return text

unstructured/partition/html/partition.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def partition_html(
3636
skip_headers_and_footers: bool = False,
3737
detection_origin: Optional[str] = None,
3838
html_parser_version: Literal["v1", "v2"] = "v1",
39+
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
3940
**kwargs: Any,
4041
) -> list[Element]:
4142
"""Partitions an HTML document into its constituent elements.
@@ -65,6 +66,9 @@ def partition_html(
6566
html_parser_version (Literal['v1', 'v2']):
6667
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
6768
use the ontology schema to parse the HTML document.
69+
70+
image_alt_mode (Literal['to_text']):
71+
When set 'to_text', the v2 parser will include the alternative text of images in the output.
6872
"""
6973
# -- parser rejects an empty str, nip that edge-case in the bud here --
7074
if text is not None and text.strip() == "" and not file and not filename and not url:
@@ -81,6 +85,7 @@ def partition_html(
8185
skip_headers_and_footers=skip_headers_and_footers,
8286
detection_origin=detection_origin,
8387
html_parser_version=html_parser_version,
88+
image_alt_mode=image_alt_mode,
8489
)
8590

8691
return list(_HtmlPartitioner.iter_elements(opts))
@@ -102,6 +107,7 @@ def __init__(
102107
skip_headers_and_footers: bool,
103108
detection_origin: str | None,
104109
html_parser_version: Literal["v1", "v2"] = "v1",
110+
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
105111
):
106112
self._file_path = file_path
107113
self._file = file
@@ -113,6 +119,7 @@ def __init__(
113119
self._skip_headers_and_footers = skip_headers_and_footers
114120
self._detection_origin = detection_origin
115121
self._html_parser_version = html_parser_version
122+
self._image_alt_mode = image_alt_mode
116123

117124
@lazyproperty
118125
def detection_origin(self) -> str | None:
@@ -172,6 +179,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]:
172179
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
173180
return self._html_parser_version
174181

182+
@lazyproperty
183+
def add_img_alt_text(self) -> bool:
184+
"""When True, the alternative text of images is included in the output."""
185+
return self._image_alt_mode == "to_text"
186+
175187

176188
class _HtmlPartitioner:
177189
"""Partition HTML document into document-elements."""
@@ -239,5 +251,7 @@ def _from_ontology(self) -> List[Element]:
239251
"""Convert an ontology elements represented in HTML to an ontology element."""
240252
html_text = self._opts.html_text
241253
ontology = parse_html_to_ontology(html_text)
242-
unstructured_elements = ontology_to_unstructured_elements(ontology)
254+
unstructured_elements = ontology_to_unstructured_elements(
255+
ontology, add_img_alt_text=self._opts.add_img_alt_text
256+
)
243257
return unstructured_elements

unstructured/partition/html/transformations.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
2424
page_number: int = None,
2525
depth: int = 0,
2626
filename: str | None = None,
27+
add_img_alt_text: bool = True,
2728
) -> list[elements.Element]:
2829
"""
2930
Converts an OntologyElement object to a list of unstructured Element objects.
@@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
4445
parent_id (str, optional): The ID of the parent element. Defaults to None.
4546
page_number (int, optional): The page number of the element. Defaults to None.
4647
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
47-
48+
filename (str, optional): The name of the file the element comes from. Defaults to None.
49+
add_img_alt_text (bool): Whether to include the alternative text of images
50+
in the output. Defaults to True.
4851
Returns:
4952
list[Element]: A list of unstructured Element objects.
5053
"""
@@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
7780
page_number=page_number,
7881
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
7982
filename=filename,
83+
add_img_alt_text=add_img_alt_text,
8084
)
8185
children += child
8286

@@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
8589
else:
8690
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
8791
html_code_of_ontology_element = ontology_element.to_html()
88-
element_text = ontology_element.to_text()
92+
element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
8993

9094
unstructured_element = element_class(
9195
text=element_text,
@@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
278282
Args:
279283
html_code (str): The HTML code to be parsed.
280284
Parsing HTML will start from <div class="Page">.
281-
282285
Returns:
283286
OntologyElement: The parsed Element object.
284287
@@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
352355
Args:
353356
soup (Tag): The BeautifulSoup Tag object to be converted.
354357
recursion_depth (int): Flag to control limit of recursion depth.
355-
356358
Returns:
357359
OntologyElement: The converted OntologyElement object.
358360
"""

0 commit comments

Comments
 (0)