Skip to content

Commit 73d239f

Browse files
authored
feat: keep img tag's class attr (#4050)
This change affects partition html. Previously when there is a table in the html, we clean any tags inside the table of their `class` and `id` attributes. However, sometimes there are images, `img` tags, present in a table and its `class` attribute identifies some important information about the image. This change preserves the `class` attribute for `img` tags inside a table. This change is reflected in a table element's `metadata.text_as_html` attribute.
1 parent 7764fb6 commit 73d239f

File tree

4 files changed

+50
-5
lines changed

4 files changed

+50
-5
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
## 0.18.5-dev1
1+
## 0.18.5
22

33
### Enhancements
44

55
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
6+
- **`text_as_html` for Table element now keeps `img` tag's `class` attribute** Previously in partition HTML any tag inside a table is stripped of its `class` attribute. Now this attribute is preserved for `img` tag in the table element's `metadata.text_as_html`.
67

78
### Features
89

test_unstructured/documents/test_ontology_to_unstructured_parsing.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pathlib import Path
22

33
import pytest
4+
from bs4 import BeautifulSoup
45

56
from unstructured.chunking.basic import chunk_elements
67
from unstructured.chunking.title import chunk_by_title
@@ -13,6 +14,7 @@
1314
Paragraph,
1415
Section,
1516
Table,
17+
remove_ids_and_class_from_table,
1618
)
1719
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
1820
from unstructured.partition.html import partition_html
@@ -24,6 +26,37 @@
2426
from unstructured.staging.base import elements_from_json
2527

2628

29+
def test_remove_ids_and_class_from_table():
30+
html_text = """
31+
<table>
32+
<tr class="TableRow">
33+
<td><img class="Signature" alt="cell 1"/></td>
34+
<td>cell 2</td>
35+
</tr>
36+
<tr>
37+
<td><IMG class="Signature" alt="cell 3"/></td>
38+
<td>cell 4</td>
39+
</tr>
40+
</table>
41+
"""
42+
soup = BeautifulSoup(html_text, "html.parser")
43+
assert (
44+
str(remove_ids_and_class_from_table(soup))
45+
== """
46+
<table>
47+
<tr>
48+
<td><img alt="cell 1" class="Signature"/></td>
49+
<td>cell 2</td>
50+
</tr>
51+
<tr>
52+
<td><img alt="cell 3" class="Signature"/></td>
53+
<td>cell 4</td>
54+
</tr>
55+
</table>
56+
"""
57+
)
58+
59+
2760
def test_page_number_is_passed_correctly():
2861
ontology = Document(
2962
children=[

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.5-dev1" # pragma: no cover
1+
__version__ = "0.18.5" # pragma: no cover

unstructured/documents/ontology.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,23 @@ def page_number(self) -> int | None:
147147
return None
148148

149149

150-
def remove_ids_and_class_from_table(soup: BeautifulSoup):
150+
def remove_ids_and_class_from_table(soup: BeautifulSoup) -> BeautifulSoup:
151+
"""
152+
Remove id and class attributes from tags inside tables,
153+
except preserve class attributes for img tags.
154+
155+
Args:
156+
soup: BeautifulSoup object containing the HTML
157+
158+
Returns:
159+
BeautifulSoup: Modified soup with attributes removed
160+
"""
151161
for tag in soup.find_all(True):
152-
if tag.name == "table": # type: ignore
162+
if tag.name.lower() == "table": # type: ignore
153163
continue # We keep table tag
154-
tag.attrs.pop("class", None) # type: ignore
155164
tag.attrs.pop("id", None) # type: ignore
165+
if tag.name.lower() != "img": # type: ignore
166+
tag.attrs.pop("class", None) # type: ignore
156167
return soup
157168

158169

0 commit comments

Comments
 (0)