Skip to content

Commit 8653c59

Browse files
Fix html_as_text appearing in every element metadata (#319)
This PR fixes the issue described here: Unstructured-IO/unstructured#2463 Now `text_as_html` will only be available for elements that are HTML strings (contain HTML tags) E.g. output for **non** html element ```json { "element_id": "4a44dc15364204a80fe80e9039455cc1", "metadata": { "coordinates": { "layout_height": 3301, "layout_width": 2550, "points": [ [170, 13], [170, 140], [427, 140], [427, 13] ], "system": "PixelSpace" }, "file_directory": "/home/ubuntu/Documents", "filename": "purchasing-payment-policy-10.pdf", "filetype": "application/pdf", "languages": ["eng"], "last_modified": "2024-02-02T11:49:38", "page_number": 1, "parent_id": "e3b0c44298fc1c149afbf4c8996fb924" }, "text": "10", "type": "UncategorizedText" } ``` E.g. output for html element ```json { "element_id": "398766f59dd6b37bd38b6d612159cd3e", "metadata": { "coordinates": { "layout_height": 3301, "layout_width": 2550, "points": [ [433, 2180], [433, 2181], [2290, 2181], [2290, 2180] ], "system": "PixelSpace" }, "file_directory": "/home/ubuntu/Documents", "filename": "purchasing-payment-policy-10.pdf", "filetype": "application/pdf", "languages": ["eng"], "last_modified": "2024-02-02T11:49:38", "page_number": 1, "text_as_html": "<table><tbody><tr><td></td><td> Subject Matter Expert / Department</td><td> Contract Review Responsibility</td><td></td></tr><tbody></table>" }, "text": "Subject Matter Expert / Department Contract Review Responsibility", "type": "Table" } ```
1 parent ed5f2c2 commit 8653c59

File tree

4 files changed

+38
-8
lines changed

4 files changed

+38
-8
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.7.24
2+
3+
* fix: assign value to `text_as_html` element attribute only if `text` attribute contains HTML tags.
4+
15
## 0.7.23
26

37
* fix: added handling in `UnstructuredTableTransformerModel` for if `recognize` returns an empty

test_unstructured_inference/models/test_chippermodel.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44
import torch
55
from PIL import Image
6+
from unstructured_inference.inference.layoutelement import LayoutElement
67

78
from unstructured_inference.models import chipper
89
from unstructured_inference.models.base import get_model
@@ -422,3 +423,26 @@ def test_check_overlap(bbox1, bbox2, output):
422423
model = get_model("chipper")
423424

424425
assert model.check_overlap(bbox1, bbox2) == output
426+
427+
428+
def test_format_table_elements():
429+
table_html = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td></tr></table>"
430+
texts = [
431+
"Text",
432+
" - List element",
433+
table_html,
434+
None,
435+
]
436+
elements = [LayoutElement(bbox=mock.MagicMock(), text=text) for text in texts]
437+
formatted_elements = chipper.UnstructuredChipperModel.format_table_elements(elements)
438+
text_attributes = [fe.text for fe in formatted_elements]
439+
text_as_html_attributes = [
440+
fe.text_as_html if hasattr(fe, "text_as_html") else None for fe in formatted_elements
441+
]
442+
assert text_attributes == [
443+
"Text",
444+
" - List element",
445+
"Cell 1Cell 2Cell 3",
446+
None,
447+
]
448+
assert text_as_html_attributes == [None, None, table_html, None]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.23" # pragma: no cover
1+
__version__ = "0.7.24" # pragma: no cover

unstructured_inference/models/chipper.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -171,16 +171,18 @@ def predict(self, image) -> List[LayoutElement]:
171171
return elements
172172

173173
@staticmethod
174-
def format_table_elements(elements):
175-
"""makes chipper table element return the same as other layout models
174+
def format_table_elements(elements: List[LayoutElement]) -> List[LayoutElement]:
175+
"""Makes chipper table element return the same as other layout models.
176176
177-
- copies the html representation to attribute text_as_html
178-
- strip html tags from the attribute text
177+
1. If `text` attribute is an html (has html tags in it), copies the `text`
178+
attribute to `text_as_html` attribute.
179+
2. Strips html tags from the `text` attribute.
179180
"""
180181
for element in elements:
181-
element.text_as_html = element.text
182-
element.text = strip_tags(element.text)
183-
182+
text = strip_tags(element.text) if element.text is not None else element.text
183+
if text != element.text:
184+
element.text_as_html = element.text # type: ignore[attr-defined]
185+
element.text = text
184186
return elements
185187

186188
def predict_tokens(

0 commit comments

Comments
 (0)