Skip to content

Commit 6966178

Browse files
authored
fix: track narrative text and figure captions in HTML documents (#309)
* fix for missing narrative text in partition_html * fixes so existing tests pass * tests for figure caption and narrative text * bump version; changelog
1 parent e52dd5c commit 6966178

File tree

5 files changed

+37
-14
lines changed

5 files changed

+37
-14
lines changed

Diff for: CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.4.17-dev1
1+
## 0.5.0
22

33
### Enhancements
44

@@ -12,6 +12,8 @@ instantiating a class or running a function
1212
### Fixes
1313

1414
* Fix `process_document` file cleaning on failure
15+
* Fixes an error introduced in the metadata tracking commit that caused `NarrativeText`
16+
and `FigureCaption` elements to be represented as `Text` in HTML documents.
1517

1618
## 0.4.16
1719

Diff for: test_unstructured/partition/test_common.py

+26
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,19 @@ def test_normalize_layout_element_dict_caption():
3434
)
3535

3636

37+
def test_normalize_layout_element_dict_figure_caption():
38+
layout_element = {
39+
"type": "FigureCaption",
40+
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
41+
"text": "Some lovely text",
42+
}
43+
element = common.normalize_layout_element(layout_element)
44+
assert element == FigureCaption(
45+
text="Some lovely text",
46+
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
47+
)
48+
49+
3750
def test_normalize_layout_element_dict_misc():
3851
layout_element = {
3952
"type": "Misc",
@@ -57,6 +70,19 @@ def test_normalize_layout_element_layout_element():
5770
)
5871

5972

73+
def test_normalize_layout_element_layout_element_narrative_text():
74+
layout_element = LayoutElement(
75+
type="NarrativeText",
76+
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
77+
text="Some lovely text",
78+
)
79+
element = common.normalize_layout_element(layout_element)
80+
assert element == NarrativeText(
81+
text="Some lovely text",
82+
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
83+
)
84+
85+
6086
def test_normalize_layout_element_checked_box():
6187
layout_element = LayoutElement(
6288
type="Checked",

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.17-dev1" # pragma: no cover
1+
__version__ = "0.5.0" # pragma: no cover

Diff for: unstructured/documents/elements.py

+2
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,8 @@ def __init__(
201201
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
202202
"UncategorizedText": Text,
203203
"FigureCaption": FigureCaption,
204+
"Figure": FigureCaption,
205+
"Text": NarrativeText,
204206
"NarrativeText": NarrativeText,
205207
"ListItem": ListItem,
206208
"BulletedText": ListItem,

Diff for: unstructured/partition/common.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,13 @@
22
from typing import List, Optional, Union
33

44
from unstructured.documents.elements import (
5+
TYPE_TO_TEXT_ELEMENT_MAP,
56
CheckBox,
67
Element,
78
ElementMetadata,
8-
FigureCaption,
99
ListItem,
10-
NarrativeText,
1110
PageBreak,
1211
Text,
13-
Title,
1412
)
1513
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
1614

@@ -31,20 +29,15 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
3129
coordinates = layout_dict.get("coordinates")
3230
element_type = layout_dict.get("type")
3331

34-
if element_type == "Title":
35-
return Title(text=text, coordinates=coordinates)
36-
elif element_type == "Text":
37-
return NarrativeText(text=text, coordinates=coordinates)
38-
elif element_type == "Figure":
39-
return FigureCaption(text=text, coordinates=coordinates)
40-
elif element_type == "List":
32+
if element_type == "List":
4133
return layout_list_to_list_items(text, coordinates)
34+
elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
35+
_element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
36+
return _element_class(text=text, coordinates=coordinates)
4237
elif element_type == "Checked":
4338
return CheckBox(checked=True, coordinates=coordinates)
4439
elif element_type == "Unchecked":
4540
return CheckBox(checked=False, coordinates=coordinates)
46-
elif element_type == "PageBreak":
47-
return PageBreak()
4841
else:
4942
return Text(text=text, coordinates=coordinates)
5043

0 commit comments

Comments
 (0)