Skip to content

Commit 5a91f0c

Browse files
authored
Fix layout parsing (#3754)
1 parent 2417f8e commit 5a91f0c

File tree

4 files changed

+44
-9
lines changed

4 files changed

+44
-9
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.3-dev1
1+
## 0.16.3-dev2
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
### Fixes
88

99
* **V2 elements without first parent ID can be parsed**
10+
* **Fix missing elements when layout element parsed in V2 ontology**
1011

1112

1213
## 0.16.2

Diff for: test_unstructured/partition/html/test_html_to_ontology_parsing.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type():
5959
expected_html = _wrap_with_body(
6060
"""
6161
<div class="Page">
62-
<aside class='Sidebar'>Some text</aside>
62+
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
6363
</div>
6464
"""
6565
)
@@ -87,7 +87,7 @@ def test_when_class_is_wrong_tag_name_is_overwritten():
8787
expected_html = _wrap_with_body(
8888
"""
8989
<div class="Page">
90-
<aside class='Sidebar'>Some text</aside>
90+
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
9191
</div>
9292
"""
9393
)
@@ -535,6 +535,8 @@ def test_malformed_html():
535535
# language=HTML
536536
expected_html = """
537537
<body class="Document">
538+
539+
<p class="Paragraph">
538540
Unclosed comment
539541
<div class="">
540542
<p>
@@ -554,6 +556,7 @@ def test_malformed_html():
554556
<p>
555557
Paragraph with invalid characters: � � �
556558
</p>
559+
</p>
557560
</body>
558561
"""
559562

@@ -563,3 +566,31 @@ def test_malformed_html():
563566
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
564567

565568
assert parsed_ontology == expected_html
569+
570+
571+
def test_text_is_wrapped_inside_layout_element():
572+
# language=HTML
573+
base_html = _wrap_with_body(
574+
"""
575+
<div class="Page">
576+
Text
577+
</div>
578+
"""
579+
)
580+
base_html = indent_html(base_html)
581+
582+
# language=HTML
583+
expected_html = _wrap_with_body(
584+
"""
585+
<div class="Page">
586+
<p class='Paragraph'>Text</p>
587+
</div>
588+
"""
589+
)
590+
591+
expected_html = indent_html(expected_html)
592+
593+
ontology: OntologyElement = parse_html_to_ontology(base_html)
594+
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
595+
596+
assert parsed_ontology == expected_html

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.3-dev1" # pragma: no cover
1+
__version__ = "0.16.3-dev2" # pragma: no cover

Diff for: unstructured/partition/html/transformations.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,17 @@ def ontology_to_unstructured_elements(
7979
),
8080
)
8181
]
82-
82+
childreen = []
8383
for child in ontology_element.children:
84-
elements_to_return += ontology_to_unstructured_elements(
84+
childreen += ontology_to_unstructured_elements(
8585
child,
8686
parent_id=ontology_element.id,
8787
page_number=page_number,
8888
depth=0 if isinstance(ontology_element, Document) else depth + 1,
8989
filename=filename,
9090
)
91+
92+
elements_to_return += childreen
9193
else:
9294
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
9395
ontology_element.__class__.__name__
@@ -98,7 +100,6 @@ def ontology_to_unstructured_elements(
98100
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
99101
)
100102
# TODO value attribute from form input should be added to the text
101-
102103
unstructured_element = element_class(
103104
text=element_text,
104105
element_id=ontology_element.id,
@@ -255,8 +256,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
255256
additional_attributes=escaped_attrs,
256257
)
257258

258-
has_children = (ontology_class != UncategorizedText) and any(
259-
isinstance(content, Tag) for content in soup.contents
259+
has_children = (
260+
(ontology_class != UncategorizedText)
261+
and any(isinstance(content, Tag) for content in soup.contents)
262+
or ontology_class().elementType == ElementTypeEnum.layout
260263
)
261264

262265
if has_children:

0 commit comments

Comments
 (0)