Skip to content

Commit 66d1e5a

Browse files
authored
Add max recursion limit and fix to_text() method (#3773)
1 parent df156eb commit 66d1e5a

File tree

6 files changed

+79
-21
lines changed

6 files changed

+79
-21
lines changed

Diff for: CHANGELOG.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.16.5-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element
9+
10+
111
## 0.16.4
212

313
### Enhancements
@@ -9,7 +19,7 @@
919

1020
### Features
1121

12-
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
22+
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
1323

1424
### Fixes
1525

Diff for: test_unstructured/partition/html/test_html_to_ontology_parsing.py

+51-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from bs4 import BeautifulSoup
22

3-
from unstructured.documents.ontology import OntologyElement
3+
from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page
44
from unstructured.partition.html.html_utils import indent_html
5-
from unstructured.partition.html.transformations import parse_html_to_ontology
5+
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
66

77

88
def _wrap_with_body(html: str) -> str:
@@ -605,3 +605,52 @@ def test_text_in_form_field_value():
605605
form_field_value = page.children[0]
606606
assert form_field_value.text == ""
607607
assert form_field_value.to_text() == "Random Input Value"
608+
609+
610+
def test_to_text_when_form_field():
611+
ontology = Page(
612+
children=[
613+
Form(
614+
tag="input",
615+
additional_attributes={"value": "Random Input Value"},
616+
children=[
617+
FormFieldValue(
618+
tag="input",
619+
additional_attributes={"value": "Random Input Value"},
620+
)
621+
],
622+
)
623+
]
624+
)
625+
assert ontology.to_text(add_children=True) == "Random Input Value"
626+
627+
628+
def test_recursion_limit_is_limiting_parsing():
629+
# language=HTML
630+
broken_html = "some text"
631+
for i in range(100):
632+
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
633+
broken_html = _wrap_with_body(broken_html)
634+
ontology = parse_html_to_ontology(broken_html)
635+
636+
iterator = 1
637+
last_child = ontology.children[0]
638+
while last_child.children:
639+
last_child = last_child.children[0]
640+
iterator += 1
641+
assert last_child.text.startswith('<p class="Paragraph">')
642+
assert iterator == RECURSION_LIMIT
643+
644+
645+
def test_get_text_when_recursion_limit_activated():
646+
broken_html = "some text"
647+
for i in range(100):
648+
broken_html = f"<p class='Paragraph'>{broken_html}</p>"
649+
broken_html = _wrap_with_body(broken_html)
650+
ontology = parse_html_to_ontology(broken_html)
651+
652+
last_child = ontology.children[0]
653+
while last_child.children:
654+
last_child = last_child.children[0]
655+
656+
assert last_child.to_text() == "some text"

Diff for: test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def test_forms():
274274
assert expected_html == parsed_html
275275
expected_elements = _page_elements + [
276276
Text(
277-
text="Option 1 (Checked)",
277+
text="2 Option 1 (Checked)",
278278
element_id="2",
279279
detection_origin="vlm_partitioner",
280280
metadata=ElementMetadata(

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.4" # pragma: no cover
1+
__version__ = "0.16.5-dev0" # pragma: no cover

Diff for: unstructured/documents/ontology.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum):
4242

4343

4444
class OntologyElement(BaseModel):
45-
text: Optional[str] = Field(None, description="Text content of the element")
45+
text: Optional[str] = Field("", description="Text content of the element")
4646
css_class_name: Optional[str] = Field(
4747
default_factory=lambda: "", description="CSS class associated with the element"
4848
)
@@ -90,7 +90,10 @@ def to_html(self, add_children=True) -> str:
9090
return result_html
9191

9292
def to_text(self, add_children=True) -> str:
93-
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
93+
if self.children and add_children:
94+
children_text = " ".join(child.to_text().strip() for child in self.children)
95+
return children_text
96+
return BeautifulSoup(self.to_html()).get_text().strip()
9497

9598
def _construct_attribute_string(self, attributes: dict) -> str:
9699
return " ".join(
@@ -450,15 +453,6 @@ class Form(OntologyElement):
450453
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
451454
allowed_tags: List[str] = Field(["form"], frozen=True)
452455

453-
def to_text(self, add_children=True) -> str:
454-
texts = [self.text] if self.text else []
455-
456-
if add_children:
457-
for child in self.children:
458-
texts.append(child.to_text(add_children=True))
459-
460-
return " ".join(filter(None, texts)).strip()
461-
462456

463457
class FormField(OntologyElement):
464458
description: str = Field("A property value of a form", frozen=True)
@@ -472,7 +466,8 @@ class FormFieldValue(OntologyElement):
472466
allowed_tags: List[str] = Field(["input"], frozen=True)
473467

474468
def to_text(self, add_children=True) -> str:
475-
return super().to_text() + self.additional_attributes.get("value", "")
469+
text = super().to_text() + self.additional_attributes.get("value", "")
470+
return text.strip()
476471

477472

478473
class Checkbox(OntologyElement):

Diff for: unstructured/partition/html/transformations.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
UncategorizedText,
3737
)
3838

39+
RECURSION_LIMIT = 50
40+
3941

4042
def ontology_to_unstructured_elements(
4143
ontology_element: OntologyElement,
@@ -68,7 +70,7 @@ def ontology_to_unstructured_elements(
6870
list[Element]: A list of unstructured Element objects.
6971
"""
7072
elements_to_return = []
71-
if ontology_element.elementType == ElementTypeEnum.layout:
73+
if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
7274

7375
if page_number is None and isinstance(ontology_element, Page):
7476
page_number = ontology_element.page_number
@@ -354,7 +356,7 @@ def remove_empty_tags(soup):
354356
return str(soup)
355357

356358

357-
def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
359+
def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None:
358360
"""
359361
Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
360362
First tries to recognize a class from Unstructured Ontology, then if class is matched tries
@@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
364366
365367
Args:
366368
soup (Tag): The BeautifulSoup Tag object to be converted.
369+
recursion_depth (int): Flag to control limit of recursion depth.
367370
368371
Returns:
369372
OntologyElement: The converted OntologyElement object.
@@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
384387
and any(isinstance(content, Tag) for content in soup.contents)
385388
or ontology_class().elementType == ElementTypeEnum.layout
386389
)
390+
should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT
387391

388-
if has_children:
392+
if should_unwrap_html:
389393
text = ""
390394
children = [
391395
(
392-
parse_html_to_ontology_element(child)
396+
parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1)
393397
if isinstance(child, Tag)
394398
else Paragraph(text=str(child).strip())
395399
)

0 commit comments

Comments
 (0)