Skip to content

Commit 4bcb46a

Browse files
committed
Added to_text to ontology element
1 parent f0f8d8d commit 4bcb46a

File tree

2 files changed

+8
-4
lines changed

2 files changed

+8
-4
lines changed

unstructured/documents/ontology.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ def to_html(self, add_children=True) -> str:
8989

9090
return result_html
9191

92+
def to_text(self, add_children=True) -> str:
93+
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
94+
9295
def _construct_attribute_string(self, attributes: dict) -> str:
9396
return " ".join(
9497
f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items()
@@ -481,6 +484,9 @@ def to_html(self, add_children=True) -> str:
481484

482485
return f"<{self.html_tag_name} {combined_attr_str} />"
483486

487+
def to_text(self, add_children=True) -> str:
488+
return super().to_text() + self.text
489+
484490

485491
class Checkbox(OntologyElement):
486492
description: str = Field("A small box that can be checked or unchecked", frozen=True)

unstructured/partition/html/transformations.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,8 @@ def ontology_to_unstructured_elements(
9696
]
9797
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
9898
html_code_of_ontology_element = ontology_element.to_html()
99-
element_text = (
100-
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
101-
)
102-
# TODO value attribute from form input should be added to the text
99+
element_text = ontology_element.to_text()
100+
103101
unstructured_element = element_class(
104102
text=element_text,
105103
element_id=ontology_element.id,

0 commit comments

Comments
 (0)