Skip to content

Commit 07ccf46

Browse files
committed
Added to_html for table and form value
1 parent b931015 commit 07ccf46

File tree

1 file changed

+54
-19
lines changed

1 file changed

+54
-19
lines changed

unstructured/documents/ontology.py

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from enum import Enum
2121
from typing import List, Optional
2222

23+
from bs4 import BeautifulSoup
2324
from pydantic import BaseModel, Field
2425

2526

@@ -75,32 +76,36 @@ def generate_unique_id() -> str:
7576

7677
def to_html(self, add_children=True) -> str:
7778
additional_attrs = copy(self.additional_attributes)
78-
if "class" in additional_attrs:
79-
del additional_attrs["class"]
80-
81-
# TODO(Pluto) Add support for multiple classes
82-
attrs = " ".join(
83-
f'{key}="{value}"' if value else f"{key}" for key, value in additional_attrs.items()
84-
)
79+
additional_attrs.pop("class", None)
8580

81+
attr_str = self._construct_attribute_string(additional_attrs)
8682
class_attr = f'class="{self.css_class_name}"' if self.css_class_name else ""
87-
attr_str = f"{class_attr} {attrs}".strip()
8883

89-
children_html = (
90-
("" if not self.children else "".join(child.to_html() for child in self.children))
91-
if add_children
92-
else ""
84+
combined_attr_str = f"{class_attr} {attr_str}".strip()
85+
86+
children_html = self._generate_children_html(add_children)
87+
88+
result_html = self._generate_final_html(combined_attr_str, children_html)
89+
90+
return result_html
91+
92+
def _construct_attribute_string(self, attributes: dict) -> str:
93+
return " ".join(
94+
f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items()
9395
)
94-
text = "" if not self.text else self.text
96+
97+
def _generate_children_html(self, add_children: bool) -> str:
98+
if not add_children or not self.children:
99+
return ""
100+
return "".join(child.to_html() for child in self.children)
101+
102+
def _generate_final_html(self, attr_str: str, children_html: str) -> str:
103+
text = self.text or ""
95104

96105
if text or children_html:
97-
# This is either one or another, never both
98-
result_html = (
99-
f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
100-
)
106+
return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
101107
else:
102-
result_html = f"<{self.html_tag_name} {attr_str} />"
103-
return result_html
108+
return f"<{self.html_tag_name} {attr_str} />"
104109

105110
@property
106111
def id(self) -> str | None:
@@ -254,6 +259,24 @@ class Table(OntologyElement):
254259
elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True)
255260
allowed_tags: List[str] = Field(["table"], frozen=True)
256261

262+
def to_html(self, add_children=True) -> str:
263+
raw_html = super().to_html(add_children=add_children)
264+
265+
cleaned_html = self._remove_ids_and_classes(raw_html)
266+
267+
return cleaned_html
268+
269+
def _remove_ids_and_classes(self, html: str) -> str:
270+
soup = BeautifulSoup(html, "html.parser")
271+
272+
for tag in soup.find_all(True):
273+
if "id" in tag.attrs:
274+
del tag.attrs["id"]
275+
if "class" in tag.attrs:
276+
del tag.attrs["class"]
277+
278+
return str(soup)
279+
257280

258281
class TableBody(OntologyElement):
259282
description: str = Field("A body of the table", frozen=True)
@@ -446,6 +469,18 @@ def __init__(self, **kwargs):
446469
super().__init__(**kwargs)
447470
self.text = self.additional_attributes.get("value", "")
448471

472+
def to_html(self, add_children=True) -> str:
473+
additional_attrs = copy(self.additional_attributes)
474+
additional_attrs.pop("class", None)
475+
476+
attr_str = self._construct_attribute_string(additional_attrs)
477+
478+
class_attr = f'class="{self.css_class_name}"' if self.css_class_name else ""
479+
480+
combined_attr_str = f"{class_attr} {attr_str}".strip()
481+
482+
return f"<{self.html_tag_name} {combined_attr_str} />"
483+
449484

450485
class Checkbox(OntologyElement):
451486
description: str = Field("A small box that can be checked or unchecked", frozen=True)

0 commit comments

Comments
 (0)