Skip to content

Commit c2d17b1

Browse files
authored
Fix extracting value from field (#3774)
1 parent 66d1e5a commit c2d17b1

File tree

5 files changed

+27
-5
lines changed

5 files changed

+27
-5
lines changed

Diff for: CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.5-dev0
1+
## 0.16.5-dev1
22

33
### Enhancements
44

Diff for: test_unstructured/documents/test_ontology_to_unstructured_parsing.py

+4
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,10 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
201201

202202
for i in range(len(expected_json_elements)):
203203
assert expected_json_elements[i] == predicted_elements[i]
204+
assert (
205+
expected_json_elements[i].metadata.text_as_html
206+
== predicted_elements[i].metadata.text_as_html
207+
)
204208

205209

206210
def test_inline_elements_are_squeezed():

Diff for: test_unstructured/partition/html/test_html_to_ontology_parsing.py

+15
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,21 @@ def test_text_in_form_field_value():
607607
assert form_field_value.to_text() == "Random Input Value"
608608

609609

610+
def test_text_in_form_field_value_with_null_value():
611+
# language=HTML
612+
input_html = """
613+
<div class="Page">
614+
<input class="FormFieldValue" value=""/>
615+
</div>
616+
"""
617+
page = parse_html_to_ontology(input_html)
618+
619+
assert len(page.children) == 1
620+
form_field_value = page.children[0]
621+
assert form_field_value.text == ""
622+
assert form_field_value.to_text() == ""
623+
624+
610625
def test_to_text_when_form_field():
611626
ontology = Page(
612627
children=[

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.5-dev0" # pragma: no cover
1+
__version__ = "0.16.5-dev1" # pragma: no cover

Diff for: unstructured/documents/ontology.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def to_text(self, add_children=True) -> str:
9393
if self.children and add_children:
9494
children_text = " ".join(child.to_text().strip() for child in self.children)
9595
return children_text
96-
return BeautifulSoup(self.to_html()).get_text().strip()
96+
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
9797

9898
def _construct_attribute_string(self, attributes: dict) -> str:
9999
return " ".join(
@@ -466,8 +466,11 @@ class FormFieldValue(OntologyElement):
466466
allowed_tags: List[str] = Field(["input"], frozen=True)
467467

468468
def to_text(self, add_children=True) -> str:
469-
text = super().to_text() + self.additional_attributes.get("value", "")
470-
return text.strip()
469+
text = super().to_text()
470+
value = self.additional_attributes.get("value", "")
471+
if not value:
472+
return text
473+
return f"{text} {value}".strip()
471474

472475

473476
class Checkbox(OntologyElement):

0 commit comments

Comments
 (0)