Skip to content

Commit 26c8714

Browse files
committed
Fix unit tests for 3.9 and rename param
1 parent 016e851 commit 26c8714

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

test_unstructured/documents/test_ontology_to_unstructured_parsing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
172172
expected_json_elements = elements_from_json(json_file_path)
173173
html_code = Path(html_file_path).read_text()
174174

175-
predicted_elements = partition_html(text=html_code, contains_ontology_schema=True)
175+
predicted_elements = partition_html(text=html_code, html_parser_version="v2")
176176
assert len(expected_json_elements) == len(predicted_elements)
177177

178178
for i in range(len(expected_json_elements)):

unstructured/documents/ontology.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
allowed html tags, css classes and descriptions as metadata.
1414
"""
1515

16+
from __future__ import annotations
17+
1618
import uuid
1719
from copy import copy
1820
from enum import Enum

unstructured/partition/html/partition.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from __future__ import annotations
66

7-
from typing import IO, Any, Iterator, List, Optional, cast
7+
from typing import IO, Any, Iterator, List, Literal, Optional, cast
88

99
import requests
1010
from lxml import etree
@@ -35,7 +35,7 @@ def partition_html(
3535
ssl_verify: bool = True,
3636
skip_headers_and_footers: bool = False,
3737
detection_origin: Optional[str] = None,
38-
contains_ontology_schema: bool = False,
38+
html_parser_version: Literal["v1", "v2"] = "v1",
3939
**kwargs: Any,
4040
) -> list[Element]:
4141
"""Partitions an HTML document into its constituent elements.
@@ -61,6 +61,10 @@ def partition_html(
6161
The encoding method used to decode the text input. If None, utf-8 will be used.
6262
skip_headers_and_footers
6363
If True, ignores any content that is within <header> or <footer> tags
64+
65+
html_parser_version (Literal['v1', 'v2']):
66+
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
67+
use the ontology schema to parse the HTML document.
6468
"""
6569
# -- parser rejects an empty str, nip that edge-case in the bud here --
6670
if text is not None and text.strip() == "" and not file and not filename and not url:
@@ -76,7 +80,7 @@ def partition_html(
7680
ssl_verify=ssl_verify,
7781
skip_headers_and_footers=skip_headers_and_footers,
7882
detection_origin=detection_origin,
79-
contains_ontology_schema=contains_ontology_schema,
83+
html_parser_version=html_parser_version,
8084
)
8185

8286
return list(_HtmlPartitioner.iter_elements(opts))
@@ -97,7 +101,7 @@ def __init__(
97101
ssl_verify: bool,
98102
skip_headers_and_footers: bool,
99103
detection_origin: str | None,
100-
contains_ontology_schema: bool,
104+
html_parser_version: Literal["v1", "v2"] = "v1",
101105
):
102106
self._file_path = file_path
103107
self._file = file
@@ -108,7 +112,7 @@ def __init__(
108112
self._ssl_verify = ssl_verify
109113
self._skip_headers_and_footers = skip_headers_and_footers
110114
self._detection_origin = detection_origin
111-
self._contains_ontology_schema = contains_ontology_schema
115+
self._html_parser_version = html_parser_version
112116

113117
@lazyproperty
114118
def detection_origin(self) -> str | None:
@@ -164,9 +168,9 @@ def skip_headers_and_footers(self) -> bool:
164168
return self._skip_headers_and_footers
165169

166170
@lazyproperty
167-
def contains_ontology_schema(self) -> bool:
168-
"""When True, HTML elements follow ontology schema."""
169-
return self._contains_ontology_schema
171+
def html_parser_version(self) -> Literal["v1", "v2"]:
172+
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
173+
return self._html_parser_version
170174

171175

172176
class _HtmlPartitioner:
@@ -186,9 +190,9 @@ def _iter_elements(self) -> Iterator[Element]:
186190
Elements appear in document order.
187191
"""
188192
elements_iter = (
189-
self._from_ontology
190-
if self._opts.contains_ontology_schema
191-
else self._main.iter_elements()
193+
self._main.iter_elements()
194+
if self._opts.html_parser_version == "v2"
195+
else self._from_ontology
192196
)
193197

194198
for e in elements_iter:

0 commit comments

Comments
 (0)