44
55from __future__ import annotations
66
7- from typing import IO , Any , Iterator , List , Optional , cast
7+ from typing import IO , Any , Iterator , List , Literal , Optional , cast
88
99import requests
1010from lxml import etree
@@ -35,7 +35,7 @@ def partition_html(
3535 ssl_verify : bool = True ,
3636 skip_headers_and_footers : bool = False ,
3737 detection_origin : Optional [str ] = None ,
38- contains_ontology_schema : bool = False ,
38+ html_parser_version : Literal [ "v1" , "v2" ] = "v1" ,
3939 ** kwargs : Any ,
4040) -> list [Element ]:
4141 """Partitions an HTML document into its constituent elements.
@@ -61,6 +61,10 @@ def partition_html(
6161 The encoding method used to decode the text input. If None, utf-8 will be used.
6262 skip_headers_and_footers
6363 If True, ignores any content that is within <header> or <footer> tags
64+
65+ html_parser_version (Literal['v1', 'v2']):
66+ The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
67+ use the ontology schema to parse the HTML document.
6468 """
6569 # -- parser rejects an empty str, nip that edge-case in the bud here --
6670 if text is not None and text .strip () == "" and not file and not filename and not url :
@@ -76,7 +80,7 @@ def partition_html(
7680 ssl_verify = ssl_verify ,
7781 skip_headers_and_footers = skip_headers_and_footers ,
7882 detection_origin = detection_origin ,
79- contains_ontology_schema = contains_ontology_schema ,
83+ html_parser_version = html_parser_version ,
8084 )
8185
8286 return list (_HtmlPartitioner .iter_elements (opts ))
@@ -97,7 +101,7 @@ def __init__(
97101 ssl_verify : bool ,
98102 skip_headers_and_footers : bool ,
99103 detection_origin : str | None ,
100- contains_ontology_schema : bool ,
104+ html_parser_version : Literal [ "v1" , "v2" ] = "v1" ,
101105 ):
102106 self ._file_path = file_path
103107 self ._file = file
@@ -108,7 +112,7 @@ def __init__(
108112 self ._ssl_verify = ssl_verify
109113 self ._skip_headers_and_footers = skip_headers_and_footers
110114 self ._detection_origin = detection_origin
111- self ._contains_ontology_schema = contains_ontology_schema
115+ self ._html_parser_version = html_parser_version
112116
113117 @lazyproperty
114118 def detection_origin (self ) -> str | None :
@@ -164,9 +168,9 @@ def skip_headers_and_footers(self) -> bool:
164168 return self ._skip_headers_and_footers
165169
166170 @lazyproperty
167- def contains_ontology_schema (self ) -> bool :
168- """When True , HTML elements follow ontology schema."""
169- return self ._contains_ontology_schema
171+ def html_parser_version (self ) -> Literal [ "v1" , "v2" ] :
172+ """When html_parser_version=='v2' , HTML elements follow ontology schema."""
173+ return self ._html_parser_version
170174
171175
172176class _HtmlPartitioner :
@@ -186,9 +190,9 @@ def _iter_elements(self) -> Iterator[Element]:
186190 Elements appear in document order.
187191 """
188192 elements_iter = (
189- self ._from_ontology
190- if self ._opts .contains_ontology_schema
191- else self ._main . iter_elements ()
193+ self ._main . iter_elements ()
194+ if self ._opts .html_parser_version == "v2"
195+ else self ._from_ontology
192196 )
193197
194198 for e in elements_iter :
0 commit comments