Skip to content

Commit 60abac2

Browse files
asai95Viktor Zhemchuzhnikov
and
Viktor Zhemchuzhnikov
authored
feat: add allow custom parsers in partition_html (#251)
This will allow partition_html to use a custom XMLParser or HTMLParser. It can be useful if one needs to specify additional arguments to these parsers (not only built-in remove_comments=True). --------- Co-authored-by: Viktor Zhemchuzhnikov <[email protected]>
1 parent 1b8bf31 commit 60abac2

File tree

3 files changed

+11
-6
lines changed

3 files changed

+11
-6
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.4.12-dev1
1+
## 0.4.12-dev2
22

33
* Adds console_entrypoint for unstructured-ingest, other structure/doc updates related to ingest.
4+
* Add `parser` parameter to `partition_html`.
45

56
## 0.4.11
67

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.12" # pragma: no cover
1+
__version__ = "0.4.12-dev2" # pragma: no cover

Diff for: unstructured/partition/html.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from unstructured.documents.elements import Element
66
from unstructured.documents.html import HTMLDocument
7+
from unstructured.documents.xml import VALID_PARSERS
78
from unstructured.partition.common import add_element_metadata, document_to_element_list
89

910

@@ -14,6 +15,7 @@ def partition_html(
1415
url: Optional[str] = None,
1516
include_page_breaks: bool = False,
1617
include_metadata: bool = True,
18+
parser: VALID_PARSERS = None,
1719
) -> List[Element]:
1820
"""Partitions an HTML document into its constituent elements.
1921
@@ -32,12 +34,14 @@ def partition_html(
3234
include_metadata
3335
Optionally allows for excluding metadata from the output. Primarily intended
3436
for when partition_html is called in other partition bricks (like partition_email)
37+
parser
38+
The parser to use for parsing the HTML document. If None, default parser will be used.
3539
"""
3640
if not any([filename, file, text, url]):
3741
raise ValueError("One of filename, file, or text must be specified.")
3842

3943
if filename is not None and not file and not text and not url:
40-
document = HTMLDocument.from_file(filename)
44+
document = HTMLDocument.from_file(filename, parser=parser)
4145

4246
elif file is not None and not filename and not text and not url:
4347
file_content = file.read()
@@ -46,11 +50,11 @@ def partition_html(
4650
else:
4751
file_text = file_content
4852

49-
document = HTMLDocument.from_string(file_text)
53+
document = HTMLDocument.from_string(file_text, parser=parser)
5054

5155
elif text is not None and not filename and not file and not url:
5256
_text: str = str(text)
53-
document = HTMLDocument.from_string(_text)
57+
document = HTMLDocument.from_string(_text, parser=parser)
5458

5559
elif url is not None and not filename and not file and not text:
5660
response = requests.get(url)
@@ -61,7 +65,7 @@ def partition_html(
6165
if not content_type.startswith("text/html"):
6266
raise ValueError(f"Expected content type text/html. Got {content_type}.")
6367

64-
document = HTMLDocument.from_string(response.text)
68+
document = HTMLDocument.from_string(response.text, parser=parser)
6569

6670
else:
6771
raise ValueError("Only one of filename, file, or text can be specified.")

0 commit comments

Comments
 (0)