Skip to content

Commit 03a3ed8

Browse files
authored
Add parsing HTML to unstructured elements (#3732)
> This is POC change; not everything is working correctly and code quality could be improved significantly This ticket add parsing HTML to unstructured element and back. How is it working? HTML has a tree structure, Unstructured Elements is a list. HTML structure is traversed in DFS order, creating Elements and adding them to list. So the reading order from HTML is preserved. To be able to compose tree again all elements has IDs, and metadata.parent_id is leveraged How html is preserved if there are 'layout' without text, or there are deeply nested HTMLs that are just text from the point of view of Unstructured Element? Each element is parsed back to HTML using metadata.text_as_html field. For layout elements only html_tag are there, for long text elements there is everything required to recreate HTML - you can see examples in unit tests or .json file I attached. Pros of solution: - Nothing had to be changed in element types Cons: - There are elements without Text which may be confusing (they could be replaced by some special type) Core transformation logic can be found in 2 functions in `unstructured/documents/transformations.py` Knowns bugs (they are minor): - sometimes html tag is changed incorrectly - metadata.category_depth and metadata.page_number are not set - page break is not added between pages How to test. Generate HTML: ```python3 from pathlib import Path from vlm_partitioner.src.partition import partition if __name__ == "__main__": doc_dir = Path("out_dir") file_path = Path("example_doc.pdf") partition(str(file_path), provider="anthropic", output_dir=str(doc_dir)) ``` Then parse to unstructured elements and back to html ```python3 from pathlib import Path from unstructured.documents.html_utils import indent_html from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \ unstructured_elements_to_ontology from unstructured.staging.base import elements_to_json if __name__ == "__main__": output_dir = Path("out_dir/") output_dir.mkdir(exist_ok=True, parents=True) doc_path = Path("out_dir/example_doc.html") html_content = doc_path.read_text() ontology = parse_html_to_ontology(html_content) unstructured_elements = ontology_to_unstructured_elements(ontology) elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json")) parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) html_to_save = indent_html(parsed_ontology.to_html()) Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save) ``` I attached example doc before and after running these scripts [outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
1 parent 6bceac1 commit 03a3ed8

18 files changed

+2572
-12
lines changed

Diff for: CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.1-dev7
1+
## 0.16.1-dev8
22

33
### Enhancements
44

@@ -8,6 +8,8 @@
88

99
### Features
1010

11+
* **Parsing HTML to Unstructured Elements and back**
12+
1113
### Fixes
1214

1315
* **Remove unsupported chipper model**

Diff for: requirements/base.in

+1
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ wrapt
2121
tqdm
2222
psutil
2323
python-oxmsg
24+
html5lib

Diff for: requirements/base.txt

+6-3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ filetype==1.2.0
4444
# via -r ./base.in
4545
h11==0.14.0
4646
# via httpcore
47+
html5lib==1.1
48+
# via -r ./base.in
4749
httpcore==1.0.6
4850
# via httpx
4951
httpx==0.27.2
@@ -92,7 +94,7 @@ pypdf==5.0.1
9294
# via unstructured-client
9395
python-dateutil==2.9.0.post0
9496
# via unstructured-client
95-
python-iso639==2024.4.27
97+
python-iso639==2024.10.22
9698
# via -r ./base.in
9799
python-magic==0.4.27
98100
# via -r ./base.in
@@ -111,6 +113,7 @@ requests-toolbelt==1.0.0
111113
# via unstructured-client
112114
six==1.16.0
113115
# via
116+
# html5lib
114117
# langdetect
115118
# python-dateutil
116119
# unstructured-client
@@ -120,8 +123,6 @@ sniffio==1.3.1
120123
# httpx
121124
soupsieve==2.6
122125
# via beautifulsoup4
123-
tabulate==0.9.0
124-
# via -r ./base.in
125126
tqdm==4.66.5
126127
# via
127128
# -r ./base.in
@@ -147,5 +148,7 @@ urllib3==1.26.20
147148
# -c ././deps/constraints.txt
148149
# requests
149150
# unstructured-client
151+
webencodings==0.5.1
152+
# via html5lib
150153
wrapt==1.16.0
151154
# via -r ./base.in

Diff for: requirements/extra-pdf-image.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ flatbuffers==24.3.25
4444
# via onnxruntime
4545
fonttools==4.54.1
4646
# via matplotlib
47-
fsspec==2024.9.0
47+
fsspec==2024.10.0
4848
# via
4949
# huggingface-hub
5050
# torch
@@ -67,7 +67,7 @@ grpcio==1.67.0
6767
# grpcio-status
6868
grpcio-status==1.62.3
6969
# via google-api-core
70-
huggingface-hub==0.26.0
70+
huggingface-hub==0.26.1
7171
# via
7272
# timm
7373
# tokenizers

Diff for: requirements/huggingface.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ filelock==3.16.1
2121
# huggingface-hub
2222
# torch
2323
# transformers
24-
fsspec==2024.9.0
24+
fsspec==2024.10.0
2525
# via
2626
# huggingface-hub
2727
# torch
28-
huggingface-hub==0.26.0
28+
huggingface-hub==0.26.1
2929
# via
3030
# tokenizers
3131
# transformers

Diff for: requirements/test.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ wrapt==1.16.0
236236
# vcrpy
237237
xmljson==0.2.1
238238
# via label-studio-sdk
239-
yarl==1.15.5
239+
yarl==1.16.0
240240
# via vcrpy
241241

242242
# The following packages are considered to be unsafe in a requirements file:

Diff for: test_unstructured/documents/html_files/example.html

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
2+
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
3+
<header class="Header" id="45b3d0053468484ba1c7b53998115412">
4+
<h1 class="Title" id="c95473e8a3704fc2b418697f9fddb27b">
5+
Header
6+
</h1>
7+
<time class="CalendarDate" id="379cbfdc16d44bd6a59e6cfabe6438d5">
8+
Date: October 30, 2023
9+
</time>
10+
</header>
11+
<form class="Form" id="637c2f6935fb4353a5f73025ce04619d">
12+
<label class="FormField" for="company-name" id="50027cccbe1948c9853ce0de37b635c2">
13+
From field name
14+
</label>
15+
<input class="FormFieldValue" id="0032242af75c4b37984ea7fea9aac74c" value="Example value"/>
16+
</form>
17+
<section class="Section" id="592422373ed741b68a077e2003f8ed81">
18+
<table class="Table" id="dc3792d4422e444f90876b56d0cfb20d">
19+
<thead class="TableHeader" id="50a5548a87e84024af590b3d2830d140">
20+
<tr class="TableRow" id="5e473d7742474412be72dc4e2c45bd4a">
21+
<th class="TableCellHeader" id="01800309aa42411c98ae30f85b23f399">
22+
Description
23+
</th>
24+
<th class="TableCellHeader" id="c2765b63d08946a2851955e79e301de4">
25+
Row header
26+
</th>
27+
</tr>
28+
</thead>
29+
<tbody class="TableBody" id="e0a9a8ffdd7148ad8b4a274b073d340a">
30+
<tr class="TableRow" id="77e829974632455191330b0b8545d1e3">
31+
<td class="TableCell" id="7fee12d4c5554b7da778d6f8fdec8a57">
32+
Value description
33+
</td>
34+
<td class="TableCell" id="5a7a33b0c57b4eb881a35bce9f87c831">
35+
<span class="Currency" id="87220f9d62c3482e92e7de72a26869cd">
36+
50 $
37+
</span>
38+
<span class="Measurement" id="0095b9efb90a4cca991e73547c7165f1">
39+
(1.32 %)
40+
</span>
41+
</td>
42+
</tr>
43+
</tbody>
44+
</table>
45+
</section>
46+
<section class="Section" id="1032242af75c4b37984ea7fea9aac74c">
47+
<h2 class="Subtitle" id="2a4e2c4a689f4f9a8c180b6b521e45c3">
48+
2. Subtitle
49+
</h2>
50+
<p class="NarrativeText" id="5591f7a4df01447e82515ce45f686fbe">
51+
Paragraph text
52+
</p>
53+
</section>
54+
</div>
55+
</body>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
from unstructured.chunking.basic import chunk_elements
6+
from unstructured.chunking.title import chunk_by_title
7+
from unstructured.documents.ontology import Column, Document, Page, Paragraph
8+
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
9+
from unstructured.partition.html import partition_html
10+
from unstructured.partition.html.transformations import (
11+
ontology_to_unstructured_elements,
12+
parse_html_to_ontology,
13+
)
14+
from unstructured.partition.json import partition_json
15+
from unstructured.staging.base import elements_from_json
16+
17+
18+
def test_page_number_is_passed_correctly():
19+
ontology = Document(
20+
children=[
21+
Page(
22+
children=[Paragraph(text="Paragraph1")],
23+
additional_attributes={"data-page-number": "1"},
24+
),
25+
Page(
26+
children=[Paragraph(text="Paragraph2")],
27+
additional_attributes={"data-page-number": "2"},
28+
),
29+
]
30+
)
31+
unstructured_elements = ontology_to_unstructured_elements(ontology)
32+
page1, p1, page2, p2 = unstructured_elements
33+
assert p1.metadata.page_number == 1
34+
assert p2.metadata.page_number == 2
35+
36+
37+
def test_invalid_page_number_is_not_passed():
38+
ontology = Document(
39+
children=[
40+
Page(
41+
children=[Paragraph(text="Paragraph1")],
42+
additional_attributes={"data-page-number": "invalid"},
43+
)
44+
]
45+
)
46+
unstructured_elements = ontology_to_unstructured_elements(ontology)
47+
page1, p1 = unstructured_elements
48+
assert not p1.metadata.page_number
49+
50+
51+
def test_depth_is_passed_correctly():
52+
ontology = Document(
53+
children=[
54+
Page(children=[Paragraph(text="Paragraph1")]),
55+
Page(
56+
children=[
57+
Column(children=[Paragraph(text="Paragraph2")]),
58+
Column(children=[Paragraph(text="Paragraph3")]),
59+
]
60+
),
61+
]
62+
)
63+
64+
unstructured_elements = ontology_to_unstructured_elements(ontology)
65+
page1, p1, page2, c1, p2, c2, p3 = unstructured_elements
66+
67+
assert page1.metadata.category_depth == 0
68+
assert page2.metadata.category_depth == 0
69+
70+
assert p1.metadata.category_depth == 1
71+
72+
assert c2.metadata.category_depth == 1
73+
assert c1.metadata.category_depth == 1
74+
75+
assert p2.metadata.category_depth == 2
76+
assert p3.metadata.category_depth == 2
77+
78+
79+
def test_chunking_is_applied_on_elements():
80+
ontology = Document(
81+
children=[
82+
Page(children=[Paragraph(text="Paragraph1")]),
83+
Page(
84+
children=[
85+
Column(children=[Paragraph(text="Paragraph2")]),
86+
Column(children=[Paragraph(text="Paragraph3")]),
87+
]
88+
),
89+
]
90+
)
91+
92+
unstructured_elements = ontology_to_unstructured_elements(ontology)
93+
94+
chunked_basic = chunk_elements(unstructured_elements)
95+
assert str(chunked_basic[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
96+
chunked_by_title = chunk_by_title(unstructured_elements)
97+
assert str(chunked_by_title[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
98+
99+
100+
def test_embeddings_are_applied_on_elements(mocker):
101+
ontology = Document(
102+
children=[
103+
Page(children=[Paragraph(text="Paragraph1")]),
104+
Page(
105+
children=[
106+
Column(children=[Paragraph(text="Paragraph2")]),
107+
Column(children=[Paragraph(text="Paragraph3")]),
108+
]
109+
),
110+
]
111+
)
112+
113+
unstructured_elements = ontology_to_unstructured_elements(ontology)
114+
# Mocked client with the desired behavior for embed_documents
115+
mock_client = mocker.MagicMock()
116+
mock_client.embed_documents.return_value = [1, 2, 3, 4, 5, 6, 7]
117+
118+
# Mock get_client to return our mock_client
119+
mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
120+
121+
encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
122+
elements = encoder.embed_documents(
123+
elements=unstructured_elements,
124+
)
125+
126+
assert len(elements) == 7
127+
128+
page1, p1, page2, c1, p2, c2, p3 = elements
129+
130+
assert p1.embeddings == 2
131+
assert p2.embeddings == 5
132+
assert p3.embeddings == 7
133+
134+
135+
@pytest.mark.parametrize(
136+
("html_file_path", "json_file_path"),
137+
[
138+
("html_files/example.html", "unstructured_json_output/example.json"),
139+
],
140+
)
141+
def test_ingest(html_file_path, json_file_path):
142+
html_file_path = Path(__file__).parent / html_file_path
143+
json_file_path = Path(__file__).parent / json_file_path
144+
145+
html_code = html_file_path.read_text()
146+
expected_json_elements = elements_from_json(str(json_file_path))
147+
148+
ontology = parse_html_to_ontology(html_code)
149+
unstructured_elements = ontology_to_unstructured_elements(ontology)
150+
assert unstructured_elements == expected_json_elements
151+
152+
153+
@pytest.mark.parametrize("json_file_path", ["unstructured_json_output/example.json"])
154+
def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
155+
json_file_path = Path(__file__).parent / json_file_path
156+
157+
expected_json_elements = elements_from_json(str(json_file_path))
158+
159+
json_elements_text = json_file_path.read_text()
160+
elements = partition_json(text=json_elements_text)
161+
162+
assert len(elements) == len(expected_json_elements)
163+
for i in range(len(elements)):
164+
assert elements[i] == expected_json_elements[i]
165+
# The partitioning output comes from PDF file, so only stem is compared
166+
# as the suffix is different .pdf != .json
167+
assert Path(elements[i].metadata.filename).stem == json_file_path.stem
168+
169+
170+
@pytest.mark.parametrize(
171+
("html_file_path", "json_file_path"),
172+
[
173+
("html_files/example.html", "unstructured_json_output/example.json"),
174+
],
175+
)
176+
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
177+
html_file_path = Path(__file__).parent / html_file_path
178+
json_file_path = Path(__file__).parent / json_file_path
179+
180+
expected_json_elements = elements_from_json(str(json_file_path))
181+
html_code = html_file_path.read_text()
182+
183+
predicted_elements = partition_html(text=html_code, html_parser_version="v2")
184+
assert len(expected_json_elements) == len(predicted_elements)
185+
186+
for i in range(len(expected_json_elements)):
187+
assert expected_json_elements[i] == expected_json_elements[i]

0 commit comments

Comments
 (0)