Unstructured-IO
diff --git a/‎CHANGELOG.md
Lines changed: 3 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎requirements/base.in
Lines changed: 1 addition & 0 deletions b/‎requirements/base.in
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements/base.txt
Lines changed: 6 additions & 3 deletions b/‎requirements/base.txt
Lines changed: 6 additions & 3 deletions
diff --git a/‎requirements/extra-pdf-image.txt
Lines changed: 2 additions & 2 deletions b/‎requirements/extra-pdf-image.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements/huggingface.txt
Lines changed: 2 additions & 2 deletions b/‎requirements/huggingface.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎requirements/test.txt
Lines changed: 1 addition & 1 deletion b/‎requirements/test.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_unstructured/documents/html_files/example.html
Lines changed: 55 additions & 0 deletions b/‎test_unstructured/documents/html_files/example.html
Lines changed: 55 additions & 0 deletions
diff --git a/‎test_unstructured/documents/test_ontology_to_unstructured_parsing.py
Lines changed: 187 additions & 0 deletions b/‎test_unstructured/documents/test_ontology_to_unstructured_parsing.py
Lines changed: 187 additions & 0 deletions
@@ -1,4 +1,4 @@
-## 0.16.1-dev7
+## 0.16.1-dev8
 
 ### Enhancements
 
@@ -8,6 +8,8 @@
 
 ### Features
 
+* **Parsing HTML to Unstructured Elements and back**
+
 ### Fixes
 
 * **Remove unsupported chipper model**
 
@@ -21,3 +21,4 @@ wrapt
 tqdm
 psutil
 python-oxmsg
+html5lib
@@ -44,6 +44,8 @@ filetype==1.2.0
     # via -r ./base.in
 h11==0.14.0
     # via httpcore
+html5lib==1.1
+    # via -r ./base.in
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
@@ -92,7 +94,7 @@ pypdf==5.0.1
     # via unstructured-client
 python-dateutil==2.9.0.post0
     # via unstructured-client
-python-iso639==2024.4.27
+python-iso639==2024.10.22
     # via -r ./base.in
 python-magic==0.4.27
     # via -r ./base.in
@@ -111,6 +113,7 @@ requests-toolbelt==1.0.0
     # via unstructured-client
 six==1.16.0
     # via
+    #   html5lib
     #   langdetect
     #   python-dateutil
     #   unstructured-client
@@ -120,8 +123,6 @@ sniffio==1.3.1
     #   httpx
 soupsieve==2.6
     # via beautifulsoup4
-tabulate==0.9.0
-    # via -r ./base.in
 tqdm==4.66.5
     # via
     #   -r ./base.in
@@ -147,5 +148,7 @@ urllib3==1.26.20
     #   -c ././deps/constraints.txt
     #   requests
     #   unstructured-client
+webencodings==0.5.1
+    # via html5lib
 wrapt==1.16.0
     # via -r ./base.in
@@ -44,7 +44,7 @@ flatbuffers==24.3.25
     # via onnxruntime
 fonttools==4.54.1
     # via matplotlib
-fsspec==2024.9.0
+fsspec==2024.10.0
     # via
     #   huggingface-hub
     #   torch
@@ -67,7 +67,7 @@ grpcio==1.67.0
     #   grpcio-status
 grpcio-status==1.62.3
     # via google-api-core
-huggingface-hub==0.26.0
+huggingface-hub==0.26.1
     # via
     #   timm
     #   tokenizers
 
@@ -21,11 +21,11 @@ filelock==3.16.1
     #   huggingface-hub
     #   torch
     #   transformers
-fsspec==2024.9.0
+fsspec==2024.10.0
     # via
     #   huggingface-hub
     #   torch
-huggingface-hub==0.26.0
+huggingface-hub==0.26.1
     # via
     #   tokenizers
     #   transformers
 
@@ -236,7 +236,7 @@ wrapt==1.16.0
     #   vcrpy
 xmljson==0.2.1
     # via label-studio-sdk
-yarl==1.15.5
+yarl==1.16.0
     # via vcrpy
 
 # The following packages are considered to be unsafe in a requirements file:
 
@@ -0,0 +1,55 @@
+<body class="Document" id="897a8a47377c4ad6aab839a929879537">
+ <div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
+  <header class="Header" id="45b3d0053468484ba1c7b53998115412">
+   <h1 class="Title" id="c95473e8a3704fc2b418697f9fddb27b">
+    Header
+   </h1>
+   <time class="CalendarDate" id="379cbfdc16d44bd6a59e6cfabe6438d5">
+    Date: October 30, 2023
+   </time>
+  </header>
+  <form class="Form" id="637c2f6935fb4353a5f73025ce04619d">
+   <label class="FormField" for="company-name" id="50027cccbe1948c9853ce0de37b635c2">
+    From field name
+   </label>
+   <input class="FormFieldValue" id="0032242af75c4b37984ea7fea9aac74c" value="Example value"/>
+  </form>
+  <section class="Section" id="592422373ed741b68a077e2003f8ed81">
+   <table class="Table" id="dc3792d4422e444f90876b56d0cfb20d">
+    <thead class="TableHeader" id="50a5548a87e84024af590b3d2830d140">
+     <tr class="TableRow" id="5e473d7742474412be72dc4e2c45bd4a">
+      <th class="TableCellHeader" id="01800309aa42411c98ae30f85b23f399">
+       Description
+      </th>
+      <th class="TableCellHeader" id="c2765b63d08946a2851955e79e301de4">
+       Row header
+      </th>
+     </tr>
+    </thead>
+    <tbody class="TableBody" id="e0a9a8ffdd7148ad8b4a274b073d340a">
+     <tr class="TableRow" id="77e829974632455191330b0b8545d1e3">
+      <td class="TableCell" id="7fee12d4c5554b7da778d6f8fdec8a57">
+       Value description
+      </td>
+      <td class="TableCell" id="5a7a33b0c57b4eb881a35bce9f87c831">
+       <span class="Currency" id="87220f9d62c3482e92e7de72a26869cd">
+        50 $
+       </span>
+       <span class="Measurement" id="0095b9efb90a4cca991e73547c7165f1">
+        (1.32 %)
+       </span>
+      </td>
+     </tr>
+    </tbody>
+   </table>
+  </section>
+  <section class="Section" id="1032242af75c4b37984ea7fea9aac74c">
+   <h2 class="Subtitle" id="2a4e2c4a689f4f9a8c180b6b521e45c3">
+    2. Subtitle
+   </h2>
+   <p class="NarrativeText" id="5591f7a4df01447e82515ce45f686fbe">
+    Paragraph text
+   </p>
+  </section>
+ </div>
+</body>
@@ -0,0 +1,187 @@
+from pathlib import Path
+
+import pytest
+
+from unstructured.chunking.basic import chunk_elements
+from unstructured.chunking.title import chunk_by_title
+from unstructured.documents.ontology import Column, Document, Page, Paragraph
+from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
+from unstructured.partition.html import partition_html
+from unstructured.partition.html.transformations import (
+    ontology_to_unstructured_elements,
+    parse_html_to_ontology,
+)
+from unstructured.partition.json import partition_json
+from unstructured.staging.base import elements_from_json
+
+
+def test_page_number_is_passed_correctly():
+    ontology = Document(
+        children=[
+            Page(
+                children=[Paragraph(text="Paragraph1")],
+                additional_attributes={"data-page-number": "1"},
+            ),
+            Page(
+                children=[Paragraph(text="Paragraph2")],
+                additional_attributes={"data-page-number": "2"},
+            ),
+        ]
+    )
+    unstructured_elements = ontology_to_unstructured_elements(ontology)
+    page1, p1, page2, p2 = unstructured_elements
+    assert p1.metadata.page_number == 1
+    assert p2.metadata.page_number == 2
+
+
+def test_invalid_page_number_is_not_passed():
+    ontology = Document(
+        children=[
+            Page(
+                children=[Paragraph(text="Paragraph1")],
+                additional_attributes={"data-page-number": "invalid"},
+            )
+        ]
+    )
+    unstructured_elements = ontology_to_unstructured_elements(ontology)
+    page1, p1 = unstructured_elements
+    assert not p1.metadata.page_number
+
+
+def test_depth_is_passed_correctly():
+    ontology = Document(
+        children=[
+            Page(children=[Paragraph(text="Paragraph1")]),
+            Page(
+                children=[
+                    Column(children=[Paragraph(text="Paragraph2")]),
+                    Column(children=[Paragraph(text="Paragraph3")]),
+                ]
+            ),
+        ]
+    )
+
+    unstructured_elements = ontology_to_unstructured_elements(ontology)
+    page1, p1, page2, c1, p2, c2, p3 = unstructured_elements
+
+    assert page1.metadata.category_depth == 0
+    assert page2.metadata.category_depth == 0
+
+    assert p1.metadata.category_depth == 1
+
+    assert c2.metadata.category_depth == 1
+    assert c1.metadata.category_depth == 1
+
+    assert p2.metadata.category_depth == 2
+    assert p3.metadata.category_depth == 2
+
+
+def test_chunking_is_applied_on_elements():
+    ontology = Document(
+        children=[
+            Page(children=[Paragraph(text="Paragraph1")]),
+            Page(
+                children=[
+                    Column(children=[Paragraph(text="Paragraph2")]),
+                    Column(children=[Paragraph(text="Paragraph3")]),
+                ]
+            ),
+        ]
+    )
+
+    unstructured_elements = ontology_to_unstructured_elements(ontology)
+
+    chunked_basic = chunk_elements(unstructured_elements)
+    assert str(chunked_basic[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
+    chunked_by_title = chunk_by_title(unstructured_elements)
+    assert str(chunked_by_title[0]) == "Paragraph1\n\nParagraph2\n\nParagraph3"
+
+
+def test_embeddings_are_applied_on_elements(mocker):
+    ontology = Document(
+        children=[
+            Page(children=[Paragraph(text="Paragraph1")]),
+            Page(
+                children=[
+                    Column(children=[Paragraph(text="Paragraph2")]),
+                    Column(children=[Paragraph(text="Paragraph3")]),
+                ]
+            ),
+        ]
+    )
+
+    unstructured_elements = ontology_to_unstructured_elements(ontology)
+    # Mocked client with the desired behavior for embed_documents
+    mock_client = mocker.MagicMock()
+    mock_client.embed_documents.return_value = [1, 2, 3, 4, 5, 6, 7]
+
+    # Mock get_client to return our mock_client
+    mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
+
+    encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
+    elements = encoder.embed_documents(
+        elements=unstructured_elements,
+    )
+
+    assert len(elements) == 7
+
+    page1, p1, page2, c1, p2, c2, p3 = elements
+
+    assert p1.embeddings == 2
+    assert p2.embeddings == 5
+    assert p3.embeddings == 7
+
+
+@pytest.mark.parametrize(
+    ("html_file_path", "json_file_path"),
+    [
+        ("html_files/example.html", "unstructured_json_output/example.json"),
+    ],
+)
+def test_ingest(html_file_path, json_file_path):
+    html_file_path = Path(__file__).parent / html_file_path
+    json_file_path = Path(__file__).parent / json_file_path
+
+    html_code = html_file_path.read_text()
+    expected_json_elements = elements_from_json(str(json_file_path))
+
+    ontology = parse_html_to_ontology(html_code)
+    unstructured_elements = ontology_to_unstructured_elements(ontology)
+    assert unstructured_elements == expected_json_elements
+
+
+@pytest.mark.parametrize("json_file_path", ["unstructured_json_output/example.json"])
+def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
+    json_file_path = Path(__file__).parent / json_file_path
+
+    expected_json_elements = elements_from_json(str(json_file_path))
+
+    json_elements_text = json_file_path.read_text()
+    elements = partition_json(text=json_elements_text)
+
+    assert len(elements) == len(expected_json_elements)
+    for i in range(len(elements)):
+        assert elements[i] == expected_json_elements[i]
+        # The partitioning output comes from PDF file, so only stem is compared
+        # as the suffix is different .pdf != .json
+        assert Path(elements[i].metadata.filename).stem == json_file_path.stem
+
+
+@pytest.mark.parametrize(
+    ("html_file_path", "json_file_path"),
+    [
+        ("html_files/example.html", "unstructured_json_output/example.json"),
+    ],
+)
+def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
+    html_file_path = Path(__file__).parent / html_file_path
+    json_file_path = Path(__file__).parent / json_file_path
+
+    expected_json_elements = elements_from_json(str(json_file_path))
+    html_code = html_file_path.read_text()
+
+    predicted_elements = partition_html(text=html_code, html_parser_version="v2")
+    assert len(expected_json_elements) == len(predicted_elements)
+
+    for i in range(len(expected_json_elements)):
+        assert expected_json_elements[i] == expected_json_elements[i]