Skip to content

Commit c3af03d

Browse files
feat: expose converters deckerd -> html and back (#3233)
This PR exposes functions in evaluation module for easy conversion between tables in Deckerd and HTML formats, which are useful in evalution experiments.
1 parent f23d180 commit c3af03d

File tree

3 files changed

+122
-2
lines changed

3 files changed

+122
-2
lines changed

Diff for: CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
### Features
99

10+
* **Expose conversion functions for tables** Adds public functions to convert tables from HTML to the Deckerd format and back
11+
1012
### Fixes
1113

1214
* **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image.

Diff for: test_unstructured/metrics/test_text_extraction.py

+80
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55
from unstructured.metrics import text_extraction
66
from unstructured.metrics.table.table_extraction import (
7+
deckerd_table_to_html,
78
extract_cells_from_table_as_cells,
89
extract_cells_from_text_as_html,
10+
html_table_to_deckerd,
911
)
1012
from unstructured.partition.auto import partition
1113

@@ -556,3 +558,81 @@ def test_cells_extraction_from_prediction_when_missing_prediction():
556558
example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
557559
assert extract_cells_from_text_as_html(example_element) is None
558560
assert extract_cells_from_table_as_cells(example_element) is None
561+
562+
563+
def _trim_html(html: str) -> str:
564+
html_lines = [line.strip() for line in html.split("\n") if line]
565+
return "".join(html_lines)
566+
567+
568+
@pytest.mark.parametrize(
569+
"html_to_test",
570+
[
571+
"""
572+
<table>
573+
<thead>
574+
<tr>
575+
<th>Month A.</th>
576+
</tr>
577+
</thead>
578+
<tbody>
579+
<tr>
580+
<td>22</td>
581+
</tr>
582+
</tbody>
583+
</table>
584+
""",
585+
"""
586+
<table>
587+
<thead>
588+
<tr>
589+
<th>Month A.</th>
590+
<th>Month B.</th>
591+
<th>Month C.</th>
592+
</tr>
593+
</thead>
594+
<tbody>
595+
<tr>
596+
<td>11</td>
597+
<td>12</td>
598+
<td>13</td>
599+
</tr>
600+
<tr>
601+
<td>21</td>
602+
<td>22</td>
603+
<td>23</td>
604+
</tr>
605+
</tbody>
606+
</table>
607+
""",
608+
"""
609+
<table>
610+
<thead>
611+
<tr>
612+
<th rowspan="2">h12col1</th>
613+
<th colspan="2">h1col23</th>
614+
<th>h1col4</th>
615+
</tr>
616+
<tr>
617+
<th>h2col2</th>
618+
<th colspan="2">h2col34</th>
619+
</tr>
620+
</thead>
621+
<tbody>
622+
<tr>
623+
<td>r3col1</td>
624+
<td>r3col2</td>
625+
<td colspan="2" rowspan="2">r34col34</td>
626+
</tr>
627+
<tr>
628+
<td colspan="2">r4col12</td>
629+
</tr>
630+
</tbody>
631+
</table>
632+
""",
633+
],
634+
)
635+
def test_deckerd_html_converter(html_to_test):
636+
deckerd_table = html_table_to_deckerd(html_to_test)
637+
html_table = deckerd_table_to_html(deckerd_table)
638+
assert _trim_html(html_to_test) == html_table

Diff for: unstructured/metrics/table/table_extraction.py

+40-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Any, Dict, List
44

55
from bs4 import BeautifulSoup
6+
from unstructured_inference.models.tables import cells_to_html
67

78
EMPTY_CELL = {
89
"row_index": "",
@@ -37,7 +38,7 @@ def _move_cells_for_spanned_cells(cells: List[Dict[str, Any]]):
3738
return sorted_cells
3839

3940

40-
def _html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
41+
def html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
4142
"""Convert html format to Deckerd table structure.
4243
4344
Args:
@@ -66,6 +67,36 @@ def _html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
6667
return _move_cells_for_spanned_cells(table_data)
6768

6869

70+
def deckerd_table_to_html(cells: List[Dict[str, Any]]) -> str:
71+
"""Convert Deckerd table structure to html format.
72+
73+
Args:
74+
cells: List of dictionaries where each dictionary represents a cell in the table.
75+
76+
Returns:
77+
A string with the html content of the table.
78+
"""
79+
transformer_cells = []
80+
# determine which cells are in header. Consider row 0 as header
81+
# but spans may make it larger
82+
first_row_cells = [cell for cell in cells if cell["y"] == 0]
83+
header_length = max(cell["w"] for cell in first_row_cells)
84+
header_rows = set(range(header_length))
85+
for cell in cells:
86+
cell_data = {
87+
"row_nums": list(range(cell["y"], cell["y"] + cell["h"])),
88+
"column_nums": list(range(cell["x"], cell["x"] + cell["w"])),
89+
"w": cell["w"],
90+
"h": cell["h"],
91+
"cell text": cell["content"],
92+
"column header": cell["y"] in header_rows,
93+
}
94+
transformer_cells.append(cell_data)
95+
# reuse the existing function to convert to HTML
96+
table = cells_to_html(transformer_cells)
97+
return table
98+
99+
69100
def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
70101
"""Convert html format to table structure. As a middle step it converts
71102
html to the Deckerd format as it's more convenient to work with.
@@ -76,7 +107,7 @@ def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
76107
Returns:
77108
A list of dictionaries where each dictionary represents a cell in the table.
78109
"""
79-
deckerd_cells = _html_table_to_deckerd(content)
110+
deckerd_cells = html_table_to_deckerd(content)
80111
return _convert_table_from_deckerd(deckerd_cells)
81112

82113

@@ -160,11 +191,18 @@ def extract_and_convert_tables_from_prediction(
160191
)
161192

162193
extract_cells_fn = source_type_to_extraction_strategies[source_type]
194+
fallback_extract_cells_fn = (
195+
extract_cells_from_table_as_cells
196+
if source_type == "cells"
197+
else extract_cells_from_text_as_html
198+
)
163199

164200
predicted_table_data = []
165201
for element in file_elements:
166202
if element.get("type") == "Table":
167203
extracted_cells = extract_cells_fn(element)
204+
if not extracted_cells:
205+
extracted_cells = fallback_extract_cells_fn(element)
168206
if extracted_cells:
169207
sorted_cells = _sort_table_cells(extracted_cells)
170208
predicted_table_data.append(sorted_cells)

0 commit comments

Comments
 (0)