Skip to content

Commit 74e6b84

Browse files
authored
feat: add metadata tracking to document elements (#225)
* add metadata field to elements * metadata tracking for pdf/image * metadata for html * update expected outputs * metadata for the rest of the document types * take out file metadata for now * add url to tables * added metadata to test_auto * bump version * added coordinates to __init__ * fix coordinates in tests
1 parent b8dce61 commit 74e6b84

File tree

17 files changed

+152
-1677
lines changed

17 files changed

+152
-1677
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
## 0.4.9-dev2
1+
## 0.4.9
22

33
* Added ingest modules and s3 connector
44
* Default to `url=None` for `partition_pdf` and `partition_image`
55
* Add ability to skip English specific check by setting the `UNSTRUCTURED_LANGUAGE` env var to `""`.
6+
* Document `Element` objects now track metadata
67

78
## 0.4.8
89

test_unstructured/partition/test_auto.py

+7
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def test_auto_partition_email_from_filename():
2525
elements = partition(filename=filename)
2626
assert len(elements) > 0
2727
assert elements == EXPECTED_EMAIL_OUTPUT
28+
assert elements[0].metadata.filename == filename
2829

2930

3031
def test_auto_partition_email_from_file():
@@ -83,6 +84,7 @@ def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_ele
8384

8485
elements = partition(filename=filename)
8586
assert elements == expected_docx_elements
87+
assert elements[0].metadata.filename == filename
8688

8789

8890
def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir):
@@ -98,6 +100,7 @@ def test_auto_partition_html_from_filename():
98100
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
99101
elements = partition(filename=filename)
100102
assert len(elements) > 0
103+
assert elements[0].metadata.filename == filename
101104

102105

103106
def test_auto_partition_html_from_file():
@@ -129,6 +132,7 @@ def test_auto_partition_text_from_filename():
129132
elements = partition(filename=filename)
130133
assert len(elements) > 0
131134
assert elements == EXPECTED_TEXT_OUTPUT
135+
assert elements[0].metadata.filename == filename
132136

133137

134138
def test_auto_partition_text_from_file():
@@ -149,6 +153,8 @@ def test_auto_partition_pdf_from_filename():
149153
assert isinstance(elements[1], NarrativeText)
150154
assert elements[1].text.startswith("Zejiang Shen 1")
151155

156+
assert elements[0].metadata.filename == filename
157+
152158

153159
def test_auto_partition_pdf_from_file():
154160
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
@@ -206,6 +212,7 @@ def test_auto_partition_pptx_from_filename():
206212
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
207213
elements = partition(filename=filename)
208214
assert elements == EXPECTED_PPTX_OUTPUT
215+
assert elements[0].metadata.filename == filename
209216

210217

211218
def test_auto_with_page_breaks():

test_unstructured/partition/test_common.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_normalize_layout_element_dict():
1818
"text": "Some lovely text",
1919
}
2020
element = common.normalize_layout_element(layout_element)
21-
assert element == Title(text="Some lovely text", coordinates=[[2, 2], [3, 4], [5, 6], [7, 8]])
21+
assert element == Title(text="Some lovely text", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]])
2222

2323

2424
def test_normalize_layout_element_dict_caption():
@@ -29,7 +29,7 @@ def test_normalize_layout_element_dict_caption():
2929
}
3030
element = common.normalize_layout_element(layout_element)
3131
assert element == FigureCaption(
32-
text="Some lovely text", coordinates=[[2, 2], [3, 4], [5, 6], [7, 8]]
32+
text="Some lovely text", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]
3333
)
3434

3535

@@ -40,7 +40,7 @@ def test_normalize_layout_element_dict_misc():
4040
"text": "Some lovely text",
4141
}
4242
element = common.normalize_layout_element(layout_element)
43-
assert element == Text(text="Some lovely text", coordinates=[[2, 2], [3, 4], [5, 6], [7, 8]])
43+
assert element == Text(text="Some lovely text", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]])
4444

4545

4646
def test_normalize_layout_element_layout_element():
@@ -51,7 +51,7 @@ def test_normalize_layout_element_layout_element():
5151
)
5252
element = common.normalize_layout_element(layout_element)
5353
assert element == NarrativeText(
54-
text="Some lovely text", coordinates=[[2, 2], [3, 4], [5, 6], [7, 8]]
54+
text="Some lovely text", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]
5555
)
5656

5757

test_unstructured/staging/test_base_staging.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,9 @@ def test_convert_to_isd_csv(output_csv_file):
4848
isd_csv_string = base.convert_to_isd_csv(elements)
4949
csv_file.write(isd_csv_string)
5050

51-
fieldnames = ["type", "text"]
5251
with open(output_csv_file, "r") as csv_file:
5352
csv_rows = csv.DictReader(csv_file)
54-
assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)
53+
assert all(set(row.keys()) == set(base.TABLE_FIELDNAMES) for row in csv_rows)
5554

5655

5756
def test_convert_to_dataframe():

0 commit comments

Comments
 (0)