Skip to content

Commit 6be07a5

Browse files
authored
feat: update auto.partition() function to recognize Unstructured json (#337)
1 parent 1580c1b commit 6be07a5

File tree

5 files changed

+162
-0
lines changed

5 files changed

+162
-0
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
### Enhancements
44

5+
* `auto.partition()` can now load Unstructured ISD json documents.
56
* Simplify partitioning functions.
67
* Improve logging for ingest CLI.
78

Diff for: test_unstructured/partition/test_json.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import os
2+
import pathlib
3+
import tempfile
4+
5+
import pytest
6+
7+
from unstructured.partition.auto import partition
8+
from unstructured.partition.json import partition_json
9+
from unstructured.staging.base import elements_to_json
10+
11+
DIRECTORY = pathlib.Path(__file__).parent.resolve()
12+
13+
test_files = [
14+
"fake-text.txt",
15+
"layout-parser-paper-fast.pdf",
16+
"fake-html.html",
17+
"fake.doc",
18+
"fake-email.eml",
19+
"fake-power-point.ppt",
20+
"fake.docx",
21+
"fake-power-point.pptx",
22+
]
23+
24+
25+
@pytest.mark.parametrize("filename", test_files)
26+
def test_partition_json_from_filename(filename: str):
27+
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
28+
elements = partition(filename=path)
29+
30+
with tempfile.TemporaryDirectory() as tmpdir:
31+
test_path = os.path.join(tmpdir, filename + ".json")
32+
elements_to_json(elements, filename=test_path, indent=2)
33+
test_elements = partition_json(filename=test_path)
34+
35+
assert len(elements) > 0
36+
assert len(str(elements[0])) > 0
37+
38+
assert len(elements) == len(test_elements)
39+
for i in range(len(elements)):
40+
assert elements[i] == test_elements[i]
41+
42+
43+
@pytest.mark.parametrize("filename", test_files)
44+
def test_partition_json_from_file(filename: str):
45+
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
46+
elements = partition(filename=path)
47+
48+
with tempfile.TemporaryDirectory() as tmpdir:
49+
test_path = os.path.join(tmpdir, filename + ".json")
50+
elements_to_json(elements, filename=test_path, indent=2)
51+
with open(test_path) as f:
52+
test_elements = partition_json(file=f)
53+
54+
assert len(elements) > 0
55+
assert len(str(elements[0])) > 0
56+
57+
assert len(elements) == len(test_elements)
58+
for i in range(len(elements)):
59+
assert elements[i] == test_elements[i]
60+
61+
62+
@pytest.mark.parametrize("filename", test_files)
63+
def test_partition_json_from_text(filename: str):
64+
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
65+
elements = partition(filename=path)
66+
67+
with tempfile.TemporaryDirectory() as tmpdir:
68+
test_path = os.path.join(tmpdir, filename + ".json")
69+
elements_to_json(elements, filename=test_path, indent=2)
70+
with open(test_path) as f:
71+
text = f.read()
72+
test_elements = partition_json(text=text)
73+
74+
assert len(elements) > 0
75+
assert len(str(elements[0])) > 0
76+
77+
assert len(elements) == len(test_elements)
78+
for i in range(len(elements)):
79+
assert elements[i] == test_elements[i]
80+
81+
82+
def test_partition_json_raises_with_none_specified():
83+
with pytest.raises(ValueError):
84+
partition_json()
85+
86+
87+
def test_partition_json_raises_with_too_many_specified():
88+
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
89+
elements = partition(filename=path)
90+
91+
with tempfile.TemporaryDirectory() as tmpdir:
92+
test_path = os.path.join(tmpdir, "fake-text.txt.json")
93+
elements_to_json(elements, filename=test_path, indent=2)
94+
with open(test_path) as f:
95+
text = f.read()
96+
97+
with pytest.raises(ValueError):
98+
partition_json(filename=test_path, file=f)
99+
100+
with pytest.raises(ValueError):
101+
partition_json(filename=test_path, text=text)
102+
103+
with pytest.raises(ValueError):
104+
partition_json(file=f, text=text)
105+
106+
with pytest.raises(ValueError):
107+
partition_json(filename=test_path, file=f, text=text)

Diff for: unstructured/file_utils/filetype.py

+5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class FileType(Enum):
8383
EML = 40
8484
RTF = 41
8585
TXT = 42
86+
JSON = 43
8687

8788
# Markup Types
8889
HTML = 50
@@ -116,6 +117,7 @@ def __lt__(self, other):
116117
".xls": FileType.XLS,
117118
".ppt": FileType.PPT,
118119
".rtf": FileType.RTF,
120+
".json": FileType.JSON,
119121
}
120122

121123

@@ -154,6 +156,9 @@ def detect_filetype(
154156
if mime_type == "application/pdf":
155157
return FileType.PDF
156158

159+
elif mime_type == "application/json":
160+
return FileType.JSON
161+
157162
elif mime_type in DOCX_MIME_TYPES:
158163
return FileType.DOCX
159164

Diff for: unstructured/partition/auto.py

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from unstructured.partition.email import partition_email
77
from unstructured.partition.html import partition_html
88
from unstructured.partition.image import partition_image
9+
from unstructured.partition.json import partition_json
910
from unstructured.partition.md import partition_md
1011
from unstructured.partition.pdf import partition_pdf
1112
from unstructured.partition.ppt import partition_ppt
@@ -67,6 +68,8 @@ def partition(
6768
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
6869
elif filetype == FileType.PPTX:
6970
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
71+
elif filetype == FileType.JSON:
72+
return partition_json(filename=filename, file=file)
7073
else:
7174
msg = "Invalid file" if not filename else f"Invalid file {filename}"
7275
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")

Diff for: unstructured/partition/json.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import json
2+
import re
3+
from typing import IO, List, Optional
4+
5+
from unstructured.documents.elements import Element
6+
from unstructured.staging.base import dict_to_elements
7+
8+
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{"
9+
10+
11+
def partition_json(
12+
filename: Optional[str] = None,
13+
file: Optional[IO] = None,
14+
text: Optional[str] = None,
15+
) -> List[Element]:
16+
"""Partitions an .json document into its constituent elements."""
17+
if not any([filename, file, text]):
18+
raise ValueError("One of filename, file, or text must be specified.")
19+
20+
if filename is not None and not file and not text:
21+
with open(filename, encoding="utf8") as f:
22+
file_text = f.read()
23+
24+
elif file is not None and not filename and not text:
25+
file_text = file.read()
26+
27+
elif text is not None and not filename and not file:
28+
file_text = str(text)
29+
30+
else:
31+
raise ValueError("Only one of filename, file, or text can be specified.")
32+
33+
# NOTE(Nathan): we expect file_text to be a list of dicts (optimization)
34+
if not re.match(LIST_OF_DICTS_PATTERN, file_text):
35+
raise ValueError("Json schema does not match the Unstructured schema")
36+
37+
try:
38+
dict = json.loads(file_text)
39+
elements = dict_to_elements(dict)
40+
except json.JSONDecodeError:
41+
raise ValueError("Not a valid json")
42+
43+
# NOTE(Nathan): in future PR, try extracting items that look like text
44+
# if file_text is a valid json but not an unstructured json
45+
46+
return elements

0 commit comments

Comments
 (0)