Skip to content

Commit de31df5

Browse files
authored
feat: Adds a helper function to convert ISD dicts to elements (#39)
* updated category name for ListItem * added brick to convert isd to elements * bump version * added isd_to_elements to documentation
1 parent 2871941 commit de31df5

File tree

6 files changed

+64
-6
lines changed

6 files changed

+64
-6
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.2.1-dev9
1+
## 0.2.1
22

3+
* Added brick to convert an ISD dictionary to a list of elements
34
* Update `PDFDocument` to use the `from_file` method
45
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
56
* Added staging brick for separating text into attention window size chunks for `transformers`.

docs/source/bricks.rst

+21
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,27 @@ Examples:
338338
isd = convert_to_isd(elements)
339339
340340
341+
``isd_to_elements``
342+
-------------------
343+
344+
Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
345+
346+
Examples:
347+
348+
.. code:: python
349+
350+
from unstructured.staging.base import isd_to_elements
351+
352+
isd = [
353+
{"text": "My Title", "type": "Title"},
354+
{"text": "My Narrative", "type": "NarrativeText"}
355+
]
356+
357+
# elements will look like:
358+
# [ Title(text="My Title"), NarrativeText(text="My Narrative")]
359+
elements = isd_to_elements(isd)
360+
361+
341362
``convert_to_isd_csv``
342363
----------------------
343364

test_unstructured/staging/test_base_staging.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import unstructured.staging.base as base
66

7-
from unstructured.documents.elements import Title, NarrativeText
7+
from unstructured.documents.elements import Title, NarrativeText, ListItem
88

99

1010
@pytest.fixture
@@ -23,6 +23,23 @@ def test_convert_to_isd():
2323
assert isd[1]["type"] == "NarrativeText"
2424

2525

26+
def test_isd_to_elements():
27+
isd = [
28+
{"text": "Blurb1", "type": "NarrativeText"},
29+
{"text": "Blurb2", "type": "Title"},
30+
{"text": "Blurb3", "type": "ListItem"},
31+
{"text": "Blurb4", "type": "BulletedText"},
32+
]
33+
34+
elements = base.isd_to_elements(isd)
35+
assert elements == [
36+
NarrativeText(text="Blurb1"),
37+
Title(text="Blurb2"),
38+
ListItem(text="Blurb3"),
39+
ListItem(text="Blurb4"),
40+
]
41+
42+
2643
def test_convert_to_isd_csv(output_csv_file):
2744

2845
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.1-dev9" # pragma: no cover
1+
__version__ = "0.2.1" # pragma: no cover

unstructured/documents/elements.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,11 @@ class NarrativeText(Text):
4747

4848

4949
class ListItem(Text):
50-
"""BulletedText is a NarrativeText element that is part of a bulleted list."""
50+
"""ListItem is a NarrativeText element that is part of a list."""
5151

52-
category = "BulletedText"
52+
category = "ListItem"
53+
54+
pass
5355

5456

5557
class Title(Text):

unstructured/staging/base.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import csv
33
from typing import Dict, List
44

5-
from unstructured.documents.elements import Text
5+
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
66

77

88
def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
@@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
1414
return isd
1515

1616

17+
def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
18+
"""Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
19+
elements: List[Text] = list()
20+
21+
for item in isd:
22+
if item["type"] == "NarrativeText":
23+
elements.append(NarrativeText(text=item["text"]))
24+
elif item["type"] == "Title":
25+
elements.append(Title(text=item["text"]))
26+
# NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
27+
# to be called BulletedText in an earlier version
28+
elif item["type"] in ["ListItem", "BulletedText"]:
29+
elements.append(ListItem(text=item["text"]))
30+
31+
return elements
32+
33+
1734
def convert_to_isd_csv(elements: List[Text]) -> str:
1835
"""
1936
Returns the representation of document elements as an Initial Structured Document (ISD)

0 commit comments

Comments
 (0)