Skip to content

Commit e42a1dd

Browse files
committed
Extend default export labels, add convenience mehtods
Signed-off-by: Christoph Auer <[email protected]>
1 parent ba79b4a commit e42a1dd

File tree

1 file changed

+41
-19
lines changed

1 file changed

+41
-19
lines changed

docling_core/types/experimental/document.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,22 @@
2929
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
3030
CURRENT_VERSION: Final = "1.0.0"
3131

32+
DEFAULT_EXPORT_LABELS = {
33+
DocItemLabel.TITLE,
34+
DocItemLabel.DOCUMENT_INDEX,
35+
DocItemLabel.SECTION_HEADER,
36+
DocItemLabel.PARAGRAPH,
37+
DocItemLabel.CAPTION,
38+
DocItemLabel.TABLE,
39+
DocItemLabel.PICTURE,
40+
DocItemLabel.FORMULA,
41+
DocItemLabel.CHECKBOX_UNSELECTED,
42+
DocItemLabel.CHECKBOX_SELECTED,
43+
DocItemLabel.TEXT,
44+
DocItemLabel.LIST_ITEM,
45+
DocItemLabel.CODE,
46+
}
47+
3248

3349
class BasePictureData(BaseModel): # TBD
3450
"""BasePictureData."""
@@ -930,20 +946,16 @@ def print_element_tree(self):
930946
elif isinstance(item, DocItem):
931947
print(" " * level, f"{ix}: {item.label.value}")
932948

949+
def export_to_dict(self) -> Dict:
950+
"""export_to_dict."""
951+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
952+
933953
def export_to_markdown( # noqa: C901
934954
self,
935955
delim: str = "\n\n",
936956
from_element: int = 0,
937957
to_element: Optional[int] = None,
938-
labels: list[DocItemLabel] = [
939-
DocItemLabel.TITLE,
940-
DocItemLabel.SECTION_HEADER,
941-
DocItemLabel.PARAGRAPH,
942-
DocItemLabel.CAPTION,
943-
DocItemLabel.TABLE,
944-
DocItemLabel.PICTURE,
945-
DocItemLabel.TEXT,
946-
],
958+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
947959
strict_text: bool = False,
948960
image_placeholder: str = "<!-- image -->",
949961
) -> str:
@@ -964,7 +976,7 @@ def export_to_markdown( # noqa: C901
964976
:param delim: str: (Default value = "\n\n")
965977
:param from_element: int: (Default value = 0)
966978
:param to_element: Optional[int]: (Default value = None)
967-
:param labels: list[DocItemLabel]
979+
:param labels: set[DocItemLabel]
968980
:param "subtitle-level-1":
969981
:param "paragraph":
970982
:param "caption":
@@ -1089,19 +1101,29 @@ def export_to_markdown( # noqa: C901
10891101
result = delim.join(md_texts)
10901102
return result
10911103

1104+
def export_to_text( # noqa: C901
1105+
self,
1106+
delim: str = "\n\n",
1107+
from_element: int = 0,
1108+
to_element: Optional[int] = None,
1109+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1110+
) -> str:
1111+
"""export_to_text."""
1112+
return self.export_to_markdown(
1113+
delim,
1114+
from_element,
1115+
to_element,
1116+
labels,
1117+
strict_text=True,
1118+
image_placeholder="",
1119+
)
1120+
10921121
def export_to_document_tokens(
10931122
self,
10941123
delim: str = "\n\n",
10951124
from_element: int = 0,
10961125
to_element: Optional[int] = None,
1097-
labels: list[DocItemLabel] = [
1098-
DocItemLabel.TITLE,
1099-
DocItemLabel.SECTION_HEADER,
1100-
DocItemLabel.PARAGRAPH,
1101-
DocItemLabel.CAPTION,
1102-
DocItemLabel.TABLE,
1103-
DocItemLabel.TEXT,
1104-
],
1126+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
11051127
xsize: int = 100,
11061128
ysize: int = 100,
11071129
add_location: bool = True,
@@ -1120,7 +1142,7 @@ def export_to_document_tokens(
11201142
:param delim: str: (Default value = "\n\n")
11211143
:param from_element: int: (Default value = 0)
11221144
:param to_element: Optional[int]: (Default value = None)
1123-
:param labels: list[DocItemLabel]
1145+
:param labels: set[DocItemLabel]
11241146
:param xsize: int: (Default value = 100)
11251147
:param ysize: int: (Default value = 100)
11261148
:param add_location: bool: (Default value = True)

0 commit comments

Comments
 (0)