Skip to content

Commit 05974a0

Browse files
work ongoing
Signed-off-by: Peter Staar <[email protected]>
1 parent fa28430 commit 05974a0

File tree

2 files changed

+64
-11
lines changed

2 files changed

+64
-11
lines changed

docling_core/experimental/serializer/outline.py

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
MarkdownDocSerializer,
2727
MarkdownMetaSerializer,
2828
MarkdownParams,
29+
MarkdownTextSerializer,
2930
)
3031
from docling_core.types.doc import (
3132
BaseMeta,
@@ -35,23 +36,39 @@
3536
InlineGroup,
3637
KeyValueItem,
3738
ListGroup,
39+
MetaFieldName,
3840
NodeItem,
3941
PictureItem,
4042
SummaryMetaField,
43+
SectionHeaderItem,
4144
TableItem,
4245
TextItem,
46+
TitleItem,
4347
)
4448

45-
def _default_outline_node(item: NodeItem):
46-
return f"[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}]"
49+
def _default_outline_node(item: NodeItem) -> str:
50+
# return f"[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}]"
51+
return f"[reference={item.self_ref}]"
4752

48-
class MarkdownSummaryMode(str, Enum):
53+
def _default_summary(summary:str) -> str:
54+
return f"(summary={summary})"
55+
56+
class OutlineMode(str, Enum):
4957
"""Display mode for document summary output."""
5058

5159
OUTLINE = "outline"
5260
TABLE_OF_CONTENTS = "table_of_contents"
5361

5462

63+
class OutlineParams(MarkdownParams):
64+
"""Markdown-specific serialization parameters for outline.
65+
66+
Inherits MarkdownParams to retain Markdown behaviors (escaping, links, etc.).
67+
"""
68+
69+
mode: OutlineMode = OutlineMode.OUTLINE
70+
71+
5572
class _OutlineTextSerializer(BaseTextSerializer):
5673
"""_Outline class for text item serializers."""
5774

@@ -64,10 +81,45 @@ def serialize(
6481
**kwargs: Any,
6582
) -> SerializationResult:
6683
"""Serializes the passed item."""
84+
prepend = ""
85+
if isinstance(item, TitleItem) or isinstance(item, SectionHeaderItem):
86+
# MarkdownDocSerializer requires a doc instance; pass through current doc
87+
_md_serializer = MarkdownDocSerializer(doc=doc)
88+
_serializer = MarkdownTextSerializer()
89+
90+
res = _serializer.serialize(item=item, doc_serializer=_md_serializer, doc=doc)
91+
prepend = res.text
92+
93+
summary = ""
94+
if item.meta and \
95+
(field_val := getattr(item.meta, MetaFieldName.SUMMARY)) is not None and \
96+
isinstance(field_val, SummaryMetaField):
97+
summary = _default_summary(field_val.text)
98+
99+
reference = _default_outline_node(item)
100+
101+
text = " ".join([prepend, reference, summary])
102+
67103
return create_ser_result(
68-
text=_default_outline_node(item)
104+
text=text
69105
)
70106

107+
"""
108+
def _serialize_meta_field(
109+
self, meta: BaseMeta, name: str, mark_meta: bool
110+
) -> Optional[str]:
111+
if (field_val := getattr(meta, name)) is not None and isinstance(
112+
field_val, SummaryMetaField
113+
):
114+
txt = field_val.text
115+
return (
116+
f"[{self._humanize_text(name, title=True)}] {txt}"
117+
if mark_meta
118+
else txt
119+
)
120+
else:
121+
return None
122+
"""
71123

72124
class _OutlineTableSerializer(BaseTableSerializer):
73125
"""_Outline class for table item serializers."""
@@ -247,4 +299,4 @@ class OutlineDocSerializer(MarkdownDocSerializer):
247299

248300
meta_serializer: BaseMetaSerializer = _OutlineMetaSerializer()
249301

250-
302+
params: OutlineParams = OutlineParams()

test/test_outline_serializer.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,30 @@
11
from pathlib import Path
22

3-
from docling_core.experimental.serializer.outline import OutlineDocSerializer
4-
from docling_core.transforms.serializer.markdown import MarkdownParams
3+
from docling_core.experimental.serializer.outline import (
4+
OutlineDocSerializer,
5+
OutlineParams,
6+
)
57
from docling_core.types.doc import DoclingDocument
68

79

810
def test_outline_serializer_basic():
911
src = Path("test/data/doc/2408.09869_p1.json")
1012
doc = DoclingDocument.load_from_json(filename=src)
1113

12-
print("MARKDOWN: \n\n")
14+
print("\n\nMARKDOWN: \n\n")
1315
print(doc.export_to_markdown())
1416

1517
# Only serialize metadata to focus on outline-like content
16-
params = MarkdownParams(include_non_meta=False)
18+
params = OutlineParams(include_non_meta=True)
1719
ser = OutlineDocSerializer(doc=doc, params=params)
1820

1921
res = ser.serialize()
2022
actual = res.text
2123

22-
print("SUMMARY: \n\n")
24+
print("\n\nSUMMARY: \n\n")
2325
print(actual)
2426

2527
assert isinstance(actual, str)
2628
# Expect summaries from title and section header to appear
2729
assert "This is a title." in actual
2830
assert "This is a section header." in actual
29-

0 commit comments

Comments
 (0)