Skip to content

Commit 9f08d35

Browse files
committed
update Markdown serialization
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 6ce1dba commit 9f08d35

15 files changed

+497
-11
lines changed

docling_core/transforms/serializer/base.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import Any, Optional, Union
1010

1111
from pydantic import AnyUrl, BaseModel
12+
from typing_extensions import deprecated
1213

1314
from docling_core.types.doc.document import (
1415
DocItem,
@@ -258,6 +259,7 @@ def serialize_captions(
258259
"""Serialize the item's captions."""
259260
...
260261

262+
@deprecated("Use serialize_meta() instead.")
261263
@abstractmethod
262264
def serialize_annotations(
263265
self,
@@ -267,6 +269,15 @@ def serialize_annotations(
267269
"""Serialize the item's annotations."""
268270
...
269271

272+
@abstractmethod
273+
def serialize_meta(
274+
self,
275+
item: NodeItem,
276+
**kwargs: Any,
277+
) -> SerializationResult:
278+
"""Serialize the item's meta."""
279+
...
280+
270281
@abstractmethod
271282
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
272283
"""Get references to excluded items."""
@@ -287,6 +298,26 @@ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
287298
...
288299

289300

301+
class BaseMetaSerializer(ABC):
302+
"""Base class for meta serializers."""
303+
304+
@abstractmethod
305+
def serialize(
306+
self,
307+
*,
308+
item: NodeItem,
309+
doc: DoclingDocument,
310+
**kwargs: Any,
311+
) -> SerializationResult:
312+
"""Serializes the meta of the passed item."""
313+
...
314+
315+
def _humanize_text(self, text: str, title: bool = False) -> str:
316+
tmp = text.replace("__", "_").replace("_", " ")
317+
return tmp.title() if title else tmp.capitalize()
318+
319+
320+
@deprecated("Use BaseMetaSerializer() instead.")
290321
class BaseAnnotationSerializer(ABC):
291322
"""Base class for annotation serializers."""
292323

docling_core/transforms/serializer/common.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#
55

66
"""Define base classes for serialization."""
7+
import logging
78
import re
89
import sys
910
from abc import abstractmethod
@@ -22,6 +23,7 @@
2223
BaseInlineSerializer,
2324
BaseKeyValueSerializer,
2425
BaseListSerializer,
26+
BaseMetaSerializer,
2527
BasePictureSerializer,
2628
BaseTableSerializer,
2729
BaseTextSerializer,
@@ -56,6 +58,9 @@
5658
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
5759

5860

61+
_logger = logging.getLogger(__name__)
62+
63+
5964
class _PageBreakNode(NodeItem):
6065
"""Page break node."""
6166

@@ -215,6 +220,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
215220
list_serializer: BaseListSerializer
216221
inline_serializer: BaseInlineSerializer
217222

223+
meta_serializer: Optional[BaseMetaSerializer] = None
218224
annotation_serializer: BaseAnnotationSerializer
219225

220226
params: CommonParams = CommonParams()
@@ -435,6 +441,13 @@ def get_parts(
435441
)
436442
if part.text:
437443
parts.append(part)
444+
445+
part = self.serialize_meta(
446+
item=node,
447+
**kwargs,
448+
)
449+
if part.text:
450+
parts.append(part)
438451
return parts
439452

440453
@override
@@ -528,6 +541,30 @@ def serialize_captions(
528541
text_res = ""
529542
return create_ser_result(text=text_res, span_source=results)
530543

544+
@override
545+
def serialize_meta(
546+
self,
547+
item: NodeItem,
548+
**kwargs: Any,
549+
) -> SerializationResult:
550+
"""Serialize the item's meta."""
551+
if self.meta_serializer:
552+
return self.meta_serializer.serialize(
553+
item=item,
554+
doc=self.doc,
555+
**kwargs,
556+
)
557+
else:
558+
_logger.warning("No meta serializer found.")
559+
return create_ser_result(
560+
text="", span_source=item if isinstance(item, DocItem) else []
561+
)
562+
# return create_ser_result(
563+
# text=item.meta.model_dump_json() if item.meta else "",
564+
# span_source=item,
565+
# )
566+
567+
# TODO deprecate
531568
@override
532569
def serialize_annotations(
533570
self,

docling_core/transforms/serializer/markdown.py

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pathlib import Path
1212
from typing import Any, Optional, Union
1313

14-
from pydantic import AnyUrl, BaseModel, PositiveInt
14+
from pydantic import AnyUrl, BaseModel, Field, PositiveInt
1515
from tabulate import tabulate
1616
from typing_extensions import override
1717

@@ -23,6 +23,7 @@
2323
BaseInlineSerializer,
2424
BaseKeyValueSerializer,
2525
BaseListSerializer,
26+
BaseMetaSerializer,
2627
BasePictureSerializer,
2728
BaseTableSerializer,
2829
BaseTextSerializer,
@@ -36,6 +37,7 @@
3637
)
3738
from docling_core.types.doc.base import ImageRefMode
3839
from docling_core.types.doc.document import (
40+
BaseMeta,
3941
CodeItem,
4042
ContentLayer,
4143
DescriptionAnnotation,
@@ -52,14 +54,18 @@
5254
KeyValueItem,
5355
ListGroup,
5456
ListItem,
57+
MoleculeMetaField,
5558
NodeItem,
5659
PictureClassificationData,
60+
PictureClassificationMetaField,
5761
PictureItem,
5862
PictureMoleculeData,
5963
PictureTabularChartData,
6064
RichTableCell,
6165
SectionHeaderItem,
66+
SummaryMetaField,
6267
TableItem,
68+
TabularChartMetaField,
6369
TextItem,
6470
TitleItem,
6571
)
@@ -102,8 +108,18 @@ class MarkdownParams(CommonParams):
102108
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
103109
escape_underscores: bool = True
104110
escape_html: bool = True
105-
include_annotations: bool = True
106-
mark_annotations: bool = False
111+
include_meta: bool = Field(default=True, description="Include item meta.")
112+
mark_meta: bool = Field(default=False, description="Mark meta sections.")
113+
include_annotations: bool = Field(
114+
default=True,
115+
description="Include item annotations.",
116+
deprecated="Use include_meta instead.",
117+
)
118+
mark_annotations: bool = Field(
119+
default=False,
120+
description="Mark annotation sections.",
121+
deprecated="Use mark_meta instead.",
122+
)
107123
orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
108124
ensure_valid_list_item_marker: bool = True
109125

@@ -245,9 +261,67 @@ def serialize(
245261
return create_ser_result(text=text, span_source=res_parts)
246262

247263

264+
class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer):
265+
"""Markdown-specific meta serializer."""
266+
267+
@override
268+
def serialize(
269+
self,
270+
*,
271+
item: NodeItem,
272+
doc: DoclingDocument,
273+
**kwargs: Any,
274+
) -> SerializationResult:
275+
"""Serialize the item's meta."""
276+
params = MarkdownParams(**kwargs)
277+
return create_ser_result(
278+
text="\n\n".join(
279+
[
280+
tmp
281+
for key in list(item.meta.__class__.model_fields)
282+
+ list(item.meta.get_custom_part())
283+
if (
284+
tmp := self._serialize_meta_field(
285+
item.meta, key, params.mark_meta
286+
)
287+
)
288+
is not None
289+
]
290+
if params.include_meta and item.meta
291+
else []
292+
),
293+
span_source=item if isinstance(item, DocItem) else [],
294+
)
295+
296+
def _serialize_meta_field(
297+
self, meta: BaseMeta, name: str, mark_meta: bool
298+
) -> Optional[str]:
299+
if (field_val := getattr(meta, name)) is not None:
300+
# NOTE: currently only considering field type, not field name
301+
if isinstance(field_val, SummaryMetaField):
302+
txt = field_val.text
303+
elif isinstance(field_val, PictureClassificationMetaField):
304+
txt = self._humanize_text(field_val.get_main_prediction().class_name)
305+
elif isinstance(field_val, MoleculeMetaField):
306+
txt = field_val.smi
307+
elif isinstance(field_val, TabularChartMetaField):
308+
# suppressing tabular chart serialization
309+
return None
310+
elif tmp := str(field_val or ""):
311+
txt = tmp
312+
else:
313+
return None
314+
return (
315+
f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt
316+
)
317+
else:
318+
return None
319+
320+
248321
class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
249322
"""Markdown-specific annotation serializer."""
250323

324+
@override
251325
def serialize(
252326
self,
253327
*,
@@ -629,6 +703,7 @@ class MarkdownDocSerializer(DocSerializer):
629703
list_serializer: BaseListSerializer = MarkdownListSerializer()
630704
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
631705

706+
meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer()
632707
annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
633708

634709
params: MarkdownParams = MarkdownParams()

docling_core/types/doc/document.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4685,6 +4685,10 @@ def save_as_markdown(
46854685
included_content_layers: Optional[set[ContentLayer]] = None,
46864686
page_break_placeholder: Optional[str] = None,
46874687
include_annotations: bool = True,
4688+
*,
4689+
include_meta: bool = True,
4690+
mark_meta: bool = False,
4691+
use_legacy_annotations: bool = False,
46884692
):
46894693
"""Save to markdown."""
46904694
if isinstance(filename, str):
@@ -4714,6 +4718,9 @@ def save_as_markdown(
47144718
included_content_layers=included_content_layers,
47154719
page_break_placeholder=page_break_placeholder,
47164720
include_annotations=include_annotations,
4721+
use_legacy_annotations=use_legacy_annotations,
4722+
include_meta=include_meta,
4723+
mark_meta=mark_meta,
47174724
)
47184725

47194726
with open(filename, "w", encoding="utf-8") as fw:
@@ -4738,6 +4745,10 @@ def export_to_markdown( # noqa: C901
47384745
page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
47394746
include_annotations: bool = True,
47404747
mark_annotations: bool = False,
4748+
*,
4749+
include_meta: bool = True,
4750+
mark_meta: bool = False,
4751+
use_legacy_annotations: bool = False,
47414752
) -> str:
47424753
r"""Serialize to Markdown.
47434754
@@ -4783,6 +4794,15 @@ def export_to_markdown( # noqa: C901
47834794
:param mark_annotations: bool: Whether to mark annotations in the export; only
47844795
relevant if include_annotations is True. (Default value = False).
47854796
:type mark_annotations: bool = False
4797+
:param use_legacy_annotations: bool: Whether to use legacy annotation serialization.
4798+
(Default value = False).
4799+
:type use_legacy_annotations: bool = False
4800+
:param include_meta: bool: Whether to include meta in the export.
4801+
(Default value = True).
4802+
:type include_meta: bool = True
4803+
:param mark_meta: bool: Whether to mark meta in the export; only
4804+
relevant if include_meta is True. (Default value = False).
4805+
:type mark_meta: bool = False
47864806
:returns: The exported Markdown representation.
47874807
:rtype: str
47884808
"""
@@ -4813,7 +4833,9 @@ def export_to_markdown( # noqa: C901
48134833
indent=indent,
48144834
wrap_width=text_width if text_width > 0 else None,
48154835
page_break_placeholder=page_break_placeholder,
4816-
include_annotations=include_annotations,
4836+
include_meta=include_meta and not use_legacy_annotations,
4837+
mark_meta=mark_meta,
4838+
include_annotations=include_annotations and use_legacy_annotations,
48174839
mark_annotations=mark_annotations,
48184840
),
48194841
)

0 commit comments

Comments
 (0)