Skip to content

Commit 80951c5

Browse files
committed
feat(experimental): add new DocTags serializer
Signed-off-by: Panos Vagenas <[email protected]>
1 parent c80b583 commit 80951c5

File tree

8 files changed

+330
-9
lines changed

8 files changed

+330
-9
lines changed
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""Define classes for DocTags serialization."""
2+
3+
from typing import Any, Final, Optional
4+
from xml.dom.minidom import parseString
5+
6+
from pydantic import BaseModel
7+
from typing_extensions import override
8+
9+
from docling_core.transforms.serializer.base import (
10+
BaseDocSerializer,
11+
BaseMetaSerializer,
12+
BasePictureSerializer,
13+
SerializationResult,
14+
)
15+
from docling_core.transforms.serializer.common import create_ser_result
16+
from docling_core.transforms.serializer.doctags import (
17+
DocTagsDocSerializer,
18+
DocTagsParams,
19+
DocTagsPictureSerializer,
20+
_get_delim,
21+
_wrap,
22+
)
23+
from docling_core.types.doc import (
24+
BaseMeta,
25+
DescriptionMetaField,
26+
DocItem,
27+
DoclingDocument,
28+
MetaFieldName,
29+
MoleculeMetaField,
30+
NodeItem,
31+
PictureClassificationMetaField,
32+
PictureItem,
33+
SummaryMetaField,
34+
TableData,
35+
TabularChartMetaField,
36+
)
37+
from docling_core.types.doc.labels import DocItemLabel
38+
from docling_core.types.doc.tokens import DocumentToken
39+
40+
DOCTAGS_VERSION: Final = "1.0.0"
41+
42+
43+
class IDocTagsParams(DocTagsParams):
44+
"""DocTags-specific serialization parameters."""
45+
46+
do_self_closing: bool = True
47+
pretty_indentation: Optional[str] = 2 * " "
48+
49+
50+
class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
51+
"""DocTags-specific meta serializer."""
52+
53+
@override
54+
def serialize(
55+
self,
56+
*,
57+
item: NodeItem,
58+
**kwargs: Any,
59+
) -> SerializationResult:
60+
"""DocTags-specific meta serializer."""
61+
params = IDocTagsParams(**kwargs)
62+
63+
elem_delim = ""
64+
texts = (
65+
[
66+
tmp
67+
for key in (
68+
list(item.meta.__class__.model_fields)
69+
+ list(item.meta.get_custom_part())
70+
)
71+
if (
72+
(
73+
params.allowed_meta_names is None
74+
or key in params.allowed_meta_names
75+
)
76+
and (key not in params.blocked_meta_names)
77+
and (tmp := self._serialize_meta_field(item.meta, key))
78+
)
79+
]
80+
if item.meta
81+
else []
82+
)
83+
if texts:
84+
texts.insert(0, "<meta>")
85+
texts.append("</meta>")
86+
return create_ser_result(
87+
text=elem_delim.join(texts),
88+
span_source=item if isinstance(item, DocItem) else [],
89+
)
90+
91+
def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
92+
if (field_val := getattr(meta, name)) is not None:
93+
if name == MetaFieldName.SUMMARY and isinstance(
94+
field_val, SummaryMetaField
95+
):
96+
txt = f"<summary>{field_val.text}</summary>"
97+
elif name == MetaFieldName.DESCRIPTION and isinstance(
98+
field_val, DescriptionMetaField
99+
):
100+
txt = f"<description>{field_val.text}</description>"
101+
elif name == MetaFieldName.CLASSIFICATION and isinstance(
102+
field_val, PictureClassificationMetaField
103+
):
104+
class_name = self._humanize_text(
105+
field_val.get_main_prediction().class_name
106+
)
107+
txt = f"<classification>{class_name}</classification>"
108+
elif name == MetaFieldName.MOLECULE and isinstance(
109+
field_val, MoleculeMetaField
110+
):
111+
txt = f"<molecule>{field_val.smi}</molecule>"
112+
elif name == MetaFieldName.TABULAR_CHART and isinstance(
113+
field_val, TabularChartMetaField
114+
):
115+
# suppressing tabular chart serialization
116+
return None
117+
# elif tmp := str(field_val or ""):
118+
# txt = tmp
119+
elif name not in {v.value for v in MetaFieldName}:
120+
txt = _wrap(text=str(field_val or ""), wrap_tag=name)
121+
return txt
122+
return None
123+
124+
125+
class IDocTagsPictureSerializer(DocTagsPictureSerializer):
126+
"""DocTags-specific picture item serializer."""
127+
128+
@override
129+
def serialize(
130+
self,
131+
*,
132+
item: PictureItem,
133+
doc_serializer: BaseDocSerializer,
134+
doc: DoclingDocument,
135+
**kwargs: Any,
136+
) -> SerializationResult:
137+
"""Serializes the passed item."""
138+
params = DocTagsParams(**kwargs)
139+
res_parts: list[SerializationResult] = []
140+
is_chart = False
141+
142+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
143+
144+
if item.meta:
145+
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
146+
if meta_res.text:
147+
res_parts.append(meta_res)
148+
149+
body = ""
150+
if params.add_location:
151+
body += item.get_location_tokens(
152+
doc=doc,
153+
xsize=params.xsize,
154+
ysize=params.ysize,
155+
self_closing=params.do_self_closing,
156+
)
157+
158+
# handle tabular chart data
159+
chart_data: Optional[TableData] = None
160+
if item.meta and item.meta.tabular_chart:
161+
chart_data = item.meta.tabular_chart.chart_data
162+
if chart_data and chart_data.table_cells:
163+
temp_doc = DoclingDocument(name="temp")
164+
temp_table = temp_doc.add_table(data=chart_data)
165+
otsl_content = temp_table.export_to_otsl(
166+
temp_doc,
167+
add_cell_location=False,
168+
self_closing=params.do_self_closing,
169+
)
170+
body += otsl_content
171+
res_parts.append(create_ser_result(text=body, span_source=item))
172+
173+
if params.add_caption:
174+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
175+
if cap_res.text:
176+
res_parts.append(cap_res)
177+
178+
text_res = "".join([r.text for r in res_parts])
179+
if text_res:
180+
token = DocumentToken.create_token_name_from_doc_item_label(
181+
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
182+
)
183+
text_res = _wrap(text=text_res, wrap_tag=token)
184+
return create_ser_result(text=text_res, span_source=res_parts)
185+
186+
187+
class IDocTagsDocSerializer(DocTagsDocSerializer):
188+
"""DocTags document serializer."""
189+
190+
picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
191+
meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
192+
params: IDocTagsParams = IDocTagsParams()
193+
194+
@override
195+
def _meta_is_wrapped(self) -> bool:
196+
return True
197+
198+
@override
199+
def serialize_doc(
200+
self,
201+
*,
202+
parts: list[SerializationResult],
203+
**kwargs: Any,
204+
) -> SerializationResult:
205+
"""DocTags-specific document serializer."""
206+
delim = _get_delim(params=self.params)
207+
text_res = delim.join([p.text for p in parts if p.text])
208+
209+
if self.params.add_page_break:
210+
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
211+
for full_match, _, _ in self._get_page_breaks(text=text_res):
212+
text_res = text_res.replace(full_match, page_sep)
213+
214+
wrap_tag = DocumentToken.DOCUMENT.value
215+
text_res = f"<{wrap_tag}><version>{DOCTAGS_VERSION}</version>{text_res}{delim}</{wrap_tag}>"
216+
217+
if self.params.pretty_indentation and (
218+
my_root := parseString(text_res).documentElement
219+
):
220+
text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
221+
text_res = "\n".join(
222+
[line for line in text_res.split("\n") if line.strip()]
223+
)
224+
return create_ser_result(text=text_res, span_source=parts)

docling_core/transforms/serializer/common.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult:
318318
res = self.serialize_doc(parts=subparts, **kwargs)
319319
return res
320320

321+
def _meta_is_wrapped(self) -> bool:
322+
return False
323+
321324
@override
322325
def serialize(
323326
self,
@@ -339,7 +342,7 @@ def serialize(
339342
my_item = item or self.doc.body
340343

341344
if my_item == self.doc.body:
342-
if my_item.meta:
345+
if my_item.meta and not self._meta_is_wrapped():
343346
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
344347
if meta_part.text:
345348
parts.append(meta_part)
@@ -358,7 +361,7 @@ def serialize(
358361

359362
my_visited.add(my_item.self_ref)
360363

361-
if my_item.meta:
364+
if my_item.meta and not self._meta_is_wrapped():
362365
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
363366
if meta_part.text:
364367
parts.append(meta_part)
@@ -605,7 +608,6 @@ def serialize_meta(
605608
text="", span_source=item if isinstance(item, DocItem) else []
606609
)
607610
else:
608-
_logger.warning("No meta serializer found.")
609611
return create_ser_result(
610612
text="", span_source=item if isinstance(item, DocItem) else []
611613
)

docling_core/transforms/serializer/doctags.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class Mode(str, Enum):
7777

7878
mode: Mode = Mode.HUMAN_FRIENDLY
7979

80+
do_self_closing: bool = False
81+
8082

8183
def _get_delim(params: DocTagsParams) -> str:
8284
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
@@ -110,11 +112,17 @@ def serialize(
110112
)
111113
parts: list[str] = []
112114

115+
if item.meta:
116+
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
117+
if meta_res.text:
118+
parts.append(meta_res.text)
119+
113120
if params.add_location:
114121
location = item.get_location_tokens(
115122
doc=doc,
116123
xsize=params.xsize,
117124
ysize=params.ysize,
125+
self_closing=params.do_self_closing,
118126
)
119127
if location:
120128
parts.append(location)
@@ -184,6 +192,7 @@ def serialize(
184192
doc=doc,
185193
xsize=params.xsize,
186194
ysize=params.ysize,
195+
self_closing=params.do_self_closing,
187196
)
188197
res_parts.append(create_ser_result(text=loc_text, span_source=item))
189198

@@ -233,6 +242,7 @@ def serialize(
233242
doc=doc,
234243
xsize=params.xsize,
235244
ysize=params.ysize,
245+
self_closing=params.do_self_closing,
236246
)
237247

238248
# handle classification data
@@ -353,6 +363,7 @@ def serialize(
353363
doc=doc,
354364
xsize=params.xsize,
355365
ysize=params.ysize,
366+
self_closing=params.do_self_closing,
356367
)
357368

358369
# mapping from source_cell_id to a list of target_cell_ids
@@ -493,6 +504,7 @@ def _get_inline_location_tags(
493504
page_h=page_h,
494505
xsize=params.xsize,
495506
ysize=params.ysize,
507+
self_closing=params.do_self_closing,
496508
)
497509

498510
return SerializationResult(
@@ -628,6 +640,7 @@ def serialize_captions(
628640
doc=self.doc,
629641
xsize=params.xsize,
630642
ysize=params.ysize,
643+
self_closing=params.do_self_closing,
631644
)
632645
results.append(create_ser_result(text=loc_txt))
633646
results.append(cap_res)

docling_core/types/doc/document.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1541,6 +1541,7 @@ def get_location_tokens(
15411541
new_line: str = "", # deprecated
15421542
xsize: int = 500,
15431543
ysize: int = 500,
1544+
self_closing: bool = False,
15441545
) -> str:
15451546
"""Get the location string for the BaseCell."""
15461547
if not len(self.prov):
@@ -1556,6 +1557,7 @@ def get_location_tokens(
15561557
page_h=page_h,
15571558
xsize=xsize,
15581559
ysize=ysize,
1560+
self_closing=self_closing,
15591561
)
15601562
location += loc_str
15611563

@@ -2245,6 +2247,7 @@ def export_to_otsl(
22452247
add_cell_text: bool = True,
22462248
xsize: int = 500,
22472249
ysize: int = 500,
2250+
self_closing: bool = False,
22482251
**kwargs: Any,
22492252
) -> str:
22502253
"""Export the table as OTSL."""
@@ -2300,6 +2303,7 @@ def export_to_otsl(
23002303
page_h=page_h,
23012304
xsize=xsize,
23022305
ysize=ysize,
2306+
self_closing=self_closing,
23032307
)
23042308

23052309
if rowstart == i and colstart == j:

0 commit comments

Comments
 (0)