Skip to content

Commit 13aafcb

Browse files
committed
Merge branch 'main' of github.com:DS4SD/docling-core into robustify-page-filtering
2 parents 40922f0 + 3fe8b5a commit 13aafcb

File tree

13 files changed

+730
-14
lines changed

13 files changed

+730
-14
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## [v2.52.0](https://github.com/docling-project/docling-core/releases/tag/v2.52.0) - 2025-11-20
2+
3+
### Feature
4+
5+
* **experimental:** Add new DocTags serializer ([#412](https://github.com/docling-project/docling-core/issues/412)) ([`c9e5fb4`](https://github.com/docling-project/docling-core/commit/c9e5fb4a1ceb1ec0cae8ebae5f3eb844c0a2198a))
6+
* Convert regions into TableData ([#430](https://github.com/docling-project/docling-core/issues/430)) ([`c80b583`](https://github.com/docling-project/docling-core/commit/c80b58369c5bbb1be779a241fee146aa1b3a3685))
7+
18
## [v2.51.1](https://github.com/docling-project/docling-core/releases/tag/v2.51.1) - 2025-11-14
29

310
### Fix
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""Define classes for DocTags serialization."""
2+
3+
from typing import Any, Final, Optional
4+
from xml.dom.minidom import parseString
5+
6+
from pydantic import BaseModel
7+
from typing_extensions import override
8+
9+
from docling_core.transforms.serializer.base import (
10+
BaseDocSerializer,
11+
BaseMetaSerializer,
12+
BasePictureSerializer,
13+
SerializationResult,
14+
)
15+
from docling_core.transforms.serializer.common import create_ser_result
16+
from docling_core.transforms.serializer.doctags import (
17+
DocTagsDocSerializer,
18+
DocTagsParams,
19+
DocTagsPictureSerializer,
20+
_get_delim,
21+
_wrap,
22+
)
23+
from docling_core.types.doc import (
24+
BaseMeta,
25+
DescriptionMetaField,
26+
DocItem,
27+
DoclingDocument,
28+
MetaFieldName,
29+
MoleculeMetaField,
30+
NodeItem,
31+
PictureClassificationMetaField,
32+
PictureItem,
33+
SummaryMetaField,
34+
TableData,
35+
TabularChartMetaField,
36+
)
37+
from docling_core.types.doc.labels import DocItemLabel
38+
from docling_core.types.doc.tokens import DocumentToken
39+
40+
DOCTAGS_VERSION: Final = "1.0.0"
41+
42+
43+
class IDocTagsParams(DocTagsParams):
44+
"""DocTags-specific serialization parameters."""
45+
46+
do_self_closing: bool = True
47+
pretty_indentation: Optional[str] = 2 * " "
48+
49+
50+
class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
51+
"""DocTags-specific meta serializer."""
52+
53+
@override
54+
def serialize(
55+
self,
56+
*,
57+
item: NodeItem,
58+
**kwargs: Any,
59+
) -> SerializationResult:
60+
"""DocTags-specific meta serializer."""
61+
params = IDocTagsParams(**kwargs)
62+
63+
elem_delim = ""
64+
texts = (
65+
[
66+
tmp
67+
for key in (
68+
list(item.meta.__class__.model_fields)
69+
+ list(item.meta.get_custom_part())
70+
)
71+
if (
72+
(
73+
params.allowed_meta_names is None
74+
or key in params.allowed_meta_names
75+
)
76+
and (key not in params.blocked_meta_names)
77+
and (tmp := self._serialize_meta_field(item.meta, key))
78+
)
79+
]
80+
if item.meta
81+
else []
82+
)
83+
if texts:
84+
texts.insert(0, "<meta>")
85+
texts.append("</meta>")
86+
return create_ser_result(
87+
text=elem_delim.join(texts),
88+
span_source=item if isinstance(item, DocItem) else [],
89+
)
90+
91+
def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
92+
if (field_val := getattr(meta, name)) is not None:
93+
if name == MetaFieldName.SUMMARY and isinstance(
94+
field_val, SummaryMetaField
95+
):
96+
txt = f"<summary>{field_val.text}</summary>"
97+
elif name == MetaFieldName.DESCRIPTION and isinstance(
98+
field_val, DescriptionMetaField
99+
):
100+
txt = f"<description>{field_val.text}</description>"
101+
elif name == MetaFieldName.CLASSIFICATION and isinstance(
102+
field_val, PictureClassificationMetaField
103+
):
104+
class_name = self._humanize_text(
105+
field_val.get_main_prediction().class_name
106+
)
107+
txt = f"<classification>{class_name}</classification>"
108+
elif name == MetaFieldName.MOLECULE and isinstance(
109+
field_val, MoleculeMetaField
110+
):
111+
txt = f"<molecule>{field_val.smi}</molecule>"
112+
elif name == MetaFieldName.TABULAR_CHART and isinstance(
113+
field_val, TabularChartMetaField
114+
):
115+
# suppressing tabular chart serialization
116+
return None
117+
# elif tmp := str(field_val or ""):
118+
# txt = tmp
119+
elif name not in {v.value for v in MetaFieldName}:
120+
txt = _wrap(text=str(field_val or ""), wrap_tag=name)
121+
return txt
122+
return None
123+
124+
125+
class IDocTagsPictureSerializer(DocTagsPictureSerializer):
126+
"""DocTags-specific picture item serializer."""
127+
128+
@override
129+
def serialize(
130+
self,
131+
*,
132+
item: PictureItem,
133+
doc_serializer: BaseDocSerializer,
134+
doc: DoclingDocument,
135+
**kwargs: Any,
136+
) -> SerializationResult:
137+
"""Serializes the passed item."""
138+
params = DocTagsParams(**kwargs)
139+
res_parts: list[SerializationResult] = []
140+
is_chart = False
141+
142+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
143+
144+
if item.meta:
145+
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
146+
if meta_res.text:
147+
res_parts.append(meta_res)
148+
149+
body = ""
150+
if params.add_location:
151+
body += item.get_location_tokens(
152+
doc=doc,
153+
xsize=params.xsize,
154+
ysize=params.ysize,
155+
self_closing=params.do_self_closing,
156+
)
157+
158+
# handle tabular chart data
159+
chart_data: Optional[TableData] = None
160+
if item.meta and item.meta.tabular_chart:
161+
chart_data = item.meta.tabular_chart.chart_data
162+
if chart_data and chart_data.table_cells:
163+
temp_doc = DoclingDocument(name="temp")
164+
temp_table = temp_doc.add_table(data=chart_data)
165+
otsl_content = temp_table.export_to_otsl(
166+
temp_doc,
167+
add_cell_location=False,
168+
self_closing=params.do_self_closing,
169+
)
170+
body += otsl_content
171+
res_parts.append(create_ser_result(text=body, span_source=item))
172+
173+
if params.add_caption:
174+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
175+
if cap_res.text:
176+
res_parts.append(cap_res)
177+
178+
text_res = "".join([r.text for r in res_parts])
179+
if text_res:
180+
token = DocumentToken.create_token_name_from_doc_item_label(
181+
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
182+
)
183+
text_res = _wrap(text=text_res, wrap_tag=token)
184+
return create_ser_result(text=text_res, span_source=res_parts)
185+
186+
187+
class IDocTagsDocSerializer(DocTagsDocSerializer):
188+
"""DocTags document serializer."""
189+
190+
picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
191+
meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
192+
params: IDocTagsParams = IDocTagsParams()
193+
194+
@override
195+
def _meta_is_wrapped(self) -> bool:
196+
return True
197+
198+
@override
199+
def serialize_doc(
200+
self,
201+
*,
202+
parts: list[SerializationResult],
203+
**kwargs: Any,
204+
) -> SerializationResult:
205+
"""DocTags-specific document serializer."""
206+
delim = _get_delim(params=self.params)
207+
text_res = delim.join([p.text for p in parts if p.text])
208+
209+
if self.params.add_page_break:
210+
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
211+
for full_match, _, _ in self._get_page_breaks(text=text_res):
212+
text_res = text_res.replace(full_match, page_sep)
213+
214+
wrap_tag = DocumentToken.DOCUMENT.value
215+
text_res = f"<{wrap_tag}><version>{DOCTAGS_VERSION}</version>{text_res}{delim}</{wrap_tag}>"
216+
217+
if self.params.pretty_indentation and (
218+
my_root := parseString(text_res).documentElement
219+
):
220+
text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
221+
text_res = "\n".join(
222+
[line for line in text_res.split("\n") if line.strip()]
223+
)
224+
return create_ser_result(text=text_res, span_source=parts)

docling_core/transforms/serializer/common.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class CommonParams(BaseModel):
209209
use_legacy_annotations: bool = Field(
210210
default=False,
211211
description="Use legacy annotation serialization.",
212-
deprecated="Legacy annotations considered only when meta not present.",
212+
deprecated="Ignored field; legacy annotations considered only when meta not present.",
213213
)
214214
allowed_meta_names: Optional[set[str]] = Field(
215215
default=None,
@@ -318,6 +318,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult:
318318
res = self.serialize_doc(parts=subparts, **kwargs)
319319
return res
320320

321+
def _meta_is_wrapped(self) -> bool:
322+
return False
323+
321324
@override
322325
def serialize(
323326
self,
@@ -339,7 +342,7 @@ def serialize(
339342
my_item = item or self.doc.body
340343

341344
if my_item == self.doc.body:
342-
if my_item.meta:
345+
if my_item.meta and not self._meta_is_wrapped():
343346
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
344347
if meta_part.text:
345348
parts.append(meta_part)
@@ -358,7 +361,7 @@ def serialize(
358361

359362
my_visited.add(my_item.self_ref)
360363

361-
if my_item.meta:
364+
if my_item.meta and not self._meta_is_wrapped():
362365
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
363366
if meta_part.text:
364367
parts.append(meta_part)
@@ -605,7 +608,6 @@ def serialize_meta(
605608
text="", span_source=item if isinstance(item, DocItem) else []
606609
)
607610
else:
608-
_logger.warning("No meta serializer found.")
609611
return create_ser_result(
610612
text="", span_source=item if isinstance(item, DocItem) else []
611613
)

docling_core/transforms/serializer/doctags.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class Mode(str, Enum):
7777

7878
mode: Mode = Mode.HUMAN_FRIENDLY
7979

80+
do_self_closing: bool = False
81+
8082

8183
def _get_delim(params: DocTagsParams) -> str:
8284
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
@@ -110,11 +112,17 @@ def serialize(
110112
)
111113
parts: list[str] = []
112114

115+
if item.meta:
116+
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
117+
if meta_res.text:
118+
parts.append(meta_res.text)
119+
113120
if params.add_location:
114121
location = item.get_location_tokens(
115122
doc=doc,
116123
xsize=params.xsize,
117124
ysize=params.ysize,
125+
self_closing=params.do_self_closing,
118126
)
119127
if location:
120128
parts.append(location)
@@ -184,6 +192,7 @@ def serialize(
184192
doc=doc,
185193
xsize=params.xsize,
186194
ysize=params.ysize,
195+
self_closing=params.do_self_closing,
187196
)
188197
res_parts.append(create_ser_result(text=loc_text, span_source=item))
189198

@@ -233,6 +242,7 @@ def serialize(
233242
doc=doc,
234243
xsize=params.xsize,
235244
ysize=params.ysize,
245+
self_closing=params.do_self_closing,
236246
)
237247

238248
# handle classification data
@@ -353,6 +363,7 @@ def serialize(
353363
doc=doc,
354364
xsize=params.xsize,
355365
ysize=params.ysize,
366+
self_closing=params.do_self_closing,
356367
)
357368

358369
# mapping from source_cell_id to a list of target_cell_ids
@@ -493,6 +504,7 @@ def _get_inline_location_tags(
493504
page_h=page_h,
494505
xsize=params.xsize,
495506
ysize=params.ysize,
507+
self_closing=params.do_self_closing,
496508
)
497509

498510
return SerializationResult(
@@ -628,6 +640,7 @@ def serialize_captions(
628640
doc=self.doc,
629641
xsize=params.xsize,
630642
ysize=params.ysize,
643+
self_closing=params.do_self_closing,
631644
)
632645
results.append(create_ser_result(text=loc_txt))
633646
results.append(cap_res)

docling_core/types/doc/base.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Models for the base data types."""
22

33
from enum import Enum
4-
from typing import Any, List, Tuple
4+
from typing import Any, List, Optional, Tuple
55

66
from pydantic import BaseModel, FieldSerializationInfo, field_serializer
77

@@ -231,6 +231,31 @@ def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
231231
coord_origin=CoordOrigin.BOTTOMLEFT,
232232
)
233233

234+
def get_intersection_bbox(self, other: "BoundingBox") -> Optional["BoundingBox"]:
235+
"""Return the intersection bounding box with another bounding box or ``None`` when disjoint."""
236+
if self.coord_origin != other.coord_origin:
237+
raise ValueError("BoundingBoxes have different CoordOrigin")
238+
239+
left = max(self.l, other.l)
240+
right = min(self.r, other.r)
241+
242+
if self.coord_origin == CoordOrigin.TOPLEFT:
243+
top = max(self.t, other.t)
244+
bottom = min(self.b, other.b)
245+
if right <= left or bottom <= top:
246+
return None
247+
return BoundingBox(
248+
l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin
249+
)
250+
251+
top = min(self.t, other.t)
252+
bottom = max(self.b, other.b)
253+
if right <= left or top <= bottom:
254+
return None
255+
return BoundingBox(
256+
l=left, t=top, r=right, b=bottom, coord_origin=self.coord_origin
257+
)
258+
234259
def to_top_left_origin(self, page_height: float) -> "BoundingBox":
235260
"""to_top_left_origin.
236261

0 commit comments

Comments
 (0)