Skip to content

Commit 627ba61

Browse files
committed
move meta serialization into DocSerializer.serialize() to maintain seamless chunking integration
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 5fc98e3 commit 627ba61

File tree

4 files changed

+8663
-2580
lines changed

4 files changed

+8663
-2580
lines changed

docling_core/transforms/serializer/common.py

Lines changed: 136 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -209,15 +209,6 @@ class CommonParams(BaseModel):
209209
default=False, description="Use legacy annotation serialization."
210210
)
211211

212-
# allowed_meta_names: Optional[set[str]] = Field(
213-
# default=None,
214-
# description="Names of meta fields to include; if None, all fields will be included.",
215-
# )
216-
# blocked_meta_names: set[str] = Field(
217-
# default_factory=set,
218-
# description="Names of meta fields to block; takes precedence over allowed_meta_names.",
219-
# )
220-
221212
def merge_with_patch(self, patch: dict[str, Any]) -> Self:
222213
"""Create an instance by merging the provided patch dict on top of self."""
223214
res = self.model_copy(update=patch)
@@ -328,103 +319,130 @@ def serialize(
328319
) -> SerializationResult:
329320
"""Serialize a given node."""
330321
my_visited: set[str] = visited if visited is not None else set()
322+
parts: list[SerializationResult] = []
323+
delim: str = kwargs.get("delim", "\n")
324+
my_params = self.params.model_copy(update=kwargs)
331325
my_kwargs = {**self.params.model_dump(), **kwargs}
332326
empty_res = create_ser_result()
333-
if item is None or item == self.doc.body:
334-
if self.doc.body.self_ref not in my_visited:
335-
my_visited.add(self.doc.body.self_ref)
336-
return self._serialize_body(**my_kwargs)
337-
else:
338-
return empty_res
339327

340-
my_visited.add(item.self_ref)
328+
my_item = item or self.doc.body
341329

342-
########
343-
# groups
344-
########
345-
if isinstance(item, ListGroup):
346-
part = self.list_serializer.serialize(
347-
item=item,
348-
doc_serializer=self,
349-
doc=self.doc,
350-
list_level=list_level,
351-
is_inline_scope=is_inline_scope,
352-
visited=my_visited,
353-
**my_kwargs,
354-
)
355-
elif isinstance(item, InlineGroup):
356-
part = self.inline_serializer.serialize(
357-
item=item,
358-
doc_serializer=self,
359-
doc=self.doc,
360-
list_level=list_level,
361-
visited=my_visited,
362-
**my_kwargs,
363-
)
364-
###########
365-
# doc items
366-
###########
367-
elif isinstance(item, TextItem):
368-
if item.self_ref in self._captions_of_some_item:
369-
# those captions will be handled by the floating item holding them
370-
return empty_res
330+
if my_item == self.doc.body:
331+
if my_item.meta and not my_params.use_legacy_annotations:
332+
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
333+
if meta_part.text:
334+
parts.append(meta_part)
335+
336+
if my_item.self_ref not in my_visited:
337+
my_visited.add(my_item.self_ref)
338+
part = self._serialize_body(**my_kwargs)
339+
if part.text:
340+
parts.append(part)
341+
return create_ser_result(
342+
text=delim.join([p.text for p in parts if p.text]),
343+
span_source=parts,
344+
)
371345
else:
372-
part = (
373-
self.text_serializer.serialize(
374-
item=item,
375-
doc_serializer=self,
376-
doc=self.doc,
377-
is_inline_scope=is_inline_scope,
378-
visited=my_visited,
379-
**my_kwargs,
346+
return empty_res
347+
348+
my_visited.add(my_item.self_ref)
349+
350+
if my_item.meta and not my_params.use_legacy_annotations:
351+
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
352+
if meta_part.text:
353+
parts.append(meta_part)
354+
355+
if my_params.include_non_meta:
356+
########
357+
# groups
358+
########
359+
if isinstance(my_item, ListGroup):
360+
part = self.list_serializer.serialize(
361+
item=my_item,
362+
doc_serializer=self,
363+
doc=self.doc,
364+
list_level=list_level,
365+
is_inline_scope=is_inline_scope,
366+
visited=my_visited,
367+
**my_kwargs,
368+
)
369+
elif isinstance(my_item, InlineGroup):
370+
part = self.inline_serializer.serialize(
371+
item=my_item,
372+
doc_serializer=self,
373+
doc=self.doc,
374+
list_level=list_level,
375+
visited=my_visited,
376+
**my_kwargs,
377+
)
378+
###########
379+
# doc items
380+
###########
381+
elif isinstance(my_item, TextItem):
382+
if my_item.self_ref in self._captions_of_some_item:
383+
# those captions will be handled by the floating item holding them
384+
return empty_res
385+
else:
386+
part = (
387+
self.text_serializer.serialize(
388+
item=my_item,
389+
doc_serializer=self,
390+
doc=self.doc,
391+
is_inline_scope=is_inline_scope,
392+
visited=my_visited,
393+
**my_kwargs,
394+
)
395+
if my_item.self_ref not in self.get_excluded_refs(**kwargs)
396+
else empty_res
380397
)
381-
if item.self_ref not in self.get_excluded_refs(**kwargs)
382-
else empty_res
398+
elif isinstance(my_item, TableItem):
399+
part = self.table_serializer.serialize(
400+
item=my_item,
401+
doc_serializer=self,
402+
doc=self.doc,
403+
visited=my_visited,
404+
**my_kwargs,
383405
)
384-
elif isinstance(item, TableItem):
385-
part = self.table_serializer.serialize(
386-
item=item,
387-
doc_serializer=self,
388-
doc=self.doc,
389-
visited=my_visited,
390-
**my_kwargs,
391-
)
392-
elif isinstance(item, PictureItem):
393-
part = self.picture_serializer.serialize(
394-
item=item,
395-
doc_serializer=self,
396-
doc=self.doc,
397-
visited=my_visited,
398-
**my_kwargs,
399-
)
400-
elif isinstance(item, KeyValueItem):
401-
part = self.key_value_serializer.serialize(
402-
item=item,
403-
doc_serializer=self,
404-
doc=self.doc,
405-
**my_kwargs,
406-
)
407-
elif isinstance(item, FormItem):
408-
part = self.form_serializer.serialize(
409-
item=item,
410-
doc_serializer=self,
411-
doc=self.doc,
412-
**my_kwargs,
413-
)
414-
elif isinstance(item, _PageBreakNode):
415-
part = _PageBreakSerResult(
416-
text=self._create_page_break(node=item),
417-
node=item,
418-
)
419-
else:
420-
part = self.fallback_serializer.serialize(
421-
item=item,
422-
doc_serializer=self,
423-
doc=self.doc,
424-
visited=my_visited,
425-
**my_kwargs,
426-
)
427-
return part
406+
elif isinstance(my_item, PictureItem):
407+
part = self.picture_serializer.serialize(
408+
item=my_item,
409+
doc_serializer=self,
410+
doc=self.doc,
411+
visited=my_visited,
412+
**my_kwargs,
413+
)
414+
elif isinstance(my_item, KeyValueItem):
415+
part = self.key_value_serializer.serialize(
416+
item=my_item,
417+
doc_serializer=self,
418+
doc=self.doc,
419+
**my_kwargs,
420+
)
421+
elif isinstance(my_item, FormItem):
422+
part = self.form_serializer.serialize(
423+
item=my_item,
424+
doc_serializer=self,
425+
doc=self.doc,
426+
**my_kwargs,
427+
)
428+
elif isinstance(my_item, _PageBreakNode):
429+
part = _PageBreakSerResult(
430+
text=self._create_page_break(node=my_item),
431+
node=my_item,
432+
)
433+
else:
434+
part = self.fallback_serializer.serialize(
435+
item=my_item,
436+
doc_serializer=self,
437+
doc=self.doc,
438+
visited=my_visited,
439+
**my_kwargs,
440+
)
441+
parts.append(part)
442+
443+
return create_ser_result(
444+
text=delim.join([p.text for p in parts if p.text]), span_source=parts
445+
)
428446

429447
# making some assumptions about the kwargs it can pass
430448
@override
@@ -454,28 +472,15 @@ def get_parts(
454472
else:
455473
my_visited.add(node.self_ref)
456474

457-
if (
458-
not params.use_legacy_annotations
459-
and node.self_ref not in self.get_excluded_refs(**kwargs)
460-
):
461-
part = self.serialize_meta(
462-
item=node,
463-
level=lvl,
464-
**kwargs,
465-
)
466-
if part.text:
467-
parts.append(part)
468-
469-
if params.include_non_meta:
470-
part = self.serialize(
471-
item=node,
472-
list_level=list_level,
473-
is_inline_scope=is_inline_scope,
474-
visited=my_visited,
475-
**kwargs,
476-
)
477-
if part.text:
478-
parts.append(part)
475+
part = self.serialize(
476+
item=node,
477+
list_level=list_level,
478+
is_inline_scope=is_inline_scope,
479+
visited=my_visited,
480+
**(dict(level=lvl) | kwargs),
481+
)
482+
if part.text:
483+
parts.append(part)
479484

480485
return parts
481486

@@ -578,20 +583,21 @@ def serialize_meta(
578583
) -> SerializationResult:
579584
"""Serialize the item's meta."""
580585
if self.meta_serializer:
581-
return self.meta_serializer.serialize(
582-
item=item,
583-
doc=self.doc,
584-
**kwargs,
585-
)
586+
if item.self_ref not in self.get_excluded_refs(**kwargs):
587+
return self.meta_serializer.serialize(
588+
item=item,
589+
doc=self.doc,
590+
**kwargs,
591+
)
592+
else:
593+
return create_ser_result(
594+
text="", span_source=item if isinstance(item, DocItem) else []
595+
)
586596
else:
587597
_logger.warning("No meta serializer found.")
588598
return create_ser_result(
589599
text="", span_source=item if isinstance(item, DocItem) else []
590600
)
591-
# return create_ser_result(
592-
# text=item.meta.model_dump_json() if item.meta else "",
593-
# span_source=item,
594-
# )
595601

596602
# TODO deprecate
597603
@override

docling_core/transforms/serializer/markdown.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,3 +805,22 @@ def serialize_doc(
805805
def requires_page_break(self) -> bool:
806806
"""Whether to add page breaks."""
807807
return self.params.page_break_placeholder is not None
808+
809+
@override
810+
def serialize(
811+
self,
812+
*,
813+
item: Optional[NodeItem] = None,
814+
list_level: int = 0,
815+
is_inline_scope: bool = False,
816+
visited: Optional[set[str]] = None,
817+
**kwargs: Any,
818+
) -> SerializationResult:
819+
"""Serialize a given node."""
820+
return super().serialize(
821+
item=item,
822+
list_level=list_level,
823+
is_inline_scope=is_inline_scope,
824+
visited=visited,
825+
**(dict(delim="\n\n") | kwargs),
826+
)

0 commit comments

Comments
 (0)