Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions docling_core/experimental/idoctags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""Define classes for DocTags serialization."""

from typing import Any, Final, Optional
from xml.dom.minidom import parseString

from pydantic import BaseModel
from typing_extensions import override

from docling_core.transforms.serializer.base import (
BaseDocSerializer,
BaseMetaSerializer,
BasePictureSerializer,
SerializationResult,
)
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.transforms.serializer.doctags import (
DocTagsDocSerializer,
DocTagsParams,
DocTagsPictureSerializer,
_get_delim,
_wrap,
)
from docling_core.types.doc import (
BaseMeta,
DescriptionMetaField,
DocItem,
DoclingDocument,
MetaFieldName,
MoleculeMetaField,
NodeItem,
PictureClassificationMetaField,
PictureItem,
SummaryMetaField,
TableData,
TabularChartMetaField,
)
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc.tokens import DocumentToken

DOCTAGS_VERSION: Final = "1.0.0"


class IDocTagsParams(DocTagsParams):
"""DocTags-specific serialization parameters."""

do_self_closing: bool = True
pretty_indentation: Optional[str] = 2 * " "


class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
"""DocTags-specific meta serializer."""

@override
def serialize(
self,
*,
item: NodeItem,
**kwargs: Any,
) -> SerializationResult:
"""DocTags-specific meta serializer."""
params = IDocTagsParams(**kwargs)

elem_delim = ""
texts = (
[
tmp
for key in (
list(item.meta.__class__.model_fields)
+ list(item.meta.get_custom_part())
)
if (
(
params.allowed_meta_names is None
or key in params.allowed_meta_names
)
and (key not in params.blocked_meta_names)
and (tmp := self._serialize_meta_field(item.meta, key))
)
]
if item.meta
else []
)
if texts:
texts.insert(0, "<meta>")
texts.append("</meta>")
return create_ser_result(
text=elem_delim.join(texts),
span_source=item if isinstance(item, DocItem) else [],
)

def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
if (field_val := getattr(meta, name)) is not None:
if name == MetaFieldName.SUMMARY and isinstance(
field_val, SummaryMetaField
):
txt = f"<summary>{field_val.text}</summary>"
elif name == MetaFieldName.DESCRIPTION and isinstance(
field_val, DescriptionMetaField
):
txt = f"<description>{field_val.text}</description>"
elif name == MetaFieldName.CLASSIFICATION and isinstance(
field_val, PictureClassificationMetaField
):
class_name = self._humanize_text(
field_val.get_main_prediction().class_name
)
txt = f"<classification>{class_name}</classification>"
elif name == MetaFieldName.MOLECULE and isinstance(
field_val, MoleculeMetaField
):
txt = f"<molecule>{field_val.smi}</molecule>"
elif name == MetaFieldName.TABULAR_CHART and isinstance(
field_val, TabularChartMetaField
):
# suppressing tabular chart serialization
return None
# elif tmp := str(field_val or ""):
# txt = tmp
elif name not in {v.value for v in MetaFieldName}:
txt = _wrap(text=str(field_val or ""), wrap_tag=name)
return txt
return None


class IDocTagsPictureSerializer(DocTagsPictureSerializer):
"""DocTags-specific picture item serializer."""

@override
def serialize(
self,
*,
item: PictureItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
params = DocTagsParams(**kwargs)
res_parts: list[SerializationResult] = []
is_chart = False

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):

if item.meta:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
res_parts.append(meta_res)

body = ""
if params.add_location:
body += item.get_location_tokens(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

# handle tabular chart data
chart_data: Optional[TableData] = None
if item.meta and item.meta.tabular_chart:
chart_data = item.meta.tabular_chart.chart_data
if chart_data and chart_data.table_cells:
temp_doc = DoclingDocument(name="temp")
temp_table = temp_doc.add_table(data=chart_data)
otsl_content = temp_table.export_to_otsl(
temp_doc,
add_cell_location=False,
self_closing=params.do_self_closing,
)
body += otsl_content
res_parts.append(create_ser_result(text=body, span_source=item))

if params.add_caption:
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
if cap_res.text:
res_parts.append(cap_res)

text_res = "".join([r.text for r in res_parts])
if text_res:
token = DocumentToken.create_token_name_from_doc_item_label(
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
)
text_res = _wrap(text=text_res, wrap_tag=token)
return create_ser_result(text=text_res, span_source=res_parts)


class IDocTagsDocSerializer(DocTagsDocSerializer):
"""DocTags document serializer."""

picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
params: IDocTagsParams = IDocTagsParams()

@override
def _meta_is_wrapped(self) -> bool:
return True

@override
def serialize_doc(
self,
*,
parts: list[SerializationResult],
**kwargs: Any,
) -> SerializationResult:
"""DocTags-specific document serializer."""
delim = _get_delim(params=self.params)
text_res = delim.join([p.text for p in parts if p.text])

if self.params.add_page_break:
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
for full_match, _, _ in self._get_page_breaks(text=text_res):
text_res = text_res.replace(full_match, page_sep)

wrap_tag = DocumentToken.DOCUMENT.value
text_res = f"<{wrap_tag}><version>{DOCTAGS_VERSION}</version>{text_res}{delim}</{wrap_tag}>"

if self.params.pretty_indentation and (
my_root := parseString(text_res).documentElement
):
text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
text_res = "\n".join(
[line for line in text_res.split("\n") if line.strip()]
)
return create_ser_result(text=text_res, span_source=parts)
10 changes: 6 additions & 4 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ class CommonParams(BaseModel):
use_legacy_annotations: bool = Field(
default=False,
description="Use legacy annotation serialization.",
deprecated="Legacy annotations considered only when meta not present.",
deprecated="Ignored field; legacy annotations considered only when meta not present.",
)
allowed_meta_names: Optional[set[str]] = Field(
default=None,
Expand Down Expand Up @@ -318,6 +318,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult:
res = self.serialize_doc(parts=subparts, **kwargs)
return res

def _meta_is_wrapped(self) -> bool:
return False

@override
def serialize(
self,
Expand All @@ -339,7 +342,7 @@ def serialize(
my_item = item or self.doc.body

if my_item == self.doc.body:
if my_item.meta:
if my_item.meta and not self._meta_is_wrapped():
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
Expand All @@ -358,7 +361,7 @@ def serialize(

my_visited.add(my_item.self_ref)

if my_item.meta:
if my_item.meta and not self._meta_is_wrapped():
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
Expand Down Expand Up @@ -605,7 +608,6 @@ def serialize_meta(
text="", span_source=item if isinstance(item, DocItem) else []
)
else:
_logger.warning("No meta serializer found.")
return create_ser_result(
text="", span_source=item if isinstance(item, DocItem) else []
)
Expand Down
13 changes: 13 additions & 0 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class Mode(str, Enum):

mode: Mode = Mode.HUMAN_FRIENDLY

do_self_closing: bool = False


def _get_delim(params: DocTagsParams) -> str:
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
Expand Down Expand Up @@ -110,11 +112,17 @@ def serialize(
)
parts: list[str] = []

if item.meta:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
parts.append(meta_res.text)

if params.add_location:
location = item.get_location_tokens(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)
if location:
parts.append(location)
Expand Down Expand Up @@ -184,6 +192,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)
res_parts.append(create_ser_result(text=loc_text, span_source=item))

Expand Down Expand Up @@ -233,6 +242,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

# handle classification data
Expand Down Expand Up @@ -353,6 +363,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

# mapping from source_cell_id to a list of target_cell_ids
Expand Down Expand Up @@ -493,6 +504,7 @@ def _get_inline_location_tags(
page_h=page_h,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

return SerializationResult(
Expand Down Expand Up @@ -628,6 +640,7 @@ def serialize_captions(
doc=self.doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)
results.append(create_ser_result(text=loc_txt))
results.append(cap_res)
Expand Down
4 changes: 4 additions & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1541,6 +1541,7 @@ def get_location_tokens(
new_line: str = "", # deprecated
xsize: int = 500,
ysize: int = 500,
self_closing: bool = False,
) -> str:
"""Get the location string for the BaseCell."""
if not len(self.prov):
Expand All @@ -1556,6 +1557,7 @@ def get_location_tokens(
page_h=page_h,
xsize=xsize,
ysize=ysize,
self_closing=self_closing,
)
location += loc_str

Expand Down Expand Up @@ -2245,6 +2247,7 @@ def export_to_otsl(
add_cell_text: bool = True,
xsize: int = 500,
ysize: int = 500,
self_closing: bool = False,
**kwargs: Any,
) -> str:
"""Export the table as OTSL."""
Expand Down Expand Up @@ -2300,6 +2303,7 @@ def export_to_otsl(
page_h=page_h,
xsize=xsize,
ysize=ysize,
self_closing=self_closing,
)

if rowstart == i and colstart == j:
Expand Down
Loading
Loading