diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
new file mode 100644
index 00000000..ddd1d9a0
--- /dev/null
+++ b/docling_core/experimental/idoctags.py
@@ -0,0 +1,224 @@
+"""Define classes for DocTags serialization."""
+
+from typing import Any, Final, Optional
+from xml.dom.minidom import parseString
+
+from pydantic import BaseModel
+from typing_extensions import override
+
+from docling_core.transforms.serializer.base import (
+ BaseDocSerializer,
+ BaseMetaSerializer,
+ BasePictureSerializer,
+ SerializationResult,
+)
+from docling_core.transforms.serializer.common import create_ser_result
+from docling_core.transforms.serializer.doctags import (
+ DocTagsDocSerializer,
+ DocTagsParams,
+ DocTagsPictureSerializer,
+ _get_delim,
+ _wrap,
+)
+from docling_core.types.doc import (
+ BaseMeta,
+ DescriptionMetaField,
+ DocItem,
+ DoclingDocument,
+ MetaFieldName,
+ MoleculeMetaField,
+ NodeItem,
+ PictureClassificationMetaField,
+ PictureItem,
+ SummaryMetaField,
+ TableData,
+ TabularChartMetaField,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.tokens import DocumentToken
+
+DOCTAGS_VERSION: Final = "1.0.0"
+
+
+class IDocTagsParams(DocTagsParams):
+ """DocTags-specific serialization parameters."""
+
+ do_self_closing: bool = True
+ pretty_indentation: Optional[str] = 2 * " "
+
+
+class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
+ """DocTags-specific meta serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: NodeItem,
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """DocTags-specific meta serializer."""
+ params = IDocTagsParams(**kwargs)
+
+ elem_delim = ""
+ texts = (
+ [
+ tmp
+ for key in (
+ list(item.meta.__class__.model_fields)
+ + list(item.meta.get_custom_part())
+ )
+ if (
+ (
+ params.allowed_meta_names is None
+ or key in params.allowed_meta_names
+ )
+ and (key not in params.blocked_meta_names)
+ and (tmp := self._serialize_meta_field(item.meta, key))
+ )
+ ]
+ if item.meta
+ else []
+ )
+ if texts:
+ texts.insert(0, "")
+ texts.append("")
+ return create_ser_result(
+ text=elem_delim.join(texts),
+ span_source=item if isinstance(item, DocItem) else [],
+ )
+
+ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
+ if (field_val := getattr(meta, name)) is not None:
+ if name == MetaFieldName.SUMMARY and isinstance(
+ field_val, SummaryMetaField
+ ):
+ txt = f"{field_val.text}"
+ elif name == MetaFieldName.DESCRIPTION and isinstance(
+ field_val, DescriptionMetaField
+ ):
+ txt = f"{field_val.text}"
+ elif name == MetaFieldName.CLASSIFICATION and isinstance(
+ field_val, PictureClassificationMetaField
+ ):
+ class_name = self._humanize_text(
+ field_val.get_main_prediction().class_name
+ )
+ txt = f"{class_name}"
+ elif name == MetaFieldName.MOLECULE and isinstance(
+ field_val, MoleculeMetaField
+ ):
+ txt = f"{field_val.smi}"
+ elif name == MetaFieldName.TABULAR_CHART and isinstance(
+ field_val, TabularChartMetaField
+ ):
+ # suppressing tabular chart serialization
+ return None
+ # elif tmp := str(field_val or ""):
+ # txt = tmp
+ elif name not in {v.value for v in MetaFieldName}:
+ txt = _wrap(text=str(field_val or ""), wrap_tag=name)
+ return txt
+ return None
+
+
+class IDocTagsPictureSerializer(DocTagsPictureSerializer):
+ """DocTags-specific picture item serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: PictureItem,
+ doc_serializer: BaseDocSerializer,
+ doc: DoclingDocument,
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """Serializes the passed item."""
+ params = DocTagsParams(**kwargs)
+ res_parts: list[SerializationResult] = []
+ is_chart = False
+
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+
+ if item.meta:
+ meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
+ if meta_res.text:
+ res_parts.append(meta_res)
+
+ body = ""
+ if params.add_location:
+ body += item.get_location_tokens(
+ doc=doc,
+ xsize=params.xsize,
+ ysize=params.ysize,
+ self_closing=params.do_self_closing,
+ )
+
+ # handle tabular chart data
+ chart_data: Optional[TableData] = None
+ if item.meta and item.meta.tabular_chart:
+ chart_data = item.meta.tabular_chart.chart_data
+ if chart_data and chart_data.table_cells:
+ temp_doc = DoclingDocument(name="temp")
+ temp_table = temp_doc.add_table(data=chart_data)
+ otsl_content = temp_table.export_to_otsl(
+ temp_doc,
+ add_cell_location=False,
+ self_closing=params.do_self_closing,
+ )
+ body += otsl_content
+ res_parts.append(create_ser_result(text=body, span_source=item))
+
+ if params.add_caption:
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+ if cap_res.text:
+ res_parts.append(cap_res)
+
+ text_res = "".join([r.text for r in res_parts])
+ if text_res:
+ token = DocumentToken.create_token_name_from_doc_item_label(
+ label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
+ )
+ text_res = _wrap(text=text_res, wrap_tag=token)
+ return create_ser_result(text=text_res, span_source=res_parts)
+
+
+class IDocTagsDocSerializer(DocTagsDocSerializer):
+ """DocTags document serializer."""
+
+ picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
+ meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
+ params: IDocTagsParams = IDocTagsParams()
+
+ @override
+ def _meta_is_wrapped(self) -> bool:
+ return True
+
+ @override
+ def serialize_doc(
+ self,
+ *,
+ parts: list[SerializationResult],
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """DocTags-specific document serializer."""
+ delim = _get_delim(params=self.params)
+ text_res = delim.join([p.text for p in parts if p.text])
+
+ if self.params.add_page_break:
+ page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
+ for full_match, _, _ in self._get_page_breaks(text=text_res):
+ text_res = text_res.replace(full_match, page_sep)
+
+ wrap_tag = DocumentToken.DOCUMENT.value
+ text_res = f"<{wrap_tag}>{DOCTAGS_VERSION}{text_res}{delim}{wrap_tag}>"
+
+ if self.params.pretty_indentation and (
+ my_root := parseString(text_res).documentElement
+ ):
+ text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
+ text_res = "\n".join(
+ [line for line in text_res.split("\n") if line.strip()]
+ )
+ return create_ser_result(text=text_res, span_source=parts)
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index 33f0a10e..177afe30 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -209,7 +209,7 @@ class CommonParams(BaseModel):
use_legacy_annotations: bool = Field(
default=False,
description="Use legacy annotation serialization.",
- deprecated="Legacy annotations considered only when meta not present.",
+ deprecated="Ignored field; legacy annotations considered only when meta not present.",
)
allowed_meta_names: Optional[set[str]] = Field(
default=None,
@@ -318,6 +318,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult:
res = self.serialize_doc(parts=subparts, **kwargs)
return res
+ def _meta_is_wrapped(self) -> bool:
+ return False
+
@override
def serialize(
self,
@@ -339,7 +342,7 @@ def serialize(
my_item = item or self.doc.body
if my_item == self.doc.body:
- if my_item.meta:
+ if my_item.meta and not self._meta_is_wrapped():
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
@@ -358,7 +361,7 @@ def serialize(
my_visited.add(my_item.self_ref)
- if my_item.meta:
+ if my_item.meta and not self._meta_is_wrapped():
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
@@ -605,7 +608,6 @@ def serialize_meta(
text="", span_source=item if isinstance(item, DocItem) else []
)
else:
- _logger.warning("No meta serializer found.")
return create_ser_result(
text="", span_source=item if isinstance(item, DocItem) else []
)
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index 0deaa991..5d1d9c10 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -77,6 +77,8 @@ class Mode(str, Enum):
mode: Mode = Mode.HUMAN_FRIENDLY
+ do_self_closing: bool = False
+
def _get_delim(params: DocTagsParams) -> str:
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
@@ -110,11 +112,17 @@ def serialize(
)
parts: list[str] = []
+ if item.meta:
+ meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
+ if meta_res.text:
+ parts.append(meta_res.text)
+
if params.add_location:
location = item.get_location_tokens(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
if location:
parts.append(location)
@@ -184,6 +192,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
res_parts.append(create_ser_result(text=loc_text, span_source=item))
@@ -233,6 +242,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
# handle classification data
@@ -353,6 +363,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
# mapping from source_cell_id to a list of target_cell_ids
@@ -493,6 +504,7 @@ def _get_inline_location_tags(
page_h=page_h,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
return SerializationResult(
@@ -628,6 +640,7 @@ def serialize_captions(
doc=self.doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
results.append(create_ser_result(text=loc_txt))
results.append(cap_res)
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index f5e89eeb..9ad4ed9c 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1541,6 +1541,7 @@ def get_location_tokens(
new_line: str = "", # deprecated
xsize: int = 500,
ysize: int = 500,
+ self_closing: bool = False,
) -> str:
"""Get the location string for the BaseCell."""
if not len(self.prov):
@@ -1556,6 +1557,7 @@ def get_location_tokens(
page_h=page_h,
xsize=xsize,
ysize=ysize,
+ self_closing=self_closing,
)
location += loc_str
@@ -2245,6 +2247,7 @@ def export_to_otsl(
add_cell_text: bool = True,
xsize: int = 500,
ysize: int = 500,
+ self_closing: bool = False,
**kwargs: Any,
) -> str:
"""Export the table as OTSL."""
@@ -2300,6 +2303,7 @@ def export_to_otsl(
page_h=page_h,
xsize=xsize,
ysize=ysize,
+ self_closing=self_closing,
)
if rowstart == i and colstart == j:
diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py
index 5edbc5dc..028afcaa 100644
--- a/docling_core/types/doc/tokens.py
+++ b/docling_core/types/doc/tokens.py
@@ -267,12 +267,14 @@ def get_code_language_token(code_language: str) -> str:
return _CodeLanguageToken(f"<_{code_language}_>").value
@staticmethod
- def get_location_token(val: float, rnorm: int = 500): # TODO review
+ def get_location_token(
+ val: float, rnorm: int = 500, self_closing: bool = False
+ ): # TODO review
"""Function to get location tokens."""
val_ = round(rnorm * val)
val_ = max(val_, 0)
val_ = min(val_, rnorm - 1)
- return f"<{_LOC_PREFIX}{val_}>"
+ return f"<{_LOC_PREFIX}{val_}{'/' if self_closing else ''}>"
@staticmethod
def get_location(
@@ -281,6 +283,7 @@ def get_location(
page_h: float,
xsize: int = 500, # TODO review
ysize: int = 500, # TODO review
+ self_closing: bool = False,
):
"""Get the location string give bbox and page-dim."""
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
@@ -291,10 +294,18 @@ def get_location(
x1 = bbox[2] / page_w
y1 = bbox[3] / page_h
- x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
- y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
- x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
- y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
+ x0_tok = DocumentToken.get_location_token(
+ val=min(x0, x1), rnorm=xsize, self_closing=self_closing
+ )
+ y0_tok = DocumentToken.get_location_token(
+ val=min(y0, y1), rnorm=ysize, self_closing=self_closing
+ )
+ x1_tok = DocumentToken.get_location_token(
+ val=max(x0, x1), rnorm=xsize, self_closing=self_closing
+ )
+ y1_tok = DocumentToken.get_location_token(
+ val=max(y0, y1), rnorm=ysize, self_closing=self_closing
+ )
loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
diff --git a/test/data/doc/dummy_doc_with_meta.gt.dt b/test/data/doc/dummy_doc_with_meta.gt.dt
new file mode 100644
index 00000000..1979777c
--- /dev/null
+++ b/test/data/doc/dummy_doc_with_meta.gt.dt
@@ -0,0 +1,4 @@
+DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis
+CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1Figure 1: Four examples of complex page layouts across different document categories
+
+
diff --git a/test/data/doc/dummy_doc_with_meta.gt.idt.xml b/test/data/doc/dummy_doc_with_meta.gt.idt.xml
new file mode 100644
index 00000000..2d6b544b
--- /dev/null
+++ b/test/data/doc/dummy_doc_with_meta.gt.idt.xml
@@ -0,0 +1,39 @@
+
+ 1.0.0
+
+
+ This is a title.
+ More stuff here.
+
+
+
+
+
+ DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis
+
+
+
+ ...
+ Bar chart
+ CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
+ {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}
+
+
+
+
+
+
+
+
+
+
+ Figure 1: Four examples of complex page layouts across different document categories
+
+
+
+
+
+
+
+
+
diff --git a/test/test_serialization.py b/test/test_serialization.py
index c0de487d..0aebee22 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -4,6 +4,7 @@
import pytest
+from docling_core.experimental.idoctags import IDocTagsDocSerializer
from docling_core.transforms.serializer.common import _DEFAULT_LABELS
from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
from docling_core.transforms.serializer.html import (
@@ -585,3 +586,26 @@ def test_doctags_inline_and_formatting():
ser = DocTagsDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
+
+
+def test_doctags_meta():
+ src = Path("./test/data/doc/dummy_doc_with_meta.yaml")
+ doc = DoclingDocument.load_from_yaml(src)
+
+ ser = DocTagsDocSerializer(doc=doc)
+ actual = ser.serialize().text
+ verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
+
+
+# ===============================
+# IDocTags tests
+# ===============================
+
+
+def test_idoctags_meta():
+ src = Path("./test/data/doc/dummy_doc_with_meta.yaml")
+ doc = DoclingDocument.load_from_yaml(src)
+
+ ser = IDocTagsDocSerializer(doc=doc)
+ actual = ser.serialize().text
+ verify(exp_file=src.with_suffix(".gt.idt.xml"), actual=actual)