Skip to content

Commit 193edad

Browse files
committed
feat(experimental): add new DocTags serializer
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 9b9e8a7 commit 193edad

File tree

3 files changed

+63
-0
lines changed

3 files changed

+63
-0
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""Define classes for DocTags serialization."""
2+
3+
from typing import Any, Final
4+
5+
from typing_extensions import override
6+
7+
from docling_core.transforms.serializer.base import SerializationResult
8+
from docling_core.transforms.serializer.common import create_ser_result
9+
from docling_core.transforms.serializer.doctags import (
10+
DocTagsDocSerializer,
11+
DocTagsParams,
12+
_get_delim,
13+
)
14+
from docling_core.types.doc.tokens import DocumentToken
15+
16+
DOCTAGS_VERSION: Final = "1.0.0"
17+
18+
19+
class IDocTagsParams(DocTagsParams):
20+
"""DocTags-specific serialization parameters."""
21+
22+
23+
class IDocTagsDocSerializer(DocTagsDocSerializer):
24+
"""DocTags document serializer."""
25+
26+
@override
27+
def serialize_doc(
28+
self,
29+
*,
30+
parts: list[SerializationResult],
31+
**kwargs: Any,
32+
) -> SerializationResult:
33+
"""DocTags-specific document serializer."""
34+
delim = _get_delim(params=self.params)
35+
text_res = delim.join([p.text for p in parts if p.text])
36+
37+
if self.params.add_page_break:
38+
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
39+
for full_match, _, _ in self._get_page_breaks(text=text_res):
40+
text_res = text_res.replace(full_match, page_sep)
41+
42+
wrap_tag = DocumentToken.DOCUMENT.value
43+
text_res = f"<{wrap_tag}><version>{DOCTAGS_VERSION}</version>{text_res}{delim}</{wrap_tag}>"
44+
return create_ser_result(text=text_res, span_source=parts)

test/data/doc/dummy_doc.igt.dt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<doctag><version>1.0.0</version><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
2+
<chart><loc_297><loc_125><loc_457><loc_499><bar_chart><smiles>CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1</smiles><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></chart>
3+
<otsl><loc_210><loc_196><loc_245><loc_213></otsl>
4+
</doctag>

test/test_serialization.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from typing_extensions import override
77

8+
from docling_core.experimental.idoctags import IDocTagsDocSerializer
89
from docling_core.transforms.serializer.base import (
910
BaseDocSerializer,
1011
SerializationResult,
@@ -637,3 +638,17 @@ def test_doctags_inline_and_formatting():
637638
ser = DocTagsDocSerializer(doc=doc)
638639
actual = ser.serialize().text
639640
verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
641+
642+
643+
# ===============================
644+
# IDocTags tests
645+
# ===============================
646+
647+
648+
def test_idoctags_basic():
649+
src = Path("./test/data/doc/dummy_doc.yaml")
650+
doc = DoclingDocument.load_from_yaml(src)
651+
652+
ser = IDocTagsDocSerializer(doc=doc)
653+
actual = ser.serialize().text
654+
verify(exp_file=src.with_suffix(".igt.dt"), actual=actual)

0 commit comments

Comments
 (0)