Skip to content

Commit abd0ce6

Browse files
committed
fix: rich table triplet serialization
Signed-off-by: Vdaleke <[email protected]>
1 parent 6d48bf5 commit abd0ce6

File tree

4 files changed

+81
-5
lines changed

4 files changed

+81
-5
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ def serialize(
6969
parts.append(cap_res)
7070

7171
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
72-
table_df = item.export_to_dataframe(doc)
72+
table_df = item.export_to_dataframe(
73+
doc,
74+
doc_serializer=doc_serializer,
75+
**kwargs,
76+
)
7377
if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
7478

7579
# copy header as first row and shift all rows by one

docling_core/types/doc/document.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,7 +1840,9 @@ def _migrate_annotations_to_meta(self) -> Self:
18401840
return self
18411841

18421842
def export_to_dataframe(
1843-
self, doc: Optional["DoclingDocument"] = None
1843+
self,
1844+
doc: Optional["DoclingDocument"] = None,
1845+
**kwargs: Any,
18441846
) -> pd.DataFrame:
18451847
"""Export the table as a Pandas DataFrame."""
18461848
if doc is None:
@@ -1876,14 +1878,14 @@ def export_to_dataframe(
18761878
columns = ["" for _ in range(self.data.num_cols)]
18771879
for i in range(num_headers):
18781880
for j, cell in enumerate(self.data.grid[i]):
1879-
col_name = cell._get_text(doc=doc)
1881+
col_name = cell._get_text(doc=doc, **kwargs)
18801882
if columns[j] != "":
18811883
col_name = f".{col_name}"
18821884
columns[j] += col_name
18831885

18841886
# Create table data
18851887
table_data = [
1886-
[cell._get_text(doc=doc) for cell in row]
1888+
[cell._get_text(doc=doc, **kwargs) for cell in row]
18871889
for row in self.data.grid[num_headers:]
18881890
]
18891891

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"root": [
3+
{
4+
"text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 = <em><p>text in italic</p></em>. <ul>\n<li>list item 1</li>\n<li>list item 2</li>\n</ul>, 1 = cell 2,1. cell 3,0, 1 = inner cell 0,0, 1 = inner cell 0,1. inner cell 0,0, 2 = inner cell 0,2. inner cell 1,0, 1 = inner cell 1,1. inner cell 1,0, 2 = inner cell 1,2. <p>Some text in a generic group.</p>\n<p>More text in the group.</p>, 1 = cell 4,1",
5+
"meta": {
6+
"schema_name": "docling_core.transforms.chunker.DocMeta",
7+
"version": "1.0.0",
8+
"doc_items": [
9+
{
10+
"self_ref": "#/tables/0",
11+
"parent": {
12+
"$ref": "#/body"
13+
},
14+
"children": [
15+
{
16+
"$ref": "#/texts/1"
17+
},
18+
{
19+
"$ref": "#/groups/0"
20+
},
21+
{
22+
"$ref": "#/tables/1"
23+
},
24+
{
25+
"$ref": "#/groups/1"
26+
}
27+
],
28+
"content_layer": "body",
29+
"label": "table",
30+
"prov": []
31+
}
32+
],
33+
"headings": [
34+
"Rich tables"
35+
]
36+
}
37+
}
38+
]
39+
}

test/test_hierarchical_chunker.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,17 @@
1010
ChunkingDocSerializer,
1111
ChunkingSerializerProvider,
1212
DocChunk,
13+
TripletTableSerializer,
14+
)
15+
from docling_core.transforms.serializer.html import HTMLDocSerializer
16+
from docling_core.transforms.serializer.markdown import (
17+
MarkdownTableSerializer,
1318
)
14-
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
1519
from docling_core.types.doc import DoclingDocument as DLDocument
1620
from docling_core.types.doc.document import DoclingDocument
1721

1822
from .test_data_gen_flag import GEN_TEST_DATA
23+
from .test_docling_doc import _construct_rich_table_doc
1924

2025

2126
def _process(act_data, exp_path_str):
@@ -71,3 +76,29 @@ def get_serializer(self, doc: DoclingDocument):
7176
act_data=act_data,
7277
exp_path_str="test/data/chunker/0b_out_chunks.json",
7378
)
79+
80+
81+
def test_chunk_rich_table_custom_serializer():
82+
doc = _construct_rich_table_doc()
83+
84+
class MySerializerProvider(ChunkingSerializerProvider):
85+
def get_serializer(self, doc: DoclingDocument):
86+
return HTMLDocSerializer(
87+
doc=doc,
88+
table_serializer=TripletTableSerializer(),
89+
)
90+
91+
chunker = HierarchicalChunker(
92+
merge_list_items=True,
93+
serializer_provider=MySerializerProvider(),
94+
)
95+
96+
chunks = chunker.chunk(dl_doc=doc)
97+
act_data = dict(
98+
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
99+
)
100+
101+
_process(
102+
act_data=act_data,
103+
exp_path_str="test/data/chunker/0c_out_chunks.json",
104+
)

0 commit comments

Comments
 (0)