Skip to content

Commit 3dce367

Browse files
refactor(xlsx): use GroupLabel.SHEET and serializer-based heading rendering
Replace the SectionHeaderItem injection approach with a cleaner architecture: - msexcel_backend: use GroupLabel.SHEET (already in docling-core) with the plain sheet name instead of GroupLabel.SECTION with "sheet: <name>" prefix - Add MsExcelMarkdownFallbackSerializer / MsExcelMarkdownDocSerializer in docling/utils/markdown.py: renders GroupLabel.SHEET groups as ## headings without polluting the document model with synthetic heading nodes - Update tests: e2e comparisons use MsExcelMarkdownDocSerializer; remove the test_sheet_names_as_headings assertions that relied on SectionHeaderItem nodes; update test_chartsheet and test_table_with_title accordingly - Regenerate all xlsx ground truth files to reflect the new document structure Closes #3229 Signed-off-by: Smeet Agrawal <smeetagrawal23@gmail.com>
1 parent dd6fa7c commit 3dce367

20 files changed

Lines changed: 191 additions & 463 deletions

docling/backend/msexcel_backend.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -234,18 +234,11 @@ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
234234
# do not rely on sheet.max_column, sheet.max_row if there are images
235235
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
236236

237-
content_layer = self._get_sheet_content_layer(sheet)
238237
self.parents[0] = doc.add_group(
239238
parent=None,
240-
label=GroupLabel.SECTION,
241-
name=f"sheet: {name}",
242-
content_layer=content_layer,
243-
)
244-
doc.add_heading(
245-
text=name,
246-
level=1,
247-
parent=self.parents[0],
248-
content_layer=content_layer,
239+
label=GroupLabel.SHEET,
240+
name=name,
241+
content_layer=self._get_sheet_content_layer(sheet),
249242
)
250243
doc = self._convert_sheet(doc, sheet)
251244
width, height = self._find_page_size(doc, page_no)

docling/utils/markdown.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Markdown serialization utilities for docling."""
2+
3+
from typing import Any
4+
5+
from docling_core.transforms.serializer.base import (
6+
BaseFallbackSerializer,
7+
BaseDocSerializer,
8+
SerializationResult,
9+
)
10+
from docling_core.transforms.serializer.common import create_ser_result
11+
from docling_core.transforms.serializer.markdown import (
12+
MarkdownDocSerializer,
13+
MarkdownFallbackSerializer,
14+
)
15+
from docling_core.types.doc.document import DoclingDocument, GroupItem, NodeItem
16+
from docling_core.types.doc.labels import GroupLabel
17+
from typing_extensions import override
18+
19+
20+
class MsExcelMarkdownFallbackSerializer(MarkdownFallbackSerializer):
21+
"""Fallback serializer that renders ``GroupLabel.SHEET`` groups as headings.
22+
23+
When a ``GroupItem`` with ``label=GroupLabel.SHEET`` is encountered the
24+
group's ``name`` is emitted as a level-2 Markdown heading (``##``) before
25+
the group's children, matching the visual structure of the original
26+
workbook where each worksheet has a name.
27+
"""
28+
29+
@override
30+
def serialize(
31+
self,
32+
*,
33+
item: NodeItem,
34+
doc_serializer: BaseDocSerializer,
35+
doc: DoclingDocument,
36+
**kwargs: Any,
37+
) -> SerializationResult:
38+
if isinstance(item, GroupItem) and item.label == GroupLabel.SHEET:
39+
parts = doc_serializer.get_parts(item=item, **kwargs)
40+
content = "\n\n".join(p.text for p in parts if p.text)
41+
heading = f"## {item.name}"
42+
text = f"{heading}\n\n{content}" if content else heading
43+
return create_ser_result(text=text, span_source=parts)
44+
return super().serialize(
45+
item=item, doc_serializer=doc_serializer, doc=doc, **kwargs
46+
)
47+
48+
49+
class MsExcelMarkdownDocSerializer(MarkdownDocSerializer):
50+
"""``MarkdownDocSerializer`` variant for Excel-sourced ``DoclingDocument``\\s.
51+
52+
Swap in :class:`MsExcelMarkdownFallbackSerializer` so that worksheet
53+
groups (``GroupLabel.SHEET``) are rendered with their name as a Markdown
54+
heading without requiring heading nodes to be injected into the document
55+
model by the backend.
56+
57+
Usage::
58+
59+
from docling.utils.markdown import MsExcelMarkdownDocSerializer
60+
from docling_core.transforms.serializer.markdown import MarkdownParams
61+
62+
serializer = MsExcelMarkdownDocSerializer(doc=result.document)
63+
md = serializer.serialize().text
64+
"""
65+
66+
fallback_serializer: BaseFallbackSerializer = MsExcelMarkdownFallbackSerializer()
Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
item-0 at level 0: unspecified: group _root_
2-
item-1 at level 1: section: group sheet: Sheet1
3-
item-2 at level 2: section_header: Sheet1
4-
item-3 at level 2: table with [7x3]
5-
item-4 at level 1: section: group sheet: Sheet2
6-
item-5 at level 2: section_header: Sheet2
7-
item-6 at level 2: table with [9x4]
8-
item-7 at level 2: table with [5x3]
9-
item-8 at level 2: table with [5x3]
10-
item-9 at level 1: section: group sheet: Sheet3
11-
item-10 at level 2: section_header: Sheet3
12-
item-11 at level 2: table with [7x3]
13-
item-12 at level 2: table with [7x3]
14-
item-13 at level 2: picture
2+
item-1 at level 1: sheet: group Sheet1
3+
item-2 at level 2: table with [7x3]
4+
item-3 at level 1: sheet: group Sheet2
5+
item-4 at level 2: table with [9x4]
6+
item-5 at level 2: table with [5x3]
7+
item-6 at level 2: table with [5x3]
8+
item-7 at level 1: sheet: group Sheet3
9+
item-8 at level 2: table with [7x3]
10+
item-9 at level 2: table with [7x3]
11+
item-10 at level 2: picture

tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json

Lines changed: 9 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -41,26 +41,20 @@
4141
"$ref": "#/body"
4242
},
4343
"children": [
44-
{
45-
"$ref": "#/texts/0"
46-
},
4744
{
4845
"$ref": "#/tables/0"
4946
}
5047
],
5148
"content_layer": "body",
52-
"name": "sheet: Sheet1",
53-
"label": "section"
49+
"name": "Sheet1",
50+
"label": "sheet"
5451
},
5552
{
5653
"self_ref": "#/groups/1",
5754
"parent": {
5855
"$ref": "#/body"
5956
},
6057
"children": [
61-
{
62-
"$ref": "#/texts/1"
63-
},
6458
{
6559
"$ref": "#/tables/1"
6660
},
@@ -72,18 +66,15 @@
7266
}
7367
],
7468
"content_layer": "body",
75-
"name": "sheet: Sheet2",
76-
"label": "section"
69+
"name": "Sheet2",
70+
"label": "sheet"
7771
},
7872
{
7973
"self_ref": "#/groups/2",
8074
"parent": {
8175
"$ref": "#/body"
8276
},
8377
"children": [
84-
{
85-
"$ref": "#/texts/2"
86-
},
8778
{
8879
"$ref": "#/tables/4"
8980
},
@@ -95,81 +86,25 @@
9586
}
9687
],
9788
"content_layer": "body",
98-
"name": "sheet: Sheet3",
99-
"label": "section"
89+
"name": "Sheet3",
90+
"label": "sheet"
10091
},
10192
{
10293
"self_ref": "#/groups/3",
10394
"parent": {
10495
"$ref": "#/body"
10596
},
10697
"children": [
107-
{
108-
"$ref": "#/texts/3"
109-
},
11098
{
11199
"$ref": "#/tables/6"
112100
}
113101
],
114102
"content_layer": "invisible",
115-
"name": "sheet: Sheet4",
116-
"label": "section"
117-
}
118-
],
119-
"texts": [
120-
{
121-
"self_ref": "#/texts/0",
122-
"parent": {
123-
"$ref": "#/groups/0"
124-
},
125-
"children": [],
126-
"content_layer": "body",
127-
"label": "section_header",
128-
"prov": [],
129-
"orig": "Sheet1",
130-
"text": "Sheet1",
131-
"level": 1
132-
},
133-
{
134-
"self_ref": "#/texts/1",
135-
"parent": {
136-
"$ref": "#/groups/1"
137-
},
138-
"children": [],
139-
"content_layer": "body",
140-
"label": "section_header",
141-
"prov": [],
142-
"orig": "Sheet2",
143-
"text": "Sheet2",
144-
"level": 1
145-
},
146-
{
147-
"self_ref": "#/texts/2",
148-
"parent": {
149-
"$ref": "#/groups/2"
150-
},
151-
"children": [],
152-
"content_layer": "body",
153-
"label": "section_header",
154-
"prov": [],
155-
"orig": "Sheet3",
156-
"text": "Sheet3",
157-
"level": 1
158-
},
159-
{
160-
"self_ref": "#/texts/3",
161-
"parent": {
162-
"$ref": "#/groups/3"
163-
},
164-
"children": [],
165-
"content_layer": "invisible",
166-
"label": "section_header",
167-
"prov": [],
168-
"orig": "Sheet4",
169-
"text": "Sheet4",
170-
"level": 1
103+
"name": "Sheet4",
104+
"label": "sheet"
171105
}
172106
],
107+
"texts": [],
173108
"pictures": [
174109
{
175110
"self_ref": "#/pictures/0",
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
item-0 at level 0: unspecified: group _root_
2-
item-1 at level 1: section: group sheet: SalesData
3-
item-2 at level 2: section_header: SalesData
4-
item-3 at level 2: table with [21x4]
2+
item-1 at level 1: sheet: group SalesData
3+
item-2 at level 2: table with [21x4]

tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,33 +32,16 @@
3232
"$ref": "#/body"
3333
},
3434
"children": [
35-
{
36-
"$ref": "#/texts/0"
37-
},
3835
{
3936
"$ref": "#/tables/0"
4037
}
4138
],
4239
"content_layer": "body",
43-
"name": "sheet: SalesData",
44-
"label": "section"
45-
}
46-
],
47-
"texts": [
48-
{
49-
"self_ref": "#/texts/0",
50-
"parent": {
51-
"$ref": "#/groups/0"
52-
},
53-
"children": [],
54-
"content_layer": "body",
55-
"label": "section_header",
56-
"prov": [],
57-
"orig": "SalesData",
58-
"text": "SalesData",
59-
"level": 1
40+
"name": "SalesData",
41+
"label": "sheet"
6042
}
6143
],
44+
"texts": [],
6245
"pictures": [],
6346
"tables": [
6447
{
Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
item-0 at level 0: unspecified: group _root_
2-
item-1 at level 1: section: group sheet: Duck Observations
3-
item-2 at level 2: section_header: Duck Observations
4-
item-3 at level 2: table with [7x4]
5-
item-4 at level 1: section: group sheet: Duck Chart
6-
item-5 at level 2: section_header: Duck Chart
2+
item-1 at level 1: sheet: group Duck Observations
3+
item-2 at level 2: table with [7x4]
4+
item-3 at level 1: sheet: group Duck Chart

tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json

Lines changed: 5 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -35,60 +35,26 @@
3535
"$ref": "#/body"
3636
},
3737
"children": [
38-
{
39-
"$ref": "#/texts/0"
40-
},
4138
{
4239
"$ref": "#/tables/0"
4340
}
4441
],
4542
"content_layer": "body",
46-
"name": "sheet: Duck Observations",
47-
"label": "section"
43+
"name": "Duck Observations",
44+
"label": "sheet"
4845
},
4946
{
5047
"self_ref": "#/groups/1",
5148
"parent": {
5249
"$ref": "#/body"
5350
},
54-
"children": [
55-
{
56-
"$ref": "#/texts/1"
57-
}
58-
],
59-
"content_layer": "body",
60-
"name": "sheet: Duck Chart",
61-
"label": "section"
62-
}
63-
],
64-
"texts": [
65-
{
66-
"self_ref": "#/texts/0",
67-
"parent": {
68-
"$ref": "#/groups/0"
69-
},
70-
"children": [],
71-
"content_layer": "body",
72-
"label": "section_header",
73-
"prov": [],
74-
"orig": "Duck Observations",
75-
"text": "Duck Observations",
76-
"level": 1
77-
},
78-
{
79-
"self_ref": "#/texts/1",
80-
"parent": {
81-
"$ref": "#/groups/1"
82-
},
8351
"children": [],
8452
"content_layer": "body",
85-
"label": "section_header",
86-
"prov": [],
87-
"orig": "Duck Chart",
88-
"text": "Duck Chart",
89-
"level": 1
53+
"name": "Duck Chart",
54+
"label": "sheet"
9055
}
9156
],
57+
"texts": [],
9258
"pictures": [],
9359
"tables": [
9460
{

0 commit comments

Comments
 (0)