Skip to content

Commit dd6fa7c

Browse files
feat(xlsx): render sheet names as headings in markdown export
GroupItem nodes with label=GroupLabel.SECTION (used to represent Excel sheet names) are now emitted as Markdown section headings during export_to_markdown(). This preserves the logical document structure and makes multi-sheet workbooks easier to navigate in downstream RAG pipelines. Closes #3229 Signed-off-by: Smeet Agrawal <smeetagrawal23@gmail.com>
1 parent e8fb5ea commit dd6fa7c

26 files changed

Lines changed: 459 additions & 62 deletions

docling/backend/msexcel_backend.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,18 @@ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
234234
# do not rely on sheet.max_column, sheet.max_row if there are images
235235
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
236236

237+
content_layer = self._get_sheet_content_layer(sheet)
237238
self.parents[0] = doc.add_group(
238239
parent=None,
239240
label=GroupLabel.SECTION,
240241
name=f"sheet: {name}",
241-
content_layer=self._get_sheet_content_layer(sheet),
242+
content_layer=content_layer,
243+
)
244+
doc.add_heading(
245+
text=name,
246+
level=1,
247+
parent=self.parents[0],
248+
content_layer=content_layer,
242249
)
243250
doc = self._convert_sheet(doc, sheet)
244251
width, height = self._find_page_size(doc, page_no)
Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
item-0 at level 0: unspecified: group _root_
22
item-1 at level 1: section: group sheet: Sheet1
3-
item-2 at level 2: table with [7x3]
4-
item-3 at level 1: section: group sheet: Sheet2
5-
item-4 at level 2: table with [9x4]
6-
item-5 at level 2: table with [5x3]
7-
item-6 at level 2: table with [5x3]
8-
item-7 at level 1: section: group sheet: Sheet3
9-
item-8 at level 2: table with [7x3]
10-
item-9 at level 2: table with [7x3]
11-
item-10 at level 2: picture
3+
item-2 at level 2: section_header: Sheet1
4+
item-3 at level 2: table with [7x3]
5+
item-4 at level 1: section: group sheet: Sheet2
6+
item-5 at level 2: section_header: Sheet2
7+
item-6 at level 2: table with [9x4]
8+
item-7 at level 2: table with [5x3]
9+
item-8 at level 2: table with [5x3]
10+
item-9 at level 1: section: group sheet: Sheet3
11+
item-10 at level 2: section_header: Sheet3
12+
item-11 at level 2: table with [7x3]
13+
item-12 at level 2: table with [7x3]
14+
item-13 at level 2: picture

tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@
4141
"$ref": "#/body"
4242
},
4343
"children": [
44+
{
45+
"$ref": "#/texts/0"
46+
},
4447
{
4548
"$ref": "#/tables/0"
4649
}
@@ -55,6 +58,9 @@
5558
"$ref": "#/body"
5659
},
5760
"children": [
61+
{
62+
"$ref": "#/texts/1"
63+
},
5864
{
5965
"$ref": "#/tables/1"
6066
},
@@ -75,6 +81,9 @@
7581
"$ref": "#/body"
7682
},
7783
"children": [
84+
{
85+
"$ref": "#/texts/2"
86+
},
7887
{
7988
"$ref": "#/tables/4"
8089
},
@@ -95,6 +104,9 @@
95104
"$ref": "#/body"
96105
},
97106
"children": [
107+
{
108+
"$ref": "#/texts/3"
109+
},
98110
{
99111
"$ref": "#/tables/6"
100112
}
@@ -104,7 +116,60 @@
104116
"label": "section"
105117
}
106118
],
107-
"texts": [],
119+
"texts": [
120+
{
121+
"self_ref": "#/texts/0",
122+
"parent": {
123+
"$ref": "#/groups/0"
124+
},
125+
"children": [],
126+
"content_layer": "body",
127+
"label": "section_header",
128+
"prov": [],
129+
"orig": "Sheet1",
130+
"text": "Sheet1",
131+
"level": 1
132+
},
133+
{
134+
"self_ref": "#/texts/1",
135+
"parent": {
136+
"$ref": "#/groups/1"
137+
},
138+
"children": [],
139+
"content_layer": "body",
140+
"label": "section_header",
141+
"prov": [],
142+
"orig": "Sheet2",
143+
"text": "Sheet2",
144+
"level": 1
145+
},
146+
{
147+
"self_ref": "#/texts/2",
148+
"parent": {
149+
"$ref": "#/groups/2"
150+
},
151+
"children": [],
152+
"content_layer": "body",
153+
"label": "section_header",
154+
"prov": [],
155+
"orig": "Sheet3",
156+
"text": "Sheet3",
157+
"level": 1
158+
},
159+
{
160+
"self_ref": "#/texts/3",
161+
"parent": {
162+
"$ref": "#/groups/3"
163+
},
164+
"children": [],
165+
"content_layer": "invisible",
166+
"label": "section_header",
167+
"prov": [],
168+
"orig": "Sheet4",
169+
"text": "Sheet4",
170+
"level": 1
171+
}
172+
],
108173
"pictures": [
109174
{
110175
"self_ref": "#/pictures/0",

tests/data/groundtruth/docling_v2/xlsx_01.xlsx.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
## Sheet1
2+
13
| first | second | third |
24
|----------|-----------|---------|
35
| 1 | 5 | 9 |
@@ -7,6 +9,8 @@
79
| 5 | 1 | -3 |
810
| 6 | 0 | -6 |
911

12+
## Sheet2
13+
1014
| col-1 | col-2 | col-3 | col-4 |
1115
|---------|---------|---------|---------|
1216
| 1 | 2 | 3 | 4 |
@@ -32,6 +36,8 @@
3236
| 3 | 6 | 9 |
3337
| 4 | 8 | 12 |
3438

39+
## Sheet3
40+
3541
| first | header | header |
3642
|----------|----------|----------|
3743
| first | second | third |
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
item-0 at level 0: unspecified: group _root_
22
item-1 at level 1: section: group sheet: SalesData
3-
item-2 at level 2: table with [21x4]
3+
item-2 at level 2: section_header: SalesData
4+
item-3 at level 2: table with [21x4]

tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
"$ref": "#/body"
3333
},
3434
"children": [
35+
{
36+
"$ref": "#/texts/0"
37+
},
3538
{
3639
"$ref": "#/tables/0"
3740
}
@@ -41,7 +44,21 @@
4144
"label": "section"
4245
}
4346
],
44-
"texts": [],
47+
"texts": [
48+
{
49+
"self_ref": "#/texts/0",
50+
"parent": {
51+
"$ref": "#/groups/0"
52+
},
53+
"children": [],
54+
"content_layer": "body",
55+
"label": "section_header",
56+
"prov": [],
57+
"orig": "SalesData",
58+
"text": "SalesData",
59+
"level": 1
60+
}
61+
],
4562
"pictures": [],
4663
"tables": [
4764
{

tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
## SalesData
2+
13
| Product | Date | Quantity | Revenue |
24
|-----------|---------------------|------------|-----------|
35
| Widget A | 2024-01-01 00:00:00 | 5 | 5000 |
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
item-0 at level 0: unspecified: group _root_
22
item-1 at level 1: section: group sheet: Duck Observations
3-
item-2 at level 2: table with [7x4]
4-
item-3 at level 1: section: group sheet: Duck Chart
3+
item-2 at level 2: section_header: Duck Observations
4+
item-3 at level 2: table with [7x4]
5+
item-4 at level 1: section: group sheet: Duck Chart
6+
item-5 at level 2: section_header: Duck Chart

tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
"$ref": "#/body"
3636
},
3737
"children": [
38+
{
39+
"$ref": "#/texts/0"
40+
},
3841
{
3942
"$ref": "#/tables/0"
4043
}
@@ -48,13 +51,44 @@
4851
"parent": {
4952
"$ref": "#/body"
5053
},
51-
"children": [],
54+
"children": [
55+
{
56+
"$ref": "#/texts/1"
57+
}
58+
],
5259
"content_layer": "body",
5360
"name": "sheet: Duck Chart",
5461
"label": "section"
5562
}
5663
],
57-
"texts": [],
64+
"texts": [
65+
{
66+
"self_ref": "#/texts/0",
67+
"parent": {
68+
"$ref": "#/groups/0"
69+
},
70+
"children": [],
71+
"content_layer": "body",
72+
"label": "section_header",
73+
"prov": [],
74+
"orig": "Duck Observations",
75+
"text": "Duck Observations",
76+
"level": 1
77+
},
78+
{
79+
"self_ref": "#/texts/1",
80+
"parent": {
81+
"$ref": "#/groups/1"
82+
},
83+
"children": [],
84+
"content_layer": "body",
85+
"label": "section_header",
86+
"prov": [],
87+
"orig": "Duck Chart",
88+
"text": "Duck Chart",
89+
"level": 1
90+
}
91+
],
5892
"pictures": [],
5993
"tables": [
6094
{
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
## Duck Observations
2+
13
| Year | Freshwater Ducks | Saltwater Ducks | Ducks |
24
|--------|--------------------|-------------------|---------|
35
| 2019 | 120 | 80 | 200 |
46
| 2020 | 135 | 95 | 230 |
57
| 2021 | 150 | 100 | 250 |
68
| 2022 | 170 | 110 | 280 |
79
| 2023 | 160 | 120 | 280 |
8-
| 2024 | 180 | 130 | 310 |
10+
| 2024 | 180 | 130 | 310 |
11+
12+
## Duck Chart

0 commit comments

Comments
 (0)