Skip to content

Commit 34e9875

Browse files
committed
add change and updated test data
Signed-off-by: Panos Vagenas <[email protected]>
1 parent b16f835 commit 34e9875

File tree

3 files changed

+145
-49
lines changed

3 files changed

+145
-49
lines changed

docling/backend/md_backend.py

Lines changed: 129 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import re
33
import warnings
44
from copy import deepcopy
5+
from enum import Enum
56
from io import BytesIO
67
from pathlib import Path
7-
from typing import List, Optional, Set, Union
8+
from typing import List, Literal, Optional, Set, Union
89

910
import marko
1011
import marko.element
@@ -21,7 +22,8 @@
2122
)
2223
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
2324
from marko import Markdown
24-
from pydantic import AnyUrl, TypeAdapter
25+
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
26+
from typing_extensions import Annotated
2527

2628
from docling.backend.abstract_backend import DeclarativeDocumentBackend
2729
from docling.backend.html_backend import HTMLDocumentBackend
@@ -35,6 +37,31 @@
3537
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
3638

3739

40+
class _PendingCreationType(str, Enum):
41+
"""CoordOrigin."""
42+
43+
HEADING = "heading"
44+
LIST_ITEM = "list_item"
45+
46+
47+
class _HeadingCreationPayload(BaseModel):
48+
kind: Literal["heading"] = "heading"
49+
level: int
50+
51+
52+
class _ListItemCreationPayload(BaseModel):
53+
kind: Literal["list_item"] = "list_item"
54+
55+
56+
_CreationPayload = Annotated[
57+
Union[
58+
_HeadingCreationPayload,
59+
_ListItemCreationPayload,
60+
],
61+
Field(discriminator="kind"),
62+
]
63+
64+
3865
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
3966
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
4067
# This regex will match any sequence of underscores
@@ -155,13 +182,62 @@ def _close_table(self, doc: DoclingDocument):
155182
doc.add_table(data=table_data)
156183
return
157184

185+
def _create_list_item(
186+
self,
187+
doc: DoclingDocument,
188+
parent_item: Optional[NodeItem],
189+
text: str,
190+
formatting: Optional[Formatting] = None,
191+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
192+
):
193+
if not isinstance(parent_item, (OrderedList, UnorderedList)):
194+
_log.warning("ListItem would have not had a list parent, adding one.")
195+
parent_item = doc.add_unordered_list(parent=parent_item)
196+
item = doc.add_list_item(
197+
text=text,
198+
enumerated=(isinstance(parent_item, OrderedList)),
199+
parent=parent_item,
200+
formatting=formatting,
201+
hyperlink=hyperlink,
202+
)
203+
return item
204+
205+
def _create_heading_item(
206+
self,
207+
doc: DoclingDocument,
208+
parent_item: Optional[NodeItem],
209+
text: str,
210+
level: int,
211+
formatting: Optional[Formatting] = None,
212+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
213+
):
214+
if level == 1:
215+
item = doc.add_title(
216+
text=text,
217+
parent=parent_item,
218+
formatting=formatting,
219+
hyperlink=hyperlink,
220+
)
221+
else:
222+
item = doc.add_heading(
223+
text=text,
224+
level=level - 1,
225+
parent=parent_item,
226+
formatting=formatting,
227+
hyperlink=hyperlink,
228+
)
229+
return item
230+
158231
def _iterate_elements( # noqa: C901
159232
self,
160233
*,
161234
element: marko.element.Element,
162235
depth: int,
163236
doc: DoclingDocument,
164237
visited: Set[marko.element.Element],
238+
creation_stack: list[
239+
_CreationPayload
240+
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
165241
parent_item: Optional[NodeItem] = None,
166242
formatting: Optional[Formatting] = None,
167243
hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -177,28 +253,17 @@ def _iterate_elements( # noqa: C901
177253
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
178254
)
179255

180-
if len(element.children) == 1:
181-
child = element.children[0]
182-
snippet_text = str(child.children) # type: ignore
183-
visited.add(child)
184-
else:
185-
snippet_text = "" # inline group will be created
186-
187-
if element.level == 1:
188-
parent_item = doc.add_title(
189-
text=snippet_text,
190-
parent=parent_item,
256+
if len(element.children) > 1: # inline group will be created further down
257+
parent_item = self._create_heading_item(
258+
doc=doc,
259+
parent_item=parent_item,
260+
text="",
261+
level=element.level,
191262
formatting=formatting,
192263
hyperlink=hyperlink,
193264
)
194265
else:
195-
parent_item = doc.add_heading(
196-
text=snippet_text,
197-
level=element.level - 1,
198-
parent=parent_item,
199-
formatting=formatting,
200-
hyperlink=hyperlink,
201-
)
266+
creation_stack.append(_HeadingCreationPayload(level=element.level))
202267

203268
elif isinstance(element, marko.block.List):
204269
has_non_empty_list_items = False
@@ -224,22 +289,16 @@ def _iterate_elements( # noqa: C901
224289
self._close_table(doc)
225290
_log.debug(" - List item")
226291

227-
if len(child.children) == 1:
228-
snippet_text = str(child.children[0].children) # type: ignore
229-
visited.add(child)
292+
if len(child.children) > 1: # inline group will be created further down
293+
parent_item = self._create_list_item(
294+
doc=doc,
295+
parent_item=parent_item,
296+
text="",
297+
formatting=formatting,
298+
hyperlink=hyperlink,
299+
)
230300
else:
231-
snippet_text = "" # inline group will be created
232-
is_numbered = isinstance(parent_item, OrderedList)
233-
if not isinstance(parent_item, (OrderedList, UnorderedList)):
234-
_log.warning("ListItem would have not had a list parent, adding one.")
235-
parent_item = doc.add_unordered_list(parent=parent_item)
236-
parent_item = doc.add_list_item(
237-
enumerated=is_numbered,
238-
parent=parent_item,
239-
text=snippet_text,
240-
formatting=formatting,
241-
hyperlink=hyperlink,
242-
)
301+
creation_stack.append(_ListItemCreationPayload())
243302

244303
elif isinstance(element, marko.inline.Image):
245304
self._close_table(doc)
@@ -285,13 +344,38 @@ def _iterate_elements( # noqa: C901
285344
self.md_table_buffer.append(snippet_text)
286345
elif snippet_text:
287346
self._close_table(doc)
288-
doc.add_text(
289-
label=DocItemLabel.TEXT,
290-
parent=parent_item,
291-
text=snippet_text,
292-
formatting=formatting,
293-
hyperlink=hyperlink,
294-
)
347+
348+
if creation_stack:
349+
while len(creation_stack) > 0:
350+
to_create = creation_stack.pop()
351+
if isinstance(to_create, _ListItemCreationPayload):
352+
parent_item = self._create_list_item(
353+
doc=doc,
354+
parent_item=parent_item,
355+
text=snippet_text,
356+
formatting=formatting,
357+
hyperlink=hyperlink,
358+
)
359+
elif isinstance(to_create, _HeadingCreationPayload):
360+
# not keeping as parent_item as logic for correctly tracking
361+
# that not implemented yet (section components not captured
362+
# as heading children in marko)
363+
self._create_heading_item(
364+
doc=doc,
365+
parent_item=parent_item,
366+
text=snippet_text,
367+
level=to_create.level,
368+
formatting=formatting,
369+
hyperlink=hyperlink,
370+
)
371+
else:
372+
doc.add_text(
373+
label=DocItemLabel.TEXT,
374+
parent=parent_item,
375+
text=snippet_text,
376+
formatting=formatting,
377+
hyperlink=hyperlink,
378+
)
295379

296380
elif isinstance(element, marko.inline.CodeSpan):
297381
self._close_table(doc)
@@ -353,7 +437,6 @@ def _iterate_elements( # noqa: C901
353437
parent_item = doc.add_inline_group(parent=parent_item)
354438

355439
processed_block_types = (
356-
# marko.block.Heading,
357440
marko.block.CodeBlock,
358441
marko.block.FencedCode,
359442
marko.inline.RawText,
@@ -369,6 +452,7 @@ def _iterate_elements( # noqa: C901
369452
depth=depth + 1,
370453
doc=doc,
371454
visited=visited,
455+
creation_stack=creation_stack,
372456
parent_item=parent_item,
373457
formatting=formatting,
374458
hyperlink=hyperlink,
@@ -405,13 +489,15 @@ def convert(self) -> DoclingDocument:
405489
# Parse the markdown into an abstract syntax tree (AST)
406490
marko_parser = Markdown()
407491
parsed_ast = marko_parser.parse(self.markdown)
492+
print(f"{parsed_ast=}")
408493
# Start iterating from the root of the AST
409494
self._iterate_elements(
410495
element=parsed_ast,
411496
depth=0,
412497
doc=doc,
413498
parent_item=None,
414499
visited=set(),
500+
creation_stack=[],
415501
)
416502
self._close_table(doc=doc) # handle any last hanging table
417503

tests/data/groundtruth/docling_v2/inline_and_formatting.md.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` .
1111
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
1212
4. Push to the branch ( `git push origin feature/AmazingFeature` )
1313
5. Open a Pull Request
14-
6. [&lt;RawText children='Whole list item has same formatting'&gt;]
14+
6. **Whole list item has same formatting**
1515
7. List item has *mixed or partial* formatting
1616

17-
# [&lt;RawText children='Whole heading is italic'&gt;]
17+
*# Whole heading is italic*
1818

1919
Bar
2020

tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -424,14 +424,19 @@ texts:
424424
- children: []
425425
content_layer: body
426426
enumerated: true
427+
formatting:
428+
bold: true
429+
italic: false
430+
strikethrough: false
431+
underline: false
427432
label: list_item
428433
marker: '-'
429-
orig: '[<RawText children=''Whole list item has same formatting''>]'
434+
orig: Whole list item has same formatting
430435
parent:
431436
$ref: '#/groups/2'
432437
prov: []
433438
self_ref: '#/texts/27'
434-
text: '[<RawText children=''Whole list item has same formatting''>]'
439+
text: Whole list item has same formatting
435440
- children:
436441
- $ref: '#/groups/7'
437442
content_layer: body
@@ -478,13 +483,18 @@ texts:
478483
text: formatting
479484
- children: []
480485
content_layer: body
486+
formatting:
487+
bold: false
488+
italic: true
489+
strikethrough: false
490+
underline: false
481491
label: title
482-
orig: '[<RawText children=''Whole heading is italic''>]'
492+
orig: Whole heading is italic
483493
parent:
484494
$ref: '#/body'
485495
prov: []
486496
self_ref: '#/texts/32'
487-
text: '[<RawText children=''Whole heading is italic''>]'
497+
text: Whole heading is italic
488498
- children: []
489499
content_layer: body
490500
label: text

0 commit comments

Comments
 (0)