22import re
33import warnings
44from copy import deepcopy
5+ from enum import Enum
56from io import BytesIO
67from pathlib import Path
7- from typing import List , Optional , Set , Union
8+ from typing import List , Literal , Optional , Set , Union
89
910import marko
1011import marko .element
2122)
2223from docling_core .types .doc .document import Formatting , OrderedList , UnorderedList
2324from marko import Markdown
24- from pydantic import AnyUrl , TypeAdapter
25+ from pydantic import AnyUrl , BaseModel , Field , TypeAdapter
26+ from typing_extensions import Annotated
2527
2628from docling .backend .abstract_backend import DeclarativeDocumentBackend
2729from docling .backend .html_backend import HTMLDocumentBackend
3537_STOP_MARKER = f"#_#_{ _MARKER_BODY } _STOP_#_#"
3638
3739
40+ class _PendingCreationType (str , Enum ):
41+ """CoordOrigin."""
42+
43+ HEADING = "heading"
44+ LIST_ITEM = "list_item"
45+
46+
47+ class _HeadingCreationPayload (BaseModel ):
48+ kind : Literal ["heading" ] = "heading"
49+ level : int
50+
51+
52+ class _ListItemCreationPayload (BaseModel ):
53+ kind : Literal ["list_item" ] = "list_item"
54+
55+
56+ _CreationPayload = Annotated [
57+ Union [
58+ _HeadingCreationPayload ,
59+ _ListItemCreationPayload ,
60+ ],
61+ Field (discriminator = "kind" ),
62+ ]
63+
64+
3865class MarkdownDocumentBackend (DeclarativeDocumentBackend ):
3966 def _shorten_underscore_sequences (self , markdown_text : str , max_length : int = 10 ):
4067 # This regex will match any sequence of underscores
@@ -155,13 +182,62 @@ def _close_table(self, doc: DoclingDocument):
155182 doc .add_table (data = table_data )
156183 return
157184
185+ def _create_list_item (
186+ self ,
187+ doc : DoclingDocument ,
188+ parent_item : Optional [NodeItem ],
189+ text : str ,
190+ formatting : Optional [Formatting ] = None ,
191+ hyperlink : Optional [Union [AnyUrl , Path ]] = None ,
192+ ):
193+ if not isinstance (parent_item , (OrderedList , UnorderedList )):
194+ _log .warning ("ListItem would have not had a list parent, adding one." )
195+ parent_item = doc .add_unordered_list (parent = parent_item )
196+ item = doc .add_list_item (
197+ text = text ,
198+ enumerated = (isinstance (parent_item , OrderedList )),
199+ parent = parent_item ,
200+ formatting = formatting ,
201+ hyperlink = hyperlink ,
202+ )
203+ return item
204+
205+ def _create_heading_item (
206+ self ,
207+ doc : DoclingDocument ,
208+ parent_item : Optional [NodeItem ],
209+ text : str ,
210+ level : int ,
211+ formatting : Optional [Formatting ] = None ,
212+ hyperlink : Optional [Union [AnyUrl , Path ]] = None ,
213+ ):
214+ if level == 1 :
215+ item = doc .add_title (
216+ text = text ,
217+ parent = parent_item ,
218+ formatting = formatting ,
219+ hyperlink = hyperlink ,
220+ )
221+ else :
222+ item = doc .add_heading (
223+ text = text ,
224+ level = level - 1 ,
225+ parent = parent_item ,
226+ formatting = formatting ,
227+ hyperlink = hyperlink ,
228+ )
229+ return item
230+
158231 def _iterate_elements ( # noqa: C901
159232 self ,
160233 * ,
161234 element : marko .element .Element ,
162235 depth : int ,
163236 doc : DoclingDocument ,
164237 visited : Set [marko .element .Element ],
238+ creation_stack : list [
239+ _CreationPayload
240+ ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
165241 parent_item : Optional [NodeItem ] = None ,
166242 formatting : Optional [Formatting ] = None ,
167243 hyperlink : Optional [Union [AnyUrl , Path ]] = None ,
@@ -177,28 +253,17 @@ def _iterate_elements( # noqa: C901
177253 f" - Heading level { element .level } , content: { element .children [0 ].children } " # type: ignore
178254 )
179255
180- if len (element .children ) == 1 :
181- child = element .children [0 ]
182- snippet_text = str (child .children ) # type: ignore
183- visited .add (child )
184- else :
185- snippet_text = "" # inline group will be created
186-
187- if element .level == 1 :
188- parent_item = doc .add_title (
189- text = snippet_text ,
190- parent = parent_item ,
256+ if len (element .children ) > 1 : # inline group will be created further down
257+ parent_item = self ._create_heading_item (
258+ doc = doc ,
259+ parent_item = parent_item ,
260+ text = "" ,
261+ level = element .level ,
191262 formatting = formatting ,
192263 hyperlink = hyperlink ,
193264 )
194265 else :
195- parent_item = doc .add_heading (
196- text = snippet_text ,
197- level = element .level - 1 ,
198- parent = parent_item ,
199- formatting = formatting ,
200- hyperlink = hyperlink ,
201- )
266+ creation_stack .append (_HeadingCreationPayload (level = element .level ))
202267
203268 elif isinstance (element , marko .block .List ):
204269 has_non_empty_list_items = False
@@ -224,22 +289,16 @@ def _iterate_elements( # noqa: C901
224289 self ._close_table (doc )
225290 _log .debug (" - List item" )
226291
227- if len (child .children ) == 1 :
228- snippet_text = str (child .children [0 ].children ) # type: ignore
229- visited .add (child )
292+ if len (child .children ) > 1 : # inline group will be created further down
293+ parent_item = self ._create_list_item (
294+ doc = doc ,
295+ parent_item = parent_item ,
296+ text = "" ,
297+ formatting = formatting ,
298+ hyperlink = hyperlink ,
299+ )
230300 else :
231- snippet_text = "" # inline group will be created
232- is_numbered = isinstance (parent_item , OrderedList )
233- if not isinstance (parent_item , (OrderedList , UnorderedList )):
234- _log .warning ("ListItem would have not had a list parent, adding one." )
235- parent_item = doc .add_unordered_list (parent = parent_item )
236- parent_item = doc .add_list_item (
237- enumerated = is_numbered ,
238- parent = parent_item ,
239- text = snippet_text ,
240- formatting = formatting ,
241- hyperlink = hyperlink ,
242- )
301+ creation_stack .append (_ListItemCreationPayload ())
243302
244303 elif isinstance (element , marko .inline .Image ):
245304 self ._close_table (doc )
@@ -285,13 +344,38 @@ def _iterate_elements( # noqa: C901
285344 self .md_table_buffer .append (snippet_text )
286345 elif snippet_text :
287346 self ._close_table (doc )
288- doc .add_text (
289- label = DocItemLabel .TEXT ,
290- parent = parent_item ,
291- text = snippet_text ,
292- formatting = formatting ,
293- hyperlink = hyperlink ,
294- )
347+
348+ if creation_stack :
349+ while len (creation_stack ) > 0 :
350+ to_create = creation_stack .pop ()
351+ if isinstance (to_create , _ListItemCreationPayload ):
352+ parent_item = self ._create_list_item (
353+ doc = doc ,
354+ parent_item = parent_item ,
355+ text = snippet_text ,
356+ formatting = formatting ,
357+ hyperlink = hyperlink ,
358+ )
359+ elif isinstance (to_create , _HeadingCreationPayload ):
360+ # not keeping as parent_item as logic for correctly tracking
361+ # that not implemented yet (section components not captured
362+ # as heading children in marko)
363+ self ._create_heading_item (
364+ doc = doc ,
365+ parent_item = parent_item ,
366+ text = snippet_text ,
367+ level = to_create .level ,
368+ formatting = formatting ,
369+ hyperlink = hyperlink ,
370+ )
371+ else :
372+ doc .add_text (
373+ label = DocItemLabel .TEXT ,
374+ parent = parent_item ,
375+ text = snippet_text ,
376+ formatting = formatting ,
377+ hyperlink = hyperlink ,
378+ )
295379
296380 elif isinstance (element , marko .inline .CodeSpan ):
297381 self ._close_table (doc )
@@ -353,7 +437,6 @@ def _iterate_elements( # noqa: C901
353437 parent_item = doc .add_inline_group (parent = parent_item )
354438
355439 processed_block_types = (
356- # marko.block.Heading,
357440 marko .block .CodeBlock ,
358441 marko .block .FencedCode ,
359442 marko .inline .RawText ,
@@ -369,6 +452,7 @@ def _iterate_elements( # noqa: C901
369452 depth = depth + 1 ,
370453 doc = doc ,
371454 visited = visited ,
455+ creation_stack = creation_stack ,
372456 parent_item = parent_item ,
373457 formatting = formatting ,
374458 hyperlink = hyperlink ,
@@ -405,13 +489,15 @@ def convert(self) -> DoclingDocument:
405489 # Parse the markdown into an abstract syntax tree (AST)
406490 marko_parser = Markdown ()
407491 parsed_ast = marko_parser .parse (self .markdown )
492+ print (f"{ parsed_ast = } " )
408493 # Start iterating from the root of the AST
409494 self ._iterate_elements (
410495 element = parsed_ast ,
411496 depth = 0 ,
412497 doc = doc ,
413498 parent_item = None ,
414499 visited = set (),
500+ creation_stack = [],
415501 )
416502 self ._close_table (doc = doc ) # handle any last hanging table
417503
0 commit comments