1111from typing import Any , ClassVar , Iterator , Optional
1212
1313from pandas import DataFrame
14- from pydantic import BaseModel , Field , conlist
14+ from pydantic import Field
1515
16- from docling_core .transforms .chunker import BaseChunker
17- from docling_core .transforms .chunker .base import BaseChunk
16+ from docling_core .transforms .chunker import BaseChunk , BaseChunker , BaseMeta
1817from docling_core .types .doc import DoclingDocument as DLDocument
1918from docling_core .types .doc .document import (
2019 DocItem ,
3332_logger = logging .getLogger (__name__ )
3433
3534
36- class ChunkMeta ( BaseModel ):
37- """Data model for specific chunk metadata."""
35+ class DocMeta ( BaseMeta ):
36+ """Data model for Hierarchical Chunker metadata."""
3837
39- # TODO align paths typewith _JSON_POINTER_REGEX
40- doc_items : conlist (DocItem , min_length = 1 ) = Field ( # type: ignore
38+ doc_items : list [DocItem ] = Field (
4139 alias = _KEY_DOC_ITEMS ,
40+ min_length = 1 ,
4241 )
43- headings : Optional [conlist ( str , min_length = 1 )] = Field ( # type: ignore
42+ headings : Optional [list [ str ]] = Field (
4443 default = None ,
4544 alias = _KEY_HEADINGS ,
45+ min_length = 1 ,
4646 )
47- captions : Optional [conlist ( str , min_length = 1 )] = Field ( # type: ignore
47+ captions : Optional [list [ str ]] = Field (
4848 default = None ,
4949 alias = _KEY_CAPTIONS ,
50+ min_length = 1 ,
5051 )
5152
5253 excluded_embed : ClassVar [list [str ]] = [_KEY_DOC_ITEMS ]
5354 excluded_llm : ClassVar [list [str ]] = [_KEY_DOC_ITEMS ]
5455
55- def export_json_dict (self ) -> dict [str , Any ]:
56- """Helper method for exporting non-None keys to JSON mode.
57-
58- Returns:
59- dict[str, Any]: The exported dictionary.
60- """
61- return self .model_dump (mode = "json" , by_alias = True , exclude_none = True )
6256
57+ class DocChunk (BaseChunk ):
58+ """Data model for Hierarchical Chunker chunks."""
6359
64- class Chunk (BaseChunk ):
65- """Data model for specific chunk."""
66-
67- meta : ChunkMeta
68-
69- def export_json_dict (self ) -> dict [str , Any ]:
70- """Helper method for exporting non-None keys to JSON mode.
71-
72- Returns:
73- dict[str, Any]: The exported dictionary.
74- """
75- return self .model_dump (mode = "json" , by_alias = True , exclude_none = True )
60+ meta : DocMeta
7661
7762
7863class HierarchicalChunker (BaseChunker ):
79- """Chunker implementation leveraging the document layout."""
64+ r"""Chunker implementation leveraging the document layout.
65+
66+ Args:
67+ merge_list_items (bool): Whether to merge successive list items.
68+ Defaults to True.
69+ delim (str): Delimiter to use for merging text. Defaults to "\n".
70+ """
8071
8172 merge_list_items : bool = True
8273 delim : str = "\n "
@@ -129,9 +120,9 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
129120 list_items .append (item )
130121 continue
131122 elif list_items : # need to yield
132- yield Chunk (
123+ yield DocChunk (
133124 text = self .delim .join ([i .text for i in list_items ]),
134- meta = ChunkMeta (
125+ meta = DocMeta (
135126 doc_items = list_items ,
136127 headings = [
137128 heading_by_level [k ]
@@ -148,7 +139,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
148139 isinstance (item , TextItem )
149140 and item .label == DocItemLabel .SECTION_HEADER
150141 ):
151- # TODO second branch not needed after cleanup above:
142+ # TODO second branch not needed once cleanup above complete :
152143 level = item .level if isinstance (item , SectionHeaderItem ) else 1
153144 heading_by_level [level ] = item .text
154145
@@ -173,9 +164,9 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
173164 ] or None
174165 else :
175166 continue
176- c = Chunk (
167+ c = DocChunk (
177168 text = text ,
178- meta = ChunkMeta (
169+ meta = DocMeta (
179170 doc_items = [item ],
180171 headings = [heading_by_level [k ] for k in sorted (heading_by_level )]
181172 or None ,
@@ -185,9 +176,9 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
185176 yield c
186177
187178 if self .merge_list_items and list_items : # need to yield
188- yield Chunk (
179+ yield DocChunk (
189180 text = self .delim .join ([i .text for i in list_items ]),
190- meta = ChunkMeta (
181+ meta = DocMeta (
191182 doc_items = list_items ,
192183 headings = [heading_by_level [k ] for k in sorted (heading_by_level )]
193184 or None ,
0 commit comments