1010import logging
1111from typing import Any , ClassVar , Iterator , Optional
1212
13+ from pandas import DataFrame
1314from pydantic import BaseModel , Field , conlist
1415
1516from docling_core .transforms .chunker import BaseChunker
2021 LevelNumber ,
2122 ListItem ,
2223 SectionHeaderItem ,
24+ TableItem ,
2325 TextItem ,
2426)
2527from docling_core .types .doc .labels import DocItemLabel
2628
27- _KEY_PATHS = "paths"
28- _KEY_PROVS = "provs"
29- _KEY_HEADINGS = "headings"
30-
3129_KEY_DOC_ITEMS = "doc_items"
30+ _KEY_HEADINGS = "headings"
31+ _KEY_CAPTIONS = "captions"
3232
3333_logger = logging .getLogger (__name__ )
3434
@@ -38,13 +38,16 @@ class ChunkMeta(BaseModel):
3838
3939 # TODO align paths typewith _JSON_POINTER_REGEX
4040 doc_items : conlist (DocItem , min_length = 1 ) = Field ( # type: ignore
41- default = None ,
4241 alias = _KEY_DOC_ITEMS ,
4342 )
4443 headings : Optional [conlist (str , min_length = 1 )] = Field ( # type: ignore
4544 default = None ,
4645 alias = _KEY_HEADINGS ,
4746 )
47+ captions : Optional [conlist (str , min_length = 1 )] = Field ( # type: ignore
48+ default = None ,
49+ alias = _KEY_CAPTIONS ,
50+ )
4851
4952 excluded_embed : ClassVar [list [str ]] = [_KEY_DOC_ITEMS ]
5053 excluded_llm : ClassVar [list [str ]] = [_KEY_DOC_ITEMS ]
@@ -78,6 +81,28 @@ class HierarchicalChunker(BaseChunker):
7881 merge_list_items : bool = True
7982 delim : str = "\n "
8083
84+ @classmethod
85+ def _triplet_serialize (cls , table_df : DataFrame ) -> str :
86+
87+ # copy header as first row and shift all rows by one
88+ table_df .loc [- 1 ] = table_df .columns # type: ignore[call-overload]
89+ table_df .index = table_df .index + 1
90+ table_df = table_df .sort_index ()
91+
92+ rows = [item .strip () for item in table_df .iloc [:, 0 ].to_list ()]
93+ cols = [item .strip () for item in table_df .iloc [0 , :].to_list ()]
94+
95+ nrows = table_df .shape [0 ]
96+ ncols = table_df .shape [1 ]
97+ texts = [
98+ f"{ rows [i ]} , { cols [j ]} = { str (table_df .iloc [i , j ]).strip ()} "
99+ for i in range (1 , nrows )
100+ for j in range (1 , ncols )
101+ ]
102+ output_text = ". " .join (texts )
103+
104+ return output_text
105+
81106 def chunk (self , dl_doc : DLDocument , ** kwargs : Any ) -> Iterator [BaseChunk ]:
82107 r"""Chunk the provided document.
83108
@@ -90,9 +115,10 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
90115 heading_by_level : dict [LevelNumber , str ] = {}
91116 list_items : list [TextItem ] = []
92117 for item , level in dl_doc .iterate_items ():
93-
118+ captions = None
94119 if isinstance (item , DocItem ):
95120
121+ # first handle any merging needed
96122 if self .merge_list_items :
97123 if isinstance (
98124 item , ListItem
@@ -136,14 +162,24 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
136162 (not self .merge_list_items ) and isinstance (item , ListItem )
137163 ):
138164 text = item .text
165+ elif isinstance (item , TableItem ):
166+ table_df = item .export_to_dataframe ()
167+ if table_df .shape [0 ] < 1 or table_df .shape [1 ] < 2 :
168+ # at least two cols needed, as first column contains row headers
169+ continue
170+ text = self ._triplet_serialize (table_df = table_df )
171+ captions = [
172+ c .text for c in [r .resolve (dl_doc ) for r in item .captions ]
173+ ] or None
139174 else :
140- continue # TODO refine to ignore some cases & raise otherwise?
175+ continue
141176 c = Chunk (
142177 text = text ,
143178 meta = ChunkMeta (
144179 doc_items = [item ],
145180 headings = [heading_by_level [k ] for k in sorted (heading_by_level )]
146181 or None ,
182+ captions = captions ,
147183 ),
148184 )
149185 yield c
0 commit comments