docling-project
diff --git a/‎docling_core/transforms/chunker/hierarchical_chunker.py‎
Lines changed: 43 additions & 7 deletions b/‎docling_core/transforms/chunker/hierarchical_chunker.py‎
Lines changed: 43 additions & 7 deletions
@@ -10,6 +10,7 @@
 import logging
 from typing import Any, ClassVar, Iterator, Optional
 
+from pandas import DataFrame
 from pydantic import BaseModel, Field, conlist
 
 from docling_core.transforms.chunker import BaseChunker
@@ -20,15 +21,14 @@
     LevelNumber,
     ListItem,
     SectionHeaderItem,
+    TableItem,
     TextItem,
 )
 from docling_core.types.doc.labels import DocItemLabel
 
-_KEY_PATHS = "paths"
-_KEY_PROVS = "provs"
-_KEY_HEADINGS = "headings"
-
 _KEY_DOC_ITEMS = "doc_items"
+_KEY_HEADINGS = "headings"
+_KEY_CAPTIONS = "captions"
 
 _logger = logging.getLogger(__name__)
 
@@ -38,13 +38,16 @@ class ChunkMeta(BaseModel):
 
     # TODO align paths typewith _JSON_POINTER_REGEX
     doc_items: conlist(DocItem, min_length=1) = Field(  # type: ignore
-        default=None,
         alias=_KEY_DOC_ITEMS,
     )
     headings: Optional[conlist(str, min_length=1)] = Field(  # type: ignore
         default=None,
         alias=_KEY_HEADINGS,
     )
+    captions: Optional[conlist(str, min_length=1)] = Field(  # type: ignore
+        default=None,
+        alias=_KEY_CAPTIONS,
+    )
 
     excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
     excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
@@ -78,6 +81,28 @@ class HierarchicalChunker(BaseChunker):
     merge_list_items: bool = True
     delim: str = "\n"
 
+    @classmethod
+    def _triplet_serialize(cls, table_df: DataFrame) -> str:
+
+        # copy header as first row and shift all rows by one
+        table_df.loc[-1] = table_df.columns  # type: ignore[call-overload]
+        table_df.index = table_df.index + 1
+        table_df = table_df.sort_index()
+
+        rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
+        cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
+
+        nrows = table_df.shape[0]
+        ncols = table_df.shape[1]
+        texts = [
+            f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
+            for i in range(1, nrows)
+            for j in range(1, ncols)
+        ]
+        output_text = ". ".join(texts)
+
+        return output_text
+
     def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
         r"""Chunk the provided document.
 
@@ -90,9 +115,10 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
         heading_by_level: dict[LevelNumber, str] = {}
         list_items: list[TextItem] = []
         for item, level in dl_doc.iterate_items():
-
+            captions = None
             if isinstance(item, DocItem):
 
+                # first handle any merging needed
                 if self.merge_list_items:
                     if isinstance(
                         item, ListItem
@@ -136,14 +162,24 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
                     (not self.merge_list_items) and isinstance(item, ListItem)
                 ):
                     text = item.text
+                elif isinstance(item, TableItem):
+                    table_df = item.export_to_dataframe()
+                    if table_df.shape[0] < 1 or table_df.shape[1] < 2:
+                        # at least two cols needed, as first column contains row headers
+                        continue
+                    text = self._triplet_serialize(table_df=table_df)
+                    captions = [
+                        c.text for c in [r.resolve(dl_doc) for r in item.captions]
+                    ] or None
                 else:
-                    continue  # TODO refine to ignore some cases & raise otherwise?
+                    continue
                 c = Chunk(
                     text=text,
                     meta=ChunkMeta(
                         doc_items=[item],
                         headings=[heading_by_level[k] for k in sorted(heading_by_level)]
                         or None,
+                        captions=captions,
                     ),
                 )
                 yield c