Skip to content

Commit 081779c

Browse files
committed
feat: add table support in chunker, incl. captions
Signed-off-by: Panos Vagenas <[email protected]>
1 parent c118afe commit 081779c

File tree

3 files changed

+334
-8
lines changed

3 files changed

+334
-8
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import logging
1111
from typing import Any, ClassVar, Iterator, Optional
1212

13+
from pandas import DataFrame
1314
from pydantic import BaseModel, Field, conlist
1415

1516
from docling_core.transforms.chunker import BaseChunker
@@ -20,15 +21,14 @@
2021
LevelNumber,
2122
ListItem,
2223
SectionHeaderItem,
24+
TableItem,
2325
TextItem,
2426
)
2527
from docling_core.types.doc.labels import DocItemLabel
2628

27-
_KEY_PATHS = "paths"
28-
_KEY_PROVS = "provs"
29-
_KEY_HEADINGS = "headings"
30-
3129
_KEY_DOC_ITEMS = "doc_items"
30+
_KEY_HEADINGS = "headings"
31+
_KEY_CAPTIONS = "captions"
3232

3333
_logger = logging.getLogger(__name__)
3434

@@ -38,13 +38,16 @@ class ChunkMeta(BaseModel):
3838

3939
# TODO align paths typewith _JSON_POINTER_REGEX
4040
doc_items: conlist(DocItem, min_length=1) = Field( # type: ignore
41-
default=None,
4241
alias=_KEY_DOC_ITEMS,
4342
)
4443
headings: Optional[conlist(str, min_length=1)] = Field( # type: ignore
4544
default=None,
4645
alias=_KEY_HEADINGS,
4746
)
47+
captions: Optional[conlist(str, min_length=1)] = Field( # type: ignore
48+
default=None,
49+
alias=_KEY_CAPTIONS,
50+
)
4851

4952
excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
5053
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
@@ -78,6 +81,28 @@ class HierarchicalChunker(BaseChunker):
7881
merge_list_items: bool = True
7982
delim: str = "\n"
8083

84+
@classmethod
85+
def _triplet_serialize(cls, table_df: DataFrame) -> str:
86+
87+
# copy header as first row and shift all rows by one
88+
table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
89+
table_df.index = table_df.index + 1
90+
table_df = table_df.sort_index()
91+
92+
rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
93+
cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
94+
95+
nrows = table_df.shape[0]
96+
ncols = table_df.shape[1]
97+
texts = [
98+
f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
99+
for i in range(1, nrows)
100+
for j in range(1, ncols)
101+
]
102+
output_text = ". ".join(texts)
103+
104+
return output_text
105+
81106
def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
82107
r"""Chunk the provided document.
83108
@@ -90,9 +115,10 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
90115
heading_by_level: dict[LevelNumber, str] = {}
91116
list_items: list[TextItem] = []
92117
for item, level in dl_doc.iterate_items():
93-
118+
captions = None
94119
if isinstance(item, DocItem):
95120

121+
# first handle any merging needed
96122
if self.merge_list_items:
97123
if isinstance(
98124
item, ListItem
@@ -136,14 +162,24 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
136162
(not self.merge_list_items) and isinstance(item, ListItem)
137163
):
138164
text = item.text
165+
elif isinstance(item, TableItem):
166+
table_df = item.export_to_dataframe()
167+
if table_df.shape[0] < 1 or table_df.shape[1] < 2:
168+
# at least two cols needed, as first column contains row headers
169+
continue
170+
text = self._triplet_serialize(table_df=table_df)
171+
captions = [
172+
c.text for c in [r.resolve(dl_doc) for r in item.captions]
173+
] or None
139174
else:
140-
continue # TODO refine to ignore some cases & raise otherwise?
175+
continue
141176
c = Chunk(
142177
text=text,
143178
meta=ChunkMeta(
144179
doc_items=[item],
145180
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
146181
or None,
182+
captions=captions,
147183
),
148184
)
149185
yield c

0 commit comments

Comments
 (0)