Skip to content

Commit 70608f8

Browse files
committed
use Field constraints instead of conlist, refactor chunking types
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 081779c commit 70608f8

File tree

9 files changed

+63
-190
lines changed

9 files changed

+63
-190
lines changed

docling_core/transforms/chunker/__init__.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,5 @@
55

66
"""Define the chunker types."""
77

8-
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker # noqa
9-
from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
10-
Chunk,
11-
HierarchicalChunker,
12-
)
8+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9+
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker

docling_core/transforms/chunker/base.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,45 @@
55

66
"""Define base classes for chunking."""
77
from abc import ABC, abstractmethod
8-
from typing import Any, Iterator
8+
from typing import Any, ClassVar, Iterator
99

1010
from pydantic import BaseModel
1111

1212
from docling_core.types.doc import DoclingDocument as DLDocument
1313

1414

15+
class BaseMeta(BaseModel):
16+
"""Metadata base class."""
17+
18+
excluded_embed: ClassVar[list[str]] = []
19+
excluded_llm: ClassVar[list[str]] = []
20+
21+
def export_json_dict(self) -> dict[str, Any]:
22+
"""Helper method for exporting non-None keys to JSON mode.
23+
24+
Returns:
25+
dict[str, Any]: The exported dictionary.
26+
"""
27+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
28+
29+
1530
class BaseChunk(BaseModel):
16-
"""Data model for base chunk."""
31+
"""Chunk base class."""
1732

1833
text: str
19-
meta: Any = None
34+
meta: BaseMeta
35+
36+
def export_json_dict(self) -> dict[str, Any]:
37+
"""Helper method for exporting non-None keys to JSON mode.
38+
39+
Returns:
40+
dict[str, Any]: The exported dictionary.
41+
"""
42+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
2043

2144

2245
class BaseChunker(BaseModel, ABC):
23-
"""Base class for Chunker."""
46+
"""Chunker base class."""
2447

2548
@abstractmethod
2649
def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 27 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@
1111
from typing import Any, ClassVar, Iterator, Optional
1212

1313
from pandas import DataFrame
14-
from pydantic import BaseModel, Field, conlist
14+
from pydantic import Field
1515

16-
from docling_core.transforms.chunker import BaseChunker
17-
from docling_core.transforms.chunker.base import BaseChunk
16+
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
1817
from docling_core.types.doc import DoclingDocument as DLDocument
1918
from docling_core.types.doc.document import (
2019
DocItem,
@@ -33,50 +32,42 @@
3332
_logger = logging.getLogger(__name__)
3433

3534

36-
class ChunkMeta(BaseModel):
37-
"""Data model for specific chunk metadata."""
35+
class DocMeta(BaseMeta):
36+
"""Data model for Hierarchical Chunker metadata."""
3837

39-
# TODO align paths typewith _JSON_POINTER_REGEX
40-
doc_items: conlist(DocItem, min_length=1) = Field( # type: ignore
38+
doc_items: list[DocItem] = Field(
4139
alias=_KEY_DOC_ITEMS,
40+
min_length=1,
4241
)
43-
headings: Optional[conlist(str, min_length=1)] = Field( # type: ignore
42+
headings: Optional[list[str]] = Field(
4443
default=None,
4544
alias=_KEY_HEADINGS,
45+
min_length=1,
4646
)
47-
captions: Optional[conlist(str, min_length=1)] = Field( # type: ignore
47+
captions: Optional[list[str]] = Field(
4848
default=None,
4949
alias=_KEY_CAPTIONS,
50+
min_length=1,
5051
)
5152

5253
excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
5354
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
5455

55-
def export_json_dict(self) -> dict[str, Any]:
56-
"""Helper method for exporting non-None keys to JSON mode.
57-
58-
Returns:
59-
dict[str, Any]: The exported dictionary.
60-
"""
61-
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
6256

57+
class DocChunk(BaseChunk):
58+
"""Data model for Hierarchical Chunker chunks."""
6359

64-
class Chunk(BaseChunk):
65-
"""Data model for specific chunk."""
66-
67-
meta: ChunkMeta
68-
69-
def export_json_dict(self) -> dict[str, Any]:
70-
"""Helper method for exporting non-None keys to JSON mode.
71-
72-
Returns:
73-
dict[str, Any]: The exported dictionary.
74-
"""
75-
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
60+
meta: DocMeta
7661

7762

7863
class HierarchicalChunker(BaseChunker):
79-
"""Chunker implementation leveraging the document layout."""
64+
r"""Chunker implementation leveraging the document layout.
65+
66+
Args:
67+
merge_list_items (bool): Whether to merge successive list items.
68+
Defaults to True.
69+
delim (str): Delimiter to use for merging text. Defaults to "\n".
70+
"""
8071

8172
merge_list_items: bool = True
8273
delim: str = "\n"
@@ -129,9 +120,9 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
129120
list_items.append(item)
130121
continue
131122
elif list_items: # need to yield
132-
yield Chunk(
123+
yield DocChunk(
133124
text=self.delim.join([i.text for i in list_items]),
134-
meta=ChunkMeta(
125+
meta=DocMeta(
135126
doc_items=list_items,
136127
headings=[
137128
heading_by_level[k]
@@ -148,7 +139,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
148139
isinstance(item, TextItem)
149140
and item.label == DocItemLabel.SECTION_HEADER
150141
):
151-
# TODO second branch not needed after cleanup above:
142+
# TODO second branch not needed once cleanup above complete:
152143
level = item.level if isinstance(item, SectionHeaderItem) else 1
153144
heading_by_level[level] = item.text
154145

@@ -173,9 +164,9 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
173164
] or None
174165
else:
175166
continue
176-
c = Chunk(
167+
c = DocChunk(
177168
text=text,
178-
meta=ChunkMeta(
169+
meta=DocMeta(
179170
doc_items=[item],
180171
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
181172
or None,
@@ -185,9 +176,9 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
185176
yield c
186177

187178
if self.merge_list_items and list_items: # need to yield
188-
yield Chunk(
179+
yield DocChunk(
189180
text=self.delim.join([i.text for i in list_items]),
190-
meta=ChunkMeta(
181+
meta=DocMeta(
191182
doc_items=list_items,
192183
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
193184
or None,

docling_core/transforms/id_generator/__init__.py

Lines changed: 0 additions & 9 deletions
This file was deleted.

docling_core/transforms/id_generator/base.py

Lines changed: 0 additions & 30 deletions
This file was deleted.

docling_core/transforms/id_generator/uuid_generator.py

Lines changed: 0 additions & 34 deletions
This file was deleted.

docling_core/transforms/metadata_extractor/__init__.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

docling_core/transforms/metadata_extractor/base.py

Lines changed: 0 additions & 59 deletions
This file was deleted.

test/test_hierarchical_chunker.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import json
77

88
from docling_core.transforms.chunker import HierarchicalChunker
9-
from docling_core.transforms.chunker.hierarchical_chunker import Chunk
9+
from docling_core.transforms.chunker.hierarchical_chunker import DocChunk
1010
from docling_core.types.doc import DoclingDocument as DLDocument
1111

1212

@@ -18,7 +18,9 @@ def test_chunk_merge_list_items():
1818
merge_list_items=True,
1919
)
2020
chunks = chunker.chunk(dl_doc=dl_doc)
21-
act_data = dict(root=[Chunk.model_validate(n).export_json_dict() for n in chunks])
21+
act_data = dict(
22+
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
23+
)
2224
with open("test/data/chunker/0_out_chunks.json") as f:
2325
exp_data = json.load(fp=f)
2426
assert exp_data == act_data
@@ -32,7 +34,9 @@ def test_chunk_no_merge_list_items():
3234
merge_list_items=False,
3335
)
3436
chunks = chunker.chunk(dl_doc=dl_doc)
35-
act_data = dict(root=[Chunk.model_validate(n).export_json_dict() for n in chunks])
37+
act_data = dict(
38+
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
39+
)
3640
with open("test/data/chunker/1_out_chunks.json") as f:
3741
exp_data = json.load(fp=f)
3842
assert exp_data == act_data

0 commit comments

Comments
 (0)