Skip to content

Commit 564bac5

Browse files
committed
feat: adapt hierarchical chunker to v2 DoclingDocument
[skip-ci] Signed-off-by: Panos Vagenas <[email protected]>
1 parent c28a040 commit 564bac5

20 files changed

+25275
-1082
lines changed

docling_core/transforms/chunker/__init__.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55

66
"""Define the chunker types."""
77

8-
from docling_core.transforms.chunker.base import ( # noqa
9-
BaseChunker,
10-
Chunk,
11-
ChunkWithMetadata,
12-
)
8+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker # noqa
139
from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
10+
Chunk,
1411
HierarchicalChunker,
1512
)

docling_core/transforms/chunker/base.py

Lines changed: 9 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,71 +4,35 @@
44
#
55

66
"""Define base classes for chunking."""
7-
import re
87
from abc import ABC, abstractmethod
9-
from typing import Final, Iterator, Optional
8+
from typing import Any, Iterator
109

11-
from pydantic import BaseModel, Field, field_validator
10+
from pydantic import BaseModel
1211

13-
from docling_core.types import BoundingBox, Document
14-
from docling_core.types.base import _JSON_POINTER_REGEX
12+
from docling_core.types.doc import DoclingDocument as DLDocument
1513

16-
# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
17-
_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
1814

15+
class BaseChunk(BaseModel):
16+
"""Data model for base chunk."""
1917

20-
def _create_path(pos: int, path_prefix: str = "main-text") -> str:
21-
return f"#/{path_prefix}/{pos}"
22-
23-
24-
class Chunk(BaseModel):
25-
"""Data model for Chunk."""
26-
27-
path: str = Field(pattern=_JSON_POINTER_REGEX)
2818
text: str
29-
heading: Optional[str] = None
30-
31-
@field_validator("path", mode="before")
32-
@classmethod
33-
def _json_pointer_from_json_path(cls, path: str):
34-
if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
35-
groups = match.groups()
36-
if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
37-
return _create_path(
38-
pos=int(groups[1]),
39-
path_prefix=groups[0],
40-
)
41-
return path
42-
43-
44-
class ChunkWithMetadata(Chunk):
45-
"""Data model for Chunk including metadata."""
46-
47-
page: Optional[int] = None
48-
bbox: Optional[BoundingBox] = None
19+
meta: Any = None
4920

5021

5122
class BaseChunker(BaseModel, ABC):
5223
"""Base class for Chunker."""
5324

5425
@abstractmethod
55-
def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
26+
def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
5627
"""Chunk the provided document.
5728
5829
Args:
59-
dl_doc (Document): document to chunk
30+
dl_doc (DLDocument): document to chunk
6031
6132
Raises:
6233
NotImplementedError: in this abstract implementation
6334
6435
Yields:
65-
Iterator[Chunk]: iterator over extracted chunks
36+
Iterator[BaseChunk]: iterator over extracted chunks
6637
"""
6738
raise NotImplementedError()
68-
69-
@classmethod
70-
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
71-
return _create_path(
72-
pos=pos,
73-
path_prefix=path_prefix,
74-
)

0 commit comments

Comments
 (0)