|
4 | 4 | # |
5 | 5 |
|
6 | 6 | """Define base classes for chunking.""" |
7 | | -import re |
8 | 7 | from abc import ABC, abstractmethod |
9 | | -from typing import Final, Iterator, Optional |
| 8 | +from typing import Any, Iterator |
10 | 9 |
|
11 | | -from pydantic import BaseModel, Field, field_validator |
| 10 | +from pydantic import BaseModel |
12 | 11 |
|
13 | | -from docling_core.types import BoundingBox, Document |
14 | | -from docling_core.types.base import _JSON_POINTER_REGEX |
| 12 | +from docling_core.types.doc import DoclingDocument as DLDocument |
15 | 13 |
|
16 | | -# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes) |
17 | | -_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$") |
18 | 14 |
|
| 15 | +class BaseChunk(BaseModel): |
| 16 | + """Data model for base chunk.""" |
19 | 17 |
|
20 | | -def _create_path(pos: int, path_prefix: str = "main-text") -> str: |
21 | | - return f"#/{path_prefix}/{pos}" |
22 | | - |
23 | | - |
24 | | -class Chunk(BaseModel): |
25 | | - """Data model for Chunk.""" |
26 | | - |
27 | | - path: str = Field(pattern=_JSON_POINTER_REGEX) |
28 | 18 | text: str |
29 | | - heading: Optional[str] = None |
30 | | - |
31 | | - @field_validator("path", mode="before") |
32 | | - @classmethod |
33 | | - def _json_pointer_from_json_path(cls, path: str): |
34 | | - if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None: |
35 | | - groups = match.groups() |
36 | | - if len(groups) == 2 and groups[0] is not None and groups[1] is not None: |
37 | | - return _create_path( |
38 | | - pos=int(groups[1]), |
39 | | - path_prefix=groups[0], |
40 | | - ) |
41 | | - return path |
42 | | - |
43 | | - |
44 | | -class ChunkWithMetadata(Chunk): |
45 | | - """Data model for Chunk including metadata.""" |
46 | | - |
47 | | - page: Optional[int] = None |
48 | | - bbox: Optional[BoundingBox] = None |
| 19 | + meta: Any = None |
49 | 20 |
|
50 | 21 |
|
51 | 22 | class BaseChunker(BaseModel, ABC): |
52 | 23 | """Base class for Chunker.""" |
53 | 24 |
|
54 | 25 | @abstractmethod |
55 | | - def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]: |
| 26 | + def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]: |
56 | 27 | """Chunk the provided document. |
57 | 28 |
|
58 | 29 | Args: |
59 | | - dl_doc (Document): document to chunk |
| 30 | + dl_doc (DLDocument): document to chunk |
60 | 31 |
|
61 | 32 | Raises: |
62 | 33 | NotImplementedError: in this abstract implementation |
63 | 34 |
|
64 | 35 | Yields: |
65 | | - Iterator[Chunk]: iterator over extracted chunks |
| 36 | + Iterator[BaseChunk]: iterator over extracted chunks |
66 | 37 | """ |
67 | 38 | raise NotImplementedError() |
68 | | - |
69 | | - @classmethod |
70 | | - def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str: |
71 | | - return _create_path( |
72 | | - pos=pos, |
73 | | - path_prefix=path_prefix, |
74 | | - ) |
0 commit comments