docling-project · cau-git · Oct 16, 2024 · Oct 9, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -6,7 +6,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
     steps:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/setup-poetry

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,7 +52,7 @@ repos:
     hooks:
       - id: docs
         name: Docs
-        entry: poetry run ds_generate_docs docs
+        entry: poetry run generate_docs docs
         pass_filenames: false
         language: system
         files: '\.py$'

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Docling Core
 
 [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
-![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
+![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%20%203.11%20%7C%203.12%20%7C%203.13-blue)
 [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -21,7 +21,7 @@ pip install docling-core
 
 ### Development setup
 
-To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
+To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
 ```bash
 poetry install
 ```
@@ -45,14 +45,14 @@ poetry run pytest test
   Document.model_validate_json(data_str)
   ```
 
-- You can generate the JSON schema of a model with the script `ds_generate_jsonschema`.
+- You can generate the JSON schema of a model with the script `generate_jsonschema`.
 
   ```py
   # for the `Document` type
-  ds_generate_jsonschema Document
+  generate_jsonschema Document
 
   # for the use `Record` type
-  ds_generate_jsonschema Record
+  generate_jsonschema Record
   ```
 
 ## Documentation
@@ -61,12 +61,12 @@ Docling supports 3 main data types:
 
 - **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
   The Document type also models the metadata that may be attached to the converted document.
-  Check [Document](docs/Document.md) for the full JSON schema. 
+  Check [Document](docs/Document.json) for the full JSON schema. 
 - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
   Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
-  Check [Record](docs/Record.md) for the full JSON schema. 
+  Check [Record](docs/Record.json) for the full JSON schema. 
 - **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
-  Check [Generic](docs/Generic.md) for the full JSON schema. 
+  Check [Generic](docs/Generic.json) for the full JSON schema. 
 
 The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
 

diff --git a/docling_core/transforms/chunker/__init__.py b/docling_core/transforms/chunker/__init__.py
@@ -5,11 +5,5 @@
 
 """Define the chunker types."""
 
-from docling_core.transforms.chunker.base import (  # noqa
-    BaseChunker,
-    Chunk,
-    ChunkWithMetadata,
-)
-from docling_core.transforms.chunker.hierarchical_chunker import (  # noqa
-    HierarchicalChunker,
-)
+from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
+from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
@@ -4,71 +4,58 @@
 #
 
 """Define base classes for chunking."""
-import re
 from abc import ABC, abstractmethod
-from typing import Final, Iterator, Optional
+from typing import Any, ClassVar, Iterator
 
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel
 
-from docling_core.types import BoundingBox, Document
-from docling_core.types.base import _JSON_POINTER_REGEX
+from docling_core.types.doc import DoclingDocument as DLDocument
 
-# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
-_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
 
+class BaseMeta(BaseModel):
+    """Metadata base class."""
 
-def _create_path(pos: int, path_prefix: str = "main-text") -> str:
-    return f"#/{path_prefix}/{pos}"
+    excluded_embed: ClassVar[list[str]] = []
+    excluded_llm: ClassVar[list[str]] = []
 
+    def export_json_dict(self) -> dict[str, Any]:
+        """Helper method for exporting non-None keys to JSON mode.
 
-class Chunk(BaseModel):
-    """Data model for Chunk."""
+        Returns:
+            dict[str, Any]: The exported dictionary.
+        """
+        return self.model_dump(mode="json", by_alias=True, exclude_none=True)
 
-    path: str = Field(pattern=_JSON_POINTER_REGEX)
-    text: str
-    heading: Optional[str] = None
 
-    @field_validator("path", mode="before")
-    @classmethod
-    def _json_pointer_from_json_path(cls, path: str):
-        if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
-            groups = match.groups()
-            if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
-                return _create_path(
-                    pos=int(groups[1]),
-                    path_prefix=groups[0],
-                )
-        return path
+class BaseChunk(BaseModel):
+    """Chunk base class."""
 
+    text: str
+    meta: BaseMeta
 
-class ChunkWithMetadata(Chunk):
-    """Data model for Chunk including metadata."""
+    def export_json_dict(self) -> dict[str, Any]:
+        """Helper method for exporting non-None keys to JSON mode.
 
-    page: Optional[int] = None
-    bbox: Optional[BoundingBox] = None
+        Returns:
+            dict[str, Any]: The exported dictionary.
+        """
+        return self.model_dump(mode="json", by_alias=True, exclude_none=True)
 
 
 class BaseChunker(BaseModel, ABC):
-    """Base class for Chunker."""
+    """Chunker base class."""
 
     @abstractmethod
-    def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
+    def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
         """Chunk the provided document.
 
         Args:
-            dl_doc (Document): document to chunk
+            dl_doc (DLDocument): document to chunk
 
         Raises:
             NotImplementedError: in this abstract implementation
 
         Yields:
-            Iterator[Chunk]: iterator over extracted chunks
+            Iterator[BaseChunk]: iterator over extracted chunks
         """
         raise NotImplementedError()
-
-    @classmethod
-    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
-        return _create_path(
-            pos=pos,
-            path_prefix=path_prefix,
-        )