Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: #641 - first draft implementation, python only #642

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion transforms/language/doc_chunk/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ The transform can be tuned with the following parameters.

| Parameter | Default | Description |
|------------|----------|--------------|
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling). |
| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. |
| `content_column_name` | `contents` | Name of the column containing the text to be chunked. |
| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. |
| `dl_min_chunk_len` | `None` | Minimum number of characters for the chunk in the dl_json chunker. Setting to None is using the library defaults, i.e. a `min_chunk_len=64`. |
| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. |
| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. |
| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. |
| `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. |
| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. |
Expand Down
73 changes: 72 additions & 1 deletion transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
################################################################################

from abc import ABCMeta, abstractmethod
from typing import Iterator, Optional
from typing import Iterator, Optional, Dict, List

from docling_core.types import Document as DLDocument
from llama_index.core.node_parser.text.token import TokenTextSplitter
from llama_index.core import Document as LIDocument
from llama_index.core.node_parser import MarkdownNodeParser
from docling_core.transforms.chunker import HierarchicalChunker
Expand Down Expand Up @@ -66,3 +67,73 @@ def chunk(self, content: str) -> Iterator[dict]:
yield {
self.output_chunk_column_name: node.text,
}


class LITokenTextSplitter(ChunkingExecutor):
"""
A text chunker that leverages Llama Index's token-based text splitter. This splitter breaks input text into
fixed-window chunks, with each chunk measured in tokens rather than characters.

The chunking process ensures that each chunk contains a specific number of tokens, and an optional overlap between
chunks (also measured in tokens) can be specified to preserve context between the chunks.

Args:
output_chunk_column_name (str): Name of the output column containing the text of each chunk.
output_chunk_column_id (str): Name of the output column containing the ID of each chunk.
chunk_size_tokens (int): Length of each chunk in number of tokens.
chunk_overlap_tokens (int): Number of tokens overlapping between consecutive chunks.

Attributes:
output_chunk_column_name (str)
output_chunk_column_id (str)
chunk_size_tokens (int)
chunk_overlap_tokens (int)
"""

def __init__(
self,
output_chunk_column_name: str,
output_chunk_column_id: str,
chunk_size_tokens: int,
chunk_overlap_tokens: int
):
self.output_chunk_column_name = output_chunk_column_name
self.output_chunk_column_id = output_chunk_column_id
self.chunk_size = chunk_size_tokens
self.chunk_overlap = chunk_overlap_tokens


def _chunk_text(self, text: str) -> List[str]:
"""
Internal method to chunk text using TokenTextSplitter.

Args:
text (str): Input text to be chunked.

Returns:
List[str]: List of chunked text.
"""
text_splitter = TokenTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap
)
return text_splitter.split_text(text)


def chunk(self, text: str) -> Iterator[Dict]:
"""
Chunks input text into fixed-window lengths with token overlap.

Args:
text (str): Input text to be chunked.

Yields:
Dict: Chunked text with ID.
"""
chunk_id = 0
for chunk in self._chunk_text(text):
yield {
self.output_chunk_column_id: chunk_id,
self.output_chunk_column_name: chunk,
}
chunk_id += 1
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from doc_chunk_transform_python import DocChunkPythonTransformConfiguration

from doc_chunk_transform import chunking_types

# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_md"))
# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input_token_text"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
local_conf = {
"input_folder": input_folder,
Expand All @@ -39,6 +40,11 @@
# doc_chunk params
# "doc_chunk_chunking_type": "li_markdown",
"doc_chunk_chunking_type": "dl_json",
# "doc_chunk_chunking_type": chunking_types.LI_TOKEN_TEXT,
# fixed-size params
# "doc_chunk_output_chunk_column_name": "chunk_text",
# "doc_chunk_chunk_size_tokens": 128,
# "doc_chunk_chunk_overlap_tokens": 30
}
if __name__ == "__main__":
# Set the simulated command line args
Expand Down
37 changes: 34 additions & 3 deletions transforms/language/doc_chunk/python/src/doc_chunk_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pyarrow as pa
from data_processing.transform import AbstractTableTransform, TransformConfiguration
from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown
from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter


short_name = "doc_chunk"
Expand All @@ -27,7 +27,10 @@
doc_id_column_name_key = "doc_id_column_name"
chunking_type_key = "chunking_type"
dl_min_chunk_len_key = "dl_min_chunk_len"
chunk_size_tokens_key = "chunk_size_tokens"
chunk_overlap_tokens_key = "chunk_overlap_tokens"
output_chunk_column_name_key = "output_chunk_column_name"
output_chunk_column_id_key = "output_chunk_column_id"
output_source_doc_id_column_name_key = "output_source_doc_id_column_name"
output_jsonpath_column_name_key = "output_jsonpath_column_name"
output_pageno_column_name_key = "output_pageno_column_name"
Expand All @@ -41,11 +44,13 @@
output_jsonpath_column_name_cli_param = f"{cli_prefix}{output_jsonpath_column_name_key}"
output_pageno_column_name_cli_param = f"{cli_prefix}{output_pageno_column_name_key}"
output_bbox_column_name_cli_param = f"{cli_prefix}{output_bbox_column_name_key}"

chunk_size_tokens_cli_param = f"{cli_prefix}{chunk_size_tokens_key}"
chunk_overlap_tokens_cli_param = f"{cli_prefix}{chunk_overlap_tokens_key}"

class chunking_types(str, enum.Enum):
LI_MARKDOWN = "li_markdown"
DL_JSON = "dl_json"
LI_TOKEN_TEXT = "li_token_text"

def __str__(self):
return str(self.value)
Expand All @@ -56,11 +61,13 @@ def __str__(self):
default_chunking_type = chunking_types.DL_JSON
default_dl_min_chunk_len = None
default_output_chunk_column_name = "contents"
default_output_chunk_column_id = "chunk_id"
default_output_source_doc_id_column_name = "source_document_id"
default_output_jsonpath_column_name = "doc_jsonpath"
default_output_pageno_column_name = "page_number"
default_output_bbox_column_name = "bbox"

default_chunk_size_tokens = 128
default_chunk_overlap_tokens = 30

class DocChunkTransform(AbstractTableTransform):
"""
Expand All @@ -84,6 +91,7 @@ def __init__(self, config: dict[str, Any]):
self.content_column_name = config.get(content_column_name_key, default_content_column_name)
self.doc_id_column_name = config.get(doc_id_column_name_key, default_doc_id_column_name)
self.output_chunk_column_name = config.get(output_chunk_column_name_key, default_output_chunk_column_name)
self.output_chunk_column_id = config.get(output_chunk_column_id_key, default_output_chunk_column_id)
self.output_source_doc_id_column_name = config.get(output_source_doc_id_column_name_key, default_output_source_doc_id_column_name)

# Parameters for Docling JSON chunking
Expand All @@ -96,6 +104,10 @@ def __init__(self, config: dict[str, Any]):
)
self.output_bbox_column_name_key = config.get(output_bbox_column_name_key, default_output_bbox_column_name)

# Parameters for Fixed-size with overlap chunking
self.chunk_size_tokens = config.get(chunk_size_tokens_key, default_chunk_size_tokens)
self.chunk_overlap_tokens = config.get(chunk_overlap_tokens_key, default_chunk_overlap_tokens)

# Initialize chunker

self.chunker: ChunkingExecutor
Expand All @@ -111,6 +123,13 @@ def __init__(self, config: dict[str, Any]):
self.chunker = LIMarkdown(
output_chunk_column_name=self.output_chunk_column_name,
)
elif self.chunking_type == chunking_types.LI_TOKEN_TEXT:
self.chunker = LITokenTextSplitter(
output_chunk_column_name=self.output_chunk_column_name,
output_chunk_column_id=self.output_chunk_column_id,
chunk_size_tokens=self.chunk_size_tokens,
chunk_overlap_tokens=self.chunk_overlap_tokens
)
else:
raise RuntimeError(f"{self.chunking_type=} is not valid.")

Expand Down Expand Up @@ -213,6 +232,18 @@ def add_input_params(self, parser: ArgumentParser) -> None:
default=default_output_bbox_column_name,
help="Column name to store the bbox of the chunk",
)
parser.add_argument(
f"--{chunk_size_tokens_cli_param}",
default=default_chunk_size_tokens,
type=int,
help="Size of the chunk in tokens for the fixed-sized chunker",
)
parser.add_argument(
f"--{chunk_overlap_tokens_cli_param}",
default=default_chunk_overlap_tokens,
type=int,
help="Number of tokens overlapping between chunks for the fixed-sized chunker.",
)

def apply_input_params(self, args: Namespace) -> bool:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-04 14:00:40",
"end_time": "2024-10-04 14:00:41",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"chunking_type": "li_token_text",
"content_column_name": "contents",
"doc_id_column_name": "document_id",
"dl_min_chunk_len": null,
"output_chunk_column_name": "chunk_text",
"output_source_doc_id_column_name": "source_document_id",
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"chunk_size_tokens": 128,
"chunk_overlap_tokens": 30,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".parquet"
],
"num_processors": 0
},
"job_output_stats": {
"source_files": 1,
"source_size": 17749,
"result_files": 1,
"result_size": 8827,
"processing_time": 0.194,
"nfiles": 1,
"nrows": 10,
"source_doc_count": 2,
"result_doc_count": 10
},
"source": {
"name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/test-data/input_token_text",
"type": "path"
},
"target": {
"name": "/Users/jmcappi/git/ai-foundation/data-prep-kit/transforms/language/doc_chunk/python/output",
"type": "path"
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
from doc_chunk_transform import chunking_type_cli_param, chunking_types
from doc_chunk_transform import (
chunking_type_cli_param,
output_chunk_column_name_cli_param,
chunking_types
)
from doc_chunk_transform_python import DocChunkPythonTransformConfiguration


Expand Down Expand Up @@ -55,4 +59,17 @@ def get_test_transform_fixtures(self) -> list[tuple]:
basedir + "/expected_md",
)
)

# Run with fixed size token chunker
fixtures.append(
(
launcher,
{
chunking_type_cli_param: chunking_types.LI_TOKEN_TEXT,
output_chunk_column_name_cli_param: "chunk_text"
},
basedir + "/input_token_text",
basedir + "/expected_token_text",
)
)
return fixtures