Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b34fb57
fix: introduce functional chunking for source files
danielfrey63 Dec 29, 2025
e6306c4
register CodeAwareSplitter and improve its log messages
danielfrey63 Dec 30, 2025
66f42c4
docs: reverted README.md and api/README.md back to original
danielfrey63 Jan 1, 2026
73fa53d
fix: reverted repo.json to original state
danielfrey63 Jan 1, 2026
67a5b34
fix: clear separation - revert partial changes
danielfrey63 Jan 1, 2026
c025575
fix: cleanup directories
danielfrey63 Jan 1, 2026
fb299fc
fix: cleanup files
danielfrey63 Jan 1, 2026
17243fa
fix: refactore gemini code assist suggestions
danielfrey63 Jan 1, 2026
78b01ce
feat: optimize code_splitter with performance improvements
danielfrey63 Jan 1, 2026
0900aea
fix: removed risky module loading
danielfrey63 Jan 1, 2026
4215e41
fix: hierarchical splitting
danielfrey63 Jan 3, 2026
a2e0b4d
fix: clean hierarchical chunking
danielfrey63 Jan 3, 2026
40bab77
Update api/code_splitter.py
danielfrey63 Jan 3, 2026
0000098
feat(api): implement robust recursive syntax-aware code splitting wit…
danielfrey63 Jan 3, 2026
12b2a4e
Update tests/unit/test_code_splitter.py
danielfrey63 Jan 3, 2026
b58ff0f
fix: unused variable removal
danielfrey63 Jan 3, 2026
a1df9aa
Merge branch 'feature/functional-chunker' of github.com:danielfrey63/…
danielfrey63 Jan 3, 2026
e20482c
fix: added test coverage
danielfrey63 Jan 3, 2026
a216766
refactor(api): simplify tree-sitter import and enhance config tests
danielfrey63 Jan 3, 2026
c51b99d
refactor(api): remove un-necessary import
danielfrey63 Jan 3, 2026
dcf795e
setup workflow actions
danielfrey63 Jan 3, 2026
daa8e62
chore: debug settings
danielfrey63 Jan 3, 2026
5776401
test(api): add edge case tests for code splitter coverage
danielfrey63 Jan 4, 2026
15a7e7f
test(api): added exeption testing reaching coverage of 98%
danielfrey63 Jan 4, 2026
76b5a33
test(api): added full AST/CST based node identification
danielfrey63 Jan 4, 2026
fb914c0
fix(api): updates based several feedbacks
danielfrey63 Jan 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
320 changes: 320 additions & 0 deletions api/code_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,320 @@
from __future__ import annotations

from dataclasses import dataclass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To improve maintainability, you can also import asdict here. It can be used later to simplify the serialization of the CodeSplitterConfig dataclass into a dictionary.

Suggested change
from dataclasses import dataclass
from dataclasses import dataclass, asdict

import importlib
import logging
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple

from adalflow.core.component import DataComponent
from adalflow.components.data_process import TextSplitter
from adalflow.core.types import Document

logger = logging.getLogger(__name__)

_DEFINITION_TYPE_KEYWORDS = (
"function",
"method",
"class",
"interface",
"struct",
"enum",
"trait",
"impl",
"module",
"namespace",
"type",
)


_EXT_TO_LANGUAGE: Dict[str, str] = {
"py": "python",
"js": "javascript",
"jsx": "javascript",
"ts": "typescript",
"tsx": "tsx",
"java": "java",
"c": "c",
"h": "c",
"cpp": "cpp",
"hpp": "cpp",
"cc": "cpp",
"cs": "c_sharp",
"go": "go",
"rs": "rust",
"php": "php",
"rb": "ruby",
"swift": "swift",
"kt": "kotlin",
"kts": "kotlin",
"scala": "scala",
"lua": "lua",
"sh": "bash",
"bash": "bash",
"html": "html",
"css": "css",
"json": "json",
"yml": "yaml",
"yaml": "yaml",
"toml": "toml",
"md": "markdown",
}


@dataclass(frozen=True)
class CodeSplitterConfig:
chunk_size_lines: int = 200
chunk_overlap_lines: int = 20
min_chunk_lines: int = 5
enabled: bool = True


def _safe_import_tree_sitter() -> Optional[Callable[..., Any]]:
"""Safely import and return the `get_parser` function from tree_sitter_languages."""
module_candidates = [
"tree_sitter_languages", # module name used by tree-sitter-languages on most installs
]

for module_name in module_candidates:
try:
mod = importlib.import_module(module_name)
get_parser = getattr(mod, "get_parser", None)
if callable(get_parser):
return get_parser
except ImportError:
continue

return None


def _iter_definition_like_nodes(root_node: Any) -> Iterable[Any]:
for child in getattr(root_node, "children", []):
if not getattr(child, "is_named", False):
continue
node_type = getattr(child, "type", "")
# Split node type into words to avoid partial matches on keywords.
lowered_parts = set(node_type.lower().replace("_", " ").split())
if any(k in lowered_parts for k in _DEFINITION_TYPE_KEYWORDS):
yield child


def _split_lines_with_overlap(
lines: List[str], *, chunk_size_lines: int, chunk_overlap_lines: int
) -> List[Tuple[List[str], int]]:
if chunk_size_lines <= 0:
return [(lines, 0)]

overlap = max(0, min(chunk_overlap_lines, chunk_size_lines - 1))
chunks: List[Tuple[List[str], int]] = []
start = 0
n = len(lines)

while start < n:
end = min(n, start + chunk_size_lines)
chunks.append((lines[start:end], start))
if end >= n:
break
start = end - overlap

return chunks


def _slice_text_by_bytes_preencoded(text_bytes: bytes, start_byte: int, end_byte: int) -> str:
return text_bytes[start_byte:end_byte].decode("utf-8", errors="replace")


def _byte_offset_to_line_preencoded(text_bytes: bytes, byte_offset: int) -> int:
prefix = text_bytes[:max(0, byte_offset)]
return prefix.count(b"\n") + 1


class TreeSitterCodeSplitter:
def __init__(
self,
*,
chunk_size_lines: int = 200,
chunk_overlap_lines: int = 20,
min_chunk_lines: int = 5,
enabled: bool = True,
) -> None:
self.config = CodeSplitterConfig(
chunk_size_lines=chunk_size_lines,
chunk_overlap_lines=chunk_overlap_lines,
min_chunk_lines=min_chunk_lines,
enabled=enabled,
)
self._get_parser = _safe_import_tree_sitter()

def is_available(self) -> bool:
return self._get_parser is not None

def split_document(self, doc: Document) -> List[Document]:
if not self.config.enabled:
return [doc]

meta = getattr(doc, "meta_data", {}) or {}
if not meta.get("is_code"):
return [doc]

file_type = (meta.get("type") or "").lower().lstrip(".")
return self._split_code_text(doc.text or "", meta, file_type)

def _get_language_name_candidates(self, file_type: str) -> List[str]:
mapped = _EXT_TO_LANGUAGE.get(file_type)
candidates: List[str] = []
if mapped:
candidates.append(mapped)
if file_type and file_type not in candidates:
candidates.append(file_type)
return candidates

def _try_get_parser(self, file_type: str) -> Any:
if self._get_parser is None:
return None

for name in self._get_language_name_candidates(file_type):
try:
return self._get_parser(name)
except Exception as e:
logger.debug("Failed to get parser for language '%s': %s", name, e)
continue
return None

def _split_code_text(self, text: str, meta: Dict[str, Any], file_type: str) -> List[Document]:
parser = self._try_get_parser(file_type)
if parser is None:
return self._fallback_line_split(text, meta)

text_bytes = text.encode("utf-8", errors="replace")
try:
tree = parser.parse(text_bytes)
except Exception:
return self._fallback_line_split(text, meta)

root = getattr(tree, "root_node", None)
if root is None:
return self._fallback_line_split(text, meta)

nodes = list(_iter_definition_like_nodes(root))
if not nodes:
return self._fallback_line_split(text, meta)

pieces: List[Tuple[str, int]] = []
for node in nodes:
try:
start_b = int(getattr(node, "start_byte"))
end_b = int(getattr(node, "end_byte"))
except (AttributeError, ValueError, TypeError) as e:
logger.debug("Could not process a tree-sitter node for file type '%s': %s", file_type, e)
continue
snippet = _slice_text_by_bytes_preencoded(text_bytes, start_b, end_b)
start_line = _byte_offset_to_line_preencoded(text_bytes, start_b)
pieces.append((snippet, start_line))

if not pieces:
return self._fallback_line_split(text, meta)

docs: List[Document] = []
for snippet, start_line in pieces:
snippet_lines = snippet.splitlines(True)
if len(snippet_lines) < self.config.min_chunk_lines:
continue

if len(snippet_lines) <= self.config.chunk_size_lines:
docs.append(self._make_chunk_doc(snippet, meta, start_line))
continue

for sub, sub_start_idx in _split_lines_with_overlap(
snippet_lines,
chunk_size_lines=self.config.chunk_size_lines,
chunk_overlap_lines=self.config.chunk_overlap_lines,
):
sub_text = "".join(sub)
docs.append(self._make_chunk_doc(sub_text, meta, start_line + sub_start_idx))

if not docs:
return self._fallback_line_split(text, meta)
else:
return self._add_chunk_metadata(docs)
Comment on lines +255 to +306
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation for splitting large code blocks has a significant limitation. When a semantic block (like a class) is larger than chunk_size_lines, it falls back to simple line-based splitting (lines 226-232). This can break up methods or other logical units within that block, which undermines the goal of syntax-aware chunking.

For example, a 500-line Java class will be identified as a single semantic block. If chunk_size_lines is 200, this class will be split into line-based chunks, ignoring the method boundaries within it.

A more robust approach would be to recursively split large nodes. If a node is too large, the splitter should attempt to split it based on its children definition nodes (e.g., methods within a class) before falling back to line-based splitting. This would ensure that the chunking remains syntax-aware at deeper levels of the code structure.


def _add_chunk_metadata(self, docs: List[Document]) -> List[Document]:
for i, d in enumerate(docs):
d.meta_data["chunk_index"] = i
d.meta_data["chunk_total"] = len(docs)
return docs

def _fallback_line_split(self, text: str, meta: Dict[str, Any]) -> List[Document]:
lines = text.splitlines(True)
docs: List[Document] = []
for sub, start_idx in _split_lines_with_overlap(
lines,
chunk_size_lines=self.config.chunk_size_lines,
chunk_overlap_lines=self.config.chunk_overlap_lines,
):
sub_text = "".join(sub)
if len(sub) < self.config.min_chunk_lines:
continue
start_line = 1 + start_idx
docs.append(self._make_chunk_doc(sub_text, meta, start_line))

if not docs:
return [Document(text=text, meta_data=dict(meta))]
else:
return self._add_chunk_metadata(docs)
Comment on lines +404 to +407
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There's a small inconsistency in metadata handling. When _fallback_line_split is unable to create any chunks (e.g., because the text is shorter than min_chunk_lines), it returns the original document as a single-element list. However, this document lacks the chunk_index and chunk_total metadata that are added to all other chunks via _add_chunk_metadata. To ensure all returned documents have consistent metadata, you should process this single-document case through _add_chunk_metadata as well. This can be simplified by unifying the return path.

        if not docs:
            docs = [Document(text=text, meta_data=dict(meta))]
        return self._add_chunk_metadata(docs)


def _make_chunk_doc(self, chunk_text: str, meta: Dict[str, Any], start_line: int) -> Document:
new_meta = dict(meta)
new_meta["chunk_start_line"] = start_line
file_path = new_meta.get("file_path")
if file_path:
new_meta["title"] = str(file_path)
return Document(text=chunk_text, meta_data=new_meta)


class CodeAwareSplitter(DataComponent):
def __init__(
self,
*,
text_splitter: TextSplitter,
code_splitter: TreeSitterCodeSplitter,
) -> None:
super().__init__()
self._text_splitter = text_splitter
self._code_splitter = code_splitter

def __call__(self, documents: Sequence[Document]) -> Sequence[Document]:
output: List[Document] = []
for doc in documents:
meta = getattr(doc, "meta_data", {}) or {}
file_path = meta.get("file_path") or meta.get("title") or "<unknown>"
is_code = bool(meta.get("is_code"))
logger.info("Splitting document: %s (is_code=%s)", file_path, is_code)
if is_code:
chunks = self._code_splitter.split_document(doc)
logger.info("Split result: %s -> %d chunks (code)", file_path, len(chunks))
output.extend(chunks)
else:
logger.info("TextSplitter start: %s", file_path)
chunks = list(self._text_splitter([doc]))
logger.info("TextSplitter result: %s -> %d chunks", file_path, len(chunks))
output.extend(chunks)
return output

def to_dict(self) -> Dict[str, Any]:
return {
"text_splitter": self._text_splitter.to_dict() if hasattr(self._text_splitter, "to_dict") else None,
"code_splitter_config": {
"chunk_size_lines": self._code_splitter.config.chunk_size_lines,
"chunk_overlap_lines": self._code_splitter.config.chunk_overlap_lines,
"min_chunk_lines": self._code_splitter.config.min_chunk_lines,
"enabled": self._code_splitter.config.enabled,
}
Comment on lines +450 to +456
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using asdict (which can be imported from dataclasses) simplifies the serialization of the CodeSplitterConfig dataclass. This makes the code more concise and easier to maintain, as you won't need to manually update this dictionary if the dataclass fields change.

            "code_splitter_config": asdict(self._code_splitter.config),

}

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "CodeAwareSplitter":
from adalflow.components.data_process import TextSplitter
text_splitter_data = data.get("text_splitter")
text_splitter = TextSplitter.from_dict(text_splitter_data) if text_splitter_data else TextSplitter()
code_config = data.get("code_splitter_config", {})
code_splitter = TreeSitterCodeSplitter(**code_config)
return cls(text_splitter=text_splitter, code_splitter=code_splitter)
2 changes: 1 addition & 1 deletion api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def load_lang_config():

# Update embedder configuration
if embedder_config:
for key in ["embedder", "embedder_ollama", "embedder_google", "embedder_bedrock", "retriever", "text_splitter"]:
for key in ["embedder", "embedder_ollama", "embedder_google", "embedder_bedrock", "retriever", "text_splitter", "code_splitter"]:
if key in embedder_config:
configs[key] = embedder_config[key]

Expand Down
6 changes: 6 additions & 0 deletions api/config/embedder.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@
"retriever": {
"top_k": 20
},
"code_splitter": {
"enabled": true,
"chunk_size_lines": 200,
"chunk_overlap_lines": 20,
"min_chunk_lines": 5
},
"text_splitter": {
"split_by": "word",
"chunk_size": 350,
Expand Down
1 change: 1 addition & 0 deletions api/config/lang.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"zh": "Mandarin Chinese (中文)",
"zh-tw": "Traditional Chinese (繁體中文)",
"es": "Spanish (Español)",
"de": "Deutsch (German)",
"kr": "Korean (한국어)",
"vi": "Vietnamese (Tiếng Việt)",
"pt-br": "Brazilian Portuguese (Português Brasileiro)",
Expand Down
12 changes: 10 additions & 2 deletions api/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from requests.exceptions import RequestException

from api.tools.embedder import get_embedder
from api.code_splitter import CodeAwareSplitter, TreeSitterCodeSplitter

# Configure logging
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -402,7 +403,9 @@ def prepare_data_pipeline(embedder_type: str = None, is_ollama_embedder: bool =
if embedder_type is None:
embedder_type = get_embedder_type()

splitter = TextSplitter(**configs["text_splitter"])
text_splitter = TextSplitter(**configs["text_splitter"])
code_splitter = TreeSitterCodeSplitter(**configs.get("code_splitter", {}))
splitter = CodeAwareSplitter(text_splitter=text_splitter, code_splitter=code_splitter)
embedder_config = get_embedder_config()

embedder = get_embedder(embedder_type=embedder_type)
Expand Down Expand Up @@ -890,8 +893,13 @@ def _embedding_vector_length(doc: Document) -> int:
)
else:
return documents
except (AttributeError, KeyError, TypeError) as e:
logger.warning(
"Existing database could not be loaded due to incompatible schema or missing components. Rebuilding embeddings... (%s)",
e,
)
except Exception as e:
logger.error(f"Error loading existing database: {e}")
logger.warning("Error loading existing database. Rebuilding embeddings... (%s)", e)
# Continue to create a new database

# prepare the database
Expand Down
Loading