AsyncFuncAI · danielfrey63 · Dec 29, 2025 · Dec 30, 2025 · Jan 1, 2026 · Jan 1, 2026
diff --git a/api/code_splitter.py b/api/code_splitter.py
@@ -0,0 +1,320 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
-from dataclasses import dataclass
+from dataclasses import dataclass, asdict
-from dataclasses import dataclass
+from dataclasses import dataclass, asdict
+import importlib
+import logging
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
+
+from adalflow.core.component import DataComponent
+from adalflow.components.data_process import TextSplitter
+from adalflow.core.types import Document
+
+logger = logging.getLogger(__name__)
+
+_DEFINITION_TYPE_KEYWORDS = (
+    "function",
+    "method",
+    "class",
+    "interface",
+    "struct",
+    "enum",
+    "trait",
+    "impl",
+    "module",
+    "namespace",
+    "type",
+)
+
+
+_EXT_TO_LANGUAGE: Dict[str, str] = {
+    "py": "python",
+    "js": "javascript",
+    "jsx": "javascript",
+    "ts": "typescript",
+    "tsx": "tsx",
+    "java": "java",
+    "c": "c",
+    "h": "c",
+    "cpp": "cpp",
+    "hpp": "cpp",
+    "cc": "cpp",
+    "cs": "c_sharp",
+    "go": "go",
+    "rs": "rust",
+    "php": "php",
+    "rb": "ruby",
+    "swift": "swift",
+    "kt": "kotlin",
+    "kts": "kotlin",
+    "scala": "scala",
+    "lua": "lua",
+    "sh": "bash",
+    "bash": "bash",
+    "html": "html",
+    "css": "css",
+    "json": "json",
+    "yml": "yaml",
+    "yaml": "yaml",
+    "toml": "toml",
+    "md": "markdown",
+}
+
+
+@dataclass(frozen=True)
+class CodeSplitterConfig:
+    chunk_size_lines: int = 200
+    chunk_overlap_lines: int = 20
+    min_chunk_lines: int = 5
+    enabled: bool = True
+
+
+def _safe_import_tree_sitter() -> Optional[Callable[..., Any]]:
+    """Safely import and return the `get_parser` function from tree_sitter_languages."""
+    module_candidates = [
+        "tree_sitter_languages",  # module name used by tree-sitter-languages on most installs
+    ]
+
+    for module_name in module_candidates:
+        try:
+            mod = importlib.import_module(module_name)
+            get_parser = getattr(mod, "get_parser", None)
+            if callable(get_parser):
+                return get_parser
+        except ImportError:
+            continue
+
+    return None
+
+
+def _iter_definition_like_nodes(root_node: Any) -> Iterable[Any]:
+    for child in getattr(root_node, "children", []):
+        if not getattr(child, "is_named", False):
+            continue
+        node_type = getattr(child, "type", "")
+        # Split node type into words to avoid partial matches on keywords.
+        lowered_parts = set(node_type.lower().replace("_", " ").split())
+        if any(k in lowered_parts for k in _DEFINITION_TYPE_KEYWORDS):
+            yield child
+
+
+def _split_lines_with_overlap(
+    lines: List[str], *, chunk_size_lines: int, chunk_overlap_lines: int
+) -> List[Tuple[List[str], int]]:
+    if chunk_size_lines <= 0:
+        return [(lines, 0)]
+
+    overlap = max(0, min(chunk_overlap_lines, chunk_size_lines - 1))
+    chunks: List[Tuple[List[str], int]] = []
+    start = 0
+    n = len(lines)
+
+    while start < n:
+        end = min(n, start + chunk_size_lines)
+        chunks.append((lines[start:end], start))
+        if end >= n:
+            break
+        start = end - overlap
+
+    return chunks
+
+
+def _slice_text_by_bytes_preencoded(text_bytes: bytes, start_byte: int, end_byte: int) -> str:
+    return text_bytes[start_byte:end_byte].decode("utf-8", errors="replace")
+
+
+def _byte_offset_to_line_preencoded(text_bytes: bytes, byte_offset: int) -> int:
+    prefix = text_bytes[:max(0, byte_offset)]
+    return prefix.count(b"\n") + 1
+
+
+class TreeSitterCodeSplitter:
+    def __init__(
+        self,
+        *,
+        chunk_size_lines: int = 200,
+        chunk_overlap_lines: int = 20,
+        min_chunk_lines: int = 5,
+        enabled: bool = True,
+    ) -> None:
+        self.config = CodeSplitterConfig(
+            chunk_size_lines=chunk_size_lines,
+            chunk_overlap_lines=chunk_overlap_lines,
+            min_chunk_lines=min_chunk_lines,
+            enabled=enabled,
+        )
+        self._get_parser = _safe_import_tree_sitter()
+
+    def is_available(self) -> bool:
+        return self._get_parser is not None
+
+    def split_document(self, doc: Document) -> List[Document]:
+        if not self.config.enabled:
+            return [doc]
+
+        meta = getattr(doc, "meta_data", {}) or {}
+        if not meta.get("is_code"):
+            return [doc]
+
+        file_type = (meta.get("type") or "").lower().lstrip(".")
+        return self._split_code_text(doc.text or "", meta, file_type)
+
+    def _get_language_name_candidates(self, file_type: str) -> List[str]:
+        mapped = _EXT_TO_LANGUAGE.get(file_type)
+        candidates: List[str] = []
+        if mapped:
+            candidates.append(mapped)
+        if file_type and file_type not in candidates:
+            candidates.append(file_type)
+        return candidates
+
+    def _try_get_parser(self, file_type: str) -> Any:
+        if self._get_parser is None:
+            return None
+
+        for name in self._get_language_name_candidates(file_type):
+            try:
+                return self._get_parser(name)
+            except Exception as e:
+                logger.debug("Failed to get parser for language '%s': %s", name, e)
+                continue
+        return None
+
+    def _split_code_text(self, text: str, meta: Dict[str, Any], file_type: str) -> List[Document]:
+        parser = self._try_get_parser(file_type)
+        if parser is None:
+            return self._fallback_line_split(text, meta)
+
+        text_bytes = text.encode("utf-8", errors="replace")
+        try:
+            tree = parser.parse(text_bytes)
+        except Exception:
+            return self._fallback_line_split(text, meta)
+
+        root = getattr(tree, "root_node", None)
+        if root is None:
+            return self._fallback_line_split(text, meta)
+
+        nodes = list(_iter_definition_like_nodes(root))
+        if not nodes:
+            return self._fallback_line_split(text, meta)
+
+        pieces: List[Tuple[str, int]] = []
+        for node in nodes:
+            try:
+                start_b = int(getattr(node, "start_byte"))
+                end_b = int(getattr(node, "end_byte"))
+            except (AttributeError, ValueError, TypeError) as e:
+                logger.debug("Could not process a tree-sitter node for file type '%s': %s", file_type, e)
+                continue
+            snippet = _slice_text_by_bytes_preencoded(text_bytes, start_b, end_b)
+            start_line = _byte_offset_to_line_preencoded(text_bytes, start_b)
+            pieces.append((snippet, start_line))
+
+        if not pieces:
+            return self._fallback_line_split(text, meta)
+
+        docs: List[Document] = []
+        for snippet, start_line in pieces:
+            snippet_lines = snippet.splitlines(True)
+            if len(snippet_lines) < self.config.min_chunk_lines:
+                continue
+
+            if len(snippet_lines) <= self.config.chunk_size_lines:
+                docs.append(self._make_chunk_doc(snippet, meta, start_line))
+                continue
+
+            for sub, sub_start_idx in _split_lines_with_overlap(
+                snippet_lines,
+                chunk_size_lines=self.config.chunk_size_lines,
+                chunk_overlap_lines=self.config.chunk_overlap_lines,
+            ):
+                sub_text = "".join(sub)
+                docs.append(self._make_chunk_doc(sub_text, meta, start_line + sub_start_idx))
+
+        if not docs:
+            return self._fallback_line_split(text, meta)
+        else:
+            return self._add_chunk_metadata(docs)
+
+    def _add_chunk_metadata(self, docs: List[Document]) -> List[Document]:
+        for i, d in enumerate(docs):
+            d.meta_data["chunk_index"] = i
+            d.meta_data["chunk_total"] = len(docs)
+        return docs
+
+    def _fallback_line_split(self, text: str, meta: Dict[str, Any]) -> List[Document]:
+        lines = text.splitlines(True)
+        docs: List[Document] = []
+        for sub, start_idx in _split_lines_with_overlap(
+            lines,
+            chunk_size_lines=self.config.chunk_size_lines,
+            chunk_overlap_lines=self.config.chunk_overlap_lines,
+        ):
+            sub_text = "".join(sub)
+            if len(sub) < self.config.min_chunk_lines:
+                continue
+            start_line = 1 + start_idx
+            docs.append(self._make_chunk_doc(sub_text, meta, start_line))
+
+        if not docs:
+            return [Document(text=text, meta_data=dict(meta))]
+        else:
+            return self._add_chunk_metadata(docs)
+
+    def _make_chunk_doc(self, chunk_text: str, meta: Dict[str, Any], start_line: int) -> Document:
+        new_meta = dict(meta)
+        new_meta["chunk_start_line"] = start_line
+        file_path = new_meta.get("file_path")
+        if file_path:
+            new_meta["title"] = str(file_path)
+        return Document(text=chunk_text, meta_data=new_meta)
+
+
+class CodeAwareSplitter(DataComponent):
+    def __init__(
+        self,
+        *,
+        text_splitter: TextSplitter,
+        code_splitter: TreeSitterCodeSplitter,
+    ) -> None:
+        super().__init__()
+        self._text_splitter = text_splitter
+        self._code_splitter = code_splitter
+
+    def __call__(self, documents: Sequence[Document]) -> Sequence[Document]:
+        output: List[Document] = []
+        for doc in documents:
+            meta = getattr(doc, "meta_data", {}) or {}
+            file_path = meta.get("file_path") or meta.get("title") or "<unknown>"
+            is_code = bool(meta.get("is_code"))
+            logger.info("Splitting document: %s (is_code=%s)", file_path, is_code)
+            if is_code:
+                chunks = self._code_splitter.split_document(doc)
+                logger.info("Split result: %s -> %d chunks (code)", file_path, len(chunks))
+                output.extend(chunks)
+            else:
+                logger.info("TextSplitter start: %s", file_path)
+                chunks = list(self._text_splitter([doc]))
+                logger.info("TextSplitter result: %s -> %d chunks", file_path, len(chunks))
+                output.extend(chunks)
+        return output
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "text_splitter": self._text_splitter.to_dict() if hasattr(self._text_splitter, "to_dict") else None,
+            "code_splitter_config": {
+                "chunk_size_lines": self._code_splitter.config.chunk_size_lines,
+                "chunk_overlap_lines": self._code_splitter.config.chunk_overlap_lines,
+                "min_chunk_lines": self._code_splitter.config.min_chunk_lines,
+                "enabled": self._code_splitter.config.enabled,
+            }
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "CodeAwareSplitter":
+        from adalflow.components.data_process import TextSplitter
+        text_splitter_data = data.get("text_splitter")
+        text_splitter = TextSplitter.from_dict(text_splitter_data) if text_splitter_data else TextSplitter()
+        code_config = data.get("code_splitter_config", {})
+        code_splitter = TreeSitterCodeSplitter(**code_config)
+        return cls(text_splitter=text_splitter, code_splitter=code_splitter)
diff --git a/api/config.py b/api/config.py
@@ -341,7 +341,7 @@ def load_lang_config():
 
 # Update embedder configuration
 if embedder_config:
-    for key in ["embedder", "embedder_ollama", "embedder_google", "embedder_bedrock", "retriever", "text_splitter"]:
+    for key in ["embedder", "embedder_ollama", "embedder_google", "embedder_bedrock", "retriever", "text_splitter", "code_splitter"]:
         if key in embedder_config:
             configs[key] = embedder_config[key]
 

diff --git a/api/config/embedder.json b/api/config/embedder.json
@@ -33,6 +33,12 @@
   "retriever": {
     "top_k": 20
   },
+  "code_splitter": {
+    "enabled": true,
+    "chunk_size_lines": 200,
+    "chunk_overlap_lines": 20,
+    "min_chunk_lines": 5
+  },
   "text_splitter": {
     "split_by": "word",
     "chunk_size": 350,

diff --git a/api/config/lang.json b/api/config/lang.json
@@ -5,6 +5,7 @@
     "zh": "Mandarin Chinese (中文)",
     "zh-tw": "Traditional Chinese (繁體中文)",
     "es": "Spanish (Español)",
+    "de": "Deutsch (German)",
     "kr": "Korean (한국어)",
     "vi": "Vietnamese (Tiếng Việt)",
     "pt-br": "Brazilian Portuguese (Português Brasileiro)",

diff --git a/api/data_pipeline.py b/api/data_pipeline.py
@@ -17,6 +17,7 @@
 from requests.exceptions import RequestException
 
 from api.tools.embedder import get_embedder
+from api.code_splitter import CodeAwareSplitter, TreeSitterCodeSplitter
 
 # Configure logging
 logger = logging.getLogger(__name__)
@@ -402,7 +403,9 @@ def prepare_data_pipeline(embedder_type: str = None, is_ollama_embedder: bool =
     if embedder_type is None:
         embedder_type = get_embedder_type()
 
-    splitter = TextSplitter(**configs["text_splitter"])
+    text_splitter = TextSplitter(**configs["text_splitter"])
+    code_splitter = TreeSitterCodeSplitter(**configs.get("code_splitter", {}))
+    splitter = CodeAwareSplitter(text_splitter=text_splitter, code_splitter=code_splitter)
     embedder_config = get_embedder_config()
 
     embedder = get_embedder(embedder_type=embedder_type)
@@ -890,8 +893,13 @@ def _embedding_vector_length(doc: Document) -> int:
                         )
                     else:
                         return documents
+            except (AttributeError, KeyError, TypeError) as e:
+                logger.warning(
+                    "Existing database could not be loaded due to incompatible schema or missing components. Rebuilding embeddings... (%s)",
+                    e,
+                )
             except Exception as e:
-                logger.error(f"Error loading existing database: {e}")
+                logger.warning("Error loading existing database. Rebuilding embeddings... (%s)", e)
                 # Continue to create a new database
 
         # prepare the database