speedyk-005
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 3 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎demo.py‎
Lines changed: 47 additions & 19 deletions b/‎demo.py‎
Lines changed: 47 additions & 19 deletions
diff --git a/‎src/chunklet/base_chunker.py‎
Lines changed: 46 additions & 0 deletions b/‎src/chunklet/base_chunker.py‎
Lines changed: 46 additions & 0 deletions
@@ -10,11 +10,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [2.1.0] - 2025-12-11
 
 ### Changed
-- Changed default `include_comments` to True
+- **Default `include_comments`:** Changed the default value of the `include_comments` parameter to `True` in the `CodeChunker.chunk()` method to align with most developer expectations for comprehensive code processing.
+- **Code Chunker Modularization:** Refactored the `CodeChunker` class for better maintainability.
+    - Split into two files: `code_chunker.py` (main chunker logic) and `_code_structure_extractor.py` (structure extraction).
+    - Modularized the complex `extract_code_structure` method by extracting helper functions to reduce cognitive load.
+- **Base Chunker Inheritance:** Introduced a new `BaseChunker` abstract base class in `base_chunker.py` to standardize the interface for all chunkers.
 
 ### Fixed
-- Fixed late-binding issue in code chunker by modifying lambda pattern substitution
-- Fixed duplicate line de-annotation logic in code chunker
+- **Late-Binding Closure Bug:** Fixed a classic Python closure bug in the code annotation loop of `CodeChunker`. The original `pattern.sub(lambda match: self._annotate_block(tag, match), code)` caused the lambda to reference the final value of `tag` after the loop completed. Resolved by changing to `pattern.sub(lambda match, tag=tag: self._annotate_block(tag, match), code)`, using the default argument trick to capture the current `tag` value at definition time.
+- **Duplicate Line De-annotation:** Removed redundant string slicing logic in `CodeChunker`'s internal processing. The line de-annotation was being called twice—once during regex substitution and again via manual slicing—creating ambiguity and potential "ghost slicing" where lines could be misinterpreted. Now relies solely on regex substitution for de-annotation, simplifying the control flow.
+- **Decorator Separation Bug:** Fixed an issue in `CodeChunker` where decorators (e.g., `@property`) were incorrectly separated from their associated functions into different chunks. Added a flush condition in `extract_code_structure` to handle the first decorator/attribute (`len(buffer["META"]) == 1`) and non-consecutive DOC lines, ensuring decorators group with their functions for better semantic chunking.
 
 ---
 
 
@@ -1,29 +1,57 @@
-from chunklet.plain_text_chunker import PlainTextChunker
-from chunklet.common.token_utils import count_tokens
+from chunklet.code_chunker import CodeChunker
 
 
-def simple_token_counter(text: str) -> int:
-    """A simple token counter that splits by spaces."""
-    return len(text.split())
+# Python code sample with decorators
+code_sample = '''
+"""Module docstring for demo."""
 
+import os
 
-# Text from the example
-haystack = "I am writing a letter ! Sometimes, I forget to put spaces and do weird stuff with punctuation ?"
+class Calculator:
+    """A simple calculator class."""
 
-# Instantiate the chunker with a simple token counter
-chunker = PlainTextChunker(token_counter=simple_token_counter)
+    def __init__(self):
+        self._value = 0
+        self._verbose = True
 
-# Chunk the text with a max_tokens limit that will likely split the text
-# The goal is to see if the span of the second chunk is correctly identified.
-chunk_boxes = chunker.chunk(text=haystack, max_tokens=12)
+    @property
+    def current_value(self):
+        """Get the current value."""
+        return self.value
+
+    @current_value.setter
+    def current_value(self, value):
+        """Set the current value."""
+        self.value = value
+
+    def add(self, x, y):
+        """Add two numbers."""
+        result = x + y
+        return result
+
+    def multiply(self, x, y):
+        """Multiply two numbers."""
+        return x * y
+
+def standalone_function():
+    """A standalone function."""
+    return True
+'''
+
+# Instantiate the chunker
+chunker = CodeChunker(verbose=True)
+
+# Chunk the code with max_functions=1 to see splitting
+chunk_boxes = chunker.chunk(source=code_sample, max_functions=1)
 
 # Print the results
-print(f"Original Text: '{haystack}'")
-print("-" * 20)
+print("=" * 50)
 for i, chunk_box in enumerate(chunk_boxes):
     print(f"Chunk #{i+1}:")
-    print(f"  Content: '{chunk_box.content}'")
-    print(f"  Metadata Span: {chunk_box.metadata.span}")
-    start, end = chunk_box.metadata.span
-    print(f"  Span in Original: '{haystack[start:end]}'")
-    print("-" * 20)
+    print(f"  Content:\n{chunk_box.content}")
+    print(f"  Tree: {chunk_box.metadata.tree}")
+    print(f"  Start Line: {chunk_box.metadata.start_line}")
+    print(f"  End Line: {chunk_box.metadata.end_line}")
+    print(f"  Span: {chunk_box.metadata.span}")
+    print(f"  Source: {chunk_box.metadata.source}")
+    print("=" * 50)
@@ -0,0 +1,46 @@
+"""
+Base Chunker Abstract Class
+
+Defines the interface for chunkers.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Generator
+from box import Box
+from loguru import logger
+
+
+class BaseChunker(ABC):
+    """
+    Abstract base class for chunkers.
+
+    Defines the standard interface for chunking content into units.
+    """
+
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+
+    @abstractmethod
+    def chunk(self, *args, **kwargs) -> list[Box]:
+        """
+        Extract chunks.
+
+        Returns:
+            list[Box]: List of chunks with content and metadata.
+        """
+        pass
+
+    @abstractmethod
+    def batch_chunk(self, *args, **kwargs) -> Generator[Box, None, None]:
+        """
+        Process multiple items in parallel.
+
+        Yields:
+            Box: `Box` object, representing a chunk with its content and metadata.
+        """
+        pass
+
+    def log_info(self, *args, **kwargs) -> None:
+        """Log an info message if verbose is enabled."""
+        if self.verbose:
+            logger.info(*args, **kwargs)