speedyk-005
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎audit_migration.py‎
Lines changed: 125 additions & 103 deletions b/‎audit_migration.py‎
Lines changed: 125 additions & 103 deletions
diff --git a/‎docs/getting-started/cli.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/getting-started/cli.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/getting-started/installation.md‎
Lines changed: 11 additions & 2 deletions b/‎docs/getting-started/installation.md‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎docs/getting-started/programmatic/document_chunker.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/getting-started/programmatic/document_chunker.md‎
Lines changed: 1 addition & 1 deletion
@@ -107,7 +107,7 @@ chunklet --version
 > [!TIP]
 > <b>Termux (Android)</b>
 >
-> No rust toolchain on Termux (especially python 3.13) ? Install pydantic-core pre-built wheels first then retry installing chunklet-py:
+> No rust toolchain on Termux (especially python 3.13)? Install pydantic-core pre-built wheels first then retry installing chunklet-py:
 >
 > ```bash
 > pip install typing-extensions
@@ -121,11 +121,11 @@ That's it! You're all set to start chunking.
 
 Want to unlock more Chunklet-py superpowers? Add these optional dependencies based on what you need:
 
-*   **Document Processing:** For handling `.pdf`, `.docx`, `.epub`, and other document formats:
+*   **Structured Documents:** For handling `.pdf`, `.docx`, `.epub`, and other document formats:
     ```bash
     pip install "chunklet-py[structured-document]"
     ```
-*   **Code Chunking:** For advanced code analysis and chunking features:
+*   **Code Chunking:** For Language-agnostic code chunking features:
     ```bash
     pip install "chunklet-py[code]"
     ```
 
@@ -10,9 +10,9 @@
 
 console = Console()
 
-CHUNKER_CLASSES = {"Chunklet", "PlainTextChunker", "DocumentChunker", "CodeChunker"}
+DEPRECATED_ARGUMENTS = {"use_cache": "Remove it.", "custom_splitters": "Remove it."}
 
-V1_ARGS = {"use_cache": "Remove it.", "custom_splitters": "Remove it."}
+CHUNKER_CLASS_NAMES = {"Chunklet", "PlainTextChunker", "DocumentChunker", "CodeChunker"}
 
 LEGACY_IMPORTS = {
     "chunklet.utils.detect_text_language": "Use 'SentenceSplitter.detected_top_language()' instead.",
@@ -36,21 +36,27 @@ class MigrationAuditor:
     """
 
     def __init__(self):
-        self._found_any = False
+        self._has_legacy_issues = False
         self._console = console
 
-    def audit(self, directory="."):
+    def audit(self, path="."):
         """
-        Public method to audit a directory for legacy v1 patterns.
+        Public method to audit a file or directory for legacy v1 patterns.
         """
-        self._found_any = False
-        directory = Path(directory)
+        self._has_legacy_issues = False
+        path = Path(path)
 
         self._print_header()
 
-        for file_path in directory.rglob("*.py"):
+        # Handle single file or directory
+        targets = [path] if path.is_file() else path.rglob("*.py")
+
+        for file_path in targets:
+            # Skip the audit script itself and virtual environments
             if file_path.name == SCRIPT_NAME:
                 continue
+            if ".venv" in file_path.parts or "site-packages" in file_path.parts:
+                continue
 
             try:
                 content = file_path.read_text(encoding="utf-8")
@@ -71,7 +77,7 @@ def _print_header(self):
         )
 
     def _print_summary(self):
-        if not self._found_any:
+        if not self._has_legacy_issues:
             self._console.print(
                 "[bold green]✓ No legacy v1 patterns found. Code is up to date![/bold green]"
             )
@@ -86,37 +92,51 @@ def _audit_file(self, file_path: Path, content: str):
         except SyntaxError:
             return
 
-        tracked_instances = self._get_chunklet_instances(tree)
+        lines = content.splitlines()
+        tracked_instances = self._get_chunker_instances(tree)
 
-        self._audit_imports(file_path, tree)
-        self._audit_class_instantiation(file_path, tree, tracked_instances)
-        self._audit_calls(file_path, tree, tracked_instances)
-        self._audit_exceptions(file_path, tree)
+        self._audit_imports(file_path, tree, lines)
+        self._audit_class_instantiation(file_path, tree, tracked_instances, lines)
+        self._audit_calls(file_path, tree, tracked_instances, lines)
+        self._audit_exceptions(file_path, tree, lines)
 
-    def _get_chunklet_instances(self, tree: ast.AST) -> set:
-        instances = set()
-        for node in ast.walk(tree):
-            if isinstance(node, ast.Assign):
-                for target in node.targets:
-                    if isinstance(target, ast.Name):
-                        if isinstance(node.value, ast.Call):
-                            if isinstance(node.value.func, ast.Name):
-                                if node.value.func.id in CHUNKER_CLASSES:
-                                    instances.add(target.id)
-        return instances
-
-    def _audit_imports(self, file_path: Path, tree: ast.AST):
+    def _get_line(self, lines: list[str], line_no: int) -> str:
+        try:
+            return lines[line_no - 1]
+        except IndexError:
+            return ""
+
+    def _is_chunker_call(self, node: ast.AST) -> bool:
+        """Check if a node is a call to a known chunker class."""
+        if not isinstance(node, ast.Call):
+            return False
+        if not isinstance(node.func, ast.Name):
+            return False
+        return node.func.id in CHUNKER_CLASS_NAMES
+
+    def _get_chunker_instances(self, tree: ast.AST) -> set:
+        """Find all chunker class instantiations in the AST."""
+        return {
+            target.id
+            for node in ast.walk(tree)
+            if isinstance(node, ast.Assign)
+            and isinstance(node.value, ast.Call)
+            and self._is_chunker_call(node.value)
+            for target in node.targets
+            if isinstance(target, ast.Name)
+        }
+
+    def _audit_imports(self, file_path: Path, tree: ast.AST, lines: list[str]):
         for node in ast.walk(tree):
             if isinstance(node, ast.ImportFrom):
                 if node.module:
                     for legacy_module, fix_msg in LEGACY_IMPORTS.items():
                         if legacy_module in node.module:
-                            self._found_any = True
-                            line_no = node.lineno
+                            self._has_legacy_issues = True
                             self._print_issue(
                                 file_path,
-                                line_no,
-                                self._get_line(file_path, line_no),
+                                node.lineno,
+                                self._get_line(lines, node.lineno),
                                 f"Import '{node.module}'. {fix_msg}",
                                 "bold red",
                             )
@@ -125,96 +145,98 @@ def _audit_imports(self, file_path: Path, tree: ast.AST):
                 for alias in node.names:
                     for legacy_module, fix_msg in LEGACY_IMPORTS.items():
                         if legacy_module.split(".")[-1] == alias.name:
-                            self._found_any = True
-                            line_no = node.lineno
+                            self._has_legacy_issues = True
                             self._print_issue(
                                 file_path,
-                                line_no,
-                                self._get_line(file_path, line_no),
+                                node.lineno,
+                                self._get_line(lines, node.lineno),
                                 f"Import '{alias.name}'. {fix_msg}",
                                 "bold red",
                             )
 
     def _audit_class_instantiation(
-        self, file_path: Path, tree: ast.AST, tracked_instances: set
+        self, file_path: Path, tree: ast.AST, tracked_instances: set, lines: list[str]
     ):
+        # Find "Chunklet" instantiations specifically (not all chunker classes)
         for node in ast.walk(tree):
-            if isinstance(node, ast.Assign):
-                for target in node.targets:
-                    if isinstance(target, ast.Name):
-                        if isinstance(node.value, ast.Call):
-                            if isinstance(node.value.func, ast.Name):
-                                if node.value.func.id == "Chunklet":
-                                    tracked_instances.add(target.id)
-                                    self._found_any = True
-                                    line_no = node.lineno
-                                    self._print_issue(
-                                        file_path,
-                                        line_no,
-                                        self._get_line(file_path, line_no),
-                                        "Rename 'Chunklet' to 'DocumentChunker'.",
-                                        "bold red",
-                                    )
-
-    def _audit_calls(self, file_path: Path, tree: ast.AST, tracked_instances: set):
-        for node in ast.walk(tree):
-            if isinstance(node, ast.Call):
-                if isinstance(node.func, ast.Attribute):
-                    if isinstance(node.func.value, ast.Name):
-                        inst_name = node.func.value.id
+            if not isinstance(node, ast.Assign):
+                continue
+            if not isinstance(node.value, ast.Call):
+                continue
+            if not isinstance(node.value.func, ast.Name):
+                continue
+            if node.value.func.id != "Chunklet":
+                continue
+            if not node.targets or not isinstance(node.targets[0], ast.Name):
+                continue
 
-                        if inst_name not in tracked_instances:
-                            continue
+            target = node.targets[0].id
+            tracked_instances.add(target)
+            self._has_legacy_issues = True
+            self._print_issue(
+                file_path,
+                node.lineno,
+                self._get_line(lines, node.lineno),
+                "Rename 'Chunklet' to 'DocumentChunker'.",
+                "bold red",
+            )
 
-                        method_name = node.func.attr
+    def _audit_calls(self, file_path: Path, tree: ast.AST, tracked_instances: set, lines: list[str]):
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.Call):
+                continue
+            if not isinstance(node.func, ast.Attribute):
+                continue
+            if not isinstance(node.func.value, ast.Name):
+                continue
 
-                        if method_name in LEGACY_METHODS:
-                            style = (
-                                "bold red"
-                                if method_name in ("preview_sentences",)
-                                else "yellow"
-                            )
-                            self._found_any = True
-                            self._print_issue(
-                                file_path,
-                                node.lineno,
-                                self._get_line(file_path, node.lineno),
-                                f"'{inst_name}.{method_name}()' - {LEGACY_METHODS[method_name]}",
-                                style,
-                            )
+            inst_name = node.func.value.id
+            if inst_name not in tracked_instances:
+                continue
 
-                        for arg, fix_msg in V1_ARGS.items():
-                            for kw in node.keywords:
-                                if kw.arg == arg:
-                                    self._found_any = True
-                                    self._print_issue(
-                                        file_path,
-                                        node.lineno,
-                                        self._get_line(file_path, node.lineno),
-                                        f"Argument '{arg}' is no longer supported. {fix_msg}",
-                                        "bold red",
-                                    )
-
-    def _audit_exceptions(self, file_path: Path, tree: ast.AST):
-        for node in ast.walk(tree):
-            if isinstance(node, ast.Name):
-                for old_name, fix_msg in LEGACY_EXCEPTIONS.items():
-                    if node.id == old_name:
-                        self._found_any = True
-                        line_no = node.lineno
+            method_name = node.func.attr
+
+            if method_name in LEGACY_METHODS:
+                style = (
+                    "bold red"
+                    if method_name in ("preview_sentences",)
+                    else "yellow"
+                )
+                self._has_legacy_issues = True
+                self._print_issue(
+                    file_path,
+                    node.lineno,
+                    self._get_line(lines, node.lineno),
+                    f"'{inst_name}.{method_name}()' - {LEGACY_METHODS[method_name]}",
+                    style,
+                )
+
+            for arg, fix_msg in DEPRECATED_ARGUMENTS.items():
+                for kw in node.keywords:
+                    if kw.arg == arg:
+                        self._has_legacy_issues = True
                         self._print_issue(
                             file_path,
-                            line_no,
-                            self._get_line(file_path, line_no),
-                            f"'{old_name}' - {fix_msg}",
+                            node.lineno,
+                            self._get_line(lines, node.lineno),
+                            f"Argument '{arg}' is no longer supported. {fix_msg}",
                             "bold red",
                         )
 
-    def _get_line(self, file_path: Path, line_no: int) -> str:
-        try:
-            return file_path.read_text(encoding="utf-8").splitlines()[line_no - 1]
-        except (IndexError, IOError):
-            return ""
+    def _audit_exceptions(self, file_path: Path, tree: ast.AST, lines: list[str]):
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.Name):
+                continue
+            if node.id not in LEGACY_EXCEPTIONS:
+                continue
+            self._has_legacy_issues = True
+            self._print_issue(
+                file_path,
+                node.lineno,
+                self._get_line(lines, node.lineno),
+                f"'{node.id}' - {LEGACY_EXCEPTIONS[node.id]}",
+                "bold red",
+            )
 
     def _print_issue(self, file_path, line_no, line_content, message, style):
         msg = Text()
 
@@ -69,7 +69,7 @@ The `chunk` command is where the real magic happens! It's your versatile tool fo
 | `--max-tokens` | Maximum number of tokens per chunk. Applies to all chunking strategies. (Must be >= 12) | None |
 | `--max-sentences` | Maximum number of sentences per chunk. Applies to DocumentChunker. (Must be >= 1) | None |
 | `--max-section-breaks` | Maximum number of section breaks per chunk. Section breaks include Markdown headings (# to ######), horizontal rules (---, ***, ___), and <details> tags. Applies to DocumentChunker. (Must be >= 1) | None |
-| `--overlap-percent` | Percentage of overlap between chunks (0-85). Applies to DocumentChunker. | 20.0 |
+| `--overlap-percent` | Percentage of overlap between chunks (0-75). Applies to DocumentChunker. | 20.0 |
 | `--offset` | Starting sentence offset for chunking. Applies to DocumentChunker. | 0 |
 | `--lang` | Language of the text (e.g., 'en', 'fr', 'auto'). | auto |
 | `--metadata` | Include rich metadata (source, span, chunk num, etc.) in the output. If `--destination` is a directory, metadata is saved as separate `.json` files; otherwise, it's included inline in the output. | False |
 
@@ -3,7 +3,7 @@
 Ready to get Chunklet-py up and running? Fantastic! This guide will walk you through the installation process, making it as smooth as possible.
 
 !!! info "Requirements"
-    Chunklet-py requires **Python 3.10 or newer**. We recommend using Python 3.11+ for the best experience.
+    Chunklet-py requires **Python 3.11 or newer**. We recommend using Python 3.12+ for the best experience.
 
 !!! note "chunklet-py (aka chunklet)"
     The old `chunklet` package is no longer maintained. Use `chunklet-py` to get the latest version.
@@ -18,6 +18,15 @@ pip install chunklet-py
 chunklet --version
 ```
 
+!!! tip "Termux (Android)"
+    No rust toolchain on Termux (especially python 3.13)? Install pydantic-core pre-built wheels first then retry installing chunklet-py:
+    
+    ```bash
+    pip install typing-extensions
+    pip install pydantic-core --index-url https://termux-user-repository.github.io/pypi/
+    pip install "pydantic>=2.12.4,<2.13"
+    ```
+
 And that's all there is to it! You're now ready to start using Chunklet-py.
 
 ## Optional Dependencies
@@ -51,7 +60,7 @@ cd chunklet-py
 pip install .[all]
 ```
 
-But why would you want to do that? The easy way is so much easier.
+But why would you want to do that? The pip way is so much easier.
 
 ## Contributing to Chunklet-py
 
 
@@ -411,7 +411,7 @@ for i, chunk in enumerate(chunks):
 !!! note "Special Handling for Streaming Processors"
     Some processors work differently due to their streaming nature - they yield content page by page or in blocks rather than all at once. This means they require special care:
 
-    **Streaming processors** (PDF, EPUB, DOCX, ODT): These beauties process content as they go, so they're designed for `chunk_files` method. Using them with `chunk_file` will throw a [`FileProcessingError`](../../exceptions-and-warnings.md#fileprocessingerror) since `chunk_file` expects all content upfront.
+    **Streaming processors** (PDF, EPUB, DOCX, ODT): These beauties process content as they go, so they're designed for `chunk_files` method. Using them with `chunk_file` will throw an [`UnsupportedFileTypeError`](../../exceptions-and-warnings.md#unsupportedfiletypeerror) since `chunk_file` expects all content upfront.
 
     **Regular processors** work fine with both `chunk_file` and `chunk_files` methods.