Sayeem3051
diff --git a/‎README.md‎
Lines changed: 23 additions & 2 deletions b/‎README.md‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎ctxeng/builder.py‎
Lines changed: 13 additions & 2 deletions b/‎ctxeng/builder.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎ctxeng/chunking.py‎
Lines changed: 85 additions & 1 deletion b/‎ctxeng/chunking.py‎
Lines changed: 85 additions & 1 deletion
@@ -8,6 +8,7 @@
 <p align="center">
   <a href="https://pypi.org/project/ctxeng/"><img src="https://img.shields.io/pypi/v/ctxeng?color=blue&label=pypi&cacheSeconds=3600" alt="PyPI"></a>
   <a href="https://github.com/sayeem3051/python-context-engineer/actions"><img src="https://github.com/sayeem3051/python-context-engineer/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
+  <a href="https://codecov.io/gh/Sayeem3051/python-context-engineer"><img src="https://codecov.io/gh/Sayeem3051/python-context-engineer/branch/main/graph/badge.svg" alt="Coverage"></a>
   <a href="https://pypi.org/project/ctxeng/"><img src="https://img.shields.io/pypi/pyversions/ctxeng?cacheSeconds=3600" alt="Python"></a>
   <img src="https://img.shields.io/github/license/sayeem3051/python-context-engineer?cacheSeconds=3600" alt="License">
   <a href="https://pepy.tech/project/ctxeng"><img src="https://static.pepy.tech/badge/ctxeng/month" alt="Downloads"></a>
@@ -29,6 +30,11 @@ The quality of your LLM's output depends almost entirely on *what you put in the
 - **Fits the budget** — smart truncation keeps the best parts within any model's token limit
 - **Ships ready to paste** — XML, Markdown, or plain text output that works with Claude, GPT-4o, Gemini, and every other model
 
+Docs:
+- [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md)
+- [`docs/PERFORMANCE.md`](docs/PERFORMANCE.md)
+- [`docs/FAQ.md`](docs/FAQ.md)
+
 One small dependency ([pathspec](https://pypi.org/project/pathspec/)) powers ``.ctxengignore`` (gitignore-style patterns). Works with any LLM.
 
 ---
@@ -59,6 +65,8 @@ For semantic similarity scoring (optional local embeddings):
 pip install "ctxeng[semantic]"
 ```
 
+Default semantic model is `all-mpnet-base-v2`. Override with `--semantic-model` when building context.
+
 For one-line LLM calls:
 
 ```bash
@@ -222,6 +230,7 @@ For large repositories, `--rag` switches from whole-file inclusion to **chunk-le
 
 - Uses **embeddings** when `sentence-transformers` is installed
 - Falls back to **lexical retrieval** when embeddings aren’t available
+- Chunks Python files by **function/class** boundaries when possible
 
 ```bash
 ctxeng build "Explain the login flow" --rag
@@ -514,7 +523,7 @@ build options:
   --show-cost / --no-show-cost
                   Include estimated input cost in stderr summary (default: on)
   --semantic       Enable semantic similarity scoring (requires sentence-transformers)
-  --semantic-model Semantic model name (default: all-MiniLM-L6-v2)
+  --semantic-model Semantic model name (default: all-mpnet-base-v2)
   --gitignore / --no-gitignore
                   Respect .gitignore in addition to .ctxengignore (default: on)
   --allow          Allowlist path prefixes; only these paths may be included
@@ -541,7 +550,7 @@ build options:
 watch options:
   --interval S    Polling interval in seconds (default: 1.0)
   --semantic       Enable semantic similarity scoring (requires sentence-transformers)
-  --semantic-model Semantic model name (default: all-MiniLM-L6-v2)
+  --semantic-model Semantic model name (default: all-mpnet-base-v2)
   --gitignore / --no-gitignore
                   Respect .gitignore in addition to .ctxengignore (default: on)
   --allow          Allowlist path prefixes; only these paths may be included
@@ -648,6 +657,18 @@ You could. But you'll hit these problems immediately:
 
 PRs welcome! See [CONTRIBUTING.md](CONTRIBUTING.md).
 
+## Test coverage
+
+We track coverage with `pytest-cov` in CI and upload `coverage.xml` to Codecov.
+
+- **Goal**: keep coverage **above 80%**
+- **Local run**:
+
+```bash
+pip install -e ".[dev]"
+pytest --cov=ctxeng --cov-report=term-missing --cov-report=xml
+```
+
 ```bash
 git clone https://github.com/sayeem3051/python-context-engineer
 cd python-context-engineer
 
@@ -43,7 +43,7 @@ def __init__(self, root: str | Path = ".") -> None:
         self._use_import_graph = True
         self._import_graph_depth = 1
         self._use_semantic = False
-        self._semantic_model = "all-MiniLM-L6-v2"
+        self._semantic_model = "all-mpnet-base-v2"
         self._respect_gitignore = True
         self._allow_paths: list[str | Path] = []
         self._deny_paths: list[str | Path] = []
@@ -54,12 +54,14 @@ def __init__(self, root: str | Path = ".") -> None:
         self._rag_max_chunks = 20
         self._rag_chunk_max_lines = 120
         self._rag_chunk_overlap = 20
+        self._rag_chunk_context_lines = 3
         self._rag_embedding_model = "all-MiniLM-L6-v2"
         self._skeleton = False
         self._redact = True
         self._fewshot = False
         self._fewshot_dir: str | Path = ".ctxeng/examples"
         self._fewshot_max_files = 5
+        self._scoring_config: str | Path | None = None
 
     def for_model(self, model: str) -> ContextBuilder:
         """Set the target model (determines token budget)."""
@@ -118,7 +120,7 @@ def no_import_graph(self) -> ContextBuilder:
         self._use_import_graph = False
         return self
 
-    def use_semantic(self, model: str = "all-MiniLM-L6-v2") -> ContextBuilder:
+    def use_semantic(self, model: str = "all-mpnet-base-v2") -> ContextBuilder:
         """Enable semantic similarity scoring (requires `sentence-transformers`)."""
         self._use_semantic = True
         self._semantic_model = model
@@ -153,13 +155,15 @@ def rag(
         max_chunks: int = 20,
         chunk_max_lines: int = 120,
         chunk_overlap: int = 20,
+        chunk_context_lines: int = 3,
         embedding_model: str = "all-MiniLM-L6-v2",
     ) -> ContextBuilder:
         """Enable chunk-level retrieval (RAG)."""
         self._rag = enabled
         self._rag_max_chunks = max_chunks
         self._rag_chunk_max_lines = chunk_max_lines
         self._rag_chunk_overlap = chunk_overlap
+        self._rag_chunk_context_lines = chunk_context_lines
         self._rag_embedding_model = embedding_model
         return self
 
@@ -186,6 +190,11 @@ def fewshot(
         self._fewshot_max_files = max_files
         return self
 
+    def scoring_config(self, path: str | Path) -> ContextBuilder:
+        """Load scoring weights from a config file."""
+        self._scoring_config = path
+        return self
+
     def build(self, query: str = "") -> Context:
         """
         Build and return the optimized Context.
@@ -229,10 +238,12 @@ def _build_engine(self) -> ContextEngine:
             rag_max_chunks=self._rag_max_chunks,
             rag_chunk_max_lines=self._rag_chunk_max_lines,
             rag_chunk_overlap=self._rag_chunk_overlap,
+            rag_chunk_context_lines=self._rag_chunk_context_lines,
             rag_embedding_model=self._rag_embedding_model,
             skeleton=self._skeleton,
             redact=self._redact,
             fewshot=self._fewshot,
             fewshot_dir=self._fewshot_dir,
             fewshot_max_files=self._fewshot_max_files,
+            scoring_config=self._scoring_config,
         )
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import ast
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -17,12 +18,35 @@ def id(self) -> str:
         return f"{self.path.as_posix()}:{self.start_line}-{self.end_line}"
 
 
+def chunk_file(
+    path: Path,
+    text: str,
+    *,
+    max_lines: int = 120,
+    overlap: int = 20,
+    context_lines: int = 3,
+) -> list[Chunk]:
+    """
+    Structure-aware chunking:
+    - Python: chunk by class/function using AST line spans.
+    - Other: fall back to line chunking.
+
+    `context_lines` expands each chunk with surrounding lines for better local context.
+    """
+    if path.suffix.lower() == ".py":
+        chunks = _chunk_python_ast(path, text, max_lines=max_lines, context_lines=context_lines)
+        if chunks:
+            return chunks
+    return chunk_text(path, text, max_lines=max_lines, overlap=overlap, context_lines=context_lines)
+
+
 def chunk_text(
     path: Path,
     text: str,
     *,
     max_lines: int = 120,
     overlap: int = 20,
+    context_lines: int = 0,
 ) -> list[Chunk]:
     """
     Split file content into overlapping line chunks.
@@ -44,13 +68,73 @@ def chunk_text(
     n = len(lines)
     while i < n:
         j = min(n, i + max_lines)
-        chunk_lines = lines[i:j]
         start_line = i + 1
         end_line = j
+
+        # Expand with surrounding context
+        start0 = max(1, start_line - context_lines)
+        end0 = min(n, end_line + context_lines)
+        chunk_lines = lines[start0 - 1 : end0]
+        start_line = start0
+        end_line = end0
         chunks.append(Chunk(path=path, start_line=start_line, end_line=end_line, text="\n".join(chunk_lines)))
         if j >= n:
             break
         i = max(0, j - overlap)
 
     return chunks
 
+
+def _chunk_python_ast(
+    path: Path,
+    text: str,
+    *,
+    max_lines: int,
+    context_lines: int,
+) -> list[Chunk]:
+    lines = text.splitlines()
+    if not lines:
+        return []
+    try:
+        tree = ast.parse(text)
+    except SyntaxError:
+        return []
+
+    spans: list[tuple[int, int]] = []
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start = getattr(node, "lineno", None)
+            end = getattr(node, "end_lineno", None)
+            if isinstance(start, int) and isinstance(end, int) and end >= start:
+                spans.append((start, end))
+
+    if not spans:
+        return []
+
+    # Dedupe + sort
+    spans = sorted(set(spans))
+
+    out: list[Chunk] = []
+    n = len(lines)
+    for start, end in spans:
+        # Expand context window
+        s = max(1, start - context_lines)
+        e = min(n, end + context_lines)
+        if e - s + 1 > max_lines:
+            # Too big: fall back to line chunking inside span (no overlap; keep boundaries)
+            segment = "\n".join(lines[s - 1 : e])
+            for ch in chunk_text(path, segment, max_lines=max_lines, overlap=0, context_lines=0):
+                # Rebase line numbers
+                out.append(
+                    Chunk(
+                        path=path,
+                        start_line=s + (ch.start_line - 1),
+                        end_line=s + (ch.end_line - 1),
+                        text=ch.text,
+                    )
+                )
+        else:
+            out.append(Chunk(path=path, start_line=s, end_line=e, text="\n".join(lines[s - 1 : e])))
+
+    return out
+