AOSSIE-Org · tubajavedd · Feb 8, 2026 · Feb 10, 2026 · coderabbitai · Feb 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+notes/
diff --git a/smart-notes/rag_mvp/README.md b/smart-notes/rag_mvp/README.md
@@ -0,0 +1,84 @@
+# Smart Notes – Local Q&A (RAG MVP)
+
+This is a minimal, local-first MVP that allows users to ask natural-language questions over their markdown notes.
+
+## Features (Current MVP)
+
+- Loads markdown files from a local `notes/` directory
+- Supports natural-language questions (e.g., "what is AI", "where is AI used")
+- Returns sentence-level answers from notes
+- Shows the source note filename
+- Interactive CLI loop (type `exit` to quit)
+
+This is a starter implementation intended to be extended with embeddings and vector search in future iterations.
+
+---
+
+## How it works
+
+1. Notes are loaded from the local `notes/` directory.
+2. Question words (what, where, who, when, etc.) are filtered.
+3. Notes are split into sentences.
+4. Relevant sentences are returned based on keyword matching.
+
+---
+
+## How to run
+
+```bash
+python smart-notes/rag_mvp/qa_cli.py
+
+
+
+>> what is AI
+
+[1] From test.md:
+Artificial Intelligence (AI) is the simulation of human intelligence in machines.
+
+
+>>  what is machine learning
+how is machine learning used
+difference between AI and ML
+
+
+
+
+
+# Smart Notes – RAG MVP (Embeddings & FAISS)
+
+This project is a simple **Retrieval-Augmented Generation (RAG)** pipeline for Smart Notes.  
+It allows users to store notes, convert them into embeddings, and search relevant notes using vector similarity.
+
+---
+
+## 🚀 Features
+
+- Convert notes into embeddings using Sentence Transformers
+- Store and search embeddings using FAISS (CPU)
+- CLI tool to ask questions about your notes
+- Simple chunking for text files
+- Works fully offline after model download
+
+---
+
+## 🧠 Tech Stack
+
+- Python 3.10+
+- sentence-transformers  
+- FAISS (faiss-cpu)  
+- HuggingFace Transformers
+
+---
+
+## 📁 Project Structure
+
+```bash
+smart-notes/
+├── rag_mvp/
+│   ├── embed.py        # Embedding logic
+│   ├── index.py        # FAISS index creation
+│   ├── qa_cli.py       # CLI for asking questions
+│   └── utils.py        # Helper functions
+├── notes/              # Put your .txt notes here
+├── requirements.txt
+└── README.md
diff --git a/smart-notes/rag_mvp/__pycache__/qa_cli.cpython-313.pyc b/smart-notes/rag_mvp/__pycache__/qa_cli.cpython-313.pyc
diff --git a/smart-notes/rag_mvp/embeddings/__init__.py b/smart-notes/rag_mvp/embeddings/__init__.py
diff --git a/smart-notes/rag_mvp/embeddings/chunker.py b/smart-notes/rag_mvp/embeddings/chunker.py
@@ -0,0 +1,31 @@
+"""
+Chunking utilities for splitting long notes into overlapping chunks.
+This helps embeddings capture local context.
+"""
+
+from typing import List
+
+
+def chunk_text(text: str, max_length: int = 500, overlap: int = 50) -> List[str]:
+    if not text:
+        return []
+
+    chunks = []
+    start = 0
+    text = text.strip()
+
+    while start < len(text):
+        end = start + max_length
+        chunk = text[start:end].strip()
+
+        if chunk:
+            chunks.append(chunk)
+
+        if end >= len(text):
+            break
+
+        start = end - overlap
+        if start < 0:
+            start = 0
+
+    return chunks
diff --git a/smart-notes/rag_mvp/embeddings/embedder.py b/smart-notes/rag_mvp/embeddings/embedder.py
@@ -0,0 +1,30 @@
+"""
+Embedding wrapper for converting text chunks into vectors.
+Supports pluggable embedding backends later (Ollama, OpenAI, SentenceTransformers).
+"""
+
+from typing import List
+import numpy as np
+
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
+
+
+class Embedder:
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
+        if SentenceTransformer is None:
+            raise ImportError(
+                "sentence-transformers not installed. Run: pip install sentence-transformers"
+            )
+
+        self.model_name = model_name
+        self.model = SentenceTransformer(model_name)
+
+    def embed(self, texts: List[str]) -> np.ndarray:
+        if not texts:
+            return np.array([])
+
+        embeddings = self.model.encode(texts, convert_to_numpy=True)
+        return embeddings
diff --git a/smart-notes/rag_mvp/embeddings/indexer.py b/smart-notes/rag_mvp/embeddings/indexer.py
@@ -0,0 +1,41 @@
+"""
+Simple vector indexer using FAISS for similarity search.
+"""
+
+from typing import List
+import numpy as np
+
+try:
+    import faiss
+except ImportError:
+    faiss = None
+
+
+class VectorIndexer:
+    def __init__(self, dim: int):
+        if faiss is None:
+            raise ImportError("faiss not installed. Run: pip install faiss-cpu")
+
+        self.dim = dim
+        self.index = faiss.IndexFlatL2(dim)
+        self.texts: List[str] = []
+
+    def add(self, embeddings: np.ndarray, chunks: List[str]):
+        if len(embeddings) == 0:
+            return
+
+        self.index.add(embeddings)
+        self.texts.extend(chunks)
+
+    def search(self, query_embedding: np.ndarray, k: int = 3):
+        if self.index.ntotal == 0:
+            return []
+
+        distances, indices = self.index.search(query_embedding.reshape(1, -1), k)
+        results = []
+
+        for idx in indices[0]:
+            if idx < len(self.texts):
+                results.append(self.texts[idx])
-        distances, indices = self.index.search(query_embedding.reshape(1, -1), k)
-        results = []
-
-        for idx in indices[0]:
-            if idx < len(self.texts):
-                results.append(self.texts[idx])
+        _distances, indices = self.index.search(query_embedding.reshape(1, -1), k)
+        results = []
+
+        for idx in indices[0]:
+            if 0 <= idx < len(self.texts):
+                results.append(self.texts[idx])
-        distances, indices = self.index.search(query_embedding.reshape(1, -1), k)
-        results = []
-
-        for idx in indices[0]:
-            if idx < len(self.texts):
-                results.append(self.texts[idx])
+        _distances, indices = self.index.search(query_embedding.reshape(1, -1), k)
+        results = []
+
+        for idx in indices[0]:
+            if 0 <= idx < len(self.texts):
+                results.append(self.texts[idx])
+
+        return results
diff --git a/smart-notes/rag_mvp/pipelines/__init__.py b/smart-notes/rag_mvp/pipelines/__init__.py
diff --git a/smart-notes/rag_mvp/pipelines/__pycache__/__init__.cpython-313.pyc b/smart-notes/rag_mvp/pipelines/__pycache__/__init__.cpython-313.pyc
diff --git a/smart-notes/rag_mvp/pipelines/__pycache__/embedding_pipeline.cpython-313.pyc b/smart-notes/rag_mvp/pipelines/__pycache__/embedding_pipeline.cpython-313.pyc
diff --git a/smart-notes/rag_mvp/pipelines/embedding_pipeline.py b/smart-notes/rag_mvp/pipelines/embedding_pipeline.py
@@ -0,0 +1,47 @@
+# rag_mvp/pipelines/embedding_pipeline.py
+
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+
+
+class EmbeddingPipeline:
+    def __init__(self, model_name="all-MiniLM-L6-v2"):
+        self.model = SentenceTransformer(model_name, cache_folder="D:/models_cache")
-        self.model = SentenceTransformer(model_name, cache_folder="D:/models_cache")
+        self.model = SentenceTransformer(model_name)
-        self.model = SentenceTransformer(model_name, cache_folder="D:/models_cache")
+        self.model = SentenceTransformer(model_name)
+        self.index = None
+        self.chunks = []
+
+    def chunk_text(self, text, max_length=300, overlap=50):
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            end = start + max_length
+            chunk = text[start:end]
+            chunks.append(chunk)
+            start = end - overlap
+
+        return chunks
+
+    def build_index(self, chunks):
+        embeddings = self.model.encode(chunks)
+        embeddings = np.array(embeddings).astype("float32")
+
+        dim = embeddings.shape[1]
+        self.index = faiss.IndexFlatL2(dim)
+        self.index.add(embeddings)
+
+        return embeddings
+
+    def process_notes(self, text):
+        self.chunks = self.chunk_text(text)
+        embeddings = self.build_index(self.chunks)
+        return self.chunks, embeddings
+
+    def semantic_search(self, query, top_k=3):
+        query_vec = self.model.encode([query])
+        query_vec = np.array(query_vec).astype("float32")
+
+        distances, indices = self.index.search(query_vec, top_k)
+        results = [self.chunks[i] for i in indices[0]]
+        return results
-    def semantic_search(self, query, top_k=3):
-        query_vec = self.model.encode([query])
-        query_vec = np.array(query_vec).astype("float32")
-
-        distances, indices = self.index.search(query_vec, top_k)
-        results = [self.chunks[i] for i in indices[0]]
-        return results
+    def semantic_search(self, query, top_k=3):
+        query_vec = self.model.encode([query])
+        query_vec = np.array(query_vec).astype("float32")
+
+        _distances, indices = self.index.search(query_vec, top_k)
+        results = [self.chunks[i] for i in indices[0] if 0 <= i < len(self.chunks)]
+        return results
-    def semantic_search(self, query, top_k=3):
-        query_vec = self.model.encode([query])
-        query_vec = np.array(query_vec).astype("float32")
-
-        distances, indices = self.index.search(query_vec, top_k)
-        results = [self.chunks[i] for i in indices[0]]
-        return results
+    def semantic_search(self, query, top_k=3):
+        query_vec = self.model.encode([query])
+        query_vec = np.array(query_vec).astype("float32")
+
+        _distances, indices = self.index.search(query_vec, top_k)
+        results = [self.chunks[i] for i in indices[0] if 0 <= i < len(self.chunks)]
+        return results
diff --git a/smart-notes/rag_mvp/qa_cli.py b/smart-notes/rag_mvp/qa_cli.py
@@ -0,0 +1,109 @@
+import os
+import re
+
+#-------------------emedding-pipeline-chunking concept
+from rag_mvp.pipelines.embedding_pipeline import EmbeddingPipeline
+
+def demo_embeddings_pipeline():
+    pipeline = EmbeddingPipeline()
+
+    note_text = """
+    Python is a programming language.
+    It is widely used in AI and machine learning projects.
+    Smart Notes helps users organize knowledge using embeddings.
+    """
+
+    chunks, embeddings = pipeline.process_notes(note_text)
+
+    print("\n--- Chunks Created ---")
+    for i, c in enumerate(chunks):
+        print(f"[{i}] {c}")
+
+    query = "What is Python used for?"
+    results = pipeline.semantic_search(query)
+
+    print("\n--- Search Results ---")
+    for r in results:
+        print("-", r)
+#-------------------------------------------------
+
+
+
+
+QUESTION_WORDS = {
+    "what", "where", "who", "when", "which",
+    "is", "are", "was", "were", "the", "a", "an",
+    "of", "to", "in", "on", "for"
+}
+
+NOTES_DIR = "notes"
-NOTES_DIR = "notes"
+NOTES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "notes")
-NOTES_DIR = "notes"
+NOTES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "notes")
+
+
+def load_notes():
+    notes = []
+    if not os.path.exists(NOTES_DIR):
+        print(f"Notes directory '{NOTES_DIR}' not found.")
+        return notes
+
+    for file in os.listdir(NOTES_DIR):
+        if file.endswith(".md"):
+            path = os.path.join(NOTES_DIR, file)
+            with open(path, "r", encoding="utf-8") as f:
+                notes.append({
+                    "filename": file,
+                    "content": f.read()
+                })
+    return notes
+
+
+def split_sentences(text):
+    return re.split(r'(?<=[.!?])\s+', text)
+
+
+def search_notes(query, notes):
+    results = []
+
+    query_words = [
+        word.lower()
+        for word in query.split()
+        if word.lower() not in QUESTION_WORDS
+    ]
+
+    for note in notes:
+        sentences = split_sentences(note["content"])
+        for sentence in sentences:
+            sentence_lower = sentence.lower()
+            if any(word in sentence_lower for word in query_words):
+                results.append({
+                    "filename": note["filename"],
+                    "sentence": sentence.strip()
+                })
+
+    return results
+
+
+if __name__ == "__main__":
+
+    demo_embeddings_pipeline()      # Temporary demo for embeddings pipeline
+
+    notes = load_notes()
+
+    print("Ask questions about your notes (type 'exit' to quit)\n")
+
+    while True:
+        query = input(">> ").strip()
+
+        if query.lower() == "exit":
+            print("Goodbye 👋")
+            break
+
+        matches = search_notes(query, notes)
+
+        if not matches:
+            print("No relevant notes found.\n")
+        else:
+            print("\n--- Answers ---\n")
+            for i, m in enumerate(matches, 1):
+                print(f"[{i}] From {m['filename']}:")
+                print(m["sentence"])
+                print()