From 564741c74ff1601740df8e84eb0d07e5fd7974f7 Mon Sep 17 00:00:00 2001 From: Tuba Javed Date: Mon, 9 Feb 2026 00:18:11 +0530 Subject: [PATCH 1/2] Add local Q&A CLI MVP for markdown notes --- .gitignore | 1 + smart-notes/rag_mvp/README.md | 42 +++++++++++++++++++ smart-notes/rag_mvp/qa_cli.py | 77 +++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 .gitignore create mode 100644 smart-notes/rag_mvp/README.md create mode 100644 smart-notes/rag_mvp/qa_cli.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e61812f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +notes/ diff --git a/smart-notes/rag_mvp/README.md b/smart-notes/rag_mvp/README.md new file mode 100644 index 0000000..2ce2968 --- /dev/null +++ b/smart-notes/rag_mvp/README.md @@ -0,0 +1,42 @@ +# Smart Notes – Local Q&A (RAG MVP) + +This is a minimal, local-first MVP that allows users to ask natural-language questions over their markdown notes. + +## Features (Current MVP) + +- Loads markdown files from a local `notes/` directory +- Supports natural-language questions (e.g., "what is AI", "where is AI used") +- Returns sentence-level answers from notes +- Shows the source note filename +- Interactive CLI loop (type `exit` to quit) + +This is a starter implementation intended to be extended with embeddings and vector search in future iterations. + +--- + +## How it works + +1. Notes are loaded from the local `notes/` directory. +2. Question words (what, where, who, when, etc.) are filtered. +3. Notes are split into sentences. +4. Relevant sentences are returned based on keyword matching. + +--- + +## How to run + +```bash +python smart-notes/rag_mvp/qa_cli.py + + + +>> what is AI + +[1] From test.md: +Artificial Intelligence (AI) is the simulation of human intelligence in machines. + + +>> what is machine learning +how is machine learning used +difference between AI and ML + diff --git a/smart-notes/rag_mvp/qa_cli.py b/smart-notes/rag_mvp/qa_cli.py new file mode 100644 index 0000000..210d56a --- /dev/null +++ b/smart-notes/rag_mvp/qa_cli.py @@ -0,0 +1,77 @@ +import os +import re + +QUESTION_WORDS = { + "what", "where", "who", "when", "which", + "is", "are", "was", "were", "the", "a", "an", + "of", "to", "in", "on", "for" +} + +NOTES_DIR = "notes" + + +def load_notes(): + notes = [] + if not os.path.exists(NOTES_DIR): + print(f"Notes directory '{NOTES_DIR}' not found.") + return notes + + for file in os.listdir(NOTES_DIR): + if file.endswith(".md"): + path = os.path.join(NOTES_DIR, file) + with open(path, "r", encoding="utf-8") as f: + notes.append({ + "filename": file, + "content": f.read() + }) + return notes + + +def split_sentences(text): + return re.split(r'(?<=[.!?])\s+', text) + + +def search_notes(query, notes): + results = [] + + query_words = [ + word.lower() + for word in query.split() + if word.lower() not in QUESTION_WORDS + ] + + for note in notes: + sentences = split_sentences(note["content"]) + for sentence in sentences: + sentence_lower = sentence.lower() + if any(word in sentence_lower for word in query_words): + results.append({ + "filename": note["filename"], + "sentence": sentence.strip() + }) + + return results + + +if __name__ == "__main__": + notes = load_notes() + + print("Ask questions about your notes (type 'exit' to quit)\n") + + while True: + query = input(">> ").strip() + + if query.lower() == "exit": + print("Goodbye 👋") + break + + matches = search_notes(query, notes) + + if not matches: + print("No relevant notes found.\n") + else: + print("\n--- Answers ---\n") + for i, m in enumerate(matches, 1): + print(f"[{i}] From {m['filename']}:") + print(m["sentence"]) + print() From 523813198988c614702b2a3a9afcc56855700d5a Mon Sep 17 00:00:00 2001 From: Tuba Javed Date: Wed, 11 Feb 2026 01:14:57 +0530 Subject: [PATCH 2/2] embedding-pipeline-chunking --- smart-notes/rag_mvp/README.md | 42 ++++++++++++++++ .../__pycache__/qa_cli.cpython-313.pyc | Bin 0 -> 4226 bytes smart-notes/rag_mvp/embeddings/__init__.py | 0 smart-notes/rag_mvp/embeddings/chunker.py | 31 ++++++++++++ smart-notes/rag_mvp/embeddings/embedder.py | 30 +++++++++++ smart-notes/rag_mvp/embeddings/indexer.py | 41 +++++++++++++++ smart-notes/rag_mvp/pipelines/__init__.py | 0 .../__pycache__/__init__.cpython-313.pyc | Bin 0 -> 203 bytes .../embedding_pipeline.cpython-313.pyc | Bin 0 -> 2897 bytes .../rag_mvp/pipelines/embedding_pipeline.py | 47 ++++++++++++++++++ smart-notes/rag_mvp/qa_cli.py | 32 ++++++++++++ 11 files changed, 223 insertions(+) create mode 100644 smart-notes/rag_mvp/__pycache__/qa_cli.cpython-313.pyc create mode 100644 smart-notes/rag_mvp/embeddings/__init__.py create mode 100644 smart-notes/rag_mvp/embeddings/chunker.py create mode 100644 smart-notes/rag_mvp/embeddings/embedder.py create mode 100644 smart-notes/rag_mvp/embeddings/indexer.py create mode 100644 smart-notes/rag_mvp/pipelines/__init__.py create mode 100644 smart-notes/rag_mvp/pipelines/__pycache__/__init__.cpython-313.pyc create mode 100644 smart-notes/rag_mvp/pipelines/__pycache__/embedding_pipeline.cpython-313.pyc create mode 100644 smart-notes/rag_mvp/pipelines/embedding_pipeline.py diff --git a/smart-notes/rag_mvp/README.md b/smart-notes/rag_mvp/README.md index 2ce2968..fd51419 100644 --- a/smart-notes/rag_mvp/README.md +++ b/smart-notes/rag_mvp/README.md @@ -40,3 +40,45 @@ Artificial Intelligence (AI) is the simulation of human intelligence in machines how is machine learning used difference between AI and ML + + + + +# Smart Notes – RAG MVP (Embeddings & FAISS) + +This project is a simple **Retrieval-Augmented Generation (RAG)** pipeline for Smart Notes. +It allows users to store notes, convert them into embeddings, and search relevant notes using vector similarity. + +--- + +## 🚀 Features + +- Convert notes into embeddings using Sentence Transformers +- Store and search embeddings using FAISS (CPU) +- CLI tool to ask questions about your notes +- Simple chunking for text files +- Works fully offline after model download + +--- + +## 🧠 Tech Stack + +- Python 3.10+ +- sentence-transformers +- FAISS (faiss-cpu) +- HuggingFace Transformers + +--- + +## 📁 Project Structure + +```bash +smart-notes/ +├── rag_mvp/ +│ ├── embed.py # Embedding logic +│ ├── index.py # FAISS index creation +│ ├── qa_cli.py # CLI for asking questions +│ └── utils.py # Helper functions +├── notes/ # Put your .txt notes here +├── requirements.txt +└── README.md diff --git a/smart-notes/rag_mvp/__pycache__/qa_cli.cpython-313.pyc b/smart-notes/rag_mvp/__pycache__/qa_cli.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df45ac456f685d62f0378214e1ce3c930d2f57f0 GIT binary patch literal 4226 zcmaJ@+jA4w89#f|{eE#UB(O3HMg<87Fa%@l;7h;-%c&P7ZLOeLq_r(9?J8$i80(>< znLb1g(_ptHIMd9;naNCOpVBlY&a|rs z369eslQbUfM`<_AIL(>K7aA*_y~zuE)85HQ=SpL2t_v_8ye> zrk$th#vbfX`wur+<;)$P_Kt}`gNL_0lMVm}zu+@D1kt4*Ayfu_IEX_AwLcy7AR&+r z2|*5c!#H%3>)-%d!CIKCJw0n$z@zb_2syzoHg8;bt#w2%QLU# zOHv^(OIfmHHY*Gmcy?{z+Pi29`bINZ%W%Lll|qw*>Y0|JOm&DkrGryML!$|_xx%9< zBEx8@)+{)otKo#wXvZ!l#e(r{iu0h8Y@5UQo`!J(CtE-haQm|@WihE`YX`s9ERRxF zkm*H8wsovqQ#SKhz;*tHlo=GWB`cpecxbOJXtRpU=T%}wu>hJw1It-ns?btuT9;V=x&(7+canM;J#B42y_=^D%$B>Ms%r!LG#KqPG3?^GJ z(4L*Lm}B#zwonwjJk`3=0@bDjD4 zO%b5;Ck^mv-7vnl;_Brg4ESlKEawsxCQ-~`qH2XJYt!+UA*t9TIjfMI5~Fq7biN?T zVo}nqSw+?)S&KO#4pp5kiQ24gk#6NxP1WtmiL0YRW@Ky<+klsDEx%;I z8w!LntCS!rEn*2?Ih_SS%f(nsEYVpa3Ma^rnpRnO6NJDaGT|sFVy*#G_$*97jz#Jp zpsS)ym!~?q<#gXUvv}stz~aCMv4_s+x-$yba?Ka{IJUa){*^V)*$rFYrZ-e`edP<> zbNt%z!Py7C7i;XM(|hOq;`x>EL+63I^T1XxeDBQinS1@q{U6)!pQ;Cs*G9Jd;icTl z@crJu`d|7~-1LR-xt3j4_Ej4{iI9 z%U`|njT7b`_+R>;>H>BD6g@Z`=KdUZ3`gx*IJQej;dg(s9Tkh9Ie zh@u#sthf&k_Me|hLpa9@5MJ52Du@ zl|F`48ZK@eE-nKXg4q2`GwG*@UyODRxaP>^g;MZ`_q;}rA8T9`nFan7NSXslTt}tC z2~=mf1$Bo7sICH)HD&!FeTud=n5Ar|D1qLKZrD<6uw|hMOs&gFbOwD=^duD9&$bjN zFhmoO`aSSvdO=`2VPX9BhIQk`CLGXF@Fx$C%uJ&gm&8ylqNk#Zc=#kdyJTg6I*53P zu@nNE0h$&6RhpdqO>~z7_y8?QsX2qrFaRM`^7kml=yc*ZBoqbbbdMP_nSuh@MkTCy zzS_8^_Wl`3mTr~sfPSD+F*NRMqQ9VI#ezDJXc1h2eMYQ(`!$5yT43n2x%;`5fd|3Z zXLH{fxBr*^5QUEbS&TMby3G_Fv*Lc3*EvH4-Sd;ws6gt0%o`JvBZ3jikR#;P>-(uhG{tte!?u_qp$R4h-cN~k3 zcU_M{k!=ofzHJ1=yDsGKs;)mDXzgB-CUQ2Fy z0~>*%KhD(yLl3<}b??xo@40)PWe*_K>d)8tiluQDL(LDf>XOz)ZO!h z1NPe?iC*K3MLn&Vttc;6)Q!)5fW6NaYnH)1~5UGgs~oN z7J&(~(o++y@~>U}k+?NVKs-i91a^(gVnmN|n#6vR`7)VC&=`r0%gd#*hA$IxBc3k7 zNVgV6EjtS+N-7;piU*x4;tS-1pk^apZ2d(gS1w2c_-8OjEK>gk-8Mr})ECGO|NmGJ zHB9ZWA*yTV6dRzc9;iayp{jky;-UOIJrs3x#}=f5+eZ=Ws#WSt_ha_?4R-%ySJ&+; z)zQbKIvv_zhre`q9{Ivc(r;W3eLd^Gp1LnuV?%CAC0dLei~WpI List[str]: + if not text: + return [] + + chunks = [] + start = 0 + text = text.strip() + + while start < len(text): + end = start + max_length + chunk = text[start:end].strip() + + if chunk: + chunks.append(chunk) + + if end >= len(text): + break + + start = end - overlap + if start < 0: + start = 0 + + return chunks diff --git a/smart-notes/rag_mvp/embeddings/embedder.py b/smart-notes/rag_mvp/embeddings/embedder.py new file mode 100644 index 0000000..1f296c0 --- /dev/null +++ b/smart-notes/rag_mvp/embeddings/embedder.py @@ -0,0 +1,30 @@ +""" +Embedding wrapper for converting text chunks into vectors. +Supports pluggable embedding backends later (Ollama, OpenAI, SentenceTransformers). +""" + +from typing import List +import numpy as np + +try: + from sentence_transformers import SentenceTransformer +except ImportError: + SentenceTransformer = None + + +class Embedder: + def __init__(self, model_name: str = "all-MiniLM-L6-v2"): + if SentenceTransformer is None: + raise ImportError( + "sentence-transformers not installed. Run: pip install sentence-transformers" + ) + + self.model_name = model_name + self.model = SentenceTransformer(model_name) + + def embed(self, texts: List[str]) -> np.ndarray: + if not texts: + return np.array([]) + + embeddings = self.model.encode(texts, convert_to_numpy=True) + return embeddings diff --git a/smart-notes/rag_mvp/embeddings/indexer.py b/smart-notes/rag_mvp/embeddings/indexer.py new file mode 100644 index 0000000..d1dc7d6 --- /dev/null +++ b/smart-notes/rag_mvp/embeddings/indexer.py @@ -0,0 +1,41 @@ +""" +Simple vector indexer using FAISS for similarity search. +""" + +from typing import List +import numpy as np + +try: + import faiss +except ImportError: + faiss = None + + +class VectorIndexer: + def __init__(self, dim: int): + if faiss is None: + raise ImportError("faiss not installed. Run: pip install faiss-cpu") + + self.dim = dim + self.index = faiss.IndexFlatL2(dim) + self.texts: List[str] = [] + + def add(self, embeddings: np.ndarray, chunks: List[str]): + if len(embeddings) == 0: + return + + self.index.add(embeddings) + self.texts.extend(chunks) + + def search(self, query_embedding: np.ndarray, k: int = 3): + if self.index.ntotal == 0: + return [] + + distances, indices = self.index.search(query_embedding.reshape(1, -1), k) + results = [] + + for idx in indices[0]: + if idx < len(self.texts): + results.append(self.texts[idx]) + + return results diff --git a/smart-notes/rag_mvp/pipelines/__init__.py b/smart-notes/rag_mvp/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/smart-notes/rag_mvp/pipelines/__pycache__/__init__.cpython-313.pyc b/smart-notes/rag_mvp/pipelines/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2915ec340c53b3d79803691f9787f54060cc7779 GIT binary patch literal 203 zcmey&%ge<81h*QxGePuY5CH>>P{wB#AY&>+I)f&o-%5reCLr%KNa|LWi&acYW>IQ# zNq$jfOnPyCGMLaUPfgM-D9X* literal 0 HcmV?d00001 diff --git a/smart-notes/rag_mvp/pipelines/__pycache__/embedding_pipeline.cpython-313.pyc b/smart-notes/rag_mvp/pipelines/__pycache__/embedding_pipeline.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..480e783f13ab60d9ccf36eeaa3b40c9da0a6e708 GIT binary patch literal 2897 zcmb7G&2Jk;6rbJouI-Iur-?TohO}-;QWkImp)E8;6_ge=p-I78qtHfLw)V!^)?T|a z>(G=-xuC58wJEI#M+cB0C6Ey~ zEH=EboELR@lMQr%_87M7*>2W8hb@=PmvGU>Gueh^gE!sKqvJ5Xi{=m(3yqhbaX+H(eiQQLcrjhHW|Jkq6^Yq66PW5rU=E)X+o zWfyEXb%YD%e5sJLu^zz8gUZ~9%8r|}uTmwuP;oC4U8DhG7v`zTwWezoZNj!oCaHIF zVlL-kJL{G3(p;XDvYe$pv@fR0xO52=WR4Uq?4{h2XQ$pgJ992Qc_MRe^6bpqq&r{Q zrX+LN%A3W-@?5!z3z@T9P?@c{(ee@vnWp19o@vs=E-Xh|x-fhp*$3=8x~Dz+ZDRd| zQPs}*fzQ;sHo#xK{eIx{jz?gEvaNh#1159ueGu-VG2{je!Id-e_Anf7!avcCkTr_X zY&r$jt}?4-lmK#rnLxT71W+_76Kwe<(QGya`Tei!0?N~{b|x@{ba4hV&>D|a~X_+;77ua=F-_zqO^{zd>d_UTAC%ZcIg;R^}t9SQRyNCQY>+$|oa_g*r zsvhaSIelY#b@=Bh(ME9$59UQeY_ZXmkPMcOkjc zFtL-Kh_%qd+7&Of+^qOWOL*Bz!Yug)~Eh^M#V-y*$3jGE!GDI{;Onp+Uh@ z(JfQi!q{4Zj3M4q*@jeDu-I$fatNUvldRU?Dp=l`am-FeB`cSMhQLnJ$Q`v^g@j5u zr$`u>EO8oH({Zul6mn3JtY-}FPMdA&1UQmBFvuHiVCDSs`3-HPs*S8I*0fPyy{Gl9 z%-)_|y;Rfm4eiCM_Tmo%HSL(M-jDWgME6#sdsnYiqx*g7o|XW|pTLpSwC8y-haxxC z8|uyIjp(gwKZg$7j}5Gc2L2_RuESUGY-+brm}pX_9q;XJ7NW$0yn|r2-FmlFtth+D zsGp?Ih=*A8gc)7JPk|c;8b1dTOmO5iHv23v!jiuYb|-YztWAi?+9{#?cEkOc!4O;E zFfahTH?dM$F0JKiy$rDq2*8FmT-Aoxa^GD3`f^Ph_tko==jKN@K3bhxkBzK{MjG&K zZ)5HMQuKcp<5(M1C=-@Joc^I4+hy_}HdQR4C>_hkOPLv%cI(!=`1A|nF_peMXPi@49tMTEr%38h_ zAN9lc;=L=UZ=YU$zZO5Z5g)C_N55UF#gF>o`-A(xI{)SQjlt3CAe$R}y&6jR;rjFA zKfLt)M6LVC_Y-T@%CY5Rt5@zE|8Bw$u6G})$9nzfUk@eJ^9o@d`Nxs|6Jh|8pZ%>k zL>#~wUAWd|6$+_o_~D9y2>sE>}CR&0@XC(rcx;| zIbxbuDpsMXiJ9iSgNes3U2e%Vag6Q83y{?wX1{R!Dln> zCBge1W(mO=gP*WPK0&{Wl05Lcs>;0&`$KZ#VO)|A{;4W*U&D{N=nBOsMQuishw7;_z+WfZ46BY3Q}a0gvO8pa3ioVXgInBnZME3KEX|iXy+F?mv}- LLf=CKjQjo#78Piz literal 0 HcmV?d00001 diff --git a/smart-notes/rag_mvp/pipelines/embedding_pipeline.py b/smart-notes/rag_mvp/pipelines/embedding_pipeline.py new file mode 100644 index 0000000..2163bcd --- /dev/null +++ b/smart-notes/rag_mvp/pipelines/embedding_pipeline.py @@ -0,0 +1,47 @@ +# rag_mvp/pipelines/embedding_pipeline.py + +from sentence_transformers import SentenceTransformer +import faiss +import numpy as np + + +class EmbeddingPipeline: + def __init__(self, model_name="all-MiniLM-L6-v2"): + self.model = SentenceTransformer(model_name, cache_folder="D:/models_cache") + self.index = None + self.chunks = [] + + def chunk_text(self, text, max_length=300, overlap=50): + chunks = [] + start = 0 + + while start < len(text): + end = start + max_length + chunk = text[start:end] + chunks.append(chunk) + start = end - overlap + + return chunks + + def build_index(self, chunks): + embeddings = self.model.encode(chunks) + embeddings = np.array(embeddings).astype("float32") + + dim = embeddings.shape[1] + self.index = faiss.IndexFlatL2(dim) + self.index.add(embeddings) + + return embeddings + + def process_notes(self, text): + self.chunks = self.chunk_text(text) + embeddings = self.build_index(self.chunks) + return self.chunks, embeddings + + def semantic_search(self, query, top_k=3): + query_vec = self.model.encode([query]) + query_vec = np.array(query_vec).astype("float32") + + distances, indices = self.index.search(query_vec, top_k) + results = [self.chunks[i] for i in indices[0]] + return results diff --git a/smart-notes/rag_mvp/qa_cli.py b/smart-notes/rag_mvp/qa_cli.py index 210d56a..4b3f19d 100644 --- a/smart-notes/rag_mvp/qa_cli.py +++ b/smart-notes/rag_mvp/qa_cli.py @@ -1,6 +1,35 @@ import os import re +#-------------------emedding-pipeline-chunking concept +from rag_mvp.pipelines.embedding_pipeline import EmbeddingPipeline + +def demo_embeddings_pipeline(): + pipeline = EmbeddingPipeline() + + note_text = """ + Python is a programming language. + It is widely used in AI and machine learning projects. + Smart Notes helps users organize knowledge using embeddings. + """ + + chunks, embeddings = pipeline.process_notes(note_text) + + print("\n--- Chunks Created ---") + for i, c in enumerate(chunks): + print(f"[{i}] {c}") + + query = "What is Python used for?" + results = pipeline.semantic_search(query) + + print("\n--- Search Results ---") + for r in results: + print("-", r) +#------------------------------------------------- + + + + QUESTION_WORDS = { "what", "where", "who", "when", "which", "is", "are", "was", "were", "the", "a", "an", @@ -54,6 +83,9 @@ def search_notes(query, notes): if __name__ == "__main__": + + demo_embeddings_pipeline() # Temporary demo for embeddings pipeline + notes = load_notes() print("Ask questions about your notes (type 'exit' to quit)\n")