11"""
2- Script to load documents from text files into Milvus Lite vector store.
2+ Script to load documents from text files into a vector store via LlamaStack .
33
4- This script reads text files from the data directory, splits them into chunks,
5- creates embeddings, and stores them in a Milvus Lite vector database.
4+ If VECTOR_STORE_ID is set, documents are added to the existing store.
5+ Otherwise a new vector store is created using VECTOR_STORE_NAME,
6+ its ID is printed and written back into the .env file.
67"""
78
89import uuid
10+ from pathlib import Path
911
1012from dotenv import load_dotenv
1113from langchain_community .document_loaders import TextLoader
1820load_dotenv (verbose = True )
1921
2022
23+ def update_env_file (key : str , value : str ):
24+ """Update or add a key=value pair in the .env file next to this script."""
25+ env_path = Path (__file__ ).resolve ().parent .parent / ".env" # data/ -> langgraph_agentic_rag/.env
26+ if not env_path .exists ():
27+ env_path .write_text (f"{ key } ={ value } \n " )
28+ return
29+
30+ lines = env_path .read_text ().splitlines ()
31+ found = False
32+ for i , line in enumerate (lines ):
33+ stripped = line .strip ()
34+ if stripped .startswith (f"{ key } =" ) or stripped == key :
35+ lines [i ] = f"{ key } ={ value } "
36+ found = True
37+ break
38+ if not found :
39+ lines .append (f"{ key } ={ value } " )
40+
41+ env_path .write_text ("\n " .join (lines ) + "\n " )
42+
43+
2144def load_and_index_documents (
2245 docs_to_load : str = None ,
2346 embedding_model : str = None ,
2447 base_url : str = None ,
2548 api_key : str = None ,
26- chunk_size : int = 512 , # Increased from 64 to 512 for better context
27- chunk_overlap : int = 128 , # Increased from 32 to 128 for better overlap
49+ chunk_size : int = 512 ,
50+ chunk_overlap : int = 128 ,
2851):
2952 """
30- Load documents from directory and index them in Milvus Lite .
53+ Load documents from directory and index them in a vector store .
3154
3255 Args:
3356 docs_to_load: Directory containing text files to load
@@ -54,28 +77,30 @@ def load_and_index_documents(
5477 api_key = api_key ,
5578 )
5679
80+ vector_store_id = getenv ("VECTOR_STORE_ID" )
5781 vector_store_name = getenv ("VECTOR_STORE_NAME" ) or "my_vector_store"
5882 provider_id = "milvus"
5983 embedding_dimension = 768
6084
61- # Delete any existing vector stores with the same name, then create a fresh one
62- vector_store_list = client .vector_stores .list ()
63-
64- for vs in vector_store_list .data :
65- if vs .name == vector_store_name :
66- print (f"Deleting existing vector store: { vs .id } ({ vs .name } )" )
67- client .vector_stores .delete (vector_store_id = vs .id )
68-
69- vector_store = client .vector_stores .create (
70- name = vector_store_name ,
71- extra_body = {
72- "provider_id" : provider_id ,
73- "embedding_model" : embedding_model ,
74- "embedding_dimension" : embedding_dimension ,
75- },
76- )
85+ if vector_store_id :
86+ # Use existing vector store
87+ print (f"Using existing vector store: { vector_store_id } " )
88+ else :
89+ # Create a new vector store
90+ vector_store = client .vector_stores .create (
91+ name = vector_store_name ,
92+ extra_body = {
93+ "provider_id" : provider_id ,
94+ "embedding_model" : embedding_model ,
95+ "embedding_dimension" : embedding_dimension ,
96+ },
97+ )
98+ vector_store_id = vector_store .id
99+ print (f"Vector store created: id={ vector_store_id } name={ vector_store_name } " )
77100
78- print (f"Vector store created: { vector_store .id } ({ vector_store_name } )" )
101+ # Persist the new ID to .env
102+ update_env_file ("VECTOR_STORE_ID" , vector_store_id )
103+ print (f"Updated .env with VECTOR_STORE_ID={ vector_store_id } " )
79104
80105 print ("Loading documents from directory..." )
81106 loader = TextLoader (docs_to_load )
@@ -131,9 +156,11 @@ def load_and_index_documents(
131156 print ("\n Loading chunks to Vector Store..." )
132157 client .vector_io .insert (
133158 chunks = formatted_chunks ,
134- vector_store_id = vector_store . id ,
159+ vector_store_id = vector_store_id ,
135160 )
136161
162+ print (f"Done! { len (formatted_chunks )} chunks inserted into vector store { vector_store_id } " )
163+
137164
138165if __name__ == "__main__" :
139- load_and_index_documents ()
166+ load_and_index_documents ()
0 commit comments