-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
139 lines (114 loc) · 5.03 KB
/
main.py
File metadata and controls
139 lines (114 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
from datetime import datetime
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader, UnstructuredFileLoader, UnstructuredHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
resource_directory = 'resources/'
vector_db_directory = 'data'
embeddings = OllamaEmbeddings(model="mxbai-embed-large")
model = OllamaLLM(model="llama3.2")
template = """
You are a helpful assistant that answers questions. Using the following retrieved information, answer the question provided. If you don't know the answer, say that you don't know.
Question: {question}
Context: {context}
"""
def create_vector_store_from_directory(directory):
print(f"Creating vector store from files in {directory}")
# Log the date and time the function is run
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open("vector_store_log.txt", "a") as log_file:
log_file.write(f"Vector store created on: {current_time}\n")
documents = []
for file_name in os.listdir(directory):
file_path = os.path.join(directory, file_name)
if file_name.endswith('.pdf'):
loader = PyPDFLoader(file_path)
elif file_name.endswith('.docx') or file_name.endswith('.doc'):
loader = UnstructuredWordDocumentLoader(file_path)
elif file_name.endswith('.txt'):
loader = UnstructuredFileLoader(file_path)
elif file_name.endswith('.html') or file_name.endswith('.htm'):
loader = UnstructuredHTMLLoader(file_path)
else:
print(f"Unsupported file type: {file_name}, skipping.")
continue
documents.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=300,
add_start_index=True
)
chunked_docs = text_splitter.split_documents(documents)
vector_db = FAISS.from_documents(chunked_docs, embeddings)
# Save the vector database locally
vector_db.save_local(vector_db_directory)
print(f"Vector store created and saved successfully.")
def upload_file(file):
with open(resource_directory + file.name, "wb") as f:
f.write(file.getbuffer())
add_document_to_vector_store(resource_directory + file.name)
def load_vector_store():
print(f"Loading vector store...")
vector_db = FAISS.load_local(vector_db_directory, embeddings, allow_dangerous_deserialization=True) # Enable deserialization
print(f"Vector store loaded successfully.")
return vector_db
def check_if_vector_store_exists():
try:
if os.path.exists("vector_store_log.txt"):
with open("vector_store_log.txt", "r") as log_file:
return log_file.read()
else:
return None
except Exception as e:
print(f"Error loading vector store: {e}")
return None
def retrieve_docs(query, k=4): # k = number of documents to retrieve
db = load_vector_store()
print()
print("==========================")
print(f"Query: {query}")
print(f"Vector db search results: {db.similarity_search(query)}")
return db.similarity_search(query, k)
def question_pdf(question, documents):
context = "\n\n".join([doc.page_content for doc in documents])
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | model
return chain.invoke({"question": question, "context": context})
def add_document_to_vector_store(file_path):
print(f"Adding document {file_path} to the vector store")
# Determine the loader based on file type
if file_path.endswith('.pdf'):
loader = PyPDFLoader(file_path)
elif file_path.endswith('.docx') or file_path.endswith('.doc'):
loader = UnstructuredWordDocumentLoader(file_path)
elif file_path.endswith('.txt'):
loader = UnstructuredFileLoader(file_path)
elif file_path.endswith('.html') or file_path.endswith('.htm'):
loader = UnstructuredHTMLLoader(file_path)
else:
print(f"Unsupported file type: {file_path}, skipping.")
return
# Load the document
documents = loader.load()
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=300,
add_start_index=True
)
chunked_docs = text_splitter.split_documents(documents)
# Load the existing vector store
try:
vector_db = FAISS.load_local(vector_db_directory, embeddings, allow_dangerous_deserialization=True)
except Exception as e:
print(f"Error loading vector store: {e}")
print("Creating a new vector store instead.")
vector_db = FAISS.from_documents([], embeddings)
# Add the new document to the vector store
vector_db.add_documents(chunked_docs)
# Save the updated vector store
vector_db.save_local(vector_db_directory)
print(f"Document {file_path} added to the vector store successfully.")