-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
128 lines (98 loc) · 3.83 KB
/
script.py
File metadata and controls
128 lines (98 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader,PyPDFDirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from config import CONNECTION_STRING
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('api_key')
# Configuration variables
user_id = "2"
folder = "/home/bitcot/Desktop/Task-folder/demo"
def filter_embed(user_id):
file_list = os.listdir(folder)
filtered_files = [file for file in file_list if user_id in file]
if len(filtered_files) >= 1:
for file in filtered_files:
file_path = os.path.join(folder, file)
loader = PyMuPDFLoader(file_path)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0.0)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = PGVector.from_documents(
documents=docs,
embedding=embeddings,
collection_name="data_of_EMcode",
connection_string=CONNECTION_STRING,
openai_api_key=os.environ['OPENAI_API_KEY'],
pre_delete_collection=False
)
retriever = db.as_retriever()
print(retriever)
else:
print("user_id not found")
def store_embeddings():
file_list = os.listdir(folder)
for user_id in range(1, 6):
filtered_files = [file for file in file_list if str(user_id) in file]
documents = ""
for file in filtered_files:
file_path = os.path.join(folder, file)
loader = PyMuPDFLoader(file_path)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0.0)
docs = text_splitter.split_documents(documents)
documents += docs
embeddings = OpenAIEmbeddings()
db = PGVector.from_documents(
documents=documents,
embedding=embeddings,
collection_name="data_of_storecode",
connection_string=CONNECTION_STRING,
openai_api_key=os.environ['OPENAI_API_KEY'],
)
response = {
"user_id": user_id,
"filtered_files": filtered_files,
"embeddings": db.as_retriever()
}
print(response)
# store_embeddings()
#text generator function for multiple files
def pdf_text_generator(filtered_files):
total_text = ""
for file in filtered_files:
file_path = os.path.join(folder, file)
loader = PyMuPDFLoader(file_path)
for text in loader.load():
total_text += text.page_content
return total_text
#Chunks generator function for multile files
def get_text_chunks(total_text):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0.0)
docs = text_splitter.split_text(total_text)
return docs
# create embeddings and store in vectorstore
def new_store_embeddings():
file_list = os.listdir(folder)
for user_id in range(1, 6):
filtered_files = [file for file in file_list if str(user_id) in file]
total_text = pdf_text_generator(filtered_files)
docs = get_text_chunks(total_text)
embeddings = OpenAIEmbeddings()
db = PGVector.from_texts(
texts=docs,
embedding=embeddings,
collection_name="data_of_storecode",
connection_string=CONNECTION_STRING,
openai_api_key=os.environ['OPENAI_API_KEY'],
)
response = {
"user_id": user_id,
"filtered_files": filtered_files,
"embeddings": db.as_retriever()
}
print(response)
new_store_embeddings()