HateSlop · jhgarry · Apr 1, 2025 · Beanssssssss · Apr 2, 2025 · Beanssssssss
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 .env
-__pycache__
+__pycache__
+myenv
diff --git a/build_vector_db.py b/build_vector_db.py
@@ -4,35 +4,86 @@
 import chromadb 
 from chromadb.config import Settings 
 from dotenv import load_dotenv
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
 
 # 환경 변수 Load해서 api_key 가져오고 OpenAI 클라이언트(객체) 초기화
-# do it
+load_dotenv()
+api_key=os.getenv("OPENAI_API_KEY")
+client = OpenAI(api_key=api_key)
 
 # 매 실행 시 DB 폴더를 삭제 후 새로 생성
 def init_db(db_path="./chroma_db"):
-    # do it
+    dbclient = chromadb.PersistentClient(path=db_path)
+    collection = dbclient.create_collection(name="rag_collection", get_or_create=True)
+    return dbclient, collection
 
 # 텍스트 로딩 함수
 def load_text_files(folder_path):
-    # do it
-
+    docs = []
+    for filename in os.listdir(folder_path):
+
+        file_path = os.path.join(folder_path, filename)
+        if file_path.endswith(".txt"):
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+                docs.append((filename, text))
+    return docs
 # OpenAI Embeddings 생성 함수 
-def get_embedding(text, model="text-embedding-3-large"):
-    # do it
+# def get_embedding(text, model="text-embedding-3-large"):
+#     response = client.embeddings.create(input=[text], model=model)
+#     embedding = response.data[0].embedding
+#     return embedding
 
 
 # 문서 청크 단위로 나누기
-def chunk_text(text, chunk_size=400, chunk_overlap=50):
-    # do it 
+# def chunk_text(text, chunk_size=400, chunk_overlap=50):
+#     chunks= []
+#     start = 0
+#     while start < len(text):
+#         end = start + chunk_size
+#         chunk = text[start:end] #적당히 자름
+#         chunks.append(chunk)
+#         start = end - chunk_overlap
+
+#         if start < 0:
+#             start = 0
+#         if start >= len(text): # 종료 시그널
+#             break
+
+#     return chunks
 
+embedder = OpenAIEmbeddings(
+    model="text-embedding-3-large"
+)
+
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=400,
+    chunk_overlap=50
+)
 
 # 문서로드 -> 청크 나누고 -> 임베딩 생성 후 DB 삽입
 if __name__ == "__main__":
-    # db 초기화, 경로 지정
-    # load_text_files 함수로 처리할 문서 데이터 불러오기기
-    # do it
+    # db 초기화
+    dbclient, collection = init_db("./chroma_db")
+
+    folder_path = "./source_data" # 데이터 가져다 쓸 경로 지정
+    docs = load_text_files(folder_path) # 처리할 문서 데이터 메모리로 불러오기
 
-    # 전처리 과정 
-    # do it
+    doc_id = 0
+    for filename, text in docs: 
+        # chunks = chunk_text(text, chunk_size=400, chunk_overlap=50) # chunking
+        chunks = text_splitter.split_text(text)
+        for idx, chunk in enumerate(chunks): # 각 청크와 해당 청크의 인덱스 가져옴
+            doc_id += 1 # 인덱스 하나씩 증가 시키면서
+            embedding = embedder.embed_query(chunk) # 각 청크 임베딩 벡터 생성
+            # vectorDB에 다음 정보 추가
+            collection.add(
+                documents=[chunk], # 실제 청크 text
+                embeddings=[embedding], # 생성된 임베딩 벡터
+                metadatas=[{"filename": filename, "chunk_index": idx}], # 파일 이름과 청크 인덱스를 포함하는 메타데이터
+                ids=[str(doc_id)] # 각 청크의 Unique한 id 저장
+                # 이 고유 id를 통해 db에서 업데이트, 삭제등의 작업 가능 
+            )
 
     print("모든 문서 벡터DB에 저장 완료")
diff --git a/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/data_level0.bin b/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/data_level0.bin
diff --git a/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/header.bin b/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/header.bin
diff --git a/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/length.bin b/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/length.bin
diff --git a/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/link_lists.bin b/chroma_db/ce74dad2-199b-4919-bf4f-0ae6a1763b47/link_lists.bin
diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3
diff --git a/rag_chatbot.py b/rag_chatbot.py
@@ -1,6 +1,6 @@
 import os
 from openai import OpenAI
-from build_vector_db import get_embedding
+from build_vector_db import embedder
 from chromadb import Client
 import chromadb 
 from chromadb.config import Settings 
@@ -11,7 +11,15 @@
 
 # query를 임베딩해 chroma에서 가장 유사도가 높은 top-k개의 문서 가져오는 함수 
 def retrieve(query, top_k=3):
-    # do it
+    query_embedding = embedder.embed_query(query) # qeury에 대한 임베딩 생성
+    # collection.query 함수로 저장된 문서 임베딩들 중에서
+    # query임베딩과 가장 유사한 항목들 검색 
+    results = collection.query(
+        query_embeddings=[query_embedding],
+        n_results=top_k
+    ) 
+    # 이때 results에는 해당 query 임베딩에 대한 텍스트, 메타데이터, id등이 전부 포함됨 
+    return results
 
 
 # 1) query에 대해 벡터 DB에서 top_k개 문서 retrieval
@@ -21,10 +29,17 @@ def retrieve(query, top_k=3):
 def generate_answer_with_context(query, top_k=3):
     # retrieve 함수로 결과 얻고 
     # top_k에 대한 documents와 metadatas 리스트로 추출
-    # do it
+    results = retrieve(query, top_k)
+    found_docs = results["documents"][0]
+    found_metadatas = results["metadatas"][0]
 
     # context 구성
-    # do it
+    context_texts = []
+    # zip을 이용해 두 리스트의 같은 인덱스에 있는 값들을 한 쌍으로 묶음
+    for doc_text, meta in zip(found_docs, found_metadatas): 
+        context_texts.append(f"<<filename: {meta['filename']}>>\n{doc_text}")
+    # context_texts 리스트에 있는 모든 문자열이 \n\n으로 이어 붙여짐
+    context_str = "\n\n".join(context_texts)
 
     # 프롬프트 작성
     system_prompt = """
@@ -53,7 +68,7 @@ def generate_answer_with_context(query, top_k=3):
     client = OpenAI(api_key=api_key)
 
     response = client.chat.completions.create(
-        model="gpt-4o",
+        model="gpt-4o-mini",
         messages = [{"role":"system", "content": system_prompt},
         {"role":"user", "content": user_prompt}]
     )

diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,7 @@ pandas
 datasets
 chromadb
 langchain
+langchain_openai
 tiktoken
 python-dotenv
 unstructured