Skip to content

Commit e503b87

Browse files
committed
Update: Updated the qdrant pipeline
1 parent 9ed9a46 commit e503b87

File tree

6 files changed

+37
-34
lines changed

6 files changed

+37
-34
lines changed

server.py

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,37 @@
66
from typing import Dict, Any, List, Optional
77

88
from src.config.config import Config
9+
from src.qdrant.qdrant_utils import QdrantWrapper
910
from src.embedder.embedder_llama_index import EmbeddingWrapper
10-
from llama_index.core.retrievers import VectorIndexRetriever
11+
from src.parser.csv_parser import CsvParser
1112
from llama_index.core import Settings
1213
Settings.llm = None
1314

14-
from src.qdrant.qdrant_manager import QdrantManager
1515
from src.utils.connections_manager import ConnectionManager
1616
from src.chatbot.rag_chat_bot import RAGChatBot
1717
from src.reranker.re_ranking import RerankDocuments
1818

19-
import os
20-
2119
app = FastAPI()
2220

2321
chatbot = RAGChatBot()
22+
file_processor = CsvParser(data_dir = Config.DATA_DIRECTORY)
2423

2524
collection_name = Config.COLLECTION_NAME
26-
qdrantManager = QdrantManager(Config.QDRANT_HOST, Config.QDRANT_PORT, collection_name)
27-
25+
qdrant_client = QdrantWrapper()
2826
embedding_client = EmbeddingWrapper()
2927

3028

31-
data_dir = Config.CAPEC_DATA_DIR
29+
try:
3230

33-
reranker = RerankDocuments()
31+
processed_chunks = file_processor.process_directory()
32+
qdrant_client.ingest_embeddings(processed_chunks)
3433

35-
index = qdrantManager.load_index(persist_dir=Config.PERSIST_DIR, embed_model=embedding_client)
34+
logger.info("Successfully ingested Data")
3635

37-
retriever = VectorIndexRetriever(
38-
index=index,
39-
similarity_top_k=5
40-
)
36+
except Exception as e:
37+
logger.error(f"Error in data ingestion: {str(e)}")
38+
39+
reranker = RerankDocuments()
4140

4241
# Manually added file names of the CAPEC daatset. In production, These files will be fetched from database
4342
database_files = ["333.csv", "658.csv", "659.csv", "1000.csv", "3000.csv"]
@@ -66,27 +65,26 @@ async def handle_search(websocket: WebSocket, query: str) -> None:
6665

6766
filename = find_file_names(query, database_files)
6867

69-
if filename:
70-
logger.info("Searching for file names...")
68+
query_embeddings = embedding_client.generate_embeddings(query)
7169

72-
filters = MetadataFilters(filters=[ExactMatchFilter(key="source_file", value=filename)])
73-
relevant_nodes = index.as_retriever(filters=filters).retrieve(query)
74-
if not relevant_nodes:
75-
logger.info("Searching without file name filter....")
76-
relevant_nodes = retriever.retrieve(query)
77-
else:
78-
logger.info("Searching without file names....")
79-
relevant_nodes = retriever.retrieve(query)
70+
top_5_results = qdrant_client.search(query_embeddings, 5)
71+
logger.info("Retrieved top 5 results")
8072

81-
82-
context = [node.text for node in relevant_nodes]
83-
84-
reranked_docs = reranker.rerank_docs(query, context)
73+
if not top_5_results:
74+
logger.warning("No results found in database")
75+
await websocket.send_json({
76+
"result": "The database is empty. Please ingest some data first before searching."
77+
})
78+
return
8579

86-
# only top 2 documents are passing as a context
87-
response, conversation_id = chatbot.chat(query, reranked_docs[:2])
8880

81+
reranked_docs = reranker.rerank_docs(query, top_5_results)
82+
reranked_top_5_list = [item['content'] for item in reranked_docs]
8983

84+
context = reranked_top_5_list[:2]
85+
86+
# only top 2 documents are passing as a context
87+
response, conversation_id = chatbot.chat(query, context)
9088

9189
logger.info("Generating response from Groq")
9290

src/config/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class Config:
1313
GRADIO_SERVER_NAME = "0.0.0.0"
1414
GRADIO_SERVER_PORT = int(7860)
1515
WEBSOCKET_URI = "ws://rag-server:8000/ws"
16-
DATA_DIRECTORY = "data/"
16+
DATA_DIRECTORY = "capec-dataset/"
1717
WEBSOCKET_TIMEOUT = 300 # 5 minutes
1818
HEARTBEAT_INTERVAL = 30 # 30 seconds
1919
MAX_CONNECTIONS = 100

src/parser/csv_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,5 +139,5 @@ def process_directory(self) -> List[Document]:
139139
logger.error(f"Skipping file {file_path} due to error: {str(e)}")
140140
continue
141141

142-
logger.info("All .csv files indexed....")
142+
logger.info("All .csv files processed. Returning chunks...")
143143
return all_documents

src/qdrant/qdrant_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def _connect_with_retry(self) -> None:
5252
self.client.get_collections()
5353
logger.info("Successfully connected to Qdrant")
5454
self._create_collection_if_not_exists()
55+
self.clear_collection()
5556
break
5657
except Exception as e:
5758
logger.error(f"Connection attempt {attempt + 1} failed: {str(e)}")

src/reranker/re_ranking.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@ def rerank_docs(self,
2727
"""
2828
# Re-ranking using cross-encoder
2929
# Prepare pairs for reranking
30-
pairs = [[query, doc] for doc in top_5_results]
30+
# Prepare pairs for reranking
31+
pairs = [[query, doc["content"]] for doc in top_5_results]
3132

3233
# Get relevance scores
33-
scores = self.reranker.predict(pairs)
34+
scores = self.reranker.predict(pairs)
3435

3536
# Sort by new scores
3637
reranked_results = [

src/utils/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33
from loguru import logger
44

5+
56
def match_file_names(filename, database_files):
67
if filename in database_files:
78
return filename
@@ -25,4 +26,6 @@ def find_file_names(query: str, database_files: List) -> str:
2526
else:
2627
return ""
2728
else:
28-
logger.info("No filename found.")
29+
logger.info("No filename found.")
30+
31+

0 commit comments

Comments
 (0)