|
6 | 6 | from typing import Dict, Any, List, Optional |
7 | 7 |
|
8 | 8 | from src.config.config import Config |
| 9 | +from src.qdrant.qdrant_utils import QdrantWrapper |
9 | 10 | from src.embedder.embedder_llama_index import EmbeddingWrapper |
10 | | -from llama_index.core.retrievers import VectorIndexRetriever |
| 11 | +from src.parser.csv_parser import CsvParser |
11 | 12 | from llama_index.core import Settings |
12 | 13 | Settings.llm = None |
13 | 14 |
|
14 | | -from src.qdrant.qdrant_manager import QdrantManager |
15 | 15 | from src.utils.connections_manager import ConnectionManager |
16 | 16 | from src.chatbot.rag_chat_bot import RAGChatBot |
17 | 17 | from src.reranker.re_ranking import RerankDocuments |
18 | 18 |
|
19 | | -import os |
20 | | - |
21 | 19 | app = FastAPI() |
22 | 20 |
|
23 | 21 | chatbot = RAGChatBot() |
| 22 | +file_processor = CsvParser(data_dir = Config.DATA_DIRECTORY) |
24 | 23 |
|
25 | 24 | collection_name = Config.COLLECTION_NAME |
26 | | -qdrantManager = QdrantManager(Config.QDRANT_HOST, Config.QDRANT_PORT, collection_name) |
27 | | - |
| 25 | +qdrant_client = QdrantWrapper() |
28 | 26 | embedding_client = EmbeddingWrapper() |
29 | 27 |
|
30 | 28 |
|
31 | | -data_dir = Config.CAPEC_DATA_DIR |
| 29 | +try: |
32 | 30 |
|
33 | | -reranker = RerankDocuments() |
| 31 | + processed_chunks = file_processor.process_directory() |
| 32 | + qdrant_client.ingest_embeddings(processed_chunks) |
34 | 33 |
|
35 | | -index = qdrantManager.load_index(persist_dir=Config.PERSIST_DIR, embed_model=embedding_client) |
| 34 | + logger.info("Successfully ingested Data") |
36 | 35 |
|
37 | | -retriever = VectorIndexRetriever( |
38 | | - index=index, |
39 | | - similarity_top_k=5 |
40 | | - ) |
| 36 | +except Exception as e: |
| 37 | + logger.error(f"Error in data ingestion: {str(e)}") |
| 38 | + |
| 39 | +reranker = RerankDocuments() |
41 | 40 |
|
42 | 41 | # Manually added file names of the CAPEC daatset. In production, These files will be fetched from database |
43 | 42 | database_files = ["333.csv", "658.csv", "659.csv", "1000.csv", "3000.csv"] |
@@ -66,27 +65,26 @@ async def handle_search(websocket: WebSocket, query: str) -> None: |
66 | 65 |
|
67 | 66 | filename = find_file_names(query, database_files) |
68 | 67 |
|
69 | | - if filename: |
70 | | - logger.info("Searching for file names...") |
| 68 | + query_embeddings = embedding_client.generate_embeddings(query) |
71 | 69 |
|
72 | | - filters = MetadataFilters(filters=[ExactMatchFilter(key="source_file", value=filename)]) |
73 | | - relevant_nodes = index.as_retriever(filters=filters).retrieve(query) |
74 | | - if not relevant_nodes: |
75 | | - logger.info("Searching without file name filter....") |
76 | | - relevant_nodes = retriever.retrieve(query) |
77 | | - else: |
78 | | - logger.info("Searching without file names....") |
79 | | - relevant_nodes = retriever.retrieve(query) |
| 70 | + top_5_results = qdrant_client.search(query_embeddings, 5) |
| 71 | + logger.info("Retrieved top 5 results") |
80 | 72 |
|
81 | | - |
82 | | - context = [node.text for node in relevant_nodes] |
83 | | - |
84 | | - reranked_docs = reranker.rerank_docs(query, context) |
| 73 | + if not top_5_results: |
| 74 | + logger.warning("No results found in database") |
| 75 | + await websocket.send_json({ |
| 76 | + "result": "The database is empty. Please ingest some data first before searching." |
| 77 | + }) |
| 78 | + return |
85 | 79 |
|
86 | | - # only top 2 documents are passing as a context |
87 | | - response, conversation_id = chatbot.chat(query, reranked_docs[:2]) |
88 | 80 |
|
| 81 | + reranked_docs = reranker.rerank_docs(query, top_5_results) |
| 82 | + reranked_top_5_list = [item['content'] for item in reranked_docs] |
89 | 83 |
|
| 84 | + context = reranked_top_5_list[:2] |
| 85 | + |
| 86 | + # only top 2 documents are passing as a context |
| 87 | + response, conversation_id = chatbot.chat(query, context) |
90 | 88 |
|
91 | 89 | logger.info("Generating response from Groq") |
92 | 90 |
|
|
0 commit comments