Skip to content

Commit 913db03

Browse files
authored
Merge pull request #8 from cyber-evangelists/dev-branch
Removed index with qdrant
2 parents f5d0d59 + 58d438b commit 913db03

File tree

12 files changed

+95
-203
lines changed

12 files changed

+95
-203
lines changed

client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ async def record_feedback(feedback, msg ) -> gr.Info:
9898

9999

100100
with gr.Blocks(
101-
title="CAPEC RAG Chatbot",
101+
title="EASM RAG Chatbot",
102102
theme=gr.themes.Soft(),
103103
css="""
104104
.gradio-container {
@@ -171,7 +171,7 @@ async def record_feedback(feedback, msg ) -> gr.Info:
171171

172172
# Header
173173
gr.Markdown(
174-
"<div id='header'>CAPEC RAG Application</div>"
174+
"<div id='header'>EASM RAG Application</div>"
175175
)
176176

177177
# Chatbot Component

requirements.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,7 @@ langchain-core==0.3.18
77
gradio
88
loguru==0.7.2
99
python-dotenv==1.0.1
10-
llama-index==0.11.21
11-
llama-index-vector-stores-qdrant==0.3.3
1210
langchain-groq==0.2.1
13-
llama-index-embeddings-huggingface==0.3.1
1411
langchain==0.3.7
1512
transformers==4.46.2
1613
torch==2.5.1

server.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,13 @@
11
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
22
from loguru import logger
33
from src.utils.utils import find_file_names
4-
from llama_index.core.vector_stores.types import MetadataFilters, ExactMatchFilter
54

65
from typing import Dict, Any, List, Optional
76

87
from src.config.config import Config
98
from src.qdrant.qdrant_utils import QdrantWrapper
10-
from src.embedder.embedder_llama_index import EmbeddingWrapper
9+
from src.embedder.embedder import EmbeddingWrapper
1110
from src.parser.csv_parser import CsvParser
12-
from llama_index.core import Settings
13-
Settings.llm = None
1411

1512
from src.utils.connections_manager import ConnectionManager
1613
from src.chatbot.rag_chat_bot import RAGChatBot
@@ -28,6 +25,10 @@
2825

2926
try:
3027

28+
qdrant_client.delete_collection(collection_name=collection_name)
29+
logger.info("collection deleted...")
30+
qdrant_client._create_collection_if_not_exists()
31+
logger.info("Collection created....")
3132
processed_chunks = file_processor.process_directory()
3233
qdrant_client.ingest_embeddings(processed_chunks)
3334

@@ -63,10 +64,12 @@ async def handle_search(websocket: WebSocket, query: str) -> None:
6364
try:
6465
logger.info(f"Processing search query")
6566

66-
filename = find_file_names(query, database_files)
67+
# filename = find_file_names(query, database_files)
6768

6869
query_embeddings = embedding_client.generate_embeddings(query)
6970

71+
72+
logger.info("Searching for top 5 results....")
7073
top_5_results = qdrant_client.search(query_embeddings, 5)
7174
logger.info("Retrieved top 5 results")
7275

src/chatbot/rag_chat_bot.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def __init__(self):
3535
model_name="llama-3.1-8b-instant",
3636
temperature=0,
3737
max_tokens=4096,
38+
frequency_penalty=0.9
3839
)
3940

4041
# Initialize memory

src/docker-files/Dockerfile.client

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ COPY client-requirements.txt .
99

1010
# Update pip and install dependencies
1111
RUN pip install --upgrade pip && \
12-
pip install -r client-requirements.txt
12+
pip install --default-timeout=5000 -r client-requirements.txt
1313

1414
# Copy only the required files for the application
1515
COPY client.py .

src/docker-files/Dockerfile.server

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ COPY requirements.txt .
77

88
# Update pip and install dependencies
99
RUN pip install --upgrade pip && \
10-
pip install -r requirements.txt
10+
pip install --default-timeout=5000 -r requirements.txt
1111

12+
1213
COPY server.py .
1314

1415
# Set Python to run in unbuffered mode

src/embedder/embedder.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
from sentence_transformers import SentenceTransformer
3+
from src.config.config import Config
4+
5+
6+
class EmbeddingWrapper:
7+
def __init__(self, model_name='all-MiniLM-L6-v2'):
8+
self.model = SentenceTransformer(Config.EMBEDDING_MODEL_PATH)
9+
10+
def generate_embeddings(self, texts):
11+
"""
12+
Generate embeddings for a list of texts.
13+
14+
Args:
15+
texts (list): A list of strings to generate embeddings for.
16+
17+
Returns:
18+
numpy.ndarray: A 2D array of embeddings, where each row corresponds to a text input.
19+
"""
20+
embeddings = self.model.encode(texts)
21+
return np.array(embeddings)

src/embedder/embedder_llama_index.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

src/parser/csv_parser.py

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,44 @@
11

22
import pandas as pd
3-
from typing import List, Dict, Any, Optional
3+
from typing import List, Dict, Any, Optional, TypedDict
44
from pathlib import Path
55
import numpy as np
6-
from llama_index.core.schema import Document
7-
from llama_index.core.node_parser import SentenceSplitter
6+
from src.embedder.embedder import EmbeddingWrapper
87

98
from datetime import datetime
109
from dataclasses import dataclass
11-
1210
from loguru import logger
13-
1411
from src.config.config import Config
1512

13+
1614
@dataclass
17-
class DocumentMetadata:
15+
class DocumentMetadata(TypedDict):
1816
"""Class to store document metadata"""
1917
source_file: str
2018
ingestion_timestamp: str
2119
last_updated_timestamp: str
2220
embedding_version: str
23-
embedding_model: str
21+
embedding_model_name: str
2422
processing_status: str
2523

2624

25+
26+
class ProcessedChunk(TypedDict):
27+
"""Type definition for processed file chunks."""
28+
embeddings: List[float]
29+
text: str
30+
metadata: str
31+
32+
33+
2734
class CsvParser:
2835

29-
def __init__(self, data_dir: str, embedding_version: str = Config.EMBEDDING_VERSION_NUMBER, embedding_model: str = Config.EMBEDDING_MODEL) -> None:
36+
def __init__(self, data_dir: str, embedding_version: str = Config.EMBEDDING_VERSION_NUMBER, embedding_model_name: str = Config.EMBEDDING_MODEL) -> None:
3037
self.data_dir = Path(data_dir)
3138
self.embedding_version = embedding_version
32-
self.embedding_model = embedding_model
33-
self.node_parser = SentenceSplitter(chunk_size=1200, chunk_overlap=200)
34-
39+
self.embedding_model_name = embedding_model_name
40+
self.embedder = EmbeddingWrapper()
41+
self.chunks: List[ProcessedChunk] = []
3542

3643
def create_document_metadata(self, row: pd.Series, file_name: str,) -> DocumentMetadata:
3744
"""Create comprehensive document metadata"""
@@ -42,7 +49,7 @@ def create_document_metadata(self, row: pd.Series, file_name: str,) -> DocumentM
4249
ingestion_timestamp=current_time,
4350
last_updated_timestamp=current_time,
4451
embedding_version=self.embedding_version,
45-
embedding_model=self.embedding_model, # In practice, this might be different
52+
embedding_model_name=self.embedding_model_name, # In practice, this might be different
4653
processing_status="processed",
4754
)
4855

@@ -68,35 +75,35 @@ def read_file(self, file_path: Path) -> pd.DataFrame:
6875
return df
6976

7077

71-
def process_file(self, file_path: Path) -> List[Document]:
78+
def process_file(self, file_path: Path) -> None:
7279
"""Process a single CSV file with enhanced metadata and version control"""
7380
try:
7481
logger.info(f"Processing file: {file_path}")
7582

7683
# Read CSV file
7784
df = self.read_file(file_path)
7885

79-
documents = []
8086
for _, row in df.iterrows():
8187
# Combine text fields
8288
text_content = self.get_text(row)
8389

8490
# Create comprehensive metadata
8591
metadata = self.create_document_metadata(row, file_path.name)
86-
92+
embeddings = self.embedder.generate_embeddings(text_content)
93+
94+
8795
# Create Document object with enhanced metadata
88-
doc = Document(
89-
text=text_content,
90-
metadata=metadata.__dict__
91-
)
92-
93-
nodes = self.node_parser.get_nodes_from_documents([doc])
94-
documents.extend(
95-
[Document(text=node.text, metadata=node.metadata) for node in nodes]
96-
)
96+
doc : ProcessedChunk = {
97+
"embeddings": embeddings,
98+
"text":text_content,
99+
"metadata":"metadata"
100+
}
101+
102+
103+
self.chunks.append(doc)
97104

98-
logger.info(f"Successfully processed {len(documents)} documents from {file_path}")
99-
return documents
105+
logger.info(f"Successfully processed all documents from {file_path}")
106+
100107

101108
except Exception as e:
102109
logger.error(f"Error processing file {file_path}: {str(e)}")
@@ -126,18 +133,18 @@ def get_text(self, row: pd.Series) -> str:
126133
return " | ".join(text_parts)
127134

128135

129-
def process_directory(self) -> List[Document]:
136+
def process_directory(self) -> List[Dict[str, Any]]:
130137
"""Process all CSV files in directory"""
131138
all_documents = []
132139

133140
logger.info("Attempting to read all .csv files and indexing....")
134141
for file_path in self.data_dir.glob('*.csv'):
135142
try:
136-
documents = self.process_file(file_path)
137-
all_documents.extend(documents)
143+
self.process_file(file_path)
138144
except Exception as e:
139145
logger.error(f"Skipping file {file_path} due to error: {str(e)}")
140146
continue
147+
141148

142149
logger.info("All .csv files processed. Returning chunks...")
143-
return all_documents
150+
return self.chunks

0 commit comments

Comments
 (0)