cyber-evangelists
diff --git a/‎capec-dataset/test.py‎
Lines changed: 30 additions & 0 deletions b/‎capec-dataset/test.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎client.py‎
Lines changed: 76 additions & 44 deletions b/‎client.py‎
Lines changed: 76 additions & 44 deletions
diff --git a/‎server.py‎
Lines changed: 30 additions & 4 deletions b/‎server.py‎
Lines changed: 30 additions & 4 deletions
diff --git a/‎src/index/index/collection/capec-collection-v1/storage.sqlite‎
-57 MB b/‎src/index/index/collection/capec-collection-v1/storage.sqlite‎
-57 MB
diff --git a/‎src/index/index/default__vector_store.json‎
Lines changed: 1 addition & 0 deletions b/‎src/index/index/default__vector_store.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/index/index/docstore.json‎
Lines changed: 1 addition & 1 deletion b/‎src/index/index/docstore.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/index/index/index_store.json‎
Lines changed: 1 addition & 1 deletion b/‎src/index/index/index_store.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/index/index/metadata.json‎
Lines changed: 5 additions & 0 deletions b/‎src/index/index/metadata.json‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/parser/csv_parser.py‎
Lines changed: 25 additions & 6 deletions b/‎src/parser/csv_parser.py‎
Lines changed: 25 additions & 6 deletions
@@ -0,0 +1,30 @@
+
+import pandas as pd
+from pathlib import Path
+
+
+
+
+def read_file( file_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(file_path, 
+            sep=',',
+            encoding='utf-8',
+            skipinitialspace=True, index_col=None)
+    
+    df.columns = df.columns.map(lambda x: x.strip("'\"")) 
+    df_reset = df.reset_index(drop=False)
+
+    col_names = df.columns
+
+    df.columns = col_names
+
+    df = df_reset.iloc[:, :-1]
+
+    df.columns = col_names
+    
+    return df
+
+
+df = read_file("333.csv")
+
+print(df.columns)
@@ -1,4 +1,3 @@
-# client.py
 import gradio as gr
 import websockets
 import json
@@ -117,7 +116,7 @@ async def handle_request(
         try:
 
             logger.info("Ensuring Connection....")
-            await ws_client.ensure_connection()
+            await self.ensure_connection()
 
             result = await self._handle_websocket_communication(action, payload)
             return  result
@@ -176,10 +175,9 @@ async def _handle_websocket_communication(
         except Exception as e:
             logger.error(f"Communication error: {e}")
             return "", [(payload.get("query", ""), f"Communication error: {str(e)}")]
-        
 
 
-# Create WebSocket client instance
+
 ws_client = WebSocketClient(Config.WEBSOCKET_URI)
 
 
@@ -216,50 +214,84 @@ def clear_chat() -> Optional[List[Tuple[str, str]]]:
 
 # Create Gradio interface
 with gr.Blocks(
-            title="CAPEC Chatbot",
-            theme=gr.themes.Soft(),
-            css=".gradio-container {max-width: 800px; margin: auto}"
-        ) as demo:
-            gr.Markdown("""
-            # ASM Chatbot
-            Ask questions about CAPEC Dataset and get detailed responses.
-            """)
-
-            with gr.Row():
-                msg = gr.Textbox(
-                    label="Type your message here...",
-                    placeholder="Enter your query",
-                    show_label=True,
-                    container=True,
-                    scale=8
-                )
-
-            with gr.Row():
-                search_btn = gr.Button("Search", variant="primary", scale=2)
-                clear_btn = gr.Button("Clear", variant="secondary", scale=1)
-                status_box = gr.Textbox(visible=False)
+        title="Capec Chatbot",
+        theme=gr.themes.Soft(),
+        css="""
+            .gradio-container {
+                max-width: 700px; 
+                margin: auto; 
+                font-family: Arial, sans-serif;
+            }
+            #header {
+                text-align: center; 
+                font-size: 1.5rem; 
+                font-weight: bold; 
+                color: #008080; 
+                padding: 0.125rem;
+            }
+            #input-container {
+                display: flex; 
+                align-items: center;
+                background-color: #f7f7f8;
+                padding: 0.25rem; 
+                border-radius: 8px;
+                margin-top: 0.25rem;
+            }
+            #chatbot {
+                border: 1px solid #E5E7EB;
+                border-radius: 8px;
+                background-color: #FFFFFF;
+                box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+            }
+            .gr-button-primary {
+                background-color: #008080;
+                border-color: #008080;
+            }
+            .gr-button-primary:hover {
+                background-color: #006666;
+            }
+        """
+    ) as demo:
 
+    # Header
+    gr.Markdown(
+        "<div id='header'>CAPEC RAG Application</div>"
+    )
 
-            chatbot = gr.Chatbot(
-                height=400,
-                show_label=False,
-                container=True,
-                elem_id="chatbot"
-            )
+    # Chatbot Component
+    chatbot = gr.Chatbot(
+        height=450,
+        show_label=False,
+        container=True,
+        elem_id="chatbot"
+    )
 
-            search_btn.click(
-                fn=search_click,
-                inputs=[msg, chatbot],
-                outputs=[msg, chatbot]
-            )
+    # Chat Input Row
+    with gr.Row(elem_id="input-container"):
+        msg = gr.Textbox(
+            placeholder="Type a message...",
+            show_label=False,
+            container=False,
+            lines=1,
+            scale=10,
+        )
+        send_button = gr.Button("Send", variant="primary", scale=1)
+        clear_button = gr.Button("Clear Chat", variant="secondary")
+
+    # Button Functionality
+    send_button.click(
+        fn=search_click,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot]
+    )
+    clear_button.click(
+        fn=clear_chat,
+        inputs=[],
+        outputs=[chatbot]
+    )
 
-            clear_btn.click(
-                fn=clear_chat,
-                inputs=[],
-                outputs=[chatbot]
-            )
 
-            
+    
 
 if __name__ == "__main__":
     server_name = Config.GRADIO_SERVER_NAME
@@ -269,4 +301,4 @@ def clear_chat() -> Optional[List[Tuple[str, str]]]:
         server_port=server_port,
         share=False,
         debug=True,
-        show_error=True,)
+        show_error=True,)
@@ -1,5 +1,7 @@
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Depends
 from loguru import logger
+from src.utils.utils import find_file_names
+from llama_index.core.vector_stores.types import MetadataFilters, ExactMatchFilter
 
 
 import asyncio
@@ -22,6 +24,8 @@
 from src.utils.connections_manager import ConnectionManager
 from src.config.config import Config
 
+from llama_index.core import  StorageContext
+
 import os
 
 app = FastAPI()
@@ -35,13 +39,23 @@
 
 embedding_client = EmbeddingWrapper()
 
+# data_dir = Config.CAPEC_DATA_DIR
+
+# storage_context = StorageContext.from_defaults()
+
+# csvParser = CsvParser(data_dir)
+# documents = csvParser.process_directory()
+# index = qdrantManager.create_and_persist_index(documents, storage_context, embedding_client, Config.PERSIST_DIR)
+
 index = qdrantManager.load_index(persist_dir=Config.PERSIST_DIR, embed_model=embedding_client)
 
 retriever = VectorIndexRetriever(
             index=index,
             similarity_top_k=5
         )
 
+# Manually added file names of the CAPEC daatset. In production, These files will be fetched from database
+database_files = ["333.csv", "658.csv", "659.csv", "1000.csv", "3000.csv"]
 
 # Create the connection manager instance
 connection_manager = ConnectionManager(max_connections=Config.MAX_CONNECTIONS)
@@ -65,13 +79,23 @@ async def handle_search(websocket: WebSocket, query: str) -> None:
     try:
         logger.info(f"Processing search query: {query}")
 
-        # Generate embeddings
-        logger.info("Retrieving Relevant nodes")
-        relevant_nodes = retriever.retrieve(query)
+        filename = find_file_names(query, database_files)
+
+        if filename:
+            logger.info("Searching for file names...")
+
+            filters = MetadataFilters(filters=[ExactMatchFilter(key="source_file", value=filename)])
+            relevant_nodes =  index.as_retriever(filters=filters).retrieve(query)
+            if not relevant_nodes:
+                logger.info("Searching without file name filter....")
+                relevant_nodes = retriever.retrieve(query)
+        else:
+            logger.info("Searching without file names....")
+            relevant_nodes = retriever.retrieve(query)
 
         context = [node.text for node in relevant_nodes]
 
-        # Only attaching top 2 results
+        logger.info(context[:2])
         prompt = prepare_prompt(query, context[:2])
 
         # Generate response using Groq
@@ -130,3 +154,5 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
     finally:
         connection_manager.disconnect(websocket)
 
+
+
@@ -0,0 +1,5 @@
+{
+  "saved_at": "2024-11-08T13:47:52.662402",
+  "index_name": "CAPEC-INDEX",
+  "num_nodes": 1794
+}
@@ -9,12 +9,10 @@
 from datetime import datetime
 from dataclasses import dataclass
 
-
 from loguru import logger
 
 from src.config.config import Config
 
-
 @dataclass
 class DocumentMetadata:
     """Class to store document metadata"""
@@ -49,13 +47,34 @@ def create_document_metadata(self, row: pd.Series, file_name: str,) -> DocumentM
         )
 
 
+
+    def read_file(self, file_path: Path) -> pd.DataFrame:
+        df = pd.read_csv(file_path, 
+                sep=',',
+                encoding='utf-8',
+                skipinitialspace=True, index_col=None)
+        
+        df.columns = df.columns.map(lambda x: x.strip("'\"")) 
+        df_reset = df.reset_index(drop=False)
+
+        col_names = df.columns
+
+        df.columns = col_names
+
+        df = df_reset.iloc[:, :-1]
+
+        df.columns = col_names
+        
+        return df
+
+
     def process_file(self, file_path: Path) -> List[Document]:
         """Process a single CSV file with enhanced metadata and version control"""
         try:
             logger.info(f"Processing file: {file_path}")
 
             # Read CSV file
-            df = pd.read_csv(file_path)
+            df = self.read_file(file_path)
 
             documents = []
             for _, row in df.iterrows():
@@ -98,11 +117,11 @@ def get_text(self, row: pd.Series) -> str:
         text_parts = []
 
         # Process each column in the row
-        for col in row.index:
-            cleaned_text = str(row[col]).strip() if pd.notna(row[col]) else ""
+        for col, value in row.items():  # Change here to access both col and value
+            cleaned_text = str(value).strip() if pd.notna(value) else ""
             if cleaned_text:  # Only include non-empty values
                 text_parts.append(f"{col}: {cleaned_text}")
-        
+
         # Join all parts with a separator
         return " | ".join(text_parts)