291291
292292# ------------- CONSTANTS -------------
293293LLM_OPTIONS = ["mistral-7b" , "snowflake-arctic" , "mixtral-8x7b" ]
294- DOCS_TBL = "DOCUMENTS"
295- CHUNKS_TBL = "CHUNKS"
296- EMB_TBL = "EMBEDDINGS"
297- CHAT_TBL = "CHAT_HISTORY"
294+
295+ # FULLY QUALIFIED TABLE NAMES - This is the key fix!
296+ DB_SCHEMA = "AIFAQ_VERSION1_DB.APP_SCHEMA"
297+ DOCS_TBL = f"{ DB_SCHEMA } .DOCUMENTS"
298+ CHUNKS_TBL = f"{ DB_SCHEMA } .CHUNKS"
299+ EMB_TBL = f"{ DB_SCHEMA } .EMBEDDINGS"
300+ CHAT_TBL = f"{ DB_SCHEMA } .CHAT_HISTORY"
298301
299302# Supported file extensions
300303SUPPORTED_EXTENSIONS = {
303306 "csv" : "CSV" ,
304307 "xlsx" : "Excel" ,
305308 "xls" : "Excel" ,
309+ "txt" : "TEXT" ,
306310}
307311
308312# Performance tuning
@@ -349,10 +353,10 @@ def sanitize(txt: str) -> str:
349353
350354# ------------- FULL EXTRACTION -------------
351355def extract_pdf_full (f ):
352- """Full PDF extraction"""
356+ """Full PDF extraction using pypdf (Snowflake compatible) """
353357 try :
354- from PyPDF2 import PdfReader
355-
358+ from pypdf import PdfReader
359+
356360 reader = PdfReader (f )
357361 text_parts = []
358362 # Read ALL pages
@@ -411,7 +415,6 @@ def extract_excel_full(f):
411415 df = pd .read_excel (excel_file , sheet_name = sheet_name )
412416
413417 # Convert the entire DataFrame to string *without* truncation
414- # This ensures every character from the Excel file is read
415418 with pd .option_context (
416419 "display.max_rows" ,
417420 None ,
@@ -430,6 +433,23 @@ def extract_excel_full(f):
430433 return ""
431434
432435
436+ def extract_txt_full (f ):
437+ """Full TXT file extraction"""
438+ try :
439+ # Reset file pointer and read as text
440+ f .seek (0 )
441+ # Try UTF-8 first, fall back to latin-1 if that fails
442+ try :
443+ content = f .read ().decode ("utf-8" )
444+ except UnicodeDecodeError :
445+ f .seek (0 )
446+ content = f .read ().decode ("latin-1" )
447+ return content
448+ except Exception as e :
449+ st .error (f"TXT extraction error: { e } " )
450+ return ""
451+
452+
433453def extract_file_content (file , file_type ):
434454 """Extract content based on file type - FULL versions"""
435455 if file_type .upper () == "PDF" :
@@ -440,6 +460,8 @@ def extract_file_content(file, file_type):
440460 return extract_csv_full (file )
441461 elif file_type .upper () == "EXCEL" :
442462 return extract_excel_full (file )
463+ elif file_type .upper () == "TEXT" :
464+ return extract_txt_full (file )
443465 else :
444466 return ""
445467
@@ -469,25 +491,21 @@ def store_document_fast(
469491) -> bool :
470492 """
471493 OPTIMIZED: Store document, chunks, and embeddings using server-side processing.
472- NOW WITH IMMEDIATE COMMIT for instant knowledge base update.
473494 """
474495 doc_id = str (uuid .uuid4 ())
475496 try :
476- # 1. Insert document (single operation) with immediate commit
497+ # 1. Insert document
477498 session .sql (
478499 f"INSERT INTO { DOCS_TBL } (DOC_ID,FILENAME,FILE_TYPE,FILE_SIZE,IS_PUBLIC,UPLOADED_AT) "
479500 f"VALUES ('{ sanitize (doc_id )} ','{ sanitize (filename )} ','{ src_type } ',{ len (content )} ,"
480501 f"{ public } ,CURRENT_TIMESTAMP())"
481502 ).collect ()
482503
483- # FORCE COMMIT - Critical for immediate visibility
484- session .sql ("COMMIT" ).collect ()
485-
486504 # 2. Create and batch insert chunks
487505 chunks = chunk_text (content )
488506 if not chunks :
489507 st .warning (f"⚠️ { filename } : Document stored but no content to chunk" )
490- return True # Document stored, but no content to chunk
508+ return True
491509
492510 # Prepare chunk data
493511 chunk_data = []
@@ -508,15 +526,14 @@ def store_document_fast(
508526 chunks_df = pd .DataFrame (chunk_data )
509527 session .write_pandas (
510528 chunks_df ,
511- table_name = CHUNKS_TBL ,
529+ table_name = "CHUNKS" ,
530+ database = "AIFAQ_VERSION1_DB" ,
531+ schema = "APP_SCHEMA" ,
512532 auto_create_table = False ,
513533 overwrite = False ,
514534 quote_identifiers = False ,
515535 )
516536
517- # FORCE COMMIT after chunks
518- session .sql ("COMMIT" ).collect ()
519-
520537 # 3. Generate and store embeddings
521538 session .sql (
522539 f"""
@@ -531,9 +548,6 @@ def store_document_fast(
531548 """
532549 ).collect ()
533550
534- # FORCE COMMIT after embeddings
535- session .sql ("COMMIT" ).collect ()
536-
537551 # Verify the document was stored
538552 verify = session .sql (
539553 f"SELECT COUNT(*) as cnt FROM { DOCS_TBL } WHERE DOC_ID='{ sanitize (doc_id )} '"
@@ -561,7 +575,6 @@ def store_document_fast(
561575 session .sql (
562576 f"DELETE FROM { DOCS_TBL } WHERE DOC_ID='{ sanitize (doc_id )} '"
563577 ).collect ()
564- session .sql ("COMMIT" ).collect () # Commit cleanup
565578 except Exception as cleanup_e :
566579 st .error (f"❌ Cleanup failed for { filename } : { str (cleanup_e )} " )
567580 return False
@@ -677,7 +690,7 @@ def load_chat_history(session_id: str):
677690 ext = file .name .split ("." )[- 1 ].lower ()
678691 file_type = SUPPORTED_EXTENSIONS .get (ext , "TEXT" )
679692
680- # Extract content - this now reads 100% of all file types
693+ # Extract content
681694 content = extract_file_content (file , file_type )
682695
683696 if content :
@@ -687,12 +700,12 @@ def load_chat_history(session_id: str):
687700 st .success (f"✅ { file .name } uploaded successfully!" )
688701 success_count += 1
689702
690- # CRITICAL: Force immediate rerun after each successful upload
703+ # Force immediate rerun after each successful upload
691704 st .session_state .kb_refresh = True
692705 st .session_state .last_upload_time = time .time ()
693706 progress_bar .progress ((idx + 1 ) / len (files ))
694- time .sleep (0.3 ) # Brief pause to show success message
695- st .rerun () # Immediate refresh to update knowledge base
707+ time .sleep (0.3 )
708+ st .rerun ()
696709 else :
697710 with status_container :
698711 st .error (f"❌ { file .name } failed to upload" )
@@ -704,7 +717,6 @@ def load_chat_history(session_id: str):
704717
705718 progress_bar .progress ((idx + 1 ) / len (files ))
706719
707- # This code only runs if all files processed without triggering rerun
708720 progress_bar .empty ()
709721
710722 if success_count > 0 :
@@ -728,7 +740,7 @@ def load_chat_history(session_id: str):
728740 st .success ("✅ Knowledge base updated!" )
729741 st .session_state .kb_refresh = False
730742
731- # Force fresh database query - this ensures latest data
743+ # Force fresh database query
732744 df = get_user_docs ()
733745
734746 if df .empty :
@@ -778,7 +790,7 @@ def load_chat_history(session_id: str):
778790 chats = session .sql (
779791 f"SELECT DISTINCT SESSION_ID, LEFT(QUERY_TEXT,40) TITLE, MAX(QUERY_TIMESTAMP) TS "
780792 f"FROM { CHAT_TBL } "
781- f"GROUP BY SESSION_ID, QUERY_TEXT ORDER BY TS DESC LIMIT 15 "
793+ f"GROUP BY SESSION_ID, QUERY_TEXT ORDER BY TS DESC"
782794 ).to_pandas ()
783795
784796 if not chats .empty :
@@ -790,7 +802,6 @@ def load_chat_history(session_id: str):
790802 use_container_width = True ,
791803 ):
792804 st .session_state .current = c ["SESSION_ID" ]
793- # Load chat history from database
794805 loaded_messages = load_chat_history (c ["SESSION_ID" ])
795806 st .session_state .sessions [c ["SESSION_ID" ]] = {
796807 "title" : btn_label ,
@@ -854,7 +865,7 @@ def load_chat_history(session_id: str):
854865
855866messages = st .session_state .sessions [st .session_state .current ]["messages" ]
856867
857- # Display messages - Beautiful modern design with enhanced styling
868+ # Display messages
858869chat_container = st .container ()
859870with chat_container :
860871 if not messages :
@@ -937,9 +948,8 @@ def load_chat_history(session_id: str):
937948 unsafe_allow_html = True ,
938949 )
939950
940- # Input form - Beautiful modern design with enhanced styling
951+ # Input form
941952with st .form ("chat_form" , clear_on_submit = True ):
942- # Enhanced input styling
943953 st .markdown (
944954 f"""
945955 <style>
@@ -975,7 +985,7 @@ def load_chat_history(session_id: str):
975985 key = "chat_input" ,
976986 )
977987
978- # Action buttons with enhanced design
988+ # Action buttons
979989 col1 , col2 , col3 = st .columns ([6 , 1.5 , 1 ])
980990 with col1 :
981991 st .markdown (
@@ -1037,7 +1047,7 @@ def load_chat_history(session_id: str):
10371047 )
10381048 context = "\n \n ---\n \n " .join (context_parts )
10391049
1040- # Generate response using LLM with source tracking
1050+ # Generate response using LLM
10411051 prompt_llm = (
10421052 f"You are a helpful assistant that answers questions based on the provided knowledge base context. "
10431053 f"Answer the user's question using only the information from the provided knowledge base. "
@@ -1058,27 +1068,23 @@ def load_chat_history(session_id: str):
10581068 else "The LLM service didn't return a response."
10591069 )
10601070
1061- # Cite only the PRIMARY source (highest similarity score)
1062- # Group by filename and get the highest similarity for each document
1071+ # Cite sources
10631072 source_scores = (
10641073 rows .groupby ("FILENAME" )["SIMILARITY_SCORE" ]
10651074 .max ()
10661075 .sort_values (ascending = False )
10671076 )
10681077
1069- # Get top source(s) - only cite documents with similarity > 0.5 (strong match)
10701078 top_sources = []
10711079 for filename , score in source_scores .items ():
1072- if score > 0.5 : # Strong relevance threshold
1080+ if score > 0.5 :
10731081 top_sources .append (filename )
1074- if len (top_sources ) >= 2 : # Max 2 sources
1082+ if len (top_sources ) >= 2 :
10751083 break
10761084
1077- # If no strong matches, just use the top result
10781085 if not top_sources :
10791086 top_sources = [source_scores .index [0 ]]
10801087
1081- # Add source attribution - only the primary source(s) used
10821088 if len (top_sources ) == 1 :
10831089 sources_md = f"\n \n ---\n **Source:** { top_sources [0 ]} "
10841090 else :
@@ -1109,11 +1115,14 @@ def load_chat_history(session_id: str):
11091115 }
11101116 ]
11111117 ),
1112- table_name = CHAT_TBL ,
1118+ table_name = "CHAT_HISTORY" ,
1119+ database = "AIFAQ_VERSION1_DB" ,
1120+ schema = "APP_SCHEMA" ,
11131121 auto_create_table = False ,
11141122 overwrite = False ,
1123+ quote_identifiers = False ,
11151124 )
11161125 except Exception :
1117- pass # Chat history persistence is optional
1126+ pass
11181127
11191128 st .rerun ()
0 commit comments