291291
292292# ------------- CONSTANTS -------------
293293LLM_OPTIONS = ["mistral-7b" , "snowflake-arctic" , "mixtral-8x7b" ]
294- DOCS_TBL = "DOCUMENTS"
295- CHUNKS_TBL = "CHUNKS"
296- EMB_TBL = "EMBEDDINGS"
297- CHAT_TBL = "CHAT_HISTORY"
298294
299- # Supported file extensions - TXT ADDED
295+ # FULLY QUALIFIED TABLE NAMES - This is the key fix!
296+ DB_SCHEMA = "AIFAQ_VERSION1_DB.APP_SCHEMA"
297+ DOCS_TBL = f"{ DB_SCHEMA } .DOCUMENTS"
298+ CHUNKS_TBL = f"{ DB_SCHEMA } .CHUNKS"
299+ EMB_TBL = f"{ DB_SCHEMA } .EMBEDDINGS"
300+ CHAT_TBL = f"{ DB_SCHEMA } .CHAT_HISTORY"
301+
302+ # Supported file extensions
300303SUPPORTED_EXTENSIONS = {
301304 "pdf" : "PDF" ,
302305 "pptx" : "PowerPoint" ,
@@ -350,10 +353,10 @@ def sanitize(txt: str) -> str:
350353
351354# ------------- FULL EXTRACTION -------------
352355def extract_pdf_full (f ):
353- """Full PDF extraction"""
356+ """Full PDF extraction using pypdf (Snowflake compatible) """
354357 try :
355- from PyPDF2 import PdfReader
356-
358+ from pypdf import PdfReader
359+
357360 reader = PdfReader (f )
358361 text_parts = []
359362 # Read ALL pages
@@ -412,7 +415,6 @@ def extract_excel_full(f):
412415 df = pd .read_excel (excel_file , sheet_name = sheet_name )
413416
414417 # Convert the entire DataFrame to string *without* truncation
415- # This ensures every character from the Excel file is read
416418 with pd .option_context (
417419 "display.max_rows" ,
418420 None ,
@@ -489,25 +491,21 @@ def store_document_fast(
489491) -> bool :
490492 """
491493 OPTIMIZED: Store document, chunks, and embeddings using server-side processing.
492- NOW WITH IMMEDIATE COMMIT for instant knowledge base update.
493494 """
494495 doc_id = str (uuid .uuid4 ())
495496 try :
496- # 1. Insert document (single operation) with immediate commit
497+ # 1. Insert document
497498 session .sql (
498499 f"INSERT INTO { DOCS_TBL } (DOC_ID,FILENAME,FILE_TYPE,FILE_SIZE,IS_PUBLIC,UPLOADED_AT) "
499500 f"VALUES ('{ sanitize (doc_id )} ','{ sanitize (filename )} ','{ src_type } ',{ len (content )} ,"
500501 f"{ public } ,CURRENT_TIMESTAMP())"
501502 ).collect ()
502503
503- # FORCE COMMIT - Critical for immediate visibility
504- session .sql ("COMMIT" ).collect ()
505-
506504 # 2. Create and batch insert chunks
507505 chunks = chunk_text (content )
508506 if not chunks :
509507 st .warning (f"⚠️ { filename } : Document stored but no content to chunk" )
510- return True # Document stored, but no content to chunk
508+ return True
511509
512510 # Prepare chunk data
513511 chunk_data = []
@@ -528,15 +526,14 @@ def store_document_fast(
528526 chunks_df = pd .DataFrame (chunk_data )
529527 session .write_pandas (
530528 chunks_df ,
531- table_name = CHUNKS_TBL ,
529+ table_name = "CHUNKS" ,
530+ database = "AIFAQ_VERSION1_DB" ,
531+ schema = "APP_SCHEMA" ,
532532 auto_create_table = False ,
533533 overwrite = False ,
534534 quote_identifiers = False ,
535535 )
536536
537- # FORCE COMMIT after chunks
538- session .sql ("COMMIT" ).collect ()
539-
540537 # 3. Generate and store embeddings
541538 session .sql (
542539 f"""
@@ -551,9 +548,6 @@ def store_document_fast(
551548 """
552549 ).collect ()
553550
554- # FORCE COMMIT after embeddings
555- session .sql ("COMMIT" ).collect ()
556-
557551 # Verify the document was stored
558552 verify = session .sql (
559553 f"SELECT COUNT(*) as cnt FROM { DOCS_TBL } WHERE DOC_ID='{ sanitize (doc_id )} '"
@@ -581,7 +575,6 @@ def store_document_fast(
581575 session .sql (
582576 f"DELETE FROM { DOCS_TBL } WHERE DOC_ID='{ sanitize (doc_id )} '"
583577 ).collect ()
584- session .sql ("COMMIT" ).collect () # Commit cleanup
585578 except Exception as cleanup_e :
586579 st .error (f"❌ Cleanup failed for { filename } : { str (cleanup_e )} " )
587580 return False
@@ -697,7 +690,7 @@ def load_chat_history(session_id: str):
697690 ext = file .name .split ("." )[- 1 ].lower ()
698691 file_type = SUPPORTED_EXTENSIONS .get (ext , "TEXT" )
699692
700- # Extract content - this now reads 100% of all file types including TXT
693+ # Extract content
701694 content = extract_file_content (file , file_type )
702695
703696 if content :
@@ -707,12 +700,12 @@ def load_chat_history(session_id: str):
707700 st .success (f"✅ { file .name } uploaded successfully!" )
708701 success_count += 1
709702
710- # CRITICAL: Force immediate rerun after each successful upload
703+ # Force immediate rerun after each successful upload
711704 st .session_state .kb_refresh = True
712705 st .session_state .last_upload_time = time .time ()
713706 progress_bar .progress ((idx + 1 ) / len (files ))
714- time .sleep (0.3 ) # Brief pause to show success message
715- st .rerun () # Immediate refresh to update knowledge base
707+ time .sleep (0.3 )
708+ st .rerun ()
716709 else :
717710 with status_container :
718711 st .error (f"❌ { file .name } failed to upload" )
@@ -724,7 +717,6 @@ def load_chat_history(session_id: str):
724717
725718 progress_bar .progress ((idx + 1 ) / len (files ))
726719
727- # This code only runs if all files processed without triggering rerun
728720 progress_bar .empty ()
729721
730722 if success_count > 0 :
@@ -748,7 +740,7 @@ def load_chat_history(session_id: str):
748740 st .success ("✅ Knowledge base updated!" )
749741 st .session_state .kb_refresh = False
750742
751- # Force fresh database query - this ensures latest data
743+ # Force fresh database query
752744 df = get_user_docs ()
753745
754746 if df .empty :
@@ -810,7 +802,6 @@ def load_chat_history(session_id: str):
810802 use_container_width = True ,
811803 ):
812804 st .session_state .current = c ["SESSION_ID" ]
813- # Load chat history from database
814805 loaded_messages = load_chat_history (c ["SESSION_ID" ])
815806 st .session_state .sessions [c ["SESSION_ID" ]] = {
816807 "title" : btn_label ,
@@ -874,7 +865,7 @@ def load_chat_history(session_id: str):
874865
875866messages = st .session_state .sessions [st .session_state .current ]["messages" ]
876867
877- # Display messages - Beautiful modern design with enhanced styling
868+ # Display messages
878869chat_container = st .container ()
879870with chat_container :
880871 if not messages :
@@ -957,9 +948,8 @@ def load_chat_history(session_id: str):
957948 unsafe_allow_html = True ,
958949 )
959950
960- # Input form - Beautiful modern design with enhanced styling
951+ # Input form
961952with st .form ("chat_form" , clear_on_submit = True ):
962- # Enhanced input styling
963953 st .markdown (
964954 f"""
965955 <style>
@@ -995,7 +985,7 @@ def load_chat_history(session_id: str):
995985 key = "chat_input" ,
996986 )
997987
998- # Action buttons with enhanced design
988+ # Action buttons
999989 col1 , col2 , col3 = st .columns ([6 , 1.5 , 1 ])
1000990 with col1 :
1001991 st .markdown (
@@ -1057,7 +1047,7 @@ def load_chat_history(session_id: str):
10571047 )
10581048 context = "\n \n ---\n \n " .join (context_parts )
10591049
1060- # Generate response using LLM with source tracking
1050+ # Generate response using LLM
10611051 prompt_llm = (
10621052 f"You are a helpful assistant that answers questions based on the provided knowledge base context. "
10631053 f"Answer the user's question using only the information from the provided knowledge base. "
@@ -1078,27 +1068,23 @@ def load_chat_history(session_id: str):
10781068 else "The LLM service didn't return a response."
10791069 )
10801070
1081- # Cite only the PRIMARY source (highest similarity score)
1082- # Group by filename and get the highest similarity for each document
1071+ # Cite sources
10831072 source_scores = (
10841073 rows .groupby ("FILENAME" )["SIMILARITY_SCORE" ]
10851074 .max ()
10861075 .sort_values (ascending = False )
10871076 )
10881077
1089- # Get top source(s) - only cite documents with similarity > 0.5 (strong match)
10901078 top_sources = []
10911079 for filename , score in source_scores .items ():
1092- if score > 0.5 : # Strong relevance threshold
1080+ if score > 0.5 :
10931081 top_sources .append (filename )
1094- if len (top_sources ) >= 2 : # Max 2 sources
1082+ if len (top_sources ) >= 2 :
10951083 break
10961084
1097- # If no strong matches, just use the top result
10981085 if not top_sources :
10991086 top_sources = [source_scores .index [0 ]]
11001087
1101- # Add source attribution - only the primary source(s) used
11021088 if len (top_sources ) == 1 :
11031089 sources_md = f"\n \n ---\n **Source:** { top_sources [0 ]} "
11041090 else :
@@ -1129,11 +1115,14 @@ def load_chat_history(session_id: str):
11291115 }
11301116 ]
11311117 ),
1132- table_name = CHAT_TBL ,
1118+ table_name = "CHAT_HISTORY" ,
1119+ database = "AIFAQ_VERSION1_DB" ,
1120+ schema = "APP_SCHEMA" ,
11331121 auto_create_table = False ,
11341122 overwrite = False ,
1123+ quote_identifiers = False ,
11351124 )
11361125 except Exception :
1137- pass # Chat history persistence is optional
1126+ pass
11381127
1139- st .rerun ()
1128+ st .rerun ()
0 commit comments