Skip to content

Commit 8cb17c1

Browse files
committed
Fixed CI and updated the version1 code for CortexAI
Signed-off-by: Jayaram007 <jairam.kumar23@gmail.com>
1 parent b920df3 commit 8cb17c1

File tree

2 files changed

+34
-85
lines changed

2 files changed

+34
-85
lines changed

.github/workflows/CI.yml

Lines changed: 0 additions & 40 deletions
This file was deleted.

version_1/app.py

Lines changed: 34 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,15 @@
291291

292292
# ------------- CONSTANTS -------------
293293
LLM_OPTIONS = ["mistral-7b", "snowflake-arctic", "mixtral-8x7b"]
294-
DOCS_TBL = "DOCUMENTS"
295-
CHUNKS_TBL = "CHUNKS"
296-
EMB_TBL = "EMBEDDINGS"
297-
CHAT_TBL = "CHAT_HISTORY"
298294

299-
# Supported file extensions - TXT ADDED
295+
# FULLY QUALIFIED TABLE NAMES - This is the key fix!
296+
DB_SCHEMA = "AIFAQ_VERSION1_DB.APP_SCHEMA"
297+
DOCS_TBL = f"{DB_SCHEMA}.DOCUMENTS"
298+
CHUNKS_TBL = f"{DB_SCHEMA}.CHUNKS"
299+
EMB_TBL = f"{DB_SCHEMA}.EMBEDDINGS"
300+
CHAT_TBL = f"{DB_SCHEMA}.CHAT_HISTORY"
301+
302+
# Supported file extensions
300303
SUPPORTED_EXTENSIONS = {
301304
"pdf": "PDF",
302305
"pptx": "PowerPoint",
@@ -350,10 +353,10 @@ def sanitize(txt: str) -> str:
350353

351354
# ------------- FULL EXTRACTION -------------
352355
def extract_pdf_full(f):
353-
"""Full PDF extraction"""
356+
"""Full PDF extraction using pypdf (Snowflake compatible)"""
354357
try:
355-
from PyPDF2 import PdfReader
356-
358+
from pypdf import PdfReader
359+
357360
reader = PdfReader(f)
358361
text_parts = []
359362
# Read ALL pages
@@ -412,7 +415,6 @@ def extract_excel_full(f):
412415
df = pd.read_excel(excel_file, sheet_name=sheet_name)
413416

414417
# Convert the entire DataFrame to string *without* truncation
415-
# This ensures every character from the Excel file is read
416418
with pd.option_context(
417419
"display.max_rows",
418420
None,
@@ -489,25 +491,21 @@ def store_document_fast(
489491
) -> bool:
490492
"""
491493
OPTIMIZED: Store document, chunks, and embeddings using server-side processing.
492-
NOW WITH IMMEDIATE COMMIT for instant knowledge base update.
493494
"""
494495
doc_id = str(uuid.uuid4())
495496
try:
496-
# 1. Insert document (single operation) with immediate commit
497+
# 1. Insert document
497498
session.sql(
498499
f"INSERT INTO {DOCS_TBL} (DOC_ID,FILENAME,FILE_TYPE,FILE_SIZE,IS_PUBLIC,UPLOADED_AT) "
499500
f"VALUES ('{sanitize(doc_id)}','{sanitize(filename)}','{src_type}',{len(content)},"
500501
f"{public},CURRENT_TIMESTAMP())"
501502
).collect()
502503

503-
# FORCE COMMIT - Critical for immediate visibility
504-
session.sql("COMMIT").collect()
505-
506504
# 2. Create and batch insert chunks
507505
chunks = chunk_text(content)
508506
if not chunks:
509507
st.warning(f"⚠️ {filename}: Document stored but no content to chunk")
510-
return True # Document stored, but no content to chunk
508+
return True
511509

512510
# Prepare chunk data
513511
chunk_data = []
@@ -528,15 +526,14 @@ def store_document_fast(
528526
chunks_df = pd.DataFrame(chunk_data)
529527
session.write_pandas(
530528
chunks_df,
531-
table_name=CHUNKS_TBL,
529+
table_name="CHUNKS",
530+
database="AIFAQ_VERSION1_DB",
531+
schema="APP_SCHEMA",
532532
auto_create_table=False,
533533
overwrite=False,
534534
quote_identifiers=False,
535535
)
536536

537-
# FORCE COMMIT after chunks
538-
session.sql("COMMIT").collect()
539-
540537
# 3. Generate and store embeddings
541538
session.sql(
542539
f"""
@@ -551,9 +548,6 @@ def store_document_fast(
551548
"""
552549
).collect()
553550

554-
# FORCE COMMIT after embeddings
555-
session.sql("COMMIT").collect()
556-
557551
# Verify the document was stored
558552
verify = session.sql(
559553
f"SELECT COUNT(*) as cnt FROM {DOCS_TBL} WHERE DOC_ID='{sanitize(doc_id)}'"
@@ -581,7 +575,6 @@ def store_document_fast(
581575
session.sql(
582576
f"DELETE FROM {DOCS_TBL} WHERE DOC_ID='{sanitize(doc_id)}'"
583577
).collect()
584-
session.sql("COMMIT").collect() # Commit cleanup
585578
except Exception as cleanup_e:
586579
st.error(f"❌ Cleanup failed for {filename}: {str(cleanup_e)}")
587580
return False
@@ -697,7 +690,7 @@ def load_chat_history(session_id: str):
697690
ext = file.name.split(".")[-1].lower()
698691
file_type = SUPPORTED_EXTENSIONS.get(ext, "TEXT")
699692

700-
# Extract content - this now reads 100% of all file types including TXT
693+
# Extract content
701694
content = extract_file_content(file, file_type)
702695

703696
if content:
@@ -707,12 +700,12 @@ def load_chat_history(session_id: str):
707700
st.success(f"✅ {file.name} uploaded successfully!")
708701
success_count += 1
709702

710-
# CRITICAL: Force immediate rerun after each successful upload
703+
# Force immediate rerun after each successful upload
711704
st.session_state.kb_refresh = True
712705
st.session_state.last_upload_time = time.time()
713706
progress_bar.progress((idx + 1) / len(files))
714-
time.sleep(0.3) # Brief pause to show success message
715-
st.rerun() # Immediate refresh to update knowledge base
707+
time.sleep(0.3)
708+
st.rerun()
716709
else:
717710
with status_container:
718711
st.error(f"❌ {file.name} failed to upload")
@@ -724,7 +717,6 @@ def load_chat_history(session_id: str):
724717

725718
progress_bar.progress((idx + 1) / len(files))
726719

727-
# This code only runs if all files processed without triggering rerun
728720
progress_bar.empty()
729721

730722
if success_count > 0:
@@ -748,7 +740,7 @@ def load_chat_history(session_id: str):
748740
st.success("✅ Knowledge base updated!")
749741
st.session_state.kb_refresh = False
750742

751-
# Force fresh database query - this ensures latest data
743+
# Force fresh database query
752744
df = get_user_docs()
753745

754746
if df.empty:
@@ -810,7 +802,6 @@ def load_chat_history(session_id: str):
810802
use_container_width=True,
811803
):
812804
st.session_state.current = c["SESSION_ID"]
813-
# Load chat history from database
814805
loaded_messages = load_chat_history(c["SESSION_ID"])
815806
st.session_state.sessions[c["SESSION_ID"]] = {
816807
"title": btn_label,
@@ -874,7 +865,7 @@ def load_chat_history(session_id: str):
874865

875866
messages = st.session_state.sessions[st.session_state.current]["messages"]
876867

877-
# Display messages - Beautiful modern design with enhanced styling
868+
# Display messages
878869
chat_container = st.container()
879870
with chat_container:
880871
if not messages:
@@ -957,9 +948,8 @@ def load_chat_history(session_id: str):
957948
unsafe_allow_html=True,
958949
)
959950

960-
# Input form - Beautiful modern design with enhanced styling
951+
# Input form
961952
with st.form("chat_form", clear_on_submit=True):
962-
# Enhanced input styling
963953
st.markdown(
964954
f"""
965955
<style>
@@ -995,7 +985,7 @@ def load_chat_history(session_id: str):
995985
key="chat_input",
996986
)
997987

998-
# Action buttons with enhanced design
988+
# Action buttons
999989
col1, col2, col3 = st.columns([6, 1.5, 1])
1000990
with col1:
1001991
st.markdown(
@@ -1057,7 +1047,7 @@ def load_chat_history(session_id: str):
10571047
)
10581048
context = "\n\n---\n\n".join(context_parts)
10591049

1060-
# Generate response using LLM with source tracking
1050+
# Generate response using LLM
10611051
prompt_llm = (
10621052
f"You are a helpful assistant that answers questions based on the provided knowledge base context. "
10631053
f"Answer the user's question using only the information from the provided knowledge base. "
@@ -1078,27 +1068,23 @@ def load_chat_history(session_id: str):
10781068
else "The LLM service didn't return a response."
10791069
)
10801070

1081-
# Cite only the PRIMARY source (highest similarity score)
1082-
# Group by filename and get the highest similarity for each document
1071+
# Cite sources
10831072
source_scores = (
10841073
rows.groupby("FILENAME")["SIMILARITY_SCORE"]
10851074
.max()
10861075
.sort_values(ascending=False)
10871076
)
10881077

1089-
# Get top source(s) - only cite documents with similarity > 0.5 (strong match)
10901078
top_sources = []
10911079
for filename, score in source_scores.items():
1092-
if score > 0.5: # Strong relevance threshold
1080+
if score > 0.5:
10931081
top_sources.append(filename)
1094-
if len(top_sources) >= 2: # Max 2 sources
1082+
if len(top_sources) >= 2:
10951083
break
10961084

1097-
# If no strong matches, just use the top result
10981085
if not top_sources:
10991086
top_sources = [source_scores.index[0]]
11001087

1101-
# Add source attribution - only the primary source(s) used
11021088
if len(top_sources) == 1:
11031089
sources_md = f"\n\n---\n**Source:** {top_sources[0]}"
11041090
else:
@@ -1129,11 +1115,14 @@ def load_chat_history(session_id: str):
11291115
}
11301116
]
11311117
),
1132-
table_name=CHAT_TBL,
1118+
table_name="CHAT_HISTORY",
1119+
database="AIFAQ_VERSION1_DB",
1120+
schema="APP_SCHEMA",
11331121
auto_create_table=False,
11341122
overwrite=False,
1123+
quote_identifiers=False,
11351124
)
11361125
except Exception:
1137-
pass # Chat history persistence is optional
1126+
pass
11381127

1139-
st.rerun()
1128+
st.rerun()

0 commit comments

Comments
 (0)