Skip to content

Commit 023d6bf

Browse files
committed
Fix minor Cortex AI issues; update app, SQL, and README
Signed-off-by: Jayaram007 <[email protected]>
1 parent bd7408a commit 023d6bf

File tree

3 files changed

+192
-87
lines changed

3 files changed

+192
-87
lines changed

version_1/app.py

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -291,10 +291,13 @@
291291

292292
# ------------- CONSTANTS -------------
293293
LLM_OPTIONS = ["mistral-7b", "snowflake-arctic", "mixtral-8x7b"]
294-
DOCS_TBL = "DOCUMENTS"
295-
CHUNKS_TBL = "CHUNKS"
296-
EMB_TBL = "EMBEDDINGS"
297-
CHAT_TBL = "CHAT_HISTORY"
294+
295+
# FULLY QUALIFIED TABLE NAMES - This is the key fix!
296+
DB_SCHEMA = "AIFAQ_VERSION1_DB.APP_SCHEMA"
297+
DOCS_TBL = f"{DB_SCHEMA}.DOCUMENTS"
298+
CHUNKS_TBL = f"{DB_SCHEMA}.CHUNKS"
299+
EMB_TBL = f"{DB_SCHEMA}.EMBEDDINGS"
300+
CHAT_TBL = f"{DB_SCHEMA}.CHAT_HISTORY"
298301

299302
# Supported file extensions
300303
SUPPORTED_EXTENSIONS = {
@@ -303,6 +306,7 @@
303306
"csv": "CSV",
304307
"xlsx": "Excel",
305308
"xls": "Excel",
309+
"txt": "TEXT",
306310
}
307311

308312
# Performance tuning
@@ -349,10 +353,10 @@ def sanitize(txt: str) -> str:
349353

350354
# ------------- FULL EXTRACTION -------------
351355
def extract_pdf_full(f):
352-
"""Full PDF extraction"""
356+
"""Full PDF extraction using pypdf (Snowflake compatible)"""
353357
try:
354-
from PyPDF2 import PdfReader
355-
358+
from pypdf import PdfReader
359+
356360
reader = PdfReader(f)
357361
text_parts = []
358362
# Read ALL pages
@@ -411,7 +415,6 @@ def extract_excel_full(f):
411415
df = pd.read_excel(excel_file, sheet_name=sheet_name)
412416

413417
# Convert the entire DataFrame to string *without* truncation
414-
# This ensures every character from the Excel file is read
415418
with pd.option_context(
416419
"display.max_rows",
417420
None,
@@ -430,6 +433,23 @@ def extract_excel_full(f):
430433
return ""
431434

432435

436+
def extract_txt_full(f):
437+
"""Full TXT file extraction"""
438+
try:
439+
# Reset file pointer and read as text
440+
f.seek(0)
441+
# Try UTF-8 first, fall back to latin-1 if that fails
442+
try:
443+
content = f.read().decode("utf-8")
444+
except UnicodeDecodeError:
445+
f.seek(0)
446+
content = f.read().decode("latin-1")
447+
return content
448+
except Exception as e:
449+
st.error(f"TXT extraction error: {e}")
450+
return ""
451+
452+
433453
def extract_file_content(file, file_type):
434454
"""Extract content based on file type - FULL versions"""
435455
if file_type.upper() == "PDF":
@@ -440,6 +460,8 @@ def extract_file_content(file, file_type):
440460
return extract_csv_full(file)
441461
elif file_type.upper() == "EXCEL":
442462
return extract_excel_full(file)
463+
elif file_type.upper() == "TEXT":
464+
return extract_txt_full(file)
443465
else:
444466
return ""
445467

@@ -469,25 +491,21 @@ def store_document_fast(
469491
) -> bool:
470492
"""
471493
OPTIMIZED: Store document, chunks, and embeddings using server-side processing.
472-
NOW WITH IMMEDIATE COMMIT for instant knowledge base update.
473494
"""
474495
doc_id = str(uuid.uuid4())
475496
try:
476-
# 1. Insert document (single operation) with immediate commit
497+
# 1. Insert document
477498
session.sql(
478499
f"INSERT INTO {DOCS_TBL} (DOC_ID,FILENAME,FILE_TYPE,FILE_SIZE,IS_PUBLIC,UPLOADED_AT) "
479500
f"VALUES ('{sanitize(doc_id)}','{sanitize(filename)}','{src_type}',{len(content)},"
480501
f"{public},CURRENT_TIMESTAMP())"
481502
).collect()
482503

483-
# FORCE COMMIT - Critical for immediate visibility
484-
session.sql("COMMIT").collect()
485-
486504
# 2. Create and batch insert chunks
487505
chunks = chunk_text(content)
488506
if not chunks:
489507
st.warning(f"⚠️ {filename}: Document stored but no content to chunk")
490-
return True # Document stored, but no content to chunk
508+
return True
491509

492510
# Prepare chunk data
493511
chunk_data = []
@@ -508,15 +526,14 @@ def store_document_fast(
508526
chunks_df = pd.DataFrame(chunk_data)
509527
session.write_pandas(
510528
chunks_df,
511-
table_name=CHUNKS_TBL,
529+
table_name="CHUNKS",
530+
database="AIFAQ_VERSION1_DB",
531+
schema="APP_SCHEMA",
512532
auto_create_table=False,
513533
overwrite=False,
514534
quote_identifiers=False,
515535
)
516536

517-
# FORCE COMMIT after chunks
518-
session.sql("COMMIT").collect()
519-
520537
# 3. Generate and store embeddings
521538
session.sql(
522539
f"""
@@ -531,9 +548,6 @@ def store_document_fast(
531548
"""
532549
).collect()
533550

534-
# FORCE COMMIT after embeddings
535-
session.sql("COMMIT").collect()
536-
537551
# Verify the document was stored
538552
verify = session.sql(
539553
f"SELECT COUNT(*) as cnt FROM {DOCS_TBL} WHERE DOC_ID='{sanitize(doc_id)}'"
@@ -561,7 +575,6 @@ def store_document_fast(
561575
session.sql(
562576
f"DELETE FROM {DOCS_TBL} WHERE DOC_ID='{sanitize(doc_id)}'"
563577
).collect()
564-
session.sql("COMMIT").collect() # Commit cleanup
565578
except Exception as cleanup_e:
566579
st.error(f"❌ Cleanup failed for {filename}: {str(cleanup_e)}")
567580
return False
@@ -677,7 +690,7 @@ def load_chat_history(session_id: str):
677690
ext = file.name.split(".")[-1].lower()
678691
file_type = SUPPORTED_EXTENSIONS.get(ext, "TEXT")
679692

680-
# Extract content - this now reads 100% of all file types
693+
# Extract content
681694
content = extract_file_content(file, file_type)
682695

683696
if content:
@@ -687,12 +700,12 @@ def load_chat_history(session_id: str):
687700
st.success(f"✅ {file.name} uploaded successfully!")
688701
success_count += 1
689702

690-
# CRITICAL: Force immediate rerun after each successful upload
703+
# Force immediate rerun after each successful upload
691704
st.session_state.kb_refresh = True
692705
st.session_state.last_upload_time = time.time()
693706
progress_bar.progress((idx + 1) / len(files))
694-
time.sleep(0.3) # Brief pause to show success message
695-
st.rerun() # Immediate refresh to update knowledge base
707+
time.sleep(0.3)
708+
st.rerun()
696709
else:
697710
with status_container:
698711
st.error(f"❌ {file.name} failed to upload")
@@ -704,7 +717,6 @@ def load_chat_history(session_id: str):
704717

705718
progress_bar.progress((idx + 1) / len(files))
706719

707-
# This code only runs if all files processed without triggering rerun
708720
progress_bar.empty()
709721

710722
if success_count > 0:
@@ -728,7 +740,7 @@ def load_chat_history(session_id: str):
728740
st.success("✅ Knowledge base updated!")
729741
st.session_state.kb_refresh = False
730742

731-
# Force fresh database query - this ensures latest data
743+
# Force fresh database query
732744
df = get_user_docs()
733745

734746
if df.empty:
@@ -778,7 +790,7 @@ def load_chat_history(session_id: str):
778790
chats = session.sql(
779791
f"SELECT DISTINCT SESSION_ID, LEFT(QUERY_TEXT,40) TITLE, MAX(QUERY_TIMESTAMP) TS "
780792
f"FROM {CHAT_TBL} "
781-
f"GROUP BY SESSION_ID, QUERY_TEXT ORDER BY TS DESC LIMIT 15"
793+
f"GROUP BY SESSION_ID, QUERY_TEXT ORDER BY TS DESC"
782794
).to_pandas()
783795

784796
if not chats.empty:
@@ -790,7 +802,6 @@ def load_chat_history(session_id: str):
790802
use_container_width=True,
791803
):
792804
st.session_state.current = c["SESSION_ID"]
793-
# Load chat history from database
794805
loaded_messages = load_chat_history(c["SESSION_ID"])
795806
st.session_state.sessions[c["SESSION_ID"]] = {
796807
"title": btn_label,
@@ -854,7 +865,7 @@ def load_chat_history(session_id: str):
854865

855866
messages = st.session_state.sessions[st.session_state.current]["messages"]
856867

857-
# Display messages - Beautiful modern design with enhanced styling
868+
# Display messages
858869
chat_container = st.container()
859870
with chat_container:
860871
if not messages:
@@ -937,9 +948,8 @@ def load_chat_history(session_id: str):
937948
unsafe_allow_html=True,
938949
)
939950

940-
# Input form - Beautiful modern design with enhanced styling
951+
# Input form
941952
with st.form("chat_form", clear_on_submit=True):
942-
# Enhanced input styling
943953
st.markdown(
944954
f"""
945955
<style>
@@ -975,7 +985,7 @@ def load_chat_history(session_id: str):
975985
key="chat_input",
976986
)
977987

978-
# Action buttons with enhanced design
988+
# Action buttons
979989
col1, col2, col3 = st.columns([6, 1.5, 1])
980990
with col1:
981991
st.markdown(
@@ -1037,7 +1047,7 @@ def load_chat_history(session_id: str):
10371047
)
10381048
context = "\n\n---\n\n".join(context_parts)
10391049

1040-
# Generate response using LLM with source tracking
1050+
# Generate response using LLM
10411051
prompt_llm = (
10421052
f"You are a helpful assistant that answers questions based on the provided knowledge base context. "
10431053
f"Answer the user's question using only the information from the provided knowledge base. "
@@ -1058,27 +1068,23 @@ def load_chat_history(session_id: str):
10581068
else "The LLM service didn't return a response."
10591069
)
10601070

1061-
# Cite only the PRIMARY source (highest similarity score)
1062-
# Group by filename and get the highest similarity for each document
1071+
# Cite sources
10631072
source_scores = (
10641073
rows.groupby("FILENAME")["SIMILARITY_SCORE"]
10651074
.max()
10661075
.sort_values(ascending=False)
10671076
)
10681077

1069-
# Get top source(s) - only cite documents with similarity > 0.5 (strong match)
10701078
top_sources = []
10711079
for filename, score in source_scores.items():
1072-
if score > 0.5: # Strong relevance threshold
1080+
if score > 0.5:
10731081
top_sources.append(filename)
1074-
if len(top_sources) >= 2: # Max 2 sources
1082+
if len(top_sources) >= 2:
10751083
break
10761084

1077-
# If no strong matches, just use the top result
10781085
if not top_sources:
10791086
top_sources = [source_scores.index[0]]
10801087

1081-
# Add source attribution - only the primary source(s) used
10821088
if len(top_sources) == 1:
10831089
sources_md = f"\n\n---\n**Source:** {top_sources[0]}"
10841090
else:
@@ -1109,11 +1115,14 @@ def load_chat_history(session_id: str):
11091115
}
11101116
]
11111117
),
1112-
table_name=CHAT_TBL,
1118+
table_name="CHAT_HISTORY",
1119+
database="AIFAQ_VERSION1_DB",
1120+
schema="APP_SCHEMA",
11131121
auto_create_table=False,
11141122
overwrite=False,
1123+
quote_identifiers=False,
11151124
)
11161125
except Exception:
1117-
pass # Chat history persistence is optional
1126+
pass
11181127

11191128
st.rerun()

version_1/readme.md

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,37 @@ Snowflake will create a default file called: **streamlit_app.py**
111111
## Step 6: Install Python Dependencies
112112

113113
Snowflake Streamlit apps require **manual dependency installation**.
114+
There is **no automatic `requirements.txt` resolution**, so all libraries must be added explicitly.
114115

115-
1. In the Streamlit app UI, locate the **Python Packages / Environment** section.
116-
2. Under the selected **Python version**, manually add all required libraries.
117-
3. Add them **one by one**, exactly as required by `app.py`.
116+
### Steps
118117

119-
(There is no automatic `requirements.txt` resolution in Snowflake Streamlit.)
118+
1. In the Streamlit app UI, open **Python Packages / Environment**.
119+
2. Select the required **Python version**.
120+
3. Add the following libraries **one by one**:
120121

121-
Once all dependencies are installed, Snowflake will reinitialize the app.
122+
### Required Python Libraries
123+
124+
* `streamlit`
125+
* `snowflake-snowpark-python`
126+
* `pandas`
127+
* `pypdf`
128+
* `python-pptx`
129+
* `openpyxl`
130+
* `xlrd` *(required only for legacy `.xls` Excel files)*
131+
132+
> ⚠️ **Do NOT add** built-in Python modules such as `uuid`, `re`, `datetime`, `io`, or `time`.
133+
134+
4. After adding all libraries, **save** the environment.
135+
136+
Snowflake will automatically **reinitialize the app** once the environment is updated.
137+
138+
139+
### Validation Checklist
140+
141+
* PDF uploads work → `pypdf` installed
142+
* PowerPoint uploads work → `python-pptx` installed
143+
* Excel uploads work → `openpyxl` (and `xlrd` if `.xls`) installed
144+
* Snowflake Cortex / Snowpark works → `snowflake-snowpark-python` installed
122145

123146
---
124147

0 commit comments

Comments
 (0)