diff --git a/version_1/readme.md b/version_1/readme.md index 8d7d420..b5abdf2 100644 --- a/version_1/readme.md +++ b/version_1/readme.md @@ -111,14 +111,37 @@ Snowflake will create a default file called: **streamlit_app.py** ## Step 6: Install Python Dependencies Snowflake Streamlit apps require **manual dependency installation**. +There is **no automatic `requirements.txt` resolution**, so all libraries must be added explicitly. -1. In the Streamlit app UI, locate the **Python Packages / Environment** section. -2. Under the selected **Python version**, manually add all required libraries. -3. Add them **one by one**, exactly as required by `app.py`. +### Steps -(There is no automatic `requirements.txt` resolution in Snowflake Streamlit.) +1. In the Streamlit app UI, open **Python Packages / Environment**. +2. Select the required **Python version**. +3. Add the following libraries **one by one**: -Once all dependencies are installed, Snowflake will reinitialize the app. +### Required Python Libraries + +* `streamlit` +* `snowflake-snowpark-python` +* `pandas` +* `pypdf` +* `python-pptx` +* `openpyxl` +* `xlrd` *(required only for legacy `.xls` Excel files)* + +> ⚠️ **Do NOT add** built-in Python modules such as `uuid`, `re`, `datetime`, `io`, or `time`. + +4. After adding all libraries, **save** the environment. + +Snowflake will automatically **reinitialize the app** once the environment is updated. + + +### Validation Checklist + +* PDF uploads work → `pypdf` installed +* PowerPoint uploads work → `python-pptx` installed +* Excel uploads work → `openpyxl` (and `xlrd` if `.xls`) installed +* Snowflake Cortex / Snowpark works → `snowflake-snowpark-python` installed --- diff --git a/version_1/setup.sql b/version_1/setup.sql index 9eeb521..f0fe987 100644 --- a/version_1/setup.sql +++ b/version_1/setup.sql @@ -1,53 +1,126 @@ --- Create database -CREATE OR REPLACE DATABASE AI_FAQ_DB; +-- ============================================================ +-- COMPLETE DATABASE SETUP FOR AI DOCUMENT ASSISTANT +-- ============================================================ --- Create warehouse -CREATE OR REPLACE WAREHOUSE AI_FAQ_WH - WAREHOUSE_SIZE = 'XSMALL' - WAREHOUSE_TYPE = 'STANDARD' - AUTO_SUSPEND = 300 - AUTO_RESUME = TRUE; +-- ============================================================ +-- STEP 1: CREATE DATABASE AND SCHEMA +-- ============================================================ --- Use the new database and warehouse -USE DATABASE AI_FAQ_DB; -USE WAREHOUSE AI_FAQ_WH; +-- Create database (skip if you already have one) +CREATE DATABASE IF NOT EXISTS AIFAQ_VERSION1_DB; --- Create DOCUMENTS table +-- Use the database +USE DATABASE AIFAQ_VERSION1_DB; + +-- Create schema +CREATE SCHEMA IF NOT EXISTS APP_SCHEMA; + +-- Use the schema +USE SCHEMA APP_SCHEMA; + +-- ============================================================ +-- STEP 2: CREATE WAREHOUSE (if you don't have one) +-- ============================================================ + +CREATE WAREHOUSE IF NOT EXISTS AIFAQ_WAREHOUSE + WAREHOUSE_SIZE = 'X-SMALL' + AUTO_SUSPEND = 60 + AUTO_RESUME = TRUE + INITIALLY_SUSPENDED = FALSE; + +-- Use the warehouse +USE WAREHOUSE AIFAQ_WAREHOUSE; + +-- ============================================================ +-- STEP 3: CREATE TABLES +-- ============================================================ + +-- 1. DOCUMENTS TABLE - Stores metadata about uploaded documents CREATE OR REPLACE TABLE DOCUMENTS ( - DOC_ID VARCHAR(36) PRIMARY KEY, - FILENAME VARCHAR(255) NOT NULL, - FILE_TYPE VARCHAR(50), - FILE_SIZE NUMBER, - IS_PUBLIC BOOLEAN DEFAULT FALSE, - UPLOADED_AT TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP() + DOC_ID VARCHAR(255) NOT NULL PRIMARY KEY, + FILENAME VARCHAR(500) NOT NULL, + FILE_TYPE VARCHAR(50) NOT NULL, + FILE_SIZE INTEGER NOT NULL, + IS_PUBLIC BOOLEAN DEFAULT FALSE, + UPLOADED_AT TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP() ); --- Create CHUNKS table +-- 2. CHUNKS TABLE - Stores document text chunks for RAG CREATE OR REPLACE TABLE CHUNKS ( - CHUNK_ID VARCHAR(50) PRIMARY KEY, - DOC_ID VARCHAR(36) NOT NULL, - CHUNK_INDEX NUMBER NOT NULL, - CHUNK_TEXT VARCHAR(10000) NOT NULL, - FOREIGN KEY (DOC_ID) REFERENCES DOCUMENTS(DOC_ID) + CHUNK_ID VARCHAR(255) NOT NULL PRIMARY KEY, + DOC_ID VARCHAR(255) NOT NULL, + CHUNK_INDEX INTEGER NOT NULL, + CHUNK_TEXT TEXT NOT NULL ); --- Create EMBEDDINGS table +-- 3. EMBEDDINGS TABLE - Stores vector embeddings for semantic search CREATE OR REPLACE TABLE EMBEDDINGS ( - CHUNK_ID VARCHAR(50) PRIMARY KEY, - EMBEDDING ARRAY, - FOREIGN KEY (CHUNK_ID) REFERENCES CHUNKS(CHUNK_ID) + CHUNK_ID VARCHAR(255) NOT NULL PRIMARY KEY, + EMBEDDING VECTOR(FLOAT, 768) NOT NULL ); --- Create CHAT_HISTORY table +-- 4. CHAT_HISTORY TABLE - Stores conversation history CREATE OR REPLACE TABLE CHAT_HISTORY ( - CHAT_ID VARCHAR(36) PRIMARY KEY, - SESSION_ID VARCHAR(36) NOT NULL, - QUERY_TEXT VARCHAR(10000) NOT NULL, - RESPONSE_TEXT VARCHAR(10000), - QUERY_TIMESTAMP TIMESTAMP_LTZ(9) DEFAULT CURRENT_TIMESTAMP() + CHAT_ID VARCHAR(255) NOT NULL PRIMARY KEY, + SESSION_ID VARCHAR(255) NOT NULL, + QUERY_TEXT TEXT NOT NULL, + RESPONSE_TEXT TEXT, + QUERY_TIMESTAMP TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP() ); --- Grant privileges to SYSADMIN role -GRANT ALL ON DATABASE AI_FAQ_DB TO ROLE SYSADMIN; -GRANT ALL ON WAREHOUSE AI_FAQ_WH TO ROLE SYSADMIN; -GRANT ALL ON ALL TABLES IN SCHEMA AI_FAQ_DB.PUBLIC TO ROLE SYSADMIN; \ No newline at end of file +-- ============================================================ +-- STEP 4: GRANT PERMISSIONS (Adjust role as needed) +-- ============================================================ + +-- Grant usage on database and schema +GRANT USAGE ON DATABASE AIFAQ_VERSION1_DB TO ROLE PUBLIC; +GRANT USAGE ON SCHEMA AIFAQ_VERSION1_DB.APP_SCHEMA TO ROLE PUBLIC; + +-- Grant permissions on tables +GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA AIFAQ_VERSION1_DB.APP_SCHEMA TO ROLE PUBLIC; + +-- Grant permissions for future tables +GRANT SELECT, INSERT, UPDATE, DELETE ON FUTURE TABLES IN SCHEMA AIFAQ_VERSION1_DB.APP_SCHEMA TO ROLE PUBLIC; + +-- Grant warehouse usage +GRANT USAGE ON WAREHOUSE AIFAQ_WAREHOUSE TO ROLE PUBLIC; + +-- ============================================================ +-- STEP 5: VERIFY CORTEX FUNCTIONS ARE AVAILABLE +-- ============================================================ + +-- Test embedding function (should return a vector) +SELECT SNOWFLAKE.CORTEX.EMBED_TEXT_768('snowflake-arctic-embed-m', 'Test text for embedding') AS test_embedding; + +-- Test LLM completion function +SELECT SNOWFLAKE.CORTEX.COMPLETE('mistral-7b', 'Say hello in one word') AS test_completion; + +-- ============================================================ +-- STEP 6: VERIFY SETUP +-- ============================================================ + +-- Show all tables +SHOW TABLES IN SCHEMA AIFAQ_VERSION1_DB.APP_SCHEMA; + +-- Describe each table +DESCRIBE TABLE DOCUMENTS; +DESCRIBE TABLE CHUNKS; +DESCRIBE TABLE EMBEDDINGS; +DESCRIBE TABLE CHAT_HISTORY; + +-- Show current context +SELECT + CURRENT_DATABASE() AS current_database, + CURRENT_SCHEMA() AS current_schema, + CURRENT_WAREHOUSE() AS current_warehouse, + CURRENT_ROLE() AS current_role; + +-- ============================================================ +-- SETUP COMPLETE! +-- ============================================================ + +-- Summary of what was created: +-- Database: AIFAQ_VERSION1_DB +-- Schema: APP_SCHEMA +-- Warehouse: AIFAQ_WAREHOUSE +-- Tables: DOCUMENTS, CHUNKS, EMBEDDINGS, CHAT_HISTORY diff --git a/version_2/app_version2.py b/version_2/app_version2.py index e69de29..c23b0b5 100644 --- a/version_2/app_version2.py +++ b/version_2/app_version2.py @@ -0,0 +1,2814 @@ +import streamlit as st +from snowflake.snowpark.context import get_active_session +import uuid +import pandas as pd +import re + +# ========================================== +# PAGE CONFIG +# ========================================== +st.set_page_config( + layout="wide", + page_title="AIFAQ Pro", + page_icon="🏢", + initial_sidebar_state="expanded", +) + +# ========================================== +# ENHANCED UI STYLING (IMPROVED FOR ADMIN) +# ========================================== +st.markdown( + """ + +""", + unsafe_allow_html=True, +) + +# ========================================== +# CONSTANTS +# ========================================== +DB_NAME = "AIFAQ_BETA_DB" +SCHEMA_NAME = "APP_SCHEMA" +MODEL_SMALL = "mistral-7b" +MODEL_LARGE = "mixtral-8x7b" +EMBED_MODEL = "snowflake-arctic-embed-m" # SINGLE SOURCE OF TRUTH + +# ========================================== +# ENHANCED CONSTANTS & CONFIGS +# ========================================== +RAG_CONFIG = { + "max_chunks": 6, + "similarity_threshold": 0.15, + "rerank_candidates": 25, + "max_chunk_text": 700, + "chunk_overlap": 100, + "line_numbers": True, +} + +# Financial-specific RAG config +FINANCIAL_RAG_CONFIG = { + "max_chunks": 10, # More chunks for financial docs + "similarity_threshold": 0.12, # Lower threshold for number matching + "rerank_candidates": 40, + "max_chunk_text": 500, # Smaller chunks for precise numbers + "chunk_overlap": 150, + "line_numbers": True, + "number_boost_factor": 1.5, + "financial_keywords": [ + "revenue", + "profit", + "loss", + "income", + "expense", + "cost", + "margin", + "balance", + "sheet", + "cash", + "flow", + "statement", + "equity", + "asset", + "liability", + "ebitda", + "eps", + "dividend", + "tax", + "depreciation", + "quarter", + "q1", + "q2", + "q3", + "q4", + "annual", + "fiscal", + "guidance", + "million", + "billion", + "percentage", + "growth", + "decline", + "increase", + "decrease", + "yoy", + "year", + "over", + "year", + "quarter", + "qtr", + ], +} + + +# ========================================== +# DATABASE SESSION +# ========================================== +@st.cache_resource +def get_db_session(): + return get_active_session() + + +session = get_db_session() + + +@st.cache_resource +def init_schema(): + tables = [ + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS (USERNAME VARCHAR, TEAM_NAME VARCHAR, UPDATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING (ROLE_NAME VARCHAR, ALLOWED_CATEGORY VARCHAR, CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS (USERNAME VARCHAR PRIMARY KEY, GRANTED_BY VARCHAR, GRANTED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.TEAMS (TEAM_NAME VARCHAR PRIMARY KEY, DESCRIPTION VARCHAR, CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.CHAT_SESSIONS (SESSION_ID VARCHAR PRIMARY KEY, USERNAME VARCHAR, TITLE VARCHAR, CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(), UPDATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.CHAT_MESSAGES (MESSAGE_ID VARCHAR PRIMARY KEY, SESSION_ID VARCHAR, ROLE VARCHAR, CONTENT VARCHAR, SOURCES VARCHAR, CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.DOCUMENTS (DOC_ID VARCHAR PRIMARY KEY, FILENAME VARCHAR, CATEGORY VARCHAR, UPLOADED_BY VARCHAR, UPLOAD_TS TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.CHUNKS (CHUNK_ID VARCHAR PRIMARY KEY, DOC_ID VARCHAR, CHUNK_TEXT VARCHAR, CHUNK_VEC VECTOR(FLOAT, 768))", + f"CREATE TABLE IF NOT EXISTS {DB_NAME}.{SCHEMA_NAME}.APP_CATEGORIES (CATEGORY_NAME VARCHAR PRIMARY KEY, DESCRIPTION VARCHAR, CREATED_AT TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP())", + ] + for t in tables: + try: + session.sql(t).collect() + except: + pass + return True + + +init_schema() + + +# ========================================== +# HELPERS +# ========================================== +def esc(t): + if t is None: + return "" + return str(t).replace("'", "''") + + +def qry(sql, default=None): + try: + result = session.sql(sql).collect() + return result if result is not None else default + except Exception as e: + print(f"Query error: {e}") + return default + + +def val(sql, default=None): + try: + r = qry(sql, None) + if r is None or not isinstance(r, (list, tuple)) or len(r) == 0: + return default + first_row = r[0] + if first_row is None: + return default + if hasattr(first_row, "__getitem__"): + try: + value = first_row[0] + return value if value is not None else default + except: + return default + return default + except Exception as e: + print(f"Value error: {e}") + return default + + +def df_page(sql, pg=0, sz=50): + try: + return session.sql(f"{sql} LIMIT {sz} OFFSET {pg*sz}").to_pandas() + except Exception as e: + print(f"Dataframe error: {e}") + return None + + +# ========================================== +# USER FUNCTIONS +# ========================================== +def get_current_user(): + r = val("SELECT CURRENT_USER()", "UNKNOWN") + return str(r).replace('"', "").upper() if r else "UNKNOWN" + + +def check_is_admin(username): + if not username: + return False + result = qry( + f"SELECT 1 FROM {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS WHERE UPPER(USERNAME)='{esc(username.upper())}'" + ) + if result and len(result) > 0: + return True + admin_count = val(f"SELECT COUNT(*) FROM {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS", 0) + if not admin_count or admin_count == 0: + try: + session.sql( + f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS (USERNAME, GRANTED_BY) VALUES ('{esc(username.upper())}', 'SYSTEM_AUTO')" + ).collect() + return True + except: + return False + return False + + +@st.cache_data(ttl=60) +def get_team(_u): + if not _u: + return "GUEST" + r = val( + f"SELECT TEAM_NAME FROM {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS WHERE UPPER(USERNAME)='{esc(_u.upper())}' ORDER BY UPDATED_AT DESC LIMIT 1" + ) + return str(r).upper() if r else "GUEST" + + +@st.cache_data(ttl=60) +def get_access(_t, _a): + if _a: + return "ALL" + if not _t or _t == "GUEST": + return "NONE" + result = val( + f"SELECT LISTAGG(DISTINCT ALLOWED_CATEGORY, ', ') FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING WHERE UPPER(ROLE_NAME)='{esc(_t.upper())}'" + ) + return result if result else "NONE" + + +# ========================================== +# CATEGORY FUNCTIONS +# ========================================== +def get_all_categories(): + sql = f"SELECT DISTINCT CATEGORY_NAME FROM {DB_NAME}.{SCHEMA_NAME}.APP_CATEGORIES ORDER BY CATEGORY_NAME" + r = qry(sql, []) + if not r or not isinstance(r, list): + return [] + categories = [] + for row in r: + if row: + try: + cat_name = row[0] + if cat_name: + categories.append(str(cat_name)) + except: + continue + return sorted(categories) + + +def create_category_db(name): + if not name: + return False, "Category name is required" + try: + session.sql( + f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.APP_CATEGORIES (CATEGORY_NAME) VALUES ('{esc(name.upper())}')" + ).collect() + return True, f"Category '{name}' added" + except Exception as e: + return False, str(e) + + +def delete_category_db(name): + if not name: + return False, "Category name is required" + try: + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.APP_CATEGORIES WHERE UPPER(CATEGORY_NAME)='{esc(name.upper())}'" + ).collect() + return True, f"Category '{name}' deleted" + except Exception as e: + return False, str(e) + + +# ========================================== +# TEAM FUNCTIONS +# ========================================== +@st.cache_data(ttl=120) +def get_teams(): + sql = f"SELECT DISTINCT TEAM_NAME FROM (SELECT TEAM_NAME FROM {DB_NAME}.{SCHEMA_NAME}.TEAMS UNION SELECT TEAM_NAME FROM {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS WHERE TEAM_NAME!='GUEST' UNION SELECT ROLE_NAME FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING) WHERE TEAM_NAME!='SUPER_ADMIN' AND TEAM_NAME IS NOT NULL" + r = qry(sql, []) + if not r or not isinstance(r, list): + return [] + teams = [] + for row in r: + if row: + try: + team_name = row[0] + if team_name: + teams.append(str(team_name)) + except: + continue + return sorted(teams) + + +@st.cache_data(ttl=30) +def get_categories(_t): + if not _t: + return [] + sql = f"SELECT ALLOWED_CATEGORY FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING WHERE UPPER(ROLE_NAME)='{esc(_t.upper())}'" + r = qry(sql, []) + if not r or not isinstance(r, list): + return [] + categories = [] + for row in r: + if row: + try: + cat = row[0] + if cat: + categories.append(str(cat)) + except: + continue + return categories + + +# ========================================== +# DATA LOADERS +# ========================================== +def load_users(pg=0, sz=50): + sql = f"SELECT u.USERNAME, COALESCE(u.TEAM_NAME,'GUEST') as TEAM, CASE WHEN a.USERNAME IS NOT NULL THEN '✓' ELSE '' END as ADMIN, u.UPDATED_AT FROM {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS u LEFT JOIN {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS a ON UPPER(u.USERNAME)=UPPER(a.USERNAME) ORDER BY u.USERNAME" + return df_page(sql, pg, sz) + + +def load_admins(): + try: + return session.sql( + f"SELECT USERNAME, GRANTED_BY, GRANTED_AT FROM {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS ORDER BY USERNAME" + ).to_pandas() + except: + return pd.DataFrame(columns=["USERNAME", "GRANTED_BY", "GRANTED_AT"]) + + +def load_rules(pg=0, sz=50): + sql = f"SELECT ROLE_NAME as TEAM, ALLOWED_CATEGORY as CATEGORY, CREATED_AT FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING ORDER BY ROLE_NAME, ALLOWED_CATEGORY" + return df_page(sql, pg, sz) + + +def load_docs(pg=0, sz=50): + sql = f"SELECT DOC_ID, FILENAME, CATEGORY, UPLOADED_BY, UPLOAD_TS FROM {DB_NAME}.{SCHEMA_NAME}.DOCUMENTS ORDER BY UPLOAD_TS DESC" + return df_page(sql, pg, sz) + + +def load_members(team, pg=0, sz=50): + if not team: + return None + sql = f"SELECT u.USERNAME, CASE WHEN a.USERNAME IS NOT NULL THEN '✓' ELSE '' END as ADMIN, u.UPDATED_AT as JOINED FROM {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS u LEFT JOIN {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS a ON UPPER(u.USERNAME)=UPPER(a.USERNAME) WHERE UPPER(u.TEAM_NAME)='{esc(team.upper())}' ORDER BY u.USERNAME" + return df_page(sql, pg, sz) + + +def count_tbl(tbl): + result = val(f"SELECT COUNT(*) FROM {DB_NAME}.{SCHEMA_NAME}.{tbl}", 0) + return result if result is not None else 0 + + +# ========================================== +# USER MANAGEMENT +# ========================================== +def create_user(username, password, team=None): + if not username: + return False, "Username is required" + u = "".join(c for c in username if c.isalnum() or c == "_").upper() + if not u or len(password) < 6: + return False, "Invalid username or password < 6 chars" + try: + session.sql("CREATE ROLE IF NOT EXISTS AIFAQ_VIEWER_ROLE").collect() + session.sql( + f"CREATE USER IF NOT EXISTS {u} PASSWORD='{esc(password)}' DEFAULT_ROLE=AIFAQ_VIEWER_ROLE MUST_CHANGE_PASSWORD=FALSE" + ).collect() + session.sql(f"GRANT ROLE AIFAQ_VIEWER_ROLE TO USER {u}").collect() + if team and team not in ("SUPER_ADMIN", "GUEST", ""): + assign_team(u, team) + get_teams.clear() + return True, f"User '{u}' created" + except Exception as e: + return False, str(e) + + +# ========================================== +# PASSWORD RESET FUNCTION +# ========================================== +def reset_user_password(username, new_password): + """Reset password for existing user""" + if not username or len(new_password) < 6: + return False, "Invalid username or password < 6 chars" + try: + # Use Snowflake's ALTER USER command + session.sql( + f"ALTER USER {username} SET PASSWORD = '{esc(new_password)}'" + ).collect() + return True, f"Password reset for user '{username}'" + except Exception as e: + return False, str(e) + + +def assign_team(username, team): + if not username or not team: + return False, "Username and team are required" + try: + sql = f"MERGE INTO {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS t USING (SELECT '{esc(username.upper())}' as U, '{esc(team.upper())}' as T) s ON UPPER(t.USERNAME)=s.U WHEN MATCHED THEN UPDATE SET TEAM_NAME=s.T, UPDATED_AT=CURRENT_TIMESTAMP() WHEN NOT MATCHED THEN INSERT (USERNAME, TEAM_NAME) VALUES (s.U, s.T)" + session.sql(sql).collect() + get_team.clear() + return True, f"Assigned '{username}' → '{team}'" + except Exception as e: + return False, str(e) + + +def bulk_assign(users, team): + if not users or not team: + return 0 + vals = ", ".join( + [f"('{esc(u.upper())}', '{esc(team.upper())}')" for u in users if u] + ) + if not vals: + return 0 + try: + sql = f"MERGE INTO {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS t USING (SELECT column1 as U, column2 as T FROM VALUES {vals}) s ON UPPER(t.USERNAME)=s.U WHEN MATCHED THEN UPDATE SET TEAM_NAME=s.T, UPDATED_AT=CURRENT_TIMESTAMP() WHEN NOT MATCHED THEN INSERT (USERNAME, TEAM_NAME) VALUES (s.U, s.T)" + session.sql(sql).collect() + get_team.clear() + return len(users) + except Exception as e: + print(f"Bulk assign error: {e}") + return 0 + + +def bulk_delete(users, sf=False): + if not users: + return 0 + lst = ", ".join([f"'{esc(u.upper())}'" for u in users if u]) + if not lst: + return 0 + try: + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS WHERE UPPER(USERNAME) IN ({lst})" + ).collect() + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS WHERE UPPER(USERNAME) IN ({lst})" + ).collect() + if sf: + for u in users: + if u: + session.sql(f"DROP USER IF EXISTS {u}").collect() + get_team.clear() + return len(users) + except Exception as e: + print(f"Bulk delete error: {e}") + return 0 + + +def create_team_db(name, desc=""): + if not name: + return False, "Team name is required" + try: + session.sql( + f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.TEAMS (TEAM_NAME, DESCRIPTION) VALUES ('{esc(name.upper())}', '{esc(desc)}')" + ).collect() + get_teams.clear() + return True, f"Team '{name}' created" + except Exception as e: + return False, str(e) + + +def delete_team_db(name): + if not name: + return False, "Team name is required" + n = esc(name.upper()) + try: + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.TEAMS WHERE UPPER(TEAM_NAME)='{n}'" + ).collect() + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING WHERE UPPER(ROLE_NAME)='{n}'" + ).collect() + session.sql( + f"UPDATE {DB_NAME}.{SCHEMA_NAME}.APP_USER_TEAMS SET TEAM_NAME='GUEST' WHERE UPPER(TEAM_NAME)='{n}'" + ).collect() + get_teams.clear() + get_team.clear() + return True, f"Team '{name}' deleted" + except Exception as e: + return False, str(e) + + +def add_admin_db(username, by): + if not username: + return False, "Username is required" + try: + session.sql( + f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS (USERNAME, GRANTED_BY) VALUES ('{esc(username.upper())}', '{esc(by.upper() if by else 'SYSTEM')}')" + ).collect() + return True, f"'{username}' is now admin" + except Exception as e: + return False, str(e) + + +def remove_admin_db(username, curr): + if not username: + return False, "Username is required" + if curr and username.upper() == curr.upper(): + return False, "Cannot remove yourself" + try: + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.ADMIN_USERS WHERE UPPER(USERNAME)='{esc(username.upper())}'" + ).collect() + return True, f"Removed admin: '{username}'" + except Exception as e: + return False, str(e) + + +def grant_cat(team, cat): + if not team or not cat: + return False, "Team and category are required" + try: + sql = f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING (ROLE_NAME, ALLOWED_CATEGORY) SELECT '{esc(team.upper())}', '{esc(cat.upper())}' WHERE NOT EXISTS (SELECT 1 FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING WHERE UPPER(ROLE_NAME)='{esc(team.upper())}' AND UPPER(ALLOWED_CATEGORY)='{esc(cat.upper())}')" + session.sql(sql).collect() + get_access.clear() + return True, f"Granted '{cat}' → '{team}'" + except Exception as e: + return False, str(e) + + +def revoke_cat(team, cat): + if not team or not cat: + return False, "Team and category are required" + try: + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING WHERE UPPER(ROLE_NAME)='{esc(team.upper())}' AND UPPER(ALLOWED_CATEGORY)='{esc(cat.upper())}'" + ).collect() + get_access.clear() + return True, "Revoked" + except Exception as e: + return False, str(e) + + +def bulk_grant(team, cats): + if not cats or not team: + return 0 + vals = ", ".join( + [f"('{esc(team.upper())}', '{esc(c.upper())}')" for c in cats if c] + ) + if not vals: + return 0 + try: + sql = f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING (ROLE_NAME, ALLOWED_CATEGORY) SELECT column1, column2 FROM VALUES {vals} WHERE NOT EXISTS (SELECT 1 FROM {DB_NAME}.{SCHEMA_NAME}.ROLE_ACCESS_MAPPING WHERE UPPER(ROLE_NAME)=column1 AND UPPER(ALLOWED_CATEGORY)=column2)" + session.sql(sql).collect() + get_access.clear() + return len(cats) + except Exception as e: + print(f"Bulk grant error: {e}") + return 0 + + +def bulk_revoke(rules): + """rules: list of tuples (team, category)""" + if not rules: + return 0 + try: + for team, category in rules: + revoke_cat(team, category) + return len(rules) + except Exception as e: + print(f"Bulk revoke error: {e}") + return 0 + + +# ========================================== +# ENHANCED DOCUMENT INGESTION +# ========================================== +def ingest_doc(file, cat): + """Enhanced ingestion with line numbers and financial-aware chunking""" + if not file or not cat: + return False, "File and category are required" + + try: + name = "".join(c for c in file.name if c.isalnum() or c in "._-") + doc_id = str(uuid.uuid4()) + + # Upload to stage + session.file.put_stream( + file, + f"@{DB_NAME}.{SCHEMA_NAME}.DOC_STAGE/{name}", + auto_compress=False, + overwrite=True, + ) + + # Parse document + parse_sql = f""" + SELECT SNOWFLAKE.CORTEX.PARSE_DOCUMENT( + '@{DB_NAME}.{SCHEMA_NAME}.DOC_STAGE', + '{name}' + ):content::STRING as content + """ + content_result = session.sql(parse_sql).collect() + + if not content_result or len(content_result) == 0: + return False, "Failed to parse document" + + full_content = content_result[0]["CONTENT"] + + # Enhanced chunking with line numbers + chunks = create_enhanced_chunks(full_content, doc_id, cat) + + # Insert document record + session.sql( + f""" + INSERT INTO {DB_NAME}.{SCHEMA_NAME}.DOCUMENTS + (DOC_ID, FILENAME, CATEGORY, UPLOADED_BY) + VALUES ('{doc_id}', '{esc(name)}', '{esc(cat.upper())}', CURRENT_USER()) + """ + ).collect() + + return True, f"Document indexed with {len(chunks)} chunks" + + except Exception as e: + return False, str(e) + + +def create_enhanced_chunks(content, doc_id, category): + """Create enhanced chunks with line numbers and financial document awareness""" + if not content: + return [] + + lines = content.split("\n") + chunks = [] + chunk_id = 0 + + # Determine if this is a financial document + is_financial = is_financial_document(content, category) + config = FINANCIAL_RAG_CONFIG if is_financial else RAG_CONFIG + + # Enhanced chunking strategy + if is_financial: + # For financial docs, preserve table structures and statements + chunks = chunk_financial_document(lines, doc_id, config) + else: + # Standard chunking with line numbers + chunks = chunk_standard_document(lines, doc_id, config) + + return chunks + + +def is_financial_document(content, category): + """Detect if document contains financial content""" + content_lower = content.lower() + category_lower = category.lower() + + # Check category first + if any( + kw in category_lower + for kw in ["finance", "financial", "accounting", "audit", "fiscal"] + ): + return True + + # Check content for financial keywords + financial_keywords = FINANCIAL_RAG_CONFIG["financial_keywords"] + keyword_count = sum(1 for kw in financial_keywords if kw in content_lower) + + # If more than 5 financial keywords, treat as financial doc + return keyword_count > 5 + + +def chunk_financial_document(lines, doc_id, config): + """Special chunking for financial documents""" + chunks = [] + current_chunk = [] + current_lines = [] + chunk_size = 0 + chunk_id = 0 + + for idx, line in enumerate(lines, 1): + line_len = len(line) + + # Add line with number prefix + numbered_line = f"[L{idx:04d}] {line}" + current_chunk.append(numbered_line) + current_lines.append(idx) + chunk_size += line_len + + # Check if we should chunk here (financial statement boundaries) + if should_chunk_financial_line(line, chunk_size, config["max_chunk_text"]): + if current_chunk: + chunk_text = "\n".join(current_chunk) + # Add metadata about line range + chunk_text = f"=== LINES {current_lines[0]}-{current_lines[-1]} ===\n{chunk_text}" + + create_chunk_record(doc_id, chunk_text, chunk_id) + chunks.append(chunk_text) + + # Reset with overlap + overlap_start = max( + 0, len(current_chunk) - config["chunk_overlap"] // 50 + ) + current_chunk = current_chunk[overlap_start:] + current_lines = current_lines[overlap_start:] + chunk_size = sum(len(l) for l in current_chunk) + chunk_id += 1 + + # Add remaining chunk + if current_chunk: + chunk_text = "\n".join(current_chunk) + chunk_text = ( + f"=== LINES {current_lines[0]}-{current_lines[-1]} ===\n{chunk_text}" + ) + create_chunk_record(doc_id, chunk_text, chunk_id) + chunks.append(chunk_text) + + return chunks + + +def chunk_standard_document(lines, doc_id, config): + """Standard chunking with line numbers""" + chunks = [] + current_chunk = [] + current_lines = [] + chunk_size = 0 + chunk_id = 0 + + for idx, line in enumerate(lines, 1): + line_len = len(line) + + # Add line with number prefix + numbered_line = f"[L{idx:04d}] {line}" + current_chunk.append(numbered_line) + current_lines.append(idx) + chunk_size += line_len + + # Simple size-based chunking + if chunk_size >= config["max_chunk_text"]: + if current_chunk: + chunk_text = "\n".join(current_chunk) + chunk_text = f"=== LINES {current_lines[0]}-{current_lines[-1]} ===\n{chunk_text}" + + create_chunk_record(doc_id, chunk_text, chunk_id) + chunks.append(chunk_text) + + # Reset with overlap + overlap_start = max( + 0, len(current_chunk) - config["chunk_overlap"] // 50 + ) + current_chunk = current_chunk[overlap_start:] + current_lines = current_lines[overlap_start:] + chunk_size = sum(len(l) for l in current_chunk) + chunk_id += 1 + + # Add remaining chunk + if current_chunk: + chunk_text = "\n".join(current_chunk) + chunk_text = ( + f"=== LINES {current_lines[0]}-{current_lines[-1]} ===\n{chunk_text}" + ) + create_chunk_record(doc_id, chunk_text, chunk_id) + chunks.append(chunk_text) + + return chunks + + +def should_chunk_financial_line(line, current_size, max_size): + """Determine if we should chunk at this line (financial doc logic)""" + if current_size >= max_size: + return True + + line_upper = line.strip().upper() + + # Chunk at financial statement boundaries + if any( + boundary in line_upper + for boundary in [ + "BALANCE SHEET", + "INCOME STATEMENT", + "CASH FLOW", + "STATEMENT OF", + "QUARTER ENDED", + "YEAR ENDED", + "ASSETS", + "LIABILITIES", + "EQUITY", + "REVENUE", + ] + ): + return True + + # Chunk at empty lines after tables + if line.strip() == "" and current_size > max_size * 0.7: + return True + + return False + + +def create_chunk_record(doc_id, chunk_text, chunk_id): + """✅ FIXED: Insert chunk into database using SELECT (not VALUES) with single EMBED_MODEL""" + chunk_uuid = str(uuid.uuid4()) + + # ✅ Use global EMBED_MODEL constant and SELECT syntax to fix SQL compilation error + session.sql( + f""" + INSERT INTO {DB_NAME}.{SCHEMA_NAME}.CHUNKS + (CHUNK_ID, DOC_ID, CHUNK_TEXT, CHUNK_VEC) + SELECT + '{chunk_uuid}', + '{doc_id}', + '{esc(chunk_text)}', + SNOWFLAKE.CORTEX.EMBED_TEXT_768('{EMBED_MODEL}', '{esc(chunk_text)}') + """ + ).collect() + + return chunk_uuid + + +def delete_docs(ids): + if not ids: + return 0 + lst = ", ".join([f"'{esc(d)}'" for d in ids if d]) + if not lst: + return 0 + try: + # Delete from chunks first (foreign key relationship) + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.CHUNKS WHERE DOC_ID IN ({lst})" + ).collect() + # Then delete from documents + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.DOCUMENTS WHERE DOC_ID IN ({lst})" + ).collect() + return len(ids) + except Exception as e: + print(f"Document delete error: {e}") + return 0 + + +# ========================================== +# FINANCIAL QUERY DETECTION +# ========================================== +def is_financial_query(query): + """Detect if query is financial in nature""" + if not query: + return False + + query_lower = query.lower() + financial_keywords = FINANCIAL_RAG_CONFIG["financial_keywords"] + + # Count financial keyword matches + keyword_matches = sum(1 for kw in financial_keywords if kw in query_lower) + + # Check for numbers/metrics + has_numbers = bool( + re.search(r"\d+(\.\d+)?\s*(million|billion|m|b|%|percent)?", query_lower) + ) + + # Check for financial questions + is_question = any( + phrase in query_lower + for phrase in [ + "how much", + "in quarter", + "revenue", + "profit", + "loss", + "income", + "tax rate", + ] + ) + + # Query is financial if it has multiple keywords or numbers + financial terms + return ( + (keyword_matches >= 2) or (has_numbers and keyword_matches >= 1) or is_question + ) + + +# ========================================== +# CHAT FUNCTIONS +# ========================================== +def new_chat(user, title="New Chat"): + if not user: + return None + sid = str(uuid.uuid4()) + try: + session.sql( + f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.CHAT_SESSIONS (SESSION_ID, USERNAME, TITLE) VALUES ('{sid}', '{esc(user.upper())}', '{esc(title)}')" + ).collect() + return sid + except Exception as e: + print(f"New chat error: {e}") + return None + + +def save_msg(sid, role, content, sources=None): + if not sid or not role: + return False + mid = str(uuid.uuid4()) + src = ",".join(sources) if sources else "" + content_safe = content if content else "" + try: + session.sql( + f"INSERT INTO {DB_NAME}.{SCHEMA_NAME}.CHAT_MESSAGES (MESSAGE_ID, SESSION_ID, ROLE, CONTENT, SOURCES) VALUES ('{mid}', '{esc(sid)}', '{esc(role)}', '{esc(content_safe)}', '{esc(src)}')" + ).collect() + session.sql( + f"UPDATE {DB_NAME}.{SCHEMA_NAME}.CHAT_SESSIONS SET UPDATED_AT=CURRENT_TIMESTAMP() WHERE SESSION_ID='{esc(sid)}'" + ).collect() + return True + except Exception as e: + print(f"Save message error: {e}") + return False + + +def get_chats(user, pg=0, sz=15): + if not user: + return pd.DataFrame(columns=["SESSION_ID", "TITLE", "UPDATED_AT"]) + sql = f"SELECT SESSION_ID, TITLE, UPDATED_AT FROM {DB_NAME}.{SCHEMA_NAME}.CHAT_SESSIONS WHERE UPPER(USERNAME)='{esc(user.upper())}' ORDER BY UPDATED_AT DESC" + return df_page(sql, pg, sz) + + +def get_msgs(sid): + if not sid: + return pd.DataFrame(columns=["ROLE", "CONTENT"]) + try: + return session.sql( + f"SELECT ROLE, CONTENT FROM {DB_NAME}.{SCHEMA_NAME}.CHAT_MESSAGES WHERE SESSION_ID='{esc(sid)}' ORDER BY CREATED_AT" + ).to_pandas() + except: + return pd.DataFrame(columns=["ROLE", "CONTENT"]) + + +def del_chat(sid): + if not sid: + return False + try: + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.CHAT_MESSAGES WHERE SESSION_ID='{esc(sid)}'" + ).collect() + session.sql( + f"DELETE FROM {DB_NAME}.{SCHEMA_NAME}.CHAT_SESSIONS WHERE SESSION_ID='{esc(sid)}'" + ).collect() + return True + except Exception as e: + print(f"Delete chat error: {e}") + return False + + +def load_chat(sid): + if not sid: + return + st.session_state.sid = sid + df = get_msgs(sid) + if df is not None and not df.empty: + st.session_state.msgs = [ + {"role": r["ROLE"], "content": r["CONTENT"]} for _, r in df.iterrows() + ] + else: + st.session_state.msgs = [] + + +# ========================================== +# MARKDOWN TO HTML CONVERTER (ENHANCED) +# ========================================== +def md_to_html(text): + """Convert markdown to clean, well-formatted HTML""" + if not text: + return "" + + text = str(text) + + # Code blocks first (before other processing) + def replace_code_block(match): + code = match.group(1).strip() + code = code.replace("<", "<").replace(">", ">") + return f"
{code}
" + + text = re.sub(r"```(?:\w+)?\n?(.*?)```", replace_code_block, text, flags=re.DOTALL) + + # Inline code + text = re.sub(r"`([^`]+)`", r"\1", text) + + # Headers with better spacing + text = re.sub(r"^### (.+)$", r"

\1

", text, flags=re.MULTILINE) + text = re.sub(r"^## (.+)$", r"

\1

", text, flags=re.MULTILINE) + text = re.sub(r"^# (.+)$", r"

\1

", text, flags=re.MULTILINE) + + # Bold and italic + text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", text) + text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) + text = re.sub(r"\*(.+?)\*", r"\1", text) + + # Lists with proper structure + lines = text.split("\n") + result = [] + in_list = False + list_type = None + + for line in lines: + stripped = line.strip() + + # Handle unordered lists + if stripped.startswith("- ") or stripped.startswith("* "): + if not in_list or list_type != "ul": + if in_list: + result.append(f"") + result.append("