|
| 1 | +import streamlit as st |
| 2 | +import os |
| 3 | +from utils import load_yaml_file_with_db_prompts |
| 4 | +from dotenv import load_dotenv, find_dotenv |
| 5 | +from langchain_community.vectorstores import FAISS |
| 6 | +from langchain_mistralai.embeddings import MistralAIEmbeddings |
| 7 | +from menu import menu_with_redirect |
| 8 | + |
| 9 | +menu_with_redirect() |
| 10 | + |
| 11 | +# only allow admin users |
| 12 | +if st.session_state.user_type not in ["admin"]: |
| 13 | + st.warning("You do not have permission to view this page.") |
| 14 | + st.stop() |
| 15 | + |
| 16 | +st.markdown("# Vector DB Content (FAISS)") |
| 17 | + |
| 18 | +# load the configuration and environment variables |
| 19 | +config_data = load_yaml_file_with_db_prompts("config.yaml") |
| 20 | +load_dotenv(find_dotenv()) |
| 21 | + |
| 22 | +# set up the embeddings model based on the provider |
| 23 | +if config_data["llm_provider"] == "mistral": |
| 24 | + embeddings = MistralAIEmbeddings( |
| 25 | + model=config_data["embedding_model"], |
| 26 | + mistral_api_key=os.getenv("MISTRALAI_API_KEY") |
| 27 | + ) |
| 28 | +else: |
| 29 | + from langchain_openai import OpenAIEmbeddings |
| 30 | + embeddings = OpenAIEmbeddings( |
| 31 | + openai_api_key=os.getenv("OPENAI_API_KEY") |
| 32 | + ) |
| 33 | + |
| 34 | +# load the FAISS vector database from disk |
| 35 | +persist_dir = config_data["persist_directory"] |
| 36 | +try: |
| 37 | + vectordb = FAISS.load_local( |
| 38 | + persist_dir, |
| 39 | + embeddings, |
| 40 | + allow_dangerous_deserialization=True |
| 41 | + ) |
| 42 | + st.success(f"Loaded FAISS content from '{persist_dir}'") |
| 43 | +except Exception as e: |
| 44 | + st.error(f"Could not load FAISS content: {e}") |
| 45 | + st.stop() |
| 46 | + |
| 47 | +# get all documents from the vector database (may be slow if many documents) |
| 48 | +try: |
| 49 | + all_docs = list(vectordb.docstore._dict.values()) |
| 50 | +except Exception as e: |
| 51 | + st.error(f"Could not access documents in FAISS index: {e}") |
| 52 | + st.stop() |
| 53 | + |
| 54 | +search_term = st.text_input("Search in vector DB content:", "") |
| 55 | + |
| 56 | +# Check if a document matches the search term |
| 57 | +def doc_matches(doc, term): |
| 58 | + if not term: |
| 59 | + return True |
| 60 | + terms = [t.strip().lower() for t in term.split() if t.strip()] |
| 61 | + content = getattr(doc, 'page_content', '').lower() |
| 62 | + metadata_values = [str(v).lower() for v in getattr(doc, 'metadata', {}).values()] |
| 63 | + for t in terms: |
| 64 | + if t not in content and not any(t in v for v in metadata_values): |
| 65 | + return False |
| 66 | + return True |
| 67 | + |
| 68 | +filtered_docs = [doc for doc in all_docs if doc_matches(doc, search_term)] |
| 69 | + |
| 70 | +st.markdown(f"### Showing {len(filtered_docs)} / {len(all_docs)} documents") |
| 71 | + |
| 72 | +# show each document with its metadata and a snippet of its content |
| 73 | +for i, doc in enumerate(filtered_docs): |
| 74 | + with st.expander(f"Document {i+1}"): |
| 75 | + st.write("**Metadata:**", doc.metadata) |
| 76 | + snippet = doc.page_content[:500] + ("..." if len(doc.page_content) > 500 else "") |
| 77 | + st.write("**Content snippet:**") |
| 78 | + st.code(snippet) |
| 79 | + with st.expander("Show full content"): |
| 80 | + st.write(doc.page_content) |
0 commit comments