ssciwr
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 9 additions & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 8 additions & 4 deletions b/‎README.md‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎parzivai/app.py‎
Lines changed: 44 additions & 19 deletions b/‎parzivai/app.py‎
Lines changed: 44 additions & 19 deletions
diff --git a/‎parzivai/image_search.py‎
Lines changed: 8 additions & 14 deletions b/‎parzivai/image_search.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎parzivai/input_output.py‎
Lines changed: 12 additions & 16 deletions b/‎parzivai/input_output.py‎
Lines changed: 12 additions & 16 deletions
@@ -26,7 +26,13 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
-
+      with:
+        path: .
+    - name: Checkout the mhg spacy model repo
+      uses: actions/checkout@v4
+      with:
+        repository: Middle-High-German-Conceptual-Database/Spacy-Model-for-Middle-High-German
+        path: mhg
     - name: Set up Python ${{ matrix.python }}
       uses: actions/setup-python@v5
       with:
@@ -37,6 +43,8 @@ jobs:
         python -m pip install -r requirements-dev.txt
         cd parzivai
         python -m pytest -svv --cov=. --cov-branch --cov-report=xml
+      env:
+        SPACY_MHG_MODEL_PATH: ${{ github.workspace }}/mhg
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@v5
       with:
 
@@ -27,13 +27,17 @@ parzivAI makes use from [spaCy](https://spacy.io/) under the hood. Download the
 ```bash
 python -m spacy download de_core_news_sm
 ```
-(*TODO: Download models on the fly if not found through the spacy cli*)
+Note that the model is downloaded on the fly if not found through the spacy cli.
 
-For Middle High German, a specially trained model must be loaded, and its path needs to be integrated into the code. The model can be found [here](https://github.com/Middle-High-German-Conceptual-Database/Spacy-Model-for-Middle-High-German). Git clone the repository and place it in the same folder as the parzivAI repo:
+For Middle High German, a specially trained model must be loaded, and its path needs to be integrated into the code. The model can be found [here](https://github.com/Middle-High-German-Conceptual-Database/Spacy-Model-for-Middle-High-German). Git clone the repository and either set an environment variable with the model path as
 ```
-you-folder/
+export SPACY_MHG_MODEL_PATH=/path/to/Spacy-Model-for-Middle-High-German-repo
+```
+or place it in the same folder as the parzivAI repo:
+```
+your-folder/
 │
-├── parzivai                                 # parzivai
+├── parzivai                                 # parzivai repo
 ├── Spacy-Model-for-Middle-High-German       # spaCy model
 ```
 (*TODO: Make sure this is platform-agnostic and can also be done on-the-fly*)
 
@@ -11,13 +11,13 @@
 from langchain.schema import Document
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langchain_core.messages import HumanMessage, AIMessage
-
-st.set_page_config(page_title="ParzivAI")
 from parzivai.input_output import get_vectorstore, load_embeddings_model
-from parzivai.image_search import display_images
+from parzivai.image_search import fetch_images_for_topic
 from parzivai.text_tagging import (
     check_attributes,
     POS_DESCRIPTIONS,
+    load_modern_model,
+    load_mhg_model,
     pos_tagging_mhg,
     pos_tagging_modern,
 )
@@ -30,10 +30,26 @@
     SIMPLE_INQUIRIES,
 )
 
+# Page configuration (must be first Streamlit command)
+st.set_page_config(page_title="ParzivAI")
+
 # avoid some torch incompatibility issues with newer Python versions
 # see https://github.com/SaiAkhil066/DeepSeek-RAG-Chatbot/issues/4
 torch.classes.__path__ = []
 
+
+# Add cache approach of getting models here, to make it easier for unit-tests
+@st.cache_resource
+def get_cached_retriever():
+    embedding_model = load_embeddings_model()
+    return get_vectorstore(embedding_model)
+
+
+@st.cache_resource
+def get_models():
+    return load_modern_model(), load_mhg_model()
+
+
 # Set API keys
 load_dotenv()  # TODO create a .env file in the root directory with TAVILY_API_KEY and delete initialization of TAVILY_API_KEY below
 if not os.getenv("TAVILY_API_KEY"):
@@ -47,7 +63,6 @@
 PKG = resources.files("parzivai")
 FILE_PATH = PKG / "data"
 AVATAR_IMAGE = str(FILE_PATH / "parzival.png")
-retriever = get_vectorstore()
 llm = instantiate_llm()
 EMOJI_MAP = {
     "Vectorstore": "📚",
@@ -79,7 +94,7 @@ def append_to_rendered_messages(role, content):
 
 
 @st.cache_data(ttl=3600)
-def retrieve(question) -> dict:
+def retrieve(question, retriever) -> dict:
     documents = retriever.invoke(question)
     return {"documents": documents, "question": question}
 
@@ -150,8 +165,8 @@ def web_search(question):
     }
 
 
-def decide_route(question):
-    documents = retrieve(question)["documents"]
+def decide_route(question, retriever):
+    documents = retrieve(question, retriever)["documents"]
     print("Documents retrieved from Vectorstore:")
     for doc in documents:
         print(doc if isinstance(doc, str) else doc.page_content)
@@ -208,7 +223,7 @@ def save_chat_history_and_messages(role: str, message: str):
     save_chat_to_history(role, message)
 
 
-def process_user_input(user_input):
+def process_user_input(user_input, retriever):
     save_chat_history_and_messages("User", user_input)
     st.session_state.state["question"] = user_input
     st.session_state.state["messages"] = st.session_state.messages
@@ -222,7 +237,7 @@ def process_user_input(user_input):
     elif contains_any(user_input, SIMPLE_INQUIRIES["simple_inquiries"]):
         handle_direct_response(user_input)
     else:
-        handle_routing_and_answer(user_input)
+        handle_routing_and_answer(user_input, retriever)
 
 
 def is_translation_request(text: str) -> bool:
@@ -266,8 +281,8 @@ def handle_direct_response(user_input: str):
     save_chat_history_and_messages("Assistant", response.content)
 
 
-def handle_routing_and_answer(user_input: str):
-    routing_info = decide_route(user_input)
+def handle_routing_and_answer(user_input: str, retriever):
+    routing_info = decide_route(user_input, retriever)
     st.session_state.state.update(routing_info)
 
     if routing_info["route_taken"] == "Vectorstore":
@@ -324,26 +339,25 @@ def build_final_response_message(route: str, result: dict) -> str:
     return message
 
 
-def show_pos_tagging_options(latest_response: str):
+def show_pos_tagging_options(latest_response: str, nlp_modern, nlp_mhg):
     st.markdown("### POS-Tagging Options")
     col1, col2 = st.columns(2)
     with col1:
         if st.button("POS-Tagging (Modernes Deutsch)"):
-            doc = pos_tagging_modern(latest_response)
+            doc = pos_tagging_modern(nlp_modern, latest_response)
             if doc:
                 st.session_state.linguistic_analysis = ("Modernes Deutsch", doc)
                 st.rerun()
     with col2:
         if st.button("POS-Tagging (Mittelhochdeutsch)"):
-            doc = pos_tagging_mhg(latest_response)
+            doc = pos_tagging_mhg(nlp_mhg, latest_response)
             if doc:
                 st.session_state.linguistic_analysis = ("Mittelhochdeutsch", doc)
-                st.experimental_update()
+                st.rerun()
 
 
 def main():
     # Main function to run the Streamlit app
-    # Page configuration (must be first Streamlit command)
     tab1, tab2, tab3, tab4 = st.tabs(
         [
             "ParzivAI Chatbot",
@@ -372,10 +386,12 @@ def main():
     st.sidebar.image(AVATAR_IMAGE, width=150)
     # function to initialize all session state variables
     initialize_session_state()
+    retriever = get_cached_retriever()
+    nlp_modern, nlp_mhg = get_models()
 
     user_input = st.chat_input("Ask ParzivAI a question:")
     if user_input:
-        process_user_input(user_input)
+        process_user_input(user_input, retriever)
 
     with st.sidebar.expander("Cached Data"):
         st.write("Embeddings:")
@@ -413,7 +429,7 @@ def main():
         None,
     )
     if assistant_response:
-        show_pos_tagging_options(assistant_response)
+        show_pos_tagging_options(assistant_response, nlp_modern, nlp_mhg)
 
     # Feedback collection
     feedback = streamlit_feedback(
@@ -479,7 +495,16 @@ def main():
 
         if "image_search_result" in st.session_state:
             st.write("Searching for images...")
-            asyncio.run(display_images(st.session_state.image_search_result))
+            image_data = asyncio.run(
+                fetch_images_for_topic(st.session_state.image_search_result)
+            )
+
+            for data in image_data:
+                st.image(
+                    data["url"],
+                    caption=f"Bildthema: {data['name']}, Archivnummer: {data['archiveNumber']}, URL: {data['url']}",
+                    use_container_width=True,
+                )
 
     with tab3:
         st.header("Linguistische Analyse")
 
@@ -1,4 +1,3 @@
-import streamlit as st
 from urllib.parse import quote
 import json
 from playwright.async_api import async_playwright
@@ -16,11 +15,12 @@
             "image_search_url"
         )  # it would be defined in the config file
 except FileNotFoundError:
-    st.error(f"Configuration file not found at {CONFIG_PATH}. Please ensure it exists.")
-    raise
+    raise RuntimeError(
+        f"Configuration file not found at {CONFIG_PATH}. Please ensure it exists."
+    )
+
 except json.JSONDecodeError as e:
-    st.error(f"Error decoding configuration file: {e}")
-    raise
+    raise RuntimeError(f"Error decoding configuration file: {e}") from e
 
 
 def adjust_image_url(url: str) -> str:
@@ -80,12 +80,6 @@ async def fetch_images(topic: str):
         return image_data
 
 
-async def display_images(topic: str):
-    """Display fetched images in Streamlit."""
-    image_data = await fetch_images(topic)
-    for data in image_data:
-        st.image(
-            data["url"],
-            caption=f"Bildthema: {data['name']}, Archivnummer: {data['archiveNumber']}, URL: {data['url']}",
-            use_container_width=True,
-        )
+async def fetch_images_for_topic(topic: str) -> list[dict]:
+    """Return image metadata for a given topic."""
+    return await fetch_images(topic)
@@ -1,7 +1,7 @@
 import os
 import json
+import warnings
 from importlib import resources
-import streamlit as st
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import (
@@ -27,14 +27,11 @@ def load_config(file):
         with open(FILE_PATH / file, "r") as file:
             return json.load(file)
     except FileNotFoundError:
-        st.error(f"Configuration file not found: {FILE_PATH / file}")
-        return {}
+        raise RuntimeError(f"Configuration file not found: {FILE_PATH / file}")
     except json.JSONDecodeError as e:
-        st.error(f"Error decoding configuration file: {e}")
-        return {}
+        raise RuntimeError(f"Error decoding configuration file: {e}")
 
 
-@st.cache_resource
 def load_embeddings_model():
     model_name_hf = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
     model_kwargs_hf = {"device": "cpu"}
@@ -46,7 +43,7 @@ def load_embeddings_model():
     )
 
 
-def load_documents_and_create_vectorstore():
+def load_documents_and_create_vectorstore(embedding_model):
     """Load documents from URLs and static files to create FAISS vector store."""
     # Load URLs
     urls_data = load_config(file="urls.json")
@@ -74,8 +71,9 @@ def load_documents_and_create_vectorstore():
             else:
                 continue
             static_docs.extend(loader.load())
-        except Exception as e:
-            print(f"Error loading file {file_name}: {e}")
+        except (IOError, ValueError) as e:
+            warnings.warn(f"Problem loading '{file_name}': {e}", UserWarning)
+            continue
 
     # Combine and process documents
     all_docs = web_docs + static_docs
@@ -86,28 +84,26 @@ def load_documents_and_create_vectorstore():
     print("Documents loaded and split successfully.")
 
     # Create and save FAISS vector store
-    vectorstore = FAISS.from_documents(doc_splits, load_embeddings_model())
+    vectorstore = FAISS.from_documents(doc_splits, embedding_model)
     vectorstore.save_local(persist_folder)
     print(f"FAISS index initialized and saved successfully in {persist_folder}.")
     return vectorstore
 
 
-def get_vectorstore():
+def get_vectorstore(embedding_model):
     vectorstore_exists = os.path.exists(index_path)
     if vectorstore_exists:
         try:
             vectorstore = FAISS.load_local(
                 persist_folder,
-                load_embeddings_model(),
+                embedding_model,
                 allow_dangerous_deserialization=True,
             )
             print(f"FAISS index loaded successfully from {persist_folder}.")
         except Exception as e:
-            print(f"Error loading existing FAISS index: {e}")
-            st.error(f"Error loading existing FAISS index: {e}")
-            raise e
+            raise RuntimeError(f"Error loading existing FAISS index: {e}") from e
     else:
-        vectorstore = load_documents_and_create_vectorstore()
+        vectorstore = load_documents_and_create_vectorstore(embedding_model)
 
     retriever = vectorstore.as_retriever()
     return retriever