monarch-initiative
diff --git a/‎.codespellrc‎
Lines changed: 3 additions & 0 deletions b/‎.codespellrc‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/qc.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/qc.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎poetry.lock‎
Lines changed: 2078 additions & 1591 deletions b/‎poetry.lock‎
Lines changed: 2078 additions & 1591 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 8 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/curategpt/agents/chat_agent.py‎
Lines changed: 104 additions & 0 deletions b/‎src/curategpt/agents/chat_agent.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎src/curategpt/app/app_alz.py‎
Lines changed: 84 additions & 48 deletions b/‎src/curategpt/app/app_alz.py‎
Lines changed: 84 additions & 48 deletions
@@ -0,0 +1,3 @@
+[codespell]
+ignore-words-list = aadd
+skip = tests/db
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.9", "3.10", "3.11" ]
+        python-version: [ "3.11" ]
 
     steps:
       - uses: actions/checkout@v3.0.2
@@ -24,7 +24,7 @@ jobs:
       - name: Install Poetry
         uses: snok/install-poetry@v1.3.1
       - name: Install dependencies
-        run: poetry install --no-interaction
+        run: poetry install --no-interaction --extras "paperqa"
 
       - name: Check common spelling errors
         run: poetry run tox -e codespell
 
@@ -7,7 +7,7 @@ license = "BSD-3"
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.9, !=3.9.7"
+python = "^3.11"
 click = "^8.1.7"
 importlib-metadata = ">=6"
 oaklib = "^0.6.9"
@@ -50,10 +50,12 @@ click-default-group = "^1.2.4"
 venomx = "^0.1.1"
 duckdb = "~1.0.0"
 python-dotenv = "^1.0.1"
+langchain-community = {version = "*", optional = true}
 onnxruntime = [
     {version = "<=1.19.2", python = "<3.10"},
     {version = "^1.20.0", python = ">=3.10"}
 ]
+paper-qa = {version = "^5.20.0", optional = true, python = ">=3.11"}
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.1.2"
@@ -88,14 +90,18 @@ docs = [
     "sphinx-autodoc-typehints",
     "sphinx-click",
     "myst-parser"
-    ]
+]
 bioc = [
     "bioc"
 ]
 gpt4all = [
     "gpt4all",
     "llm-gpt4all"
 ]
+paperqa = [
+    "paper-qa",
+    "langchain-community"
+]
 
 [tool.poetry-dynamic-versioning]
 enable = false
@@ -148,4 +154,3 @@ quiet-level = 3
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
 build-backend = "poetry_dynamic_versioning.backend"
-
 
@@ -158,3 +158,107 @@ def chat(
             uncited_references=uncited_references_dict,
             conversation_id=conversation_id,
         )
+
+
+@dataclass
+class ChatAgentAlz(BaseAgent):
+    """
+    An agent that allows chat to a knowledge source.
+
+    This implements a standard knowledgebase retrieval augmented generation pattern.
+    The knowledge_source is queried for relevant objects (the source can be a local
+    database or a remote source such as pubmed).
+    The objects are provided as context to a LLM query
+    """
+
+    relevance_factor: float = 0.5
+    """Relevance factor for diversifying search results using MMR."""
+
+    conversation_id: Optional[str] = None
+
+    def chat(
+            self,
+            query: str,
+            conversation: Optional[Any] = None,
+            limit: int = 10,
+            collection: str = None,
+            expand: bool = True,
+            **kwargs,
+    ) -> ChatResponse:
+        if self.extractor is None:
+            if isinstance(self.knowledge_source, BaseWrapper):
+                self.extractor = self.knowledge_source.extractor
+            else:
+                raise ValueError("Extractor must be set.")
+
+        logger.info(f"Chat: {query} on {self.knowledge_source} with limit: {limit}")
+        if collection is None:
+            collection = self.knowledge_source_collection
+        kwargs["collection"] = collection
+
+        # The search now returns dictionary results directly.
+        kb_results = list(self.knowledge_source.search(
+            query, relevance_factor=self.relevance_factor, limit=limit, expand=expand, **kwargs
+        ))
+
+        while True:
+            references = {}
+            texts = []
+            for i, result_tuple in enumerate(kb_results, start=1):
+                # Extract the object from the standard tuple format (obj, distance, metadata)
+                obj, _, _ = result_tuple
+
+                obj_text = yaml.dump({k: v for k, v in obj.items() if v}, sort_keys=False)
+                references[str(i)] = obj_text
+                texts.append(f"## Reference {i}\n{obj_text}")
+
+            model = self.extractor.model
+            prompt = (
+                "You are a specialized AI assistant for biomedical researchers and clinicians focused on "
+                "Alzheimer's disease and related topics. I will provide relevant background information, then ask "
+                "a question. Use this context to provide evidence-based answers with proper scientific citations.\n"
+            )
+            prompt += "---\nBackground facts:\n" + "\n".join(texts) + "\n\n"
+            prompt += (
+                "I will ask a question and you will answer as best as possible, citing the references above.\n"
+                "Write references in square brackets, e.g. [1]. For any additional facts without a citation, write [?].\n"
+            )
+            prompt += f"---\nHere is the Question: {query}.\n"
+            logger.debug(f"Candidate Prompt: {prompt}")
+            estimated_length = estimate_num_tokens([prompt])
+            logger.debug(f"Max tokens {model.model_id}: {max_tokens_by_model(model.model_id)}")
+
+            if estimated_length + 300 < max_tokens_by_model(model.model_id):
+                break
+            else:
+                logger.debug("Prompt too long, removing least relevant result.")
+                if not kb_results:
+                    raise ValueError(f"Prompt too long: {prompt}.")
+                kb_results.pop()
+
+        logger.info("Final prompt constructed for chat.")
+        if conversation:
+            conversation.model = model
+            agent = conversation
+            conversation_id = conversation.id
+            logger.info(f"Using conversation context with ID: {conversation_id}")
+        else:
+            agent = model
+            conversation_id = None
+
+        response = agent.prompt(prompt, system="You are a scientist assistant.")
+        response_text = response.text()
+        pattern = r"\[(\d+|\?)\]"
+        used_references = re.findall(pattern, response_text)
+        used_references_dict = {ref: references.get(ref, "NO REFERENCE") for ref in used_references}
+        uncited_references_dict = {ref: ref_obj for ref, ref_obj in references.items() if ref not in used_references}
+        formatted_text = replace_references_with_links(response_text)
+
+        return ChatResponse(
+            body=response_text,
+            formatted_body=formatted_text,
+            prompt=prompt,
+            references=used_references_dict,
+            uncited_references=uncited_references_dict,
+            conversation_id=conversation_id,
+        )
@@ -2,25 +2,24 @@
 
 import json
 import logging
+import os
 from typing import List, Union
 
 import streamlit as st
 import yaml
 
 from curategpt import BasicExtractor
-from curategpt.agents.chat_agent import ChatAgent, ChatResponse
+from curategpt.agents.chat_agent import ChatAgentAlz, ChatResponse
 from curategpt.agents.evidence_agent import EvidenceAgent
-from curategpt.app.helper import get_applicable_examples
 from curategpt.app.state import get_state
 from curategpt.wrappers import BaseWrapper
 from curategpt.wrappers.literature import WikipediaWrapper
 from curategpt.wrappers.literature.pubmed_wrapper import PubmedWrapper
+from curategpt.wrappers.paperqa.paperqawrapper import PaperQAWrapper
 
-PUBMED = "PubMed (via API)"
-WIKIPEDIA = "Wikipedia (via API)"
-# Removed JGI and ESS-Dive
-# JGI = "JGI (via API)"
-# ESSDIVE = "ESS-DeepDive (via API)"
+PUBMED = "PubMed"
+WIKIPEDIA = "Wikipedia"
+PAPERQA = "Alzheimers_Papers"
 
 CHAT = "Chat"
 SEARCH = "Search"
@@ -57,15 +56,22 @@
 cart = state.cart
 
 
-st.title("Alzheimers AI assistant")
+st.title("Alzheimer's AI Assistant")
+
+# Check if PQA_HOME environment variable is set for PaperQA
+if PAPERQA in [PUBMED, PAPERQA, WIKIPEDIA] and os.environ.get("PQA_HOME") is None:
+    st.warning(
+        "PQA_HOME environment variable is not set. To use the Alzheimer's Papers collection, "
+        "you need to set PQA_HOME to the directory containing your indexed papers. "
+        "Use 'curategpt paperqa index /path/to/papers' to create an index."
+    )
 if not db.list_collection_names():
     st.warning("No collections found. Please use command line to load one.")
 
 # Include Chat, Search, and CiteSeek in PAGES
 PAGES = [
     CHAT,
-    CITESEEK,
-    SEARCH
+    CITESEEK
 ]
 
 
@@ -92,12 +98,13 @@ def filtered_collection_names() -> List[str]:
 
 collection = st.sidebar.selectbox(
     "Choose collection",
-    [PUBMED, WIKIPEDIA] + filtered_collection_names(),  # Removed JGI and ESSDIVE, put PubMed first
-    index=0,  # Set PubMed as default
+    [PUBMED, PAPERQA, WIKIPEDIA] + filtered_collection_names() + ["No collection"],
+    index=0,  # Set PUBMED as default (index 0 since it's first in the list)
     help="""
     A collection is a knowledge base. It could be anything, but
     it's likely your instance has some bio-ontologies pre-loaded.
-    Select 'About' to see details of each collection
+    Select 'Alzheimer's Papers (via PaperQA)' for direct access to a trusted corpus of Alzheimer's research papers.
+    Select 'No collection' to interact with the model directly without a knowledge base.
     """,
 )
 
@@ -118,11 +125,12 @@ def filtered_collection_names() -> List[str]:
 # Add background_collection for CiteSeek functionality
 background_collection = st.sidebar.selectbox(
     "Background knowledge for CiteSeek",
-    [NO_BACKGROUND_SELECTED, PUBMED, WIKIPEDIA],
+    [NO_BACKGROUND_SELECTED, PUBMED, PAPERQA, WIKIPEDIA],
     index=1,  # Set PubMed as default
     help="""
     Background databases provide evidence sources for CiteSeek.
     PubMed is recommended for verifying medical claims.
+    Alzheimer's Papers provides specialized knowledge from trusted Alzheimer's research papers.
     """,
 )
 
@@ -131,25 +139,43 @@ def filtered_collection_names() -> List[str]:
 st.sidebar.markdown("Developed by the Monarch Initiative")
 
 
-def get_chat_agent() -> Union[ChatAgent, BaseWrapper]:
-    knowledge_source_collection = None
-    if collection == PUBMED:
+def get_chat_agent() -> Union[ChatAgentAlz, BaseWrapper]:
+    if collection == "No collection":
+        return ChatAgentAlz(extractor=extractor)
+    elif collection == PUBMED:
         source = PubmedWrapper(local_store=db, extractor=extractor)
     elif collection == WIKIPEDIA:
         source = WikipediaWrapper(local_store=db, extractor=extractor)
-    # Removed JGI and ESSDIVE cases
+    elif collection == PAPERQA:
+        source = PaperQAWrapper(extractor=extractor)
     else:
         source = db
-        knowledge_source_collection = collection
-    return ChatAgent(
+
+    agent = ChatAgentAlz(
         knowledge_source=source,
-        knowledge_source_collection=knowledge_source_collection,
+        knowledge_source_collection=collection,
         extractor=extractor,
     )
 
+    if agent.knowledge_source is None:
+        raise ValueError(f"Knowledge source is None for collection {collection}")
+
+    return agent
+
 
 def ask_chatbot(query, expand=False) -> ChatResponse:
-    return get_chat_agent().chat(query, expand=expand)
+    agent = get_chat_agent()
+    if collection == "No collection":
+        response = agent.extractor.model.prompt(query, system="You are a helpful Alzheimer's disease expert.")
+        return ChatResponse(
+            body=response.text(),
+            formatted_body=response.text(),
+            prompt=query,
+            references={},
+            uncited_references={}
+        )
+    else:
+        return agent.chat(query, expand=expand)
 
 
 def html_table(rows: List[dict]) -> str:
@@ -238,34 +264,44 @@ def _flat(obj: dict, limit=40) -> dict:
 
 elif option == CHAT:
     page_state = state.get_page_state(CHAT)
-    st.subheader("Chat with a knowledge base")
-    query = st.text_area(
-        f"Ask me anything (within the scope of {collection})!",
-        help="You can query the current knowledge base using natural language.",
-    )
+    if collection == "No collection":
+        st.subheader("Chat with the Alzheimer's AI assistant")
+        query = st.text_area(
+            "Ask me anything about Alzheimer's disease",
+            help="Ask questions directly to the AI without using a knowledge base.",
+        )
+    else:
+        query = st.text_area(
+            f"Ask me anything about Alzheimer's disease (within the scope of {collection})",
+            help="You can query the current knowledge base using natural language.",
+        )
+
+    # Only show these controls if using a knowledge base
+    if collection != "No collection":
+        limit = st.slider(
+            "Detail",
+            min_value=0,
+            max_value=30,
+            value=10,
+            step=1,
+            help="""
+                                       Behind the scenes, N entries are fetched from the knowledge base,
+                                       and these are fed to the LLM. Selecting more examples may give more
+                                       complete results, but may also exceed context windows for the model.
+                                       """,
+        )
+        expand = st.checkbox(
+            "Expand query",
+            help="""
+                                                    If checked, perform query expansion (pubmed only).
+                                                    """,
+        )
+    else:
+        # Set default values when not using a knowledge base
+        limit = 0
+        expand = False
 
-    limit = st.slider(
-        "Detail",
-        min_value=0,
-        max_value=30,
-        value=10,
-        step=1,
-        help="""
-                                   Behind the scenes, N entries are fetched from the knowledge base,
-                                   and these are fed to the LLM. Selecting more examples may give more
-                                   complete results, but may also exceed context windows for the model.
-                                   """,
-    )
-    expand = st.checkbox(
-        "Expand query",
-        help="""
-                                                If checked, perform query expansion (pubmed only).
-                                                """,
-    )
     extractor.model_name = model_name
-    examples = get_applicable_examples(collection, CHAT)
-    st.write("Examples:")
-    st.write(f"<details>{html_table(examples)}</details>", unsafe_allow_html=True)
 
     if st.button(CHAT):
         response = ask_chatbot(query, expand=expand)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[codespell]`
	`2`	`+ignore-words-list = aadd`
	`3`	`+skip = tests/db`