reactome · adamjohnwright · Mar 20, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 12, 2025
diff --git a/.config.schema.yaml b/.config.schema.yaml
@@ -55,7 +55,7 @@ properties:
     type: array
     items:
       type: string
-      enum: ["React-to-Me"]
+      enum: ["React-to-Me", "Cross-Database Prototype"]
   usage_limits:
     type: object
     properties:

diff --git a/bin/chat-chainlit.py b/bin/chat-chainlit.py
@@ -1,5 +1,4 @@
 import os
-from typing import Any
 
 import chainlit as cl
 from chainlit.data.base import BaseDataLayer
@@ -10,6 +9,7 @@
 
 from agent.graph import AgentGraph
 from agent.profiles import ProfileName, get_chat_profiles
+from agent.profiles.base import OutputState
 from util.chainlit_helpers import (is_feature_enabled, message_rate_limited,
                                    save_openai_metrics, static_messages,
                                    update_search_results)
@@ -93,7 +93,7 @@ async def main(message: cl.Message) -> None:
     openai_cb = OpenAICallbackHandler()
 
     enable_postprocess: bool = is_feature_enabled(config, "postprocessing")
-    result: dict[str, Any] = await llm_graph.ainvoke(
+    result: OutputState = await llm_graph.ainvoke(
         message.content,
         chat_profile.lower(),
         callbacks=[chainlit_cb, openai_cb],

diff --git a/bin/embeddings_manager b/bin/embeddings_manager
@@ -14,6 +14,7 @@ from botocore.client import Config
 
 from data_generation.alliance import generate_alliance_embeddings
 from data_generation.reactome import generate_reactome_embeddings
+from data_generation.uniprot import generate_uniprot_embeddings
 from util.embedding_environment import EM_ARCHIVE, EmbeddingEnvironment
 
 S3_BUCKET = "download.reactome.org"
@@ -86,6 +87,8 @@ def make(
         os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_key
     if embedding.db == "reactome":
         generate_reactome_embeddings(str(embedding_path), hf_model=embedding.model, **kwargs)
+    elif embedding.db == "uniprot":
+        generate_uniprot_embeddings(embedding_path, hf_model=embedding.model, **kwargs)
     elif embedding.db == "alliance":
         generate_alliance_embeddings(str(embedding_path), hf_model=embedding.model, **kwargs)
     else:

diff --git a/mypy.ini b/mypy.ini
@@ -3,7 +3,9 @@ ignore_missing_imports = True
 allow_untyped_calls = True
 allow_untyped_defs = True
 allow_untyped_globals = True
+explicit_package_bases = True
 exclude = data/
+files = bin/,src/
 
 [mypy.plugins.pandas.*]
 init_forbid_dynamic = False
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,7 @@ psycopg = {extras = ["binary"], version = "^3.2.3"}
 pydantic = "^2.10.5"
 pyyaml = "^6.0.2"
 tavily-python = "^0.5.0"
+openpyxl = "^3.1.5"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.7.1"

diff --git a/src/agent/graph.py b/src/agent/graph.py
@@ -15,6 +15,7 @@
 
 from agent.models import get_embedding, get_llm
 from agent.profiles import ProfileName, create_profile_graphs
+from agent.profiles.base import InputState, OutputState
 from util.logging import logging
 
 LANGGRAPH_DB_URI = f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@postgres:5432/{os.getenv('POSTGRES_LANGGRAPH_DB')}?sslmode=disable"
@@ -81,13 +82,13 @@ async def ainvoke(
         callbacks: Callbacks,
         thread_id: str,
         enable_postprocess: bool = True,
-    ) -> dict[str, Any]:
+    ) -> OutputState:
         if self.graph is None:
             self.graph = await self.initialize()
         if profile not in self.graph:
-            return {}
-        result: dict[str, Any] = await self.graph[profile].ainvoke(
-            {"user_input": user_input},
+            return OutputState()
+        result: OutputState = await self.graph[profile].ainvoke(
+            InputState(user_input=user_input),
             config=RunnableConfig(
                 callbacks=callbacks,
                 configurable={

diff --git a/src/agent/profiles/__init__.py b/src/agent/profiles/__init__.py
@@ -5,12 +5,14 @@
 from langchain_core.language_models.chat_models import BaseChatModel
 from langgraph.graph.state import StateGraph
 
-from agent.profiles.react_to_me import create_reacttome_graph
+from agent.profiles.cross_database import create_cross_database_graph
+from agent.profiles.react_to_me import create_reactome_graph
 
 
 class ProfileName(StrEnum):
     # These should exactly match names in .config.schema.yaml
     React_to_Me = "React-to-Me"
+    Cross_Database_Prototype = "Cross-Database Prototype"
 
 
 class Profile(NamedTuple):
@@ -23,7 +25,12 @@ class Profile(NamedTuple):
     ProfileName.React_to_Me.lower(): Profile(
         name=ProfileName.React_to_Me,
         description="An AI assistant specialized in exploring **Reactome** biological pathways and processes.",
-        graph_builder=create_reacttome_graph,
+        graph_builder=create_reactome_graph,
+    ),
+    ProfileName.Cross_Database_Prototype.lower(): Profile(
+        name=ProfileName.Cross_Database_Prototype,
+        description="Early version of an AI assistant with knowledge from multiple bio-databases (**Reactome** + **Uniprot**).",
+        graph_builder=create_cross_database_graph,
     ),
 }
 

diff --git a/src/agent/profiles/base.py b/src/agent/profiles/base.py
@@ -1,25 +1,49 @@
 from typing import Annotated, TypedDict
 
-from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.messages import BaseMessage
+from langchain_core.runnables import Runnable, RunnableConfig
 from langgraph.graph.message import add_messages
 
+from agent.tasks.rephrase import create_rephrase_chain
 from tools.external_search.state import WebSearchResult
 
 
-class AdditionalContent(TypedDict):
+class AdditionalContent(TypedDict, total=False):
     search_results: list[WebSearchResult]
 
 
-class BaseState(TypedDict):
-    # (Everything the Chainlit layer uses should be included here)
-
+class InputState(TypedDict, total=False):
     user_input: str  # User input text
-    chat_history: Annotated[list[BaseMessage], add_messages]
-    context: list[Document]
+
+
+class OutputState(TypedDict, total=False):
     answer: str  # primary LLM response that is streamed to the user
     additional_content: AdditionalContent  # sends on graph completion
 
 
+class BaseState(InputState, OutputState, total=False):
+    rephrased_input: str  # LLM-generated query from user input
+    chat_history: Annotated[list[BaseMessage], add_messages]
+
+
 class BaseGraphBuilder:
-    pass  # NOTE: Anything that is common to all graph builders goes here
+    # NOTE: Anything that is common to all graph builders goes here
+
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        embedding: Embeddings,
+    ) -> None:
+        self.rephrase_chain: Runnable = create_rephrase_chain(llm)
+
+    async def preprocess(self, state: BaseState, config: RunnableConfig) -> BaseState:
+        rephrased_input: str = await self.rephrase_chain.ainvoke(
+            {
+                "user_input": state["user_input"],
+                "chat_history": state["chat_history"],
+            },
+            config,
+        )
+        return BaseState(rephrased_input=rephrased_input)