Merge pull request #89 from BillFarber/task/extendExamples

BillFarber · web-flow · commit c1306e6d876d · 2024-09-12T08:00:47.000-04:00
Added an example of a retriever with a contextual query
diff --git a/examples/langchain/README.md b/examples/langchain/README.md
@@ -57,22 +57,36 @@ AZURE_LLM_DEPLOYMENT_MODEL=gpt-35-turbo
 
 ## Testing using a retriever with a basic query
 
-You are now ready to test the example retriever. Run the following to ask a question with the 
-results augmented via the `marklogic_retriever.py` module in this project; you will be 
-prompted for an AzureOpenAI API key when you run this, which you can type or paste in:
+You are now ready to test the example retriever. Run the following to ask a question
+with the results augmented via the `marklogic_similar_query_retriever.py` module in this
+project:
 
     python ask_similar_query.py "What is task decomposition?" posts
 
-The retriever uses a [cts.similarQuery](https://docs.marklogic.com/cts.similarQuery) to select from the documents 
-loaded via `load_data.py`. It defaults to a page length of 10. You can change this by providing a command line
-argument - e.g.:
+The retriever uses a [cts.similarQuery](https://docs.marklogic.com/cts.similarQuery) to
+select from the documents loaded via `load_data.py`. It defaults to a page length of 10.
+You can change this by providing a command line argument - e.g.:
 
     python ask_similar_query.py "What is task decomposition?" posts 15
 
 Example of a question for the "sotu" (State of the Union speech) collection:
 
     python ask_similar_query.py "What are economic sanctions?" sotu 20
 
-To use a word query instead of a similar query, along with a set of drop words, specify "word" as the 4th argument:
+To use a word query instead of a similar query, along with a set of drop words, specify
+"word" as the 4th argument:
 
     python ask_similar_query.py "What are economic sanctions?" sotu 20 word
+
+## Testing using a retriever with a contextual query
+
+There may be times when your langchain application needs to use both a question and a
+structured query during the document retrieval process. To see an example of this, run
+the following to ask a question. That question is combined with a hard-coded structured
+query using the `marklogic_contextual_query_retriever.py` module in this project.
+
+    python ask_contextual_query.py "What is task decomposition?" posts
+
+This retriever builds a term-query using words from the question. Then the term-query is
+added to the structured query and the merged query is used to select from the documents 
+loaded via `load_data.py`.
diff --git a/examples/langchain/ask_contextual_query.py b/examples/langchain/ask_contextual_query.py
@@ -0,0 +1,72 @@
+# Based on example at
+# https://python.langchain.com/docs/use_cases/question_answering/quickstart .
+
+import os
+import sys
+from dotenv import load_dotenv
+from langchain import hub
+from langchain_openai import AzureChatOpenAI
+from langchain.schema import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough
+from marklogic import Client
+from marklogic_contextual_query_retriever import (
+    MarkLogicContextualQueryRetriever,
+)
+
+
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+
+
+question = sys.argv[1]
+
+retriever = MarkLogicContextualQueryRetriever.create(
+    Client("http://localhost:8003", digest=("langchain-user", "password"))
+)
+retriever.collections = [sys.argv[2]]
+retriever.max_results = int(sys.argv[3]) if len(sys.argv) > 3 else 10
+if len(sys.argv) > 4:
+    retriever.query_type = sys.argv[4]
+
+load_dotenv()
+
+prompt = hub.pull("rlm/rag-prompt")
+# Note that the Azure OpenAI API key, the Azure OpenAI Endpoint, and the OpenAI API
+# Version, are all read from the environment automatically.
+llm = AzureChatOpenAI(
+    model_name=os.getenv("AZURE_LLM_DEPLOYMENT_NAME"),
+    azure_deployment=os.getenv("AZURE_LLM_DEPLOYMENT_NAME"),
+    temperature=0,
+    max_tokens=None,
+    timeout=None,
+)
+
+contextual_query = {
+    "query": {
+        "queries": [
+            {
+                "near-query": [
+                    {"word-query": ["role"]},
+                    {"word-query": ["assistant"]},
+                ]
+            },
+        ]
+    },
+}
+chat_context = {"question": question, "contextual_query": contextual_query}
+
+
+def get_question():
+    return chat_context["question"]
+
+
+rag_chain = (
+    {
+        "context": retriever | format_docs,
+        "question": RunnablePassthrough().pick("question"),
+    }
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+print(rag_chain.invoke(input=chat_context))
diff --git a/examples/langchain/docker-compose.yml b/examples/langchain/docker-compose.yml
@@ -1,5 +1,5 @@
 version: '3.8'
-name: marklogic_langchain
+name: marklogic_python_example_langchain
 
 services:
 
diff --git a/examples/langchain/marklogic_contextual_query_retriever.py b/examples/langchain/marklogic_contextual_query_retriever.py
@@ -0,0 +1,44 @@
+from typing import List
+from langchain_core.documents import Document
+from langchain_core.retrievers import (
+    BaseRetriever,
+)
+from marklogic import Client
+
+"""
+Modeled after
+https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/retrievers/elastic_search_bm25.py ,
+which uses a `create` method instead of __init__.
+"""
+
+
+class MarkLogicContextualQueryRetriever(BaseRetriever):
+
+    client: Client
+    max_results: int = 10
+    collections: List[str] = []
+
+    @classmethod
+    def create(cls, client: Client):
+        return cls(client=client)
+
+    def _get_relevant_documents(
+        self,
+        chat_context: object,
+    ) -> List[Document]:
+        search_words = []
+        for word in chat_context["question"].split():
+            search_words.append(word.lower().replace("?", ""))
+        term_query = {"term-query": {"text": search_words}}
+
+        print(f"contextual_query: {chat_context['contextual_query']}")
+        chat_context["contextual_query"]["query"]["queries"].append(term_query)
+
+        print(f"Searching with query: {chat_context['contextual_query']}")
+        results = self.client.documents.search(
+            query=chat_context["contextual_query"],
+            page_length=self.max_results,
+            collections=self.collections,
+        )
+        print(f"Count of matching MarkLogic documents: {len(results)}")
+        return map(lambda doc: Document(page_content=doc.content), results)