gmourier · gmourier · Jun 10, 2023
diff --git a/chain_builder.py b/chain_builder.py
@@ -1,19 +1,21 @@
 """Chain builder"""
-from langchain.callbacks.base import AsyncCallbackManager
+from langchain.callbacks.manager import AsyncCallbackManager
 from langchain.callbacks.tracers import LangChainTracer
-from langchain.chains import ChatVectorDBChain
 from langchain.chains import VectorDBQAWithSourcesChain
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.llm import LLMChain
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.llms import OpenAI
+from langchain.chat_models import ChatOpenAI
 from langchain.vectorstores.base import VectorStore
 
 from langchain.prompts.prompt import PromptTemplate
 
 def get_chat_chain(
     vectorstore: VectorStore, question_handler, stream_handler, tracing: bool = False
-) -> ChatVectorDBChain:
+) -> ConversationalRetrievalChain:
     condense_prompt = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
 Chat History:
 {chat_history}
@@ -28,23 +30,27 @@ def get_chat_chain(
 You are given the following extracted parts of a long meilisearch document from the official documentation and a question. ONLY provide EXISTING links to the official documentation hosted at https://docs.meilisearch.com/. DO NOT try to make up link that DO NOT exist. Replace the .md extension by .html
 You should only use links that are explicitly listed as a source in the context.
 If the question includes a request for code, provide a code block directly from the documentation.
-If you don't know the answer, just say "Hmm, I'm not sure." DO NOT try to make up an answer.
+DO NOT try to make up an answer.
 If you know the answer, remember that you are speaking to developers, so try to be as precise as possible.
 If the question is not about Meilisearch, politely inform them that you are tuned to only answer questions about Meilisearch.
 If you know the answer, DO NOT include cutted parts.
 DO NOT start the answer with <br> tags.
+
+Use the following context to answer the question
+
+CONTEXT: {context}
+----------
 QUESTION: {question}
-=========
-{context}
+
 =========
 MARKDOWN ANSWER:"""
 
     QA_PROMPT = PromptTemplate(
         template=qa_prompt, input_variables=["context", "question"]
     )
 
-    """Create a ChatVectorDBChain for question/answering."""
-    # Construct a ChatVectorDBChain with a streaming llm for combine docs
+    """Create a ConversationalRetrievalChain for question/answering."""
+    # Construct a ConversationalRetrievalChain with a streaming llm for combine docs
     # and a separate, non-streaming llm for question generation
     manager = AsyncCallbackManager([])
     question_manager = AsyncCallbackManager([question_handler])
@@ -57,28 +63,34 @@ def get_chat_chain(
         question_manager.add_handler(tracer)
         stream_manager.add_handler(tracer)
 
-    question_gen_llm = OpenAI(
+    question_gen_llm = ChatOpenAI(
         temperature=0,
         verbose=True,
         callback_manager=question_manager,
+        model_name="gpt-4"
     )
-    streaming_llm = OpenAI(
+    streaming_llm = ChatOpenAI(
         streaming=True,
         callback_manager=stream_manager,
         verbose=True,
         temperature=0.2,
-        max_tokens=500
+        max_tokens=500,
+        model_name="gpt-4"
     )
 
     question_generator = LLMChain(
         llm=question_gen_llm, prompt=CONDENSE_QUESTION_PROMPT, callback_manager=manager
     )
+
     doc_chain = load_qa_chain(
         streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager
     )
+    # doc_chain = load_qa_with_sources_chain(
+    #     streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager
+    # )
 
-    chat = ChatVectorDBChain(
-        vectorstore=vectorstore,
+    chat = ConversationalRetrievalChain(
+        retriever=vectorstore.as_retriever(),
         combine_docs_chain=doc_chain,
         question_generator=question_generator,
         callback_manager=manager,
@@ -109,7 +121,8 @@ def get_qa_chain(
         max_tokens=500,
         top_p=1,
         frequency_penalty=0.0,
-        presence_penalty=0.0
+        presence_penalty=0.0,
+        model= "gpt-4"
     )
 
     doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", prompt=PROMPT_STUFF)

diff --git a/ingest.py b/ingest.py
@@ -7,18 +7,29 @@
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.docstore.document import Document
-from langchain.vectorstores.faiss import FAISS
+from langchain.vectorstores.meilisearch import Meilisearch
+import meilisearch
+
+import logging
+logger = logging.getLogger(__name__)
 
 def ingest_docs(org_name: str, repo_name: str):
     merged_sources = source_content(org_name, repo_name)
-
     source_chunks = []
-    splitter = CharacterTextSplitter(separator=" ", chunk_size=1000, chunk_overlap=0)
+    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     for source in merged_sources:
         for chunk in splitter.split_text(source.page_content):
             source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
-    with open("vectorstore.pkl", "wb") as f:
-        pickle.dump(FAISS.from_documents(source_chunks, OpenAIEmbeddings()), f)
+
+    client = meilisearch.Client('http://127.0.0.1:7700')
+    index = client.index('langchain_demo')
+    index.delete() # delete the index if it already exists to start from fresh data
+    embeddings = OpenAIEmbeddings()
+
+    print("Compute and add documents embeddings to Meilisearch vectorstore...")
+    vectorstore = Meilisearch(index, embeddings.embed_query, "text")
+    vectorstore.add_documents(source_chunks)
+    print("Done.")
 
 def source_content(repo_owner, repo_name):
     return list(get_github_content(repo_owner, repo_name))
@@ -36,7 +47,9 @@ def get_github_content(repo_owner, repo_name):
             .strip()
         )
         repo_path = pathlib.Path(d)
-        markdown_files = list(repo_path.glob("**/*.md")) + list(repo_path.glob("**/*.mdx"))
+
+        markdown_files = list(repo_path.glob("**/resources/**/*.mdx")) + list(repo_path.glob("**/learn/**/*.mdx")) + list(repo_path.glob("**/reference/**/*.mdx"))
+
         for markdown_file in markdown_files:
             with open(markdown_file, "r") as f:
                 relative_path = markdown_file.relative_to(repo_path)

diff --git a/main.py b/main.py
@@ -20,6 +20,10 @@
 from fastapi.templating import Jinja2Templates
 from fastapi.middleware.cors import CORSMiddleware
 
+from langchain.vectorstores.meilisearch import Meilisearch
+from langchain.embeddings import OpenAIEmbeddings
+import meilisearch
+
 app = FastAPI()
 vectorstore: Optional[VectorStore] = None
 
@@ -46,11 +50,18 @@ def health():
 @app.on_event("startup")
 async def startup_event():
     logging.info("loading vectorstore")
-    if not Path("vectorstore.pkl").exists():
-        ingest_docs("meilisearch", "documentation")
-    with open("vectorstore.pkl", "rb") as f:
-        global vectorstore
-        vectorstore = pickle.load(f)
+    # if not Path("vectorstore.pkl").exists():
+    #ingest_docs("meilisearch", "documentation")
+    # with open("vectorstore.pkl", "rb") as f:
+    #     global vectorstore
+    #     vectorstore = pickle.load(f)
+    global vectorstore
+    client = meilisearch.Client('http://127.0.0.1:7700')
+    index = client.index('langchain_demo')
+    embeddings = OpenAIEmbeddings()
+    vectorstore = Meilisearch(index, embeddings.embed_query, "text")
+    logging.info("vectorstore loaded")
+
 
 @app.get("/chat")
 def chat():

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ black
 isort
 websockets
 pydantic
-langchain
+/Users/guillaume/langchain
 uvicorn
 faiss-cpu
 bs4