Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions chain_builder.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
"""Chain builder"""
from langchain.callbacks.base import AsyncCallbackManager
from langchain.callbacks.manager import AsyncCallbackManager
from langchain.callbacks.tracers import LangChainTracer
from langchain.chains import ChatVectorDBChain
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.llm import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.base import VectorStore

from langchain.prompts.prompt import PromptTemplate

def get_chat_chain(
vectorstore: VectorStore, question_handler, stream_handler, tracing: bool = False
) -> ChatVectorDBChain:
) -> ConversationalRetrievalChain:
condense_prompt = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Expand All @@ -28,23 +30,27 @@ def get_chat_chain(
You are given the following extracted parts of a long meilisearch document from the official documentation and a question. ONLY provide EXISTING links to the official documentation hosted at https://docs.meilisearch.com/. DO NOT try to make up link that DO NOT exist. Replace the .md extension by .html
You should only use links that are explicitly listed as a source in the context.
If the question includes a request for code, provide a code block directly from the documentation.
If you don't know the answer, just say "Hmm, I'm not sure." DO NOT try to make up an answer.
DO NOT try to make up an answer.
If you know the answer, remember that you are speaking to developers, so try to be as precise as possible.
If the question is not about Meilisearch, politely inform them that you are tuned to only answer questions about Meilisearch.
If you know the answer, DO NOT include cutted parts.
DO NOT start the answer with <br> tags.

Use the following context to answer the question

CONTEXT: {context}
----------
QUESTION: {question}
=========
{context}

=========
MARKDOWN ANSWER:"""

QA_PROMPT = PromptTemplate(
template=qa_prompt, input_variables=["context", "question"]
)

"""Create a ChatVectorDBChain for question/answering."""
# Construct a ChatVectorDBChain with a streaming llm for combine docs
"""Create a ConversationalRetrievalChain for question/answering."""
# Construct a ConversationalRetrievalChain with a streaming llm for combine docs
# and a separate, non-streaming llm for question generation
manager = AsyncCallbackManager([])
question_manager = AsyncCallbackManager([question_handler])
Expand All @@ -57,28 +63,34 @@ def get_chat_chain(
question_manager.add_handler(tracer)
stream_manager.add_handler(tracer)

question_gen_llm = OpenAI(
question_gen_llm = ChatOpenAI(
temperature=0,
verbose=True,
callback_manager=question_manager,
model_name="gpt-4"
)
streaming_llm = OpenAI(
streaming_llm = ChatOpenAI(
streaming=True,
callback_manager=stream_manager,
verbose=True,
temperature=0.2,
max_tokens=500
max_tokens=500,
model_name="gpt-4"
)

question_generator = LLMChain(
llm=question_gen_llm, prompt=CONDENSE_QUESTION_PROMPT, callback_manager=manager
)

doc_chain = load_qa_chain(
streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager
)
# doc_chain = load_qa_with_sources_chain(
# streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager
# )

chat = ChatVectorDBChain(
vectorstore=vectorstore,
chat = ConversationalRetrievalChain(
retriever=vectorstore.as_retriever(),
combine_docs_chain=doc_chain,
question_generator=question_generator,
callback_manager=manager,
Expand Down Expand Up @@ -109,7 +121,8 @@ def get_qa_chain(
max_tokens=500,
top_p=1,
frequency_penalty=0.0,
presence_penalty=0.0
presence_penalty=0.0,
model= "gpt-4"
)

doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", prompt=PROMPT_STUFF)
Expand Down
25 changes: 19 additions & 6 deletions ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,29 @@
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores.meilisearch import Meilisearch
import meilisearch

import logging
logger = logging.getLogger(__name__)

def ingest_docs(org_name: str, repo_name: str):
merged_sources = source_content(org_name, repo_name)

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=1000, chunk_overlap=0)
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
for source in merged_sources:
for chunk in splitter.split_text(source.page_content):
source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
with open("vectorstore.pkl", "wb") as f:
pickle.dump(FAISS.from_documents(source_chunks, OpenAIEmbeddings()), f)

client = meilisearch.Client('http://127.0.0.1:7700')
index = client.index('langchain_demo')
index.delete() # delete the index if it already exists to start from fresh data
embeddings = OpenAIEmbeddings()

print("Compute and add documents embeddings to Meilisearch vectorstore...")
vectorstore = Meilisearch(index, embeddings.embed_query, "text")
vectorstore.add_documents(source_chunks)
print("Done.")

def source_content(repo_owner, repo_name):
return list(get_github_content(repo_owner, repo_name))
Expand All @@ -36,7 +47,9 @@ def get_github_content(repo_owner, repo_name):
.strip()
)
repo_path = pathlib.Path(d)
markdown_files = list(repo_path.glob("**/*.md")) + list(repo_path.glob("**/*.mdx"))

markdown_files = list(repo_path.glob("**/resources/**/*.mdx")) + list(repo_path.glob("**/learn/**/*.mdx")) + list(repo_path.glob("**/reference/**/*.mdx"))

for markdown_file in markdown_files:
with open(markdown_file, "r") as f:
relative_path = markdown_file.relative_to(repo_path)
Expand Down
21 changes: 16 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
from fastapi.templating import Jinja2Templates
from fastapi.middleware.cors import CORSMiddleware

from langchain.vectorstores.meilisearch import Meilisearch
from langchain.embeddings import OpenAIEmbeddings
import meilisearch

app = FastAPI()
vectorstore: Optional[VectorStore] = None

Expand All @@ -46,11 +50,18 @@ def health():
@app.on_event("startup")
async def startup_event():
logging.info("loading vectorstore")
if not Path("vectorstore.pkl").exists():
ingest_docs("meilisearch", "documentation")
with open("vectorstore.pkl", "rb") as f:
global vectorstore
vectorstore = pickle.load(f)
# if not Path("vectorstore.pkl").exists():
#ingest_docs("meilisearch", "documentation")
# with open("vectorstore.pkl", "rb") as f:
# global vectorstore
# vectorstore = pickle.load(f)
global vectorstore
client = meilisearch.Client('http://127.0.0.1:7700')
index = client.index('langchain_demo')
embeddings = OpenAIEmbeddings()
vectorstore = Meilisearch(index, embeddings.embed_query, "text")
logging.info("vectorstore loaded")


@app.get("/chat")
def chat():
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ black
isort
websockets
pydantic
langchain
/Users/guillaume/langchain
uvicorn
faiss-cpu
bs4
Expand Down