integrating OPENAI and Langchain into the ChatUI

bergzain · bergzain · commit 0abb9953d903 · 2023-11-28T20:56:25.000+01:00
Signed-off-by: BergZain &lt;50025962+bergzain@users.noreply.github.com&gt;
diff --git a/src/ChatUI_streamlit/LLMModel.py b/src/ChatUI_streamlit/LLMModel.py
@@ -0,0 +1,41 @@
+from dotenv import load_dotenv
+import os
+from langchain.chat_models import ChatOpenAI
+from langchain.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.document_loaders import TextLoader
+from langchain.prompts import PromptTemplate
+
+
+YourAPIKey = os.environ['OPENAI_API_KEY']
+
+load_dotenv()
+
+openai_api_key=os.getenv('OPENAI_API_KEY', 'YourAPIKey')
+
+llm = ChatOpenAI(model_name='gpt-3.5-turbo', openai_api_key=openai_api_key)
+
+
+embeddings = OpenAIEmbeddings(disallowed_special=(), openai_api_key=openai_api_key)
+
+root_dir = '/Users/zainhazzouri/projects/RAG-Playground/core/src/sdk/python/rtdip_sdk/pipelines'
+docs = []
+
+# Go through each folder
+for dirpath, dirnames, filenames in os.walk(root_dir):
+
+    # Go through each file
+    for file in filenames:
+        try:
+            # Load up the file as a doc and split
+            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
+            docs.extend(loader.load_and_split())
+        except Exception as e:
+            pass
+
+docsearch = FAISS.from_documents(docs, embeddings)
+
+# Get our retriever ready
+RAG = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
diff --git a/src/ChatUI_streamlit/app.py b/src/ChatUI_streamlit/app.py
@@ -1,23 +1,25 @@
 import streamlit as st
 import replicate
 import os
+from LLMModel import RAG as RAG
 
 # App title
-st.set_page_config(page_title="RTDIP Chatbot")
+st.set_page_config(page_title="RTDIP PipeLine Chatbot")
 
 # Replicate Credentials
 with st.sidebar:
-    st.title('Pipeline Generation Chatbot')
-    if 'REPLICATE_API_TOKEN' in st.secrets:
-        st.success('API key already provided!', icon='✅')
-        replicate_api = st.secrets['REPLICATE_API_TOKEN']
-    else:
-        replicate_api = st.text_input('Enter Replicate API token:', type='password')
-        if not (replicate_api.startswith('r8_') and len(replicate_api) == 40):
-            st.warning('Please enter your credentials!', icon='⚠️')
-        else:
-            st.success('Proceed to entering your prompt message!', icon='👉')
-    os.environ['REPLICATE_API_TOKEN'] = replicate_api
+    st.title('RTDIP Pipeline Generation Chatbot')
+    # if 'REPLICATE_API_TOKEN' in st.secrets:
+    #     st.success('API key already provided!', icon='✅')
+    #     replicate_api = st.secrets['REPLICATE_API_TOKEN']
+    # else:
+    #     replicate_api = st.text_input('Enter Replicate API token:', type='password')
+    #     if not (replicate_api.startswith('r8_') and len(replicate_api) == 40):
+    #         st.warning('Please enter your credentials!', icon='⚠️')
+    #     else:
+    #         st.success('Proceed to entering your prompt message!', icon='👉')
+    # os.environ['REPLICATE_API_TOKEN'] = replicate_api
+    openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey')
 
 # Store LLM generated responses
 if "conversations" not in st.session_state.keys():
@@ -48,29 +50,9 @@ def clear_chat_history():
     st.session_state.conversations = [{"title": "Default Conversation", "messages": [{"role": "assistant", "content": "How may I assist you today?"}]}]
 st.sidebar.button('Clear Chat History', on_click=clear_chat_history)
 
-# Function for generating LLaMA2 response. Refactored from https://github.com/a16z-infra/llama2-chatbot
-def generate_llama2_response(prompt_input):
-    string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
-    for conversation in st.session_state.conversations:
-        for dict_message in conversation["messages"]:
-            role = dict_message["role"]
-            content = dict_message["content"]
-            if role == "user":
-                string_dialogue += f"User: {content}\n\n"
-            else:
-                string_dialogue += f"Assistant: {content}\n\n"
-
-    output = replicate.run(
-        'a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5',
-        input={
-            "prompt": f"{string_dialogue} {prompt_input} Assistant: ",
-            "repetition_penalty": 1
-        }
-    )
-    return output
 
 # User-provided prompt
-if prompt := st.chat_input(disabled=not replicate_api):
+if prompt := st.chat_input(): #
     # Use the user's prompt as the title 
     title = prompt
     
@@ -83,15 +65,15 @@ def generate_llama2_response(prompt_input):
 # Generate a new response if the last message is not from the assistant
 if st.session_state.conversations[-1]["messages"][-1]["role"] != "assistant":
     with st.chat_message("assistant"):
-        with st.spinner("Thinking..."):
-            response = generate_llama2_response(prompt)
+        with st.spinner("Generating..."):
+            response = RAG.run(prompt)
             placeholder = st.empty()
             full_response = ''
             for item in response:
                 full_response += item
                 placeholder.markdown(full_response)
             placeholder.markdown(full_response)
-    
+
     message = {"role": "assistant", "content": full_response}
     st.session_state.conversations[-1]["messages"].append(message)
     
diff --git a/src/RAG/RAG.ipynb b/src/RAG/RAG.ipynb
@@ -5,18 +5,18 @@
    "execution_count": 1,
    "id": "initial_id",
    "metadata": {
+    "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:52.534235Z",
-     "start_time": "2023-11-25T08:06:52.514168Z"
-    },
-    "collapsed": true
+     "end_time": "2023-11-27T20:36:37.381119Z",
+     "start_time": "2023-11-27T20:36:37.371708Z"
+    }
    },
    "outputs": [],
    "source": [
     "from dotenv import load_dotenv\n",
     "import os\n",
     "\n",
-    "YourAPIKey = \"\"\n",
+    "YourAPIKey = \"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\"\n",
     "\n",
     "load_dotenv()\n",
     "\n",
@@ -25,14 +25,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
    "id": "640e19e3961c5559",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:53.629849Z",
-     "start_time": "2023-11-25T08:06:52.521264Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:37:28.740784Z",
+     "start_time": "2023-11-27T20:37:28.697880Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -47,14 +47,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "id": "33597feb02573078",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:53.663555Z",
-     "start_time": "2023-11-25T08:06:53.630098Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:37:28.924429Z",
+     "start_time": "2023-11-27T20:37:28.892412Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -63,18 +63,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "id": "ce645e118f29cf79",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:53.686887Z",
-     "start_time": "2023-11-25T08:06:53.664778Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:37:29.445589Z",
+     "start_time": "2023-11-27T20:37:29.422117Z"
+    }
    },
    "outputs": [],
    "source": [
-    "root_dir = '/amos2023ws05-pipeline-config-chat-ai/src/RAG/pipelines'\n",
+    "root_dir = '/Users/zainhazzouri/projects/amos2023ws05-pipeline-config-chat-ai/src/RAG/pipelines'\n",
     "docs = []\n",
     "\n",
     "# Go through each folder\n",
@@ -92,21 +92,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "id": "c6e41366a23e6224",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:53.687897Z",
-     "start_time": "2023-11-25T08:06:53.685060Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:37:32.037810Z",
+     "start_time": "2023-11-27T20:37:32.031816Z"
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "You have 190 documents\n",
+      "You have 219 documents\n",
       "\n",
       "------ Start Document ------\n",
       "# Copyright 2022 RTDIP\n",
@@ -129,14 +129,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 12,
    "id": "e9847352294eee40",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:57.051920Z",
-     "start_time": "2023-11-25T08:06:53.687581Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:37:37.370376Z",
+     "start_time": "2023-11-27T20:37:33.328494Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -145,14 +145,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
    "id": "90fd0d8a51a5cf31",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:06:59.831522Z",
-     "start_time": "2023-11-25T08:06:59.822748Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:37:38.431113Z",
+     "start_time": "2023-11-27T20:37:38.428419Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -162,14 +162,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "id": "103f11e7d6f49f6e",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:11:12.280923Z",
-     "start_time": "2023-11-25T08:10:35.004427Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:38:16.799202Z",
+     "start_time": "2023-11-27T20:37:39.004977Z"
+    }
    },
    "outputs": [],
    "source": [
@@ -179,59 +179,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
    "id": "7b73d941ef97f4bb",
    "metadata": {
+    "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2023-11-25T08:11:13.673641Z",
-     "start_time": "2023-11-25T08:11:13.668368Z"
-    },
-    "collapsed": false
+     "end_time": "2023-11-27T20:38:18.862176Z",
+     "start_time": "2023-11-27T20:38:18.858254Z"
+    }
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sure! Here's an example of how you can use RTDIP components to read from an Eventhub using a connection string and consumer group, transform the data from binary to string, and then write it to a Delta table:\n",
+      "Certainly! Here's the code snippet that reads from an Eventhub using a connection string and consumer group, applies the BinaryToStringTransformer and EdgeXOPCUAJsonToPCDMTransformer transformations, and writes the data to a Delta table:\n",
       "\n",
       "```python\n",
-      "from rtdip_sdk.pipelines.sources import SparkEventhubSource\n",
-      "from rtdip_sdk.pipelines.transforms import BinaryToStringTransformer, EdgeXTransformer\n",
-      "from rtdip_sdk.pipelines.destinations import DeltaDestination\n",
+      "from rtdip_sdk.pipelines.sources.spark.eventhub import SparkEventhubSource\n",
+      "from rtdip_sdk.pipelines.transformers.spark.binary_to_string import BinaryToStringTransformer\n",
+      "from rtdip_sdk.pipelines.transformers.spark.edgex_opcua_json_to_pcdm import EdgeXOPCUAJsonToPCDMTransformer\n",
+      "from rtdip_sdk.pipelines.destinations.spark.delta import SparkDeltaDestination\n",
       "from rtdip_sdk.pipelines.utilities import SparkSessionUtility\n",
+      "import json\n",
       "\n",
-      "# Not required if using Databricks\n",
-      "spark = SparkSessionUtility(config={}).execute()\n",
-      "\n",
-      "# Eventhub connection string and consumer group\n",
-      "connection_string = \"YOUR_EVENTHUB_CONNECTION_STRING\"\n",
-      "consumer_group = \"YOUR_CONSUMER_GROUP\"\n",
-      "\n",
-      "# Create the Eventhub source\n",
-      "eventhub_source = SparkEventhubSource(spark=spark, options={\"eventhubs.connectionString\": connection_string, \"eventhubs.consumerGroup\": consumer_group})\n",
+      "def pipeline():\n",
+      "    spark = SparkSessionUtility(config={}).execute()\n",
       "\n",
-      "# Read from Eventhub\n",
-      "data = eventhub_source.read_stream()\n",
+      "    ehConf = {\n",
+      "        \"eventhubs.connectionString\": \"{EventhubConnectionString}\",\n",
+      "        \"eventhubs.consumerGroup\": \"{EventhubConsumerGroup}\",\n",
+      "        \"eventhubs.startingPosition\": json.dumps(\n",
+      "            {\"offset\": \"0\", \"seqNo\": -1, \"enqueuedTime\": None, \"isInclusive\": True}\n",
+      "        ),\n",
+      "    }\n",
       "\n",
-      "# Transform data from binary to string\n",
-      "binary_to_string_transformer = BinaryToStringTransformer()\n",
-      "transformed_data = binary_to_string_transformer.transform(data)\n",
+      "    source = SparkEventhubSource(spark, ehConf).read_batch()\n",
+      "    string_data = BinaryToStringTransformer(source, \"body\", \"body\").transform()\n",
+      "    PCDM_data = EdgeXOPCUAJsonToPCDMTransformer(string_data, \"body\").transform()\n",
+      "    SparkDeltaDestination(\n",
+      "        data=PCDM_data, options={}, destination=\"{path/to/table}\"\n",
+      "    ).write_batch()\n",
       "\n",
-      "# Apply EdgeX transformation\n",
-      "edgex_transformer = EdgeXTransformer()\n",
-      "transformed_data = edgex_transformer.transform(transformed_data)\n",
-      "\n",
-      "# Write transformed data to Delta table\n",
-      "delta_destination = DeltaDestination(spark=spark, data=transformed_data, table_name=\"YOUR_DELTA_TABLE_NAME\")\n",
-      "delta_destination.write_stream()\n",
+      "if __name__ == \"__main__\":\n",
+      "    pipeline()\n",
       "```\n",
       "\n",
-      "Make sure to replace `YOUR_EVENTHUB_CONNECTION_STRING`, `YOUR_CONSUMER_GROUP`, and `YOUR_DELTA_TABLE_NAME` with your actual values.\n",
-      "\n",
-      "This code will create a streaming pipeline that reads data from an Eventhub, transforms it using binary to string and EdgeX transformations, and then writes the transformed data to a Delta table.\n",
-      "\n",
-      "Note that you'll need to have the necessary dependencies installed and import the required modules for the components to work properly.\n"
+      "Please replace `{EventhubConnectionString}`, `{EventhubConsumerGroup}`, and `{path/to/table}` with your specific values.\n"
      ]
     }
    ],