microsoft
diff --git a/‎.gitignore
+6-1 b/‎.gitignore
+6-1
diff --git a/‎Makefile
+23-1 b/‎Makefile
+23-1
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎app/backend/app.py
+4-24 b/‎app/backend/app.py
+4-24
diff --git a/‎app/backend/approaches/chatreadretrieveread.py
+116-85 b/‎app/backend/approaches/chatreadretrieveread.py
+116-85
@@ -395,4 +395,9 @@ terraform.tfstate
 terraform.tfstate.d
 .tfplan.txt
 infra/infoasst*
-infra/sp_config/config.json
+infra/sp_config/config.json
+
+#Upgrade & Migrate Support
+scripts/upgrade_repoint.config.json
+azcopy.tar.gz
+azcopy_dir
@@ -64,5 +64,27 @@ destroy-inf: check-subscription
 functional-tests: extract-env ## Run functional tests to check the processing pipeline is working
 	@./scripts/functional-tests.sh	
 
-run-migration: ## Migrate from BICEP to Terraform
+merge-databases: ## Upgrade from bicep to terraform
+	@figlet "Upgrading in place"
 	python ./scripts/merge-databases.py
+
+import-state: check-subscription ## import state of current services to TF state
+	@./scripts/inf-import-state.sh
+
+prep-upgrade: ## Command to merge databases and import TF state in prep for an upgrade from 1.0 to 1.n
+	@figlet "Upgrading"
+	merge-databases 
+	import-state 
+
+prep-env: ## Apply role assignments as needed to upgrade
+	@figlet "Preparing Environment"
+	@./scripts/prep-env.sh
+
+prep-migration-env: ## Prepare the environment for migration by assigning required roles
+	@./scripts/prep-migration-env.sh
+
+run-data-migration: ## Run the data migration moving data from one resource group to another
+	python ./scripts/extract-content.py
+
+manual-inf-destroy: ## A command triggered by a user to destroy a resource group, associated resources, and related Entra items
+	@./scripts/inf-manual-destroy.sh 
@@ -35,7 +35,7 @@
 
 [![Open in GitHub Codespaces](https://img.shields.io/static/v1?style=for-the-badge&label=GitHub+Codespaces&message=Open&color=brightgreen&logo=github)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=601652366&machine=basicLinux32gb&devcontainer_path=.devcontainer%2Fdevcontainer.json&location=eastus)
 
-This industry accelerator showcases integration between Azure and OpenAI's large language models. It leverages Azure AI Search for data retrieval and ChatGPT-style Q&A interactions. Using the Retrieval Augmented Generation (RAG) design pattern with Azure Open AI's GPT models, it provides a natural language interaction to discover relevant responses to user queries. Azure AI Search simplifies data ingestion, transformation, indexing, and multilingual translation.
+This industry accelerator showcases integration between Azure and OpenAI's large language models. It leverages Azure AI Search for data retrieval and ChatGPT-style Q&A interactions. Using the Retrieval Augmented Generation (RAG) design pattern with Azure OpenAI's GPT models, it provides a natural language interaction to discover relevant responses to user queries. Azure AI Search simplifies data ingestion, transformation, indexing, and multilingual translation.
 
 The accelerator adapts prompts based on the model type for enhanced performance. Users can customize settings like temperature and persona for personalized AI interactions. It offers features like explainable thought processes, referenceable citations, and direct content for verification.
 
@@ -124,7 +124,7 @@ Find out more with Microsoft's [Responsible AI resources](https://www.microsoft.
 
 ### Content Safety
 
-Content safety is provided through Azure Open AI service. The Azure OpenAI Service includes a content filtering system that runs alongside the core AI models. This system uses an ensemble of classification models to detect four categories of potentially harmful content (violence, hate, sexual, and self-harm) at four severity levels (safe, low, medium, high).These 4 categories may not be sufficient for all use cases, especially for minors. Please read our [Transaparncy Note](/docs/transparency.md)
+Content safety is provided through Azure OpenAI service. The Azure OpenAI Service includes a content filtering system that runs alongside the core AI models. This system uses an ensemble of classification models to detect four categories of potentially harmful content (violence, hate, sexual, and self-harm) at four severity levels (safe, low, medium, high).These 4 categories may not be sufficient for all use cases, especially for minors. Please read our [Transaparncy Note](/docs/transparency.md)
 
 By default, the content filters are set to filter out prompts and completions that are detected as medium or high severity for those four harm categories. Content labeled as low or safe severity is not filtered.
 
 
@@ -126,7 +126,7 @@
     AUTHORITY = AzureAuthorityHosts.AZURE_GOVERNMENT
 else:
     AUTHORITY = AzureAuthorityHosts.AZURE_PUBLIC_CLOUD
-openai.api_version = "2023-12-01-preview"
+openai.api_version = "2024-02-01"
 # Use the current user identity to authenticate with Azure OpenAI, Cognitive Search and Blob Storage (no secrets needed,
 # just use 'az login' locally, and managed identity when deployed on Azure). If you need to use keys, use separate AzureKeyCredential instances with the
 # keys for each service
@@ -295,20 +295,11 @@ async def chat(request: Request):
             return {"error": "unknown approach"}, 400
 
         if (Approaches(int(approach)) == Approaches.CompareWorkWithWeb or Approaches(int(approach)) == Approaches.CompareWebWithWork):
-            r = await impl.run(json_body.get("history", []), json_body.get("overrides", {}), json_body.get("citation_lookup", {}), json_body.get("thought_chain", {}))
+            r = impl.run(json_body.get("history", []), json_body.get("overrides", {}), json_body.get("citation_lookup", {}), json_body.get("thought_chain", {}))
         else:
-            r = await impl.run(json_body.get("history", []), json_body.get("overrides", {}), {}, json_body.get("thought_chain", {}))
+            r = impl.run(json_body.get("history", []), json_body.get("overrides", {}), {}, json_body.get("thought_chain", {}))
 
-        response = {
-                "data_points": r["data_points"],
-                "answer": r["answer"],
-                "thoughts": r["thoughts"],
-                "thought_chain": r["thought_chain"],
-                "work_citation_lookup": r["work_citation_lookup"],
-                "web_citation_lookup": r["web_citation_lookup"]
-        }
-
-        return response
+        return StreamingResponse(r, media_type="application/x-ndjson")
 
     except Exception as ex:
         log.error(f"Error in chat:: {ex}")
@@ -824,17 +815,6 @@ async def stream_agent_response(question: str):
     Raises:
         HTTPException: If an error occurs while processing the question.
     """
-    # try:
-    #     def event_stream():
-    #         data_generator = iter(process_agent_response(question))
-    #         while True:
-    #             try:
-    #                 chunk = next(data_generator)
-    #                 yield chunk
-    #             except StopIteration:
-    #                 yield "data: keep-alive\n\n"
-    #                 time.sleep(5)
-    #     return StreamingResponse(event_stream(), media_type="text/event-stream")
     if question is None:
         raise HTTPException(status_code=400, detail="Question is required")
 
 
@@ -1,13 +1,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import json
 import re
 import logging
 import urllib.parse
 from datetime import datetime, timedelta
-from typing import Any, Sequence
+from typing import Any, AsyncGenerator, Coroutine, Sequence
 
 import openai
+from openai import AzureOpenAI
+from openai import  AsyncAzureOpenAI
 from approaches.approach import Approach
 from azure.search.documents import SearchClient  
 from azure.search.documents.models import RawVectorQuery
@@ -128,17 +131,28 @@ def __init__(
         openai.api_base = oai_endpoint
         openai.api_type = 'azure'
         openai.api_key = oai_service_key
+        openai.api_version = "2024-02-01"
+        
+        self.client = AsyncAzureOpenAI(
+        azure_endpoint = openai.api_base, 
+        api_key=openai.api_key,  
+        api_version=openai.api_version)
+               
 
         self.model_name = model_name
         self.model_version = model_version
 
+       
+      
+        
     # def run(self, history: list[dict], overrides: dict) -> any:
     async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any], citation_lookup: dict[str, Any], thought_chain: dict[str, Any]) -> Any:
 
         log = logging.getLogger("uvicorn")
         log.setLevel('DEBUG')
         log.propagate = True
 
+        chat_completion = None
         use_semantic_captions = True if overrides.get("semantic_captions") else False
         top = overrides.get("top") or 3
         user_persona = overrides.get("user_persona", "")
@@ -170,14 +184,19 @@ async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]
             self.chatgpt_token_limit - len(user_question)
             )
 
-        chat_completion = await openai.ChatCompletion.acreate(
-            deployment_id=self.chatgpt_deployment,
-            model=self.model_name,
-            messages=messages,
-            temperature=0.0,
-            # max_tokens=32, # setting it too low may cause malformed JSON
-            max_tokens=100,
-            n=1)
+        try:
+            chat_completion= await self.client.chat.completions.create(
+                    model=self.chatgpt_deployment,
+                    messages=messages,
+                    temperature=0.0,
+                    # max_tokens=32, # setting it too low may cause malformed JSON
+                    max_tokens=100,
+                n=1)
+        
+        except Exception as e:
+            log.error(f"Error generating optimized keyword search: {str(e)}")
+            yield json.dumps({"error": f"Error generating optimized keyword search: {str(e)}"}) + "\n"
+            return
 
         generated_query = chat_completion.choices[0].message.content
 
@@ -186,22 +205,33 @@ async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]
             generated_query = history[-1]["user"]
 
         thought_chain["work_search_term"] = generated_query
+        
         # Generate embedding using REST API
         url = f'{self.embedding_service_url}/models/{self.escaped_target_model}/embed'
         data = [f'"{generated_query}"']
+        
         headers = {
                 'Accept': 'application/json',  
                 'Content-Type': 'application/json',
             }
 
-        response = requests.post(url, json=data,headers=headers,timeout=60)
-        if response.status_code == 200:
-            response_data = response.json()
-            embedded_query_vector =response_data.get('data')          
-        else:
-            log.error(f"Error generating embedding:: {response.status_code}")
-            raise Exception('Error generating embedding:', response.status_code)
-
+        embedded_query_vector = None
+        try:
+            response = requests.post(url, json=data,headers=headers,timeout=60)
+            if response.status_code == 200:
+                response_data = response.json()
+                embedded_query_vector =response_data.get('data')          
+            else:
+                # Generate an error message if the embedding generation fails
+                log.error(f"Error generating embedding:: {response.status_code}")
+                yield json.dumps({"error": "Error generating embedding"}) + "\n"
+                return # Go no further
+        except Exception as e:
+            # Timeout or other error has occurred
+            log.error(f"Error generating embedding: {str(e)}")
+            yield json.dumps({"error": f"Error generating embedding: {str(e)}"}) + "\n"
+            return # Go no further
+        
         #vector set up for pure vector search & Hybrid search & Hybrid semantic
         vector = RawVectorQuery(vector=embedded_query_vector, k=top, fields="contentVector")
 
@@ -325,17 +355,19 @@ async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]
                 userPersona=user_persona,
                 systemPersona=system_persona,
             )
-        # STEP 3: Generate a contextual and content-specific answer using the search results and chat history.
-        #Added conditional block to use different system messages for different models.
-        if self.model_name.startswith("gpt-35-turbo"):
-            messages = self.get_messages_from_history(
-                system_message,
-                self.model_name,
-                history,
-                history[-1]["user"] + "Sources:\n" + content + "\n\n", # 3.5 has recency Bias that is why this is here
-                self.RESPONSE_PROMPT_FEW_SHOTS,
-                max_tokens=self.chatgpt_token_limit - 500
-            )
+            
+        try:
+            # STEP 3: Generate a contextual and content-specific answer using the search results and chat history.
+            #Added conditional block to use different system messages for different models.
+            if self.model_name.startswith("gpt-35-turbo"):
+                messages = self.get_messages_from_history(
+                    system_message,
+                    self.model_name,
+                    history,
+                    history[-1]["user"] + "Sources:\n" + content + "\n\n", # 3.5 has recency Bias that is why this is here
+                    self.RESPONSE_PROMPT_FEW_SHOTS,
+                    max_tokens=self.chatgpt_token_limit - 500
+                )
 
             #Uncomment to debug token usage.
             #print(messages)
@@ -347,66 +379,65 @@ async def run(self, history: Sequence[dict[str, str]], overrides: dict[str, Any]
             #print("System Message Tokens: ", self.num_tokens_from_string(system_message, "cl100k_base"))
             #print("Few Shot Tokens: ", self.num_tokens_from_string(self.response_prompt_few_shots[0]['content'], "cl100k_base"))
             #print("Message Tokens: ", self.num_tokens_from_string(message_string, "cl100k_base"))
-            chat_completion = await openai.ChatCompletion.acreate(
-            deployment_id=self.chatgpt_deployment,
-            model=self.model_name,
-            messages=messages,
-            temperature=float(overrides.get("response_temp")) or 0.6,
-            n=1
-        )
-
-        elif self.model_name.startswith("gpt-4"):
-            messages = self.get_messages_from_history(
-                system_message,
-                # "Sources:\n" + content + "\n\n" + system_message,
-                self.model_name,
-                history,
-                # history[-1]["user"],
-                history[-1]["user"] + "Sources:\n" + content + "\n\n", # GPT 4 starts to degrade with long system messages. so moving sources here 
-                self.RESPONSE_PROMPT_FEW_SHOTS,
-                max_tokens=self.chatgpt_token_limit
-            )
+                chat_completion= await self.client.chat.completions.create(
+                    model=self.chatgpt_deployment,
+                    messages=messages,
+                    temperature=float(overrides.get("response_temp")) or 0.6,
+                    n=1,
+                    stream=True
+                )
 
-            #Uncomment to debug token usage.
-            #print(messages)
-            #message_string = ""
-            #for message in messages:
-            #    # enumerate the messages and add the role and content elements of the dictoinary to the message_string
-            #    message_string += f"{message['role']}: {message['content']}\n"
-            #print("Content Tokens: ", self.num_tokens_from_string("Sources:\n" + content + "\n\n", "cl100k_base"))
-            #print("System Message Tokens: ", self.num_tokens_from_string(system_message, "cl100k_base"))
-            #print("Few Shot Tokens: ", self.num_tokens_from_string(self.response_prompt_few_shots[0]['content'], "cl100k_base"))
-            #print("Message Tokens: ", self.num_tokens_from_string(message_string, "cl100k_base"))
+            elif self.model_name.startswith("gpt-4"):
+                messages = self.get_messages_from_history(
+                    system_message,
+                    # "Sources:\n" + content + "\n\n" + system_message,
+                    self.model_name,
+                    history,
+                    # history[-1]["user"],
+                    history[-1]["user"] + "Sources:\n" + content + "\n\n", # GPT 4 starts to degrade with long system messages. so moving sources here 
+                    self.RESPONSE_PROMPT_FEW_SHOTS,
+                    max_tokens=self.chatgpt_token_limit
+                )
 
-            chat_completion = await openai.ChatCompletion.acreate(
-            deployment_id=self.chatgpt_deployment,
-            model=self.model_name,
-            messages=messages,
-            temperature=float(overrides.get("response_temp")) or 0.6,
-            max_tokens=1024,
-            n=1
-        )
-        # STEP 4: Format the response
-        msg_to_display = '\n\n'.join([str(message) for message in messages])
-        generated_response=chat_completion.choices[0].message.content
-
-        # # Detect the language of the response
-        response_language = self.detect_language(generated_response)
-        #if response is not in user's language, translate it to user's language
-        if response_language != detectedlanguage:
-            translated_response = self.translate_response(generated_response, detectedlanguage)
-        else:
-            translated_response = generated_response
-        thought_chain["work_response"] = urllib.parse.unquote(translated_response)
+                #Uncomment to debug token usage.
+                #print(messages)
+                #message_string = ""
+                #for message in messages:
+                #    # enumerate the messages and add the role and content elements of the dictoinary to the message_string
+                #    message_string += f"{message['role']}: {message['content']}\n"
+                #print("Content Tokens: ", self.num_tokens_from_string("Sources:\n" + content + "\n\n", "cl100k_base"))
+                #print("System Message Tokens: ", self.num_tokens_from_string(system_message, "cl100k_base"))
+                #print("Few Shot Tokens: ", self.num_tokens_from_string(self.response_prompt_few_shots[0]['content'], "cl100k_base"))
+                #print("Message Tokens: ", self.num_tokens_from_string(message_string, "cl100k_base"))
+
+            chat_completion= await self.client.chat.completions.create(
+                model=self.chatgpt_deployment,
+                messages=messages,
+                temperature=float(overrides.get("response_temp")) or 0.6,
+                n=1,
+                stream=True
+            
+            )
+            msg_to_display = '\n\n'.join([str(message) for message in messages])
 
-        return {
-            "data_points": data_points,
-            "answer": f"{urllib.parse.unquote(translated_response)}",
-            "thoughts": f"Searched for:<br>{generated_query}<br><br>Conversations:<br>" + msg_to_display.replace('\n', '<br>'),
-            "thought_chain": thought_chain,
-            "work_citation_lookup": citation_lookup,
-            "web_citation_lookup": {}
-        }
+        
+            # Return the data we know
+            yield json.dumps({"data_points": {},
+                              "thoughts": f"Searched for:<br>{generated_query}<br><br>Conversations:<br>" + msg_to_display.replace('\n', '<br>'),
+                              "thought_chain": thought_chain,
+                              "work_citation_lookup": citation_lookup,
+                              "web_citation_lookup": {}}) + "\n"
+        
+            # STEP 4: Format the response
+            async for chunk in chat_completion:
+                # Check if there is at least one element and the first element has the key 'delta'
+                if len(chunk.choices) > 0:
+                    yield json.dumps({"content": chunk.choices[0].delta.content}) + "\n"
+        except Exception as e:
+            log.error(f"Error generating chat completion: {str(e)}")
+            yield json.dumps({"error": f"Error generating chat completion: {str(e)}"}) + "\n"
+            return
+
 
     def detect_language(self, text: str) -> str:
         """ Function to detect the language of the text"""