Add Speech + LLM Sample. (#2704)

v-bingyang · rhurey · web-flow · commit d5f0d936fd20 · 2025-01-12T17:50:42.000-08:00
* Add Speech + LLM Sample.

* Update README.md

* Update premium_speech_demo.py

* Update sample name.

* Update readme.

* Update.

* Update README.md

* Add *.ps1 text to .gitattributes

* Fix: Set executable permission for app_manager.sh

---------

Co-authored-by: Ryan Hurey &lt;RHUREY@MICROSOFT.COM&gt;
diff --git a/.gitattributes b/.gitattributes
@@ -69,6 +69,7 @@ proguard-rules.pro text
 *.xml text
 *.yaml text
 *.yml text
+*.ps1 text
 
 
 # Bash only likes Unix line endings
diff --git a/scenarios/python/console/speech_rewrite_sample/.vscode/tasks.json b/scenarios/python/console/speech_rewrite_sample/.vscode/tasks.json
@@ -0,0 +1,52 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Configuration and Setup",
+            "type": "shell",
+            "command": "/bin/bash",
+            "args": [
+                "-c",
+                "chmod u+x ${workspaceFolder}/app_manager.sh && ${workspaceFolder}/app_manager.sh configure"
+            ],
+            "group": {
+                "kind": "build",
+                "isDefault": false
+            },
+            "problemMatcher": [],
+            "windows": {
+                "command": "powershell",
+                "args": [
+                    "-ExecutionPolicy",
+                    "Bypass",
+                    "-File",
+                    "${workspaceFolder}/app_manager.ps1",
+                    "configure"
+                ]
+            }
+        },
+        {
+            "label": "Run the App",
+            "type": "shell",
+            "command": "${workspaceFolder}/app_manager.sh",
+            "args": [
+                "run"
+            ],
+            "group": {
+                "kind": "none",
+                "isDefault": false
+            },
+            "problemMatcher": [],
+            "windows": {
+                "command": "powershell",
+                "args": [
+                    "-ExecutionPolicy",
+                    "Bypass",
+                    "-File",
+                    "${workspaceFolder}/app_manager.ps1",
+                    "run"
+                ]
+            }
+        }
+    ]
+}
diff --git a/scenarios/python/console/speech_rewrite_sample/README.md b/scenarios/python/console/speech_rewrite_sample/README.md
@@ -0,0 +1,39 @@
+# Scenario: Continuous Speech Recognition and Rewriting via Azure OpenAI
+This project integrates Azure Cognitive Services Speech SDK with Azure OpenAI Service to perform real-time speech recognition and refine the recognized text for improved grammar and readability.
+
+# Features
+1. Real-time speech-to-text transcription using Azure Cognitive Services Speech SDK.
+2. Automatic refinement of recognized text using Azure OpenAI Service.
+3. Grammar correction, minor rewrites for improved readability, and spelling fixes for predefined phrases.
+
+## Run the Sample within VS Code
+1. Install "Azure AI Speech Toolkit" extension in VS Code.
+2. Download this sample from sample gallery to local machine.
+3. Trigger "Azure AI Speech Toolkit: Configure Azure Speech Resources" command from command palette to select an **Azure AI Service** resource.
+4. Trigger "Azure AI Speech Toolkit: Configure and Setup the Sample App" command from command palette to configure and setup the sample. This command only needs to be run once.
+5. Trigger "Azure AI Speech Toolkit: Run the Sample App" command from command palette to run the sample.
+
+## Prerequisites
+- Install a version of [Python from 3.7 or later](https://www.python.org/downloads/). 
+
+## Environment Setup
+- Azure AI Speech Toolkit will automatically help you set these environment variables. If you want to run outside of VS Code, you can manually set the following environment variables.
+
+  - `SPEECH_REGION`: Azure region for the Speech Service (e.g., `eastus`).
+  - `SPEECH_KEY`: Azure Cognitive Services Speech API key.
+  - `AZURE_OPENAI_ENDPOINT`: Endpoint for Azure OpenAI Service (e.g., `https://<your-resource-name>.openai.azure.com`).
+  - `AZURE_OPENAI_API_KEY`: API key for Azure OpenAI Service.
+
+When running the sample app, you can set --relevant_phrases parameter.
+  - `--relevant_phrases`: (Optional) Default: Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API
+
+----
+
+## Example Output
+Speak into the microphone. The sample application will print both the recognition result and the rewritten version.
+For instance, if you speak "how ar you" into the microphone, the output will be:
+
+```
+RAW RECO: how ar you
+REWRITE: How are you?
+```
diff --git a/scenarios/python/console/speech_rewrite_sample/app.py b/scenarios/python/console/speech_rewrite_sample/app.py
@@ -0,0 +1,99 @@
+import os
+import argparse
+import azure.cognitiveservices.speech as speechsdk
+from openai import AzureOpenAI
+
+# Initialize speech recognition engine
+service_region = os.environ.get('SPEECH_REGION')
+speech_key = os.environ.get('SPEECH_KEY')
+speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
+speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language="en-us")
+
+# Initialize Azure OpenAI client
+client = AzureOpenAI(
+    azure_endpoint=os.environ.get('AZURE_OPENAI_ENDPOINT'), 
+    api_key=os.environ.get('AZURE_OPENAI_API_KEY'), 
+    api_version="2024-10-21"
+)
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Run app.py with custom parameters.")
+parser.add_argument(
+    "--relevant_phrases", 
+    type=str, 
+    default="Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API", 
+    help="Comma-separated relevant phrases for text rewriting."
+)
+args = parser.parse_args()
+
+# Use user-provided or default relevant_phrases
+relevant_phrases = args.relevant_phrases
+
+def rewrite_content(input_reco):
+    """
+    Refines the user's input sentence by fixing grammar issues, making it more readable,
+    and ensuring spelling correctness for specific phrases.
+
+    Args:
+        input_reco (str): The raw input sentence to rewrite.
+
+    Returns:
+        str: The refined sentence.
+    """
+
+    # A list of phrases relevant to the context, used to ensure their correct spelling and formatting.
+    # Users can customize these phrases based on their specific use case or domain.
+    relevant_phrases = args.relevant_phrases
+    
+    my_messages = [
+        {
+            "role": "system", 
+            "content": (
+                "You are a helpful assistant to help the user rewrite sentences. "
+                "Please fix the grammar errors in the user-provided sentence and make it more readable. "
+                "You can do minor rewriting but MUST NOT change the sentence's meaning. "
+                "DO NOT make up new content. DO NOT answer questions. "
+                "Here are phrases relevant to the sentences: '{}'. "
+                "If they appear in the sentence and are misspelled, please fix them. "
+                "Example corrections:\n"
+                "User: how ar you\nYour response: How are you?\n\n"
+                "User: what yur name?\nYour response: What's your name?\n\n"
+            ).format(relevant_phrases)
+        },
+        {"role": "user", "content": input_reco}
+    ]
+
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=my_messages
+    )
+
+    return response.choices[0].message.content
+
+def recognized_cb(evt: speechsdk.SpeechRecognitionEventArgs):
+    """
+    Callback function triggered when speech is recognized.
+
+    Args:
+        evt (SpeechRecognitionEventArgs): The event argument containing recognized text.
+    """
+    current_sentence = evt.result.text
+    if not current_sentence:
+        return
+
+    print("RAW RECO:", current_sentence)
+    print("REWRITE:", rewrite_content(current_sentence))
+
+# Connect the speech recognizer to the callback
+speech_recognizer.recognized.connect(recognized_cb)
+result_future = speech_recognizer.start_continuous_recognition_async()
+result_future.get()  # Ensure engine initialization is complete
+
+print('Continuous Recognition is now running. Say something.')
+while True:
+    print('Type "stop" then press Enter to stop recognition.')
+    stop = input()
+    if stop.lower() == "stop":
+        print('Stopping async recognition...')
+        speech_recognizer.stop_continuous_recognition_async()
+        break
diff --git a/scenarios/python/console/speech_rewrite_sample/app_manager.ps1 b/scenarios/python/console/speech_rewrite_sample/app_manager.ps1
@@ -0,0 +1,75 @@
+param(
+    [string]$action
+)
+
+function Test-PythonInstalled {
+    return Get-Command python -ErrorAction SilentlyContinue
+}
+
+function Test-PipInstalled {
+    return Get-Command pip -ErrorAction SilentlyContinue
+}
+
+if ($action -eq "configure") {
+    if (-not (Test-PythonInstalled)) {
+        Write-Host "Python is not installed. Please install Python to proceed." -ForegroundColor Red
+        exit 1
+    }
+
+    if (-not (Test-PipInstalled)) {
+        Write-Host "pip is not installed. Please install pip to proceed." -ForegroundColor Red
+        exit 1
+    }
+
+    Write-Host "Installing requirements packages..."
+    try {
+        pip install -r requirements.txt
+        Write-Host "Requirements packages installation succeeded." -ForegroundColor Green
+    }
+    catch {
+        Write-Host "Requirements packages installation failed. Please check your pip installation." -ForegroundColor Red
+        exit 1
+    }
+}
+elseif ($action -eq "run") {
+    # Define the path to your .env file
+    $envFilePath = ".env/.env.dev"
+
+    if (Test-Path $envFilePath) {
+        # Read each line of the file and process it
+        Get-Content -Path $envFilePath | ForEach-Object {
+            # Ignore empty lines and lines that start with `#` (comments)
+            if ($_ -and $_ -notmatch '^\s*#') {
+                # Split each line into key and value
+                $parts = $_ -split '=', 2
+                $key = $parts[0].Trim()
+                $value = $parts[1].Trim()
+
+                # Set the environment variable
+                [System.Environment]::SetEnvironmentVariable($key, $value)
+            }
+
+            [System.Environment]::SetEnvironmentVariable("SPEECH_KEY", $env:SPEECH_RESOURCE_KEY)
+            [System.Environment]::SetEnvironmentVariable("AZURE_OPENAI_API_KEY", $env:SPEECH_RESOURCE_KEY)
+            [System.Environment]::SetEnvironmentVariable("SPEECH_REGION", $env:SERVICE_REGION)
+            [System.Environment]::SetEnvironmentVariable("AZURE_OPENAI_ENDPOINT", "https://$env:CUSTOM_SUBDOMAIN_NAME.openai.azure.com/")
+        }
+
+        Write-Host "Environment variables loaded from $envFilePath"
+    }
+    else {
+        Write-Host "File not found: $envFilePath. You can create one to set environment variables or manually set secrets in environment variables."
+    }
+
+    $relevantPhrases = Read-Host "Enter relevant phrases (or press Enter to use defaults)"
+    if ([string]::IsNullOrEmpty($relevantPhrases)) {
+        $relevantPhrases = "Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API"
+    }
+    Write-Host "Running app.py with relevant phrases: $relevantPhrases"
+    python app.py --relevant_phrases "$relevantPhrases"
+}
+else {
+    Write-Host "Invalid action: $action" -ForegroundColor Red
+    Write-Host "Usage: -action configure or -action run"
+    exit 1
+}
diff --git a/scenarios/python/console/speech_rewrite_sample/app_manager.sh b/scenarios/python/console/speech_rewrite_sample/app_manager.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+action=$1
+
+function check_python_installed() {
+    command -v python >/dev/null 2>&1
+}
+
+function check_pip_installed() {
+    command -v pip >/dev/null 2>&1
+}
+
+if [ "$action" == "configure" ]; then
+    echo "Installing Linux platform required dependencies..."
+    sudo apt-get update
+    sudo apt-get install -y build-essential libssl-dev libasound2 wget
+
+    if ! check_python_installed; then
+        echo -e "\e[31mPython is not installed. Please install Python to proceed.\e[0m"
+        exit 1
+    fi
+
+    if ! check_pip_installed; then
+        echo -e "\e[31mpip is not installed. Please install pip to proceed.\e[0m"
+        exit 1
+    fi
+
+    echo "Installing requirements packages..."
+    if ! pip install -r requirements.txt; then
+        exit 1
+    fi
+elif [ "$action" == "run" ]; then
+
+    # Load environment variables from .env file
+    ENV_FILE=".env/.env.dev" 
+    if [ -f "$ENV_FILE" ]; then
+        source "$ENV_FILE"
+
+        # Ensure environment variables are available to the C++ binary
+        export SPEECH_KEY=$SPEECH_RESOURCE_KEY
+        export AZURE_OPENAI_API_KEY=$SPEECH_RESOURCE_KEY
+        export SPEECH_REGION=$SERVICE_REGION
+        export AZURE_OPENAI_ENDPOINT="https://${CUSTOM_SUBDOMAIN_NAME}.openai.azure.com/"
+        echo "Environment variables loaded from $ENV_FILE"
+
+    else
+        echo "Environment file $ENV_FILE not found. You can create one to set environment variables or manually set secrets in environment variables."
+    fi
+
+    read -p "Enter relevant phrases (or press Enter to use defaults): " relevant_phrases
+    if [ -z "$relevant_phrases" ]; then
+        relevant_phrases="Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API"
+    fi
+    echo "Running app.py with relevant phrases: $relevant_phrases"
+    python app.py --relevant_phrases "$relevant_phrases"
+else
+    echo -e "\e[31mInvalid action: $action\e[0m"
+    echo "Usage: $0 configure or $0 run"
+    exit 1
+fi
diff --git a/scenarios/python/console/speech_rewrite_sample/requirements.txt b/scenarios/python/console/speech_rewrite_sample/requirements.txt
@@ -0,0 +1,2 @@
+azure-cognitiveservices-speech
+openai

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+azure-cognitiveservices-speech`
	`2`	`+openai`