diff --git a/.gitattributes b/.gitattributes index b9bbf1dd4..a3f951687 100644 --- a/.gitattributes +++ b/.gitattributes @@ -69,6 +69,7 @@ proguard-rules.pro text *.xml text *.yaml text *.yml text +*.ps1 text # Bash only likes Unix line endings diff --git a/scenarios/python/console/speech_rewrite_sample/.vscode/tasks.json b/scenarios/python/console/speech_rewrite_sample/.vscode/tasks.json new file mode 100644 index 000000000..6ed192775 --- /dev/null +++ b/scenarios/python/console/speech_rewrite_sample/.vscode/tasks.json @@ -0,0 +1,52 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Configuration and Setup", + "type": "shell", + "command": "/bin/bash", + "args": [ + "-c", + "chmod u+x ${workspaceFolder}/app_manager.sh && ${workspaceFolder}/app_manager.sh configure" + ], + "group": { + "kind": "build", + "isDefault": false + }, + "problemMatcher": [], + "windows": { + "command": "powershell", + "args": [ + "-ExecutionPolicy", + "Bypass", + "-File", + "${workspaceFolder}/app_manager.ps1", + "configure" + ] + } + }, + { + "label": "Run the App", + "type": "shell", + "command": "${workspaceFolder}/app_manager.sh", + "args": [ + "run" + ], + "group": { + "kind": "none", + "isDefault": false + }, + "problemMatcher": [], + "windows": { + "command": "powershell", + "args": [ + "-ExecutionPolicy", + "Bypass", + "-File", + "${workspaceFolder}/app_manager.ps1", + "run" + ] + } + } + ] +} \ No newline at end of file diff --git a/scenarios/python/console/speech_rewrite_sample/README.md b/scenarios/python/console/speech_rewrite_sample/README.md new file mode 100644 index 000000000..5454ccd3c --- /dev/null +++ b/scenarios/python/console/speech_rewrite_sample/README.md @@ -0,0 +1,39 @@ +# Scenario: Continuous Speech Recognition and Rewriting via Azure OpenAI +This project integrates Azure Cognitive Services Speech SDK with Azure OpenAI Service to perform real-time speech recognition and refine the recognized text for improved grammar and readability. + +# Features +1. Real-time speech-to-text transcription using Azure Cognitive Services Speech SDK. +2. Automatic refinement of recognized text using Azure OpenAI Service. +3. Grammar correction, minor rewrites for improved readability, and spelling fixes for predefined phrases. + +## Run the Sample within VS Code +1. Install "Azure AI Speech Toolkit" extension in VS Code. +2. Download this sample from sample gallery to local machine. +3. Trigger "Azure AI Speech Toolkit: Configure Azure Speech Resources" command from command palette to select an **Azure AI Service** resource. +4. Trigger "Azure AI Speech Toolkit: Configure and Setup the Sample App" command from command palette to configure and setup the sample. This command only needs to be run once. +5. Trigger "Azure AI Speech Toolkit: Run the Sample App" command from command palette to run the sample. + +## Prerequisites +- Install a version of [Python from 3.7 or later](https://www.python.org/downloads/). + +## Environment Setup +- Azure AI Speech Toolkit will automatically help you set these environment variables. If you want to run outside of VS Code, you can manually set the following environment variables. + + - `SPEECH_REGION`: Azure region for the Speech Service (e.g., `eastus`). + - `SPEECH_KEY`: Azure Cognitive Services Speech API key. + - `AZURE_OPENAI_ENDPOINT`: Endpoint for Azure OpenAI Service (e.g., `https://.openai.azure.com`). + - `AZURE_OPENAI_API_KEY`: API key for Azure OpenAI Service. + +When running the sample app, you can set --relevant_phrases parameter. + - `--relevant_phrases`: (Optional) Default: Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API + +---- + +## Example Output +Speak into the microphone. The sample application will print both the recognition result and the rewritten version. +For instance, if you speak "how ar you" into the microphone, the output will be: + +``` +RAW RECO: how ar you +REWRITE: How are you? +``` diff --git a/scenarios/python/console/speech_rewrite_sample/app.py b/scenarios/python/console/speech_rewrite_sample/app.py new file mode 100644 index 000000000..dfbe6f4a4 --- /dev/null +++ b/scenarios/python/console/speech_rewrite_sample/app.py @@ -0,0 +1,99 @@ +import os +import argparse +import azure.cognitiveservices.speech as speechsdk +from openai import AzureOpenAI + +# Initialize speech recognition engine +service_region = os.environ.get('SPEECH_REGION') +speech_key = os.environ.get('SPEECH_KEY') +speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) +speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language="en-us") + +# Initialize Azure OpenAI client +client = AzureOpenAI( + azure_endpoint=os.environ.get('AZURE_OPENAI_ENDPOINT'), + api_key=os.environ.get('AZURE_OPENAI_API_KEY'), + api_version="2024-10-21" +) + +# Parse command-line arguments +parser = argparse.ArgumentParser(description="Run app.py with custom parameters.") +parser.add_argument( + "--relevant_phrases", + type=str, + default="Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API", + help="Comma-separated relevant phrases for text rewriting." +) +args = parser.parse_args() + +# Use user-provided or default relevant_phrases +relevant_phrases = args.relevant_phrases + +def rewrite_content(input_reco): + """ + Refines the user's input sentence by fixing grammar issues, making it more readable, + and ensuring spelling correctness for specific phrases. + + Args: + input_reco (str): The raw input sentence to rewrite. + + Returns: + str: The refined sentence. + """ + + # A list of phrases relevant to the context, used to ensure their correct spelling and formatting. + # Users can customize these phrases based on their specific use case or domain. + relevant_phrases = args.relevant_phrases + + my_messages = [ + { + "role": "system", + "content": ( + "You are a helpful assistant to help the user rewrite sentences. " + "Please fix the grammar errors in the user-provided sentence and make it more readable. " + "You can do minor rewriting but MUST NOT change the sentence's meaning. " + "DO NOT make up new content. DO NOT answer questions. " + "Here are phrases relevant to the sentences: '{}'. " + "If they appear in the sentence and are misspelled, please fix them. " + "Example corrections:\n" + "User: how ar you\nYour response: How are you?\n\n" + "User: what yur name?\nYour response: What's your name?\n\n" + ).format(relevant_phrases) + }, + {"role": "user", "content": input_reco} + ] + + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=my_messages + ) + + return response.choices[0].message.content + +def recognized_cb(evt: speechsdk.SpeechRecognitionEventArgs): + """ + Callback function triggered when speech is recognized. + + Args: + evt (SpeechRecognitionEventArgs): The event argument containing recognized text. + """ + current_sentence = evt.result.text + if not current_sentence: + return + + print("RAW RECO:", current_sentence) + print("REWRITE:", rewrite_content(current_sentence)) + +# Connect the speech recognizer to the callback +speech_recognizer.recognized.connect(recognized_cb) +result_future = speech_recognizer.start_continuous_recognition_async() +result_future.get() # Ensure engine initialization is complete + +print('Continuous Recognition is now running. Say something.') +while True: + print('Type "stop" then press Enter to stop recognition.') + stop = input() + if stop.lower() == "stop": + print('Stopping async recognition...') + speech_recognizer.stop_continuous_recognition_async() + break diff --git a/scenarios/python/console/speech_rewrite_sample/app_manager.ps1 b/scenarios/python/console/speech_rewrite_sample/app_manager.ps1 new file mode 100644 index 000000000..4159392f2 --- /dev/null +++ b/scenarios/python/console/speech_rewrite_sample/app_manager.ps1 @@ -0,0 +1,75 @@ +param( + [string]$action +) + +function Test-PythonInstalled { + return Get-Command python -ErrorAction SilentlyContinue +} + +function Test-PipInstalled { + return Get-Command pip -ErrorAction SilentlyContinue +} + +if ($action -eq "configure") { + if (-not (Test-PythonInstalled)) { + Write-Host "Python is not installed. Please install Python to proceed." -ForegroundColor Red + exit 1 + } + + if (-not (Test-PipInstalled)) { + Write-Host "pip is not installed. Please install pip to proceed." -ForegroundColor Red + exit 1 + } + + Write-Host "Installing requirements packages..." + try { + pip install -r requirements.txt + Write-Host "Requirements packages installation succeeded." -ForegroundColor Green + } + catch { + Write-Host "Requirements packages installation failed. Please check your pip installation." -ForegroundColor Red + exit 1 + } +} +elseif ($action -eq "run") { + # Define the path to your .env file + $envFilePath = ".env/.env.dev" + + if (Test-Path $envFilePath) { + # Read each line of the file and process it + Get-Content -Path $envFilePath | ForEach-Object { + # Ignore empty lines and lines that start with `#` (comments) + if ($_ -and $_ -notmatch '^\s*#') { + # Split each line into key and value + $parts = $_ -split '=', 2 + $key = $parts[0].Trim() + $value = $parts[1].Trim() + + # Set the environment variable + [System.Environment]::SetEnvironmentVariable($key, $value) + } + + [System.Environment]::SetEnvironmentVariable("SPEECH_KEY", $env:SPEECH_RESOURCE_KEY) + [System.Environment]::SetEnvironmentVariable("AZURE_OPENAI_API_KEY", $env:SPEECH_RESOURCE_KEY) + [System.Environment]::SetEnvironmentVariable("SPEECH_REGION", $env:SERVICE_REGION) + [System.Environment]::SetEnvironmentVariable("AZURE_OPENAI_ENDPOINT", "https://$env:CUSTOM_SUBDOMAIN_NAME.openai.azure.com/") + } + + Write-Host "Environment variables loaded from $envFilePath" + } + else { + Write-Host "File not found: $envFilePath. You can create one to set environment variables or manually set secrets in environment variables." + } + + $relevantPhrases = Read-Host "Enter relevant phrases (or press Enter to use defaults)" + if ([string]::IsNullOrEmpty($relevantPhrases)) { + $relevantPhrases = "Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API" + } + Write-Host "Running app.py with relevant phrases: $relevantPhrases" + python app.py --relevant_phrases "$relevantPhrases" +} +else { + Write-Host "Invalid action: $action" -ForegroundColor Red + Write-Host "Usage: -action configure or -action run" + exit 1 +} \ No newline at end of file diff --git a/scenarios/python/console/speech_rewrite_sample/app_manager.sh b/scenarios/python/console/speech_rewrite_sample/app_manager.sh new file mode 100755 index 000000000..6b6f2bdb4 --- /dev/null +++ b/scenarios/python/console/speech_rewrite_sample/app_manager.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +action=$1 + +function check_python_installed() { + command -v python >/dev/null 2>&1 +} + +function check_pip_installed() { + command -v pip >/dev/null 2>&1 +} + +if [ "$action" == "configure" ]; then + echo "Installing Linux platform required dependencies..." + sudo apt-get update + sudo apt-get install -y build-essential libssl-dev libasound2 wget + + if ! check_python_installed; then + echo -e "\e[31mPython is not installed. Please install Python to proceed.\e[0m" + exit 1 + fi + + if ! check_pip_installed; then + echo -e "\e[31mpip is not installed. Please install pip to proceed.\e[0m" + exit 1 + fi + + echo "Installing requirements packages..." + if ! pip install -r requirements.txt; then + exit 1 + fi +elif [ "$action" == "run" ]; then + + # Load environment variables from .env file + ENV_FILE=".env/.env.dev" + if [ -f "$ENV_FILE" ]; then + source "$ENV_FILE" + + # Ensure environment variables are available to the C++ binary + export SPEECH_KEY=$SPEECH_RESOURCE_KEY + export AZURE_OPENAI_API_KEY=$SPEECH_RESOURCE_KEY + export SPEECH_REGION=$SERVICE_REGION + export AZURE_OPENAI_ENDPOINT="https://${CUSTOM_SUBDOMAIN_NAME}.openai.azure.com/" + echo "Environment variables loaded from $ENV_FILE" + + else + echo "Environment file $ENV_FILE not found. You can create one to set environment variables or manually set secrets in environment variables." + fi + + read -p "Enter relevant phrases (or press Enter to use defaults): " relevant_phrases + if [ -z "$relevant_phrases" ]; then + relevant_phrases="Azure Cognitive Services, non-profit organization, speech recognition, OpenAI API" + fi + echo "Running app.py with relevant phrases: $relevant_phrases" + python app.py --relevant_phrases "$relevant_phrases" +else + echo -e "\e[31mInvalid action: $action\e[0m" + echo "Usage: $0 configure or $0 run" + exit 1 +fi \ No newline at end of file diff --git a/scenarios/python/console/speech_rewrite_sample/requirements.txt b/scenarios/python/console/speech_rewrite_sample/requirements.txt new file mode 100644 index 000000000..5fa41212c --- /dev/null +++ b/scenarios/python/console/speech_rewrite_sample/requirements.txt @@ -0,0 +1,2 @@ +azure-cognitiveservices-speech +openai \ No newline at end of file