Skip to content

Submission : Speech transcription with the web audio API in the browser #3990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
10 changes: 10 additions & 0 deletions aider/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from aider import urls
from aider.coders import Coder
from aider.dump import dump # noqa: F401
from aider.gui_speech_to_text import SpeechToText
from aider.io import InputOutput
from aider.main import main as cli_main
from aider.scrape import Scraper, has_playwright
Expand Down Expand Up @@ -153,6 +154,7 @@ def do_sidebar(self):

# self.do_recommended_actions()
self.do_add_to_chat()
self.do_speech_to_text()
self.do_recent_msgs()
self.do_clear_chat_history()
# st.container(height=150, border=False)
Expand Down Expand Up @@ -211,6 +213,14 @@ def do_add_web_page(self):
with st.popover("Add a web page to the chat"):
self.do_web()

def do_speech_to_text(self):
# Initialize the speech-to-text component if not already done
if not hasattr(self, "speech_to_text"):
self.speech_to_text = SpeechToText()

# Render the speech-to-text component
self.speech_to_text.render()

def do_add_image(self):
with st.popover("Add image"):
st.markdown("Hello World 👋")
Expand Down
196 changes: 196 additions & 0 deletions aider/gui_speech_to_text.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
(function() {
// Generate a unique ID for this component instance
const compId = 'st-speech-to-text-' + Math.random().toString(36).substring(2, 9);

// Find the container element
const container = document.getElementById('speech-to-text-container');
if (!container) {
console.error('Could not find speech-to-text-container');
return;
}

// Style the container
container.style.display = 'flex';
container.style.alignItems = 'center';
container.style.padding = '5px';
container.style.justifyContent = 'space-between';

// Create LED indicator
const led = document.createElement('div');
led.id = 'led-' + compId;
led.style.width = '12px';
led.style.height = '12px';
led.style.borderRadius = '50%';
led.style.backgroundColor = 'gray';
led.style.marginRight = '10px';

// Create button
const button = document.createElement('button');
button.id = 'button-' + compId;
button.textContent = 'Voice Input';
button.style.padding = '4px 8px';

// Create stop button (initially hidden)
const stopButton = document.createElement('button');
stopButton.id = 'stop-button-' + compId;
stopButton.textContent = 'Stop';
stopButton.style.padding = '4px 8px';
stopButton.style.marginLeft = '5px';
stopButton.style.display = 'none';

// Create checkbox and label container
const checkContainer = document.createElement('div');
checkContainer.style.display = 'flex';
checkContainer.style.alignItems = 'center';
checkContainer.style.marginLeft = '10px';

// Create auto-transcribe checkbox
const autoTranscribe = document.createElement('input');
autoTranscribe.id = 'auto-transcribe-' + compId;
autoTranscribe.type = 'checkbox';
autoTranscribe.style.marginRight = '5px';

// Create label for checkbox
const label = document.createElement('label');
label.htmlFor = autoTranscribe.id;
label.textContent = 'Auto Transcribe';
label.style.fontSize = '14px';
label.style.color = 'white';

// Assemble components
checkContainer.appendChild(autoTranscribe);
checkContainer.appendChild(label);

// Add elements to container
container.appendChild(led);
container.appendChild(button);
container.appendChild(stopButton);
container.appendChild(checkContainer);

// Check if browser supports the Web Speech API
if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) {
button.disabled = true;
button.textContent = 'Not supported';
return;
}

// Function to populate the chat input
function populateChatInput(text) {
const parentDoc = window.parent.document;
let chatInput = parentDoc.querySelector('textarea[data-testid="stChatInputTextArea"]');
const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$'));
const syntheticEvent = { target: chatInput, currentTarget: chatInput,
preventDefault: () => {}, nativeEvent: new Event('input', { bubbles: true })};

if (!chatInput || !reactProps) {
if (!chatInput)
console.error("Could not find chat input textarea");
if (!reactProps)
console.error("Error setting chat input value:", err);
return false;
}

// Append to the existing value
chatInput.value = chatInput.value + ' ' + text;
// Call React's onChange handler
chatInput[reactProps].onChange(syntheticEvent);
return true;
}

// Initialize speech recognition
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
const recognition = new SpeechRecognition();
let isListening = false;

recognition.continuous = false;
recognition.interimResults = false;
// Use browser's language or fall back to 'en-US'
recognition.lang = navigator.language || 'en-US';
console.log('Speech recognition language:', recognition.lang);

// Setup button click handler
button.addEventListener('click', function() {
if (isListening) return;

isListening = true;

// Set initial LED color based on auto-transcribe mode
if (autoTranscribe.checked) {
led.style.backgroundColor = 'red'; // Red when waiting for voice
stopButton.style.display = 'inline-block';
recognition.continuous = true;
} else {
led.style.backgroundColor = 'lime';
}

recognition.start();
});

// Setup stop button click handler
stopButton.addEventListener('click', function() {
if (isListening) {
recognition.stop();
stopButton.style.display = 'none';
isListening = false;
}
});

// Handle speech detection
recognition.onspeechstart = function() {
console.log('Speech detected');
if (autoTranscribe.checked) {
led.style.backgroundColor = 'lime'; // Lime green when voice is detected
}
};

// Handle speech end
recognition.onspeechend = function() {
console.log('Speech ended');
if (autoTranscribe.checked && isListening) {
led.style.backgroundColor = 'red'; // Red when waiting for voice
}
};

// Combined event handler function for speech recognition events
function handleSpeechEvent(eventType, event) {
if (eventType === 'result') {
// Get the latest transcript
const resultIndex = event.resultIndex;
const transcript = event.results[resultIndex][0].transcript;

// Try to populate the chat input directly
const success = populateChatInput(transcript);
if (!success)
console.error('populateChatInput failed');

// If not in auto-transcribe mode, reset the LED
if (!autoTranscribe.checked) {
led.style.backgroundColor = 'gray';
}
// In auto-transcribe mode, we'll keep the LED color as is (lime while speaking)
// The LED will be set back to red in the speechend event
}
else if (eventType === 'error') {
console.error('Speech recognition error', event.error);
isListening = false;
stopButton.style.display = 'none';
led.style.backgroundColor = 'gray';
}
else if (eventType === 'end') {
// If auto transcribe is enabled and we're still supposed to be listening,
// restart recognition
if (autoTranscribe.checked && isListening) {
setTimeout(() => recognition.start(), 100);
} else {
isListening = false;
stopButton.style.display = 'none';
led.style.backgroundColor = 'gray';
}
}
}

// Set up event handlers using the combined function
recognition.onresult = function(event) { handleSpeechEvent('result', event); };
recognition.onerror = function(event) { handleSpeechEvent('error', event); };
recognition.onend = function() { handleSpeechEvent('end'); };
})();
37 changes: 37 additions & 0 deletions aider/gui_speech_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import base64
import os

import streamlit as st
import streamlit.components.v1 as components


class SpeechToText:
"""Class to handle speech-to-text functionality in the GUI"""

def render(self):
"""Render the speech-to-text component with LED indicator"""
self._js_dir = os.path.dirname(__file__)

# Create JS file path
js_path = os.path.join(self._js_dir, "gui_speech_to_text.js")
if not os.path.exists(js_path):
st.error(f"JavaScript file not found: {js_path}")
return

# Read the JS file for data URL
with open(js_path, "r") as f:
js_content = f.read()

# Create data URL for the JS file
js_b64 = base64.b64encode(js_content.encode("utf-8")).decode("utf-8")
js_data_url = f"data:text/javascript;base64,{js_b64}"

# Create simple HTML component with a container for the JS to populate
components.html(
f"""
<div id="speech-to-text-container"></div>
<!-- Load JS file via data URL since direct src paths don't work in Streamlit iframe -->
<script src="{js_data_url}"></script>
""",
height=50,
)