Skip to content

Conversation

@sandeepsalwan1
Copy link

@sandeepsalwan1 sandeepsalwan1 commented May 19, 2025

Category: Prompt Guide
Description: Improve Current Documentation on best prompting practices.

Reference below which helped me learn better prompt practices. Full Script to build in app where blind people can interact via voice and browser-use with the browser:

import asyncio
import os
import time
import threading
import subprocess
import queue
from langchain_openai import ChatOpenAI
from browser_use import Agent, Browser, BrowserConfig
from dotenv import load_dotenv
import speech_recognition as sr

load_dotenv()

"""
simple voice browser that lets users control web browsing by voice
- accepts all voice input as commands
- interruptible speech (say 'next' or 'stop talking')
- cancelable commands ('cancel' during execution)
- simplified outputs with minimal voice feedback
"""

class VoiceBrowser:
	def __init__(self):
		# Core components
		self.browser = Browser(config=BrowserConfig(headless=False))
		
		# Setup LLM
		openai_api_key = os.getenv('OPENAI_API_KEY')
		if not openai_api_key:
			raise ValueError('OPENAI_API_KEY not found in environment')
		self.llm = ChatOpenAI(model='gpt-4o', api_key=openai_api_key)
		
		# speech recognition
		self.recognizer = sr.Recognizer()
		self.recognizer.energy_threshold = 1500
		self.recognizer.dynamic_energy_threshold = True
		
		# Command management
		self.command_queue = queue.Queue()
		self.is_processing = False
		self.should_stop = False
		self.current_agent = None
		self.speaking = False
		self.interrupt_speech = False
		
		# listening thread
		self.listen_thread = threading.Thread(target=self.continuous_listen)
		self.listen_thread.daemon = True
	
	def speak(self, text, short=False):
		"""text-to-speech with brevity and interruption"""
		print(f'🔊 {text}')
		
		# truncate long outputs
		if short and len(text) > 100:
			first_sentence_end = text.find('. ')
			if first_sentence_end > 0 and first_sentence_end < 100:
				speech_text = text[: first_sentence_end + 1]
			else:
				speech_text = text[:100] + '...'
		else:
			speech_text = text
		
		# enable interruption
		self.speaking = True
		self.interrupt_speech = False
		
		# platform-specific speech
		if os.name == 'posix':  # macOS or Linux
			speech_thread = threading.Thread(target=lambda: subprocess.run(['say', speech_text]))
			speech_thread.start()
			
			# monitor for interruption
			while speech_thread.is_alive() and not self.interrupt_speech:
				time.sleep(0.1)
			
			if self.interrupt_speech and speech_thread.is_alive():
				try:
					subprocess.run(['killall', 'say'])
				except:
					pass
		
		elif os.name == 'nt':  # Windows
			import win32com.client
			speaker = win32com.client.Dispatch('SAPI.SpVoice')
			speaker.Speak(speech_text)
		
		self.speaking = False
	
	def continuous_listen(self):
		"""continuously listen for voice commands"""
		self.speak('voice browser ready')
		
		while not self.should_stop:
			try:
				# show status
				if self.is_processing:
					print("🔄 processing... (say 'cancel' to stop)")
				else:
					print('🎤 listening...')
				
				# listen for command
				with sr.Microphone() as source:
					self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
					audio = self.recognizer.listen(source, phrase_time_limit=5)
				
				try:
					text = self.recognizer.recognize_google(audio).lower()
					print(f'📢 heard: {text}')
					
					# check for interruption commands
					if self.speaking and any(cmd in text for cmd in ['next', 'stop talking', 'quiet', 'skip']):
						print('speech interrupted')
						self.interrupt_speech = True
						continue
					
					# check for cancel command
					if text in ['cancel', 'stop', 'cancel that', 'never mind'] and self.is_processing:
						if self.current_agent:
							self.current_agent.stop()
							self.speak('cancelling')
							self.is_processing = False
							continue
					
					# exit commands
					if text in ['exit browser', 'quit browser', 'goodbye browser', 'shutdown']:
						self.should_stop = True
						self.speak('shutting down')
						break
					
					# treat all other input as commands when not already processing
					if not self.is_processing:
						self.is_processing = True
						self.speak(f'running: {text}', short=True)
						self.command_queue.put(text)
				
				except sr.UnknownValueError:
					pass  # speech wasn't understood
				except sr.RequestError as e:
					print(f'recognition error: {e}')
			
			except Exception as e:
				print(f'listening error: {e}')
				time.sleep(1)
	
	def summarize_result(self, result_text):
		"""create a brief summary of the result"""
		if not result_text or len(result_text) < 50:
			return result_text
		
		# extract key information from long results
		if len(result_text) > 200:
			first_period = result_text.find('.')
			if 10 < first_period < 100:
				return result_text[: first_period + 1]
			else:
				return result_text[:100] + '...'
		return result_text
	
	async def run(self):
		"""main execution loop"""
		self.listen_thread.start()
		
		try:
			while not self.should_stop:
				try:
					if not self.command_queue.empty():
						command = self.command_queue.get(block=False)
						print(f'🚀 processing: {command}')
						
						# run the agent
						self.current_agent = Agent(
							task=command,
							llm=self.llm,
							browser=self.browser,
							enable_memory=False,
						)
						
						try:
							result = await self.current_agent.run()
							if result.final_result():
								summary = self.summarize_result(result.final_result())
								self.speak(f'done. {summary}', short=True)
							else:
								self.speak('done')
						except Exception as e:
							self.speak(f'error: {str(e)[:100]}', short=True)
							print(f'agent error: {str(e)}')
						finally:
							self.current_agent = None
						
						self.is_processing = False
						self.speak('ready')
					
					await asyncio.sleep(0.5)
				
				except queue.Empty:
					await asyncio.sleep(0.5)
		
		except KeyboardInterrupt:
			self.speak('shutting down')
		except Exception as e:
			print(f'error: {str(e)}')
		finally:
			self.should_stop = True
			await self.browser.close()

async def main():
	voice_browser = VoiceBrowser()
	await voice_browser.run()

if __name__ == '__main__':
	asyncio.run(main())

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant