Skip to content
Merged
8 changes: 8 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"@eslint/js": "^9.17.0",
"@tailwindcss/postcss": "^4.1.11",
"@tailwindcss/vite": "^4.1.11",
"@types/dom-speech-recognition": "^0.0.6",
"@types/node": "^22.13.1",
"@types/react": "^18.3.18",
"@types/react-dom": "^18.3.5",
Expand Down
46 changes: 45 additions & 1 deletion src/components/ChatInput.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import { memo, useCallback, useEffect, useMemo } from 'react';
import toast from 'react-hot-toast';
import { useTranslation } from 'react-i18next';
import { LuArrowUp, LuPaperclip, LuSquare } from 'react-icons/lu';
import {
LuArrowUp,
LuCircleStop,
LuMic,
LuPaperclip,
LuSquare,
} from 'react-icons/lu';
import { TbAdjustmentsHorizontal } from 'react-icons/tb';
import { useNavigate } from 'react-router';
import { useChatContext } from '../context/chat';
Expand All @@ -10,6 +16,10 @@ import { useFileUpload } from '../hooks/useFileUpload';
import { MessageExtra } from '../types';
import { classNames, cleanCurrentUrl } from '../utils';
import { DropzoneArea } from './DropzoneArea';
import SpeechToText, {
IS_SPEECH_RECOGNITION_SUPPORTED,
SpeechRecordCallback,
} from './SpeechToText';

/**
* If the current URL contains "?m=...", prefill the message input with the value.
Expand Down Expand Up @@ -50,6 +60,11 @@ export const ChatInput = memo(
stopGenerating(convId);
}, [convId, stopGenerating]);

const handleRecord: SpeechRecordCallback = useCallback(
(text: string) => textarea.setValue(text),
[textarea]
);

const sendNewMessage = async () => {
const lastInpMsg = textarea.value();
if (lastInpMsg.trim().length === 0) {
Expand Down Expand Up @@ -145,6 +160,35 @@ export const ChatInput = memo(
</div>

<div className="flex items-center">
{IS_SPEECH_RECOGNITION_SUPPORTED && !isPending && (
<SpeechToText onRecord={handleRecord}>
{({ isRecording, startRecording, stopRecording }) => (
<>
{!isRecording && (
<button
className="btn btn-ghost w-8 h-8 p-0 rounded-full mr-2"
onClick={startRecording}
title="Record"
aria-label="Start Recording"
>
<LuMic className="h-5 w-5" />
</button>
)}
{isRecording && (
<button
className="btn btn-ghost w-8 h-8 p-0 rounded-full mr-2"
onClick={stopRecording}
title="Stop"
aria-label="Stop Recording"
>
<LuCircleStop className="h-5 w-5" />
</button>
)}
</>
)}
</SpeechToText>
)}

{isPending && (
<button
className="btn btn-neutral w-8 h-8 p-0 rounded-full"
Expand Down
167 changes: 167 additions & 0 deletions src/components/SpeechToText.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import {
forwardRef,
Fragment,
ReactNode,
useCallback,
useEffect,
useImperativeHandle,
useRef,
useState,
} from 'react';

export type SpeechRecordCallback = (text: string) => void;

const SpeechRecognition =
typeof window === 'undefined'
? undefined
: window.SpeechRecognition || window.webkitSpeechRecognition;

export const IS_SPEECH_RECOGNITION_SUPPORTED = !!SpeechRecognition;

interface SpeechToTextProps {
lang?: string;
continuous?: boolean;
interimResults?: boolean;
onRecord?: SpeechRecordCallback;
}

interface SpeechToTextState {
isRecording: boolean;
transcript: string;
startRecording: () => void;
stopRecording: () => void;
}

const useSpeechToText = ({
lang,
continuous = true,
interimResults = true,
onRecord,
}: SpeechToTextProps): SpeechToTextState => {
const [isRecording, setIsRecording] = useState<boolean>(false);
const [transcript, setTranscript] = useState<string>('');
const recognitionRef = useRef<SpeechRecognition | null>(null);
const stoppedManuallyRef = useRef<boolean>(false);
const onRecordRef = useRef<SpeechRecordCallback | undefined>(onRecord);
const finalTranscriptRef = useRef<string>('');

useEffect(() => {
onRecordRef.current = onRecord;
}, [onRecord]);

useEffect(() => {
if (!IS_SPEECH_RECOGNITION_SUPPORTED) {
console.error('Speech Recognition is not supported in this browser.');
return;
}

const recognition = new SpeechRecognition!();
recognition.continuous = continuous;
recognition.interimResults = interimResults;
recognition.lang =
lang || navigator.languages?.[0] || navigator.language || 'en-US';

recognition.onstart = () => {
setIsRecording(true);
};
recognition.onresult = (event: SpeechRecognitionEvent) => {
if (!event?.results) return;

for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i];
const { isFinal, length } = result;
if (length <= 0) continue;
const { transcript, confidence } = result[0];
const fullTranscript = [finalTranscriptRef.current, transcript].join(
' '
);
setTranscript(fullTranscript);
onRecordRef.current?.(fullTranscript);
if (isFinal && confidence > 0) {
finalTranscriptRef.current += transcript;
}
}
};
recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
console.warn('Speech recognition error:', event);
setIsRecording(false);
};
recognition.onend = () => {
setIsRecording(false);
// Automatically restart if not stopped manually
if (continuous && !stoppedManuallyRef.current) {
try {
recognition.start();
} catch (error) {
console.error('Error restarting speech recognition:', error);
}
}
};

recognitionRef.current = recognition;

return () => {
if (!recognitionRef.current) return;

recognitionRef.current.onresult = null;
recognitionRef.current.onend = null;
recognitionRef.current.onerror = null;
recognitionRef.current.onstart = null;
recognitionRef.current.stop();
recognitionRef.current = null;
};
}, [lang, continuous, interimResults]);

const startRecording = useCallback(() => {
const recognition = recognitionRef.current;
if (recognition && !isRecording) {
setTranscript('');
finalTranscriptRef.current = '';
stoppedManuallyRef.current = false;
try {
recognition.start();
} catch (error) {
console.error('Failed to start recording:', error);
setIsRecording(false);
}
}
}, [isRecording]);

const stopRecording = useCallback(() => {
const recognition = recognitionRef.current;
if (recognition && isRecording) {
stoppedManuallyRef.current = true;
try {
recognition.stop();
} catch (error) {
console.error('Failed to stop recording:', error);
setIsRecording(false);
}
}
}, [isRecording]);

return {
isRecording,
transcript,
startRecording,
stopRecording,
};
};

const SpeechToText = forwardRef<
SpeechToTextState,
SpeechToTextProps & { children: (props: SpeechToTextState) => ReactNode }
>(({ children, lang, continuous, interimResults, onRecord }, ref) => {
const speechToText = useSpeechToText({
lang,
continuous,
interimResults,
onRecord,
});

useImperativeHandle(ref, () => speechToText, [speechToText]);

return <Fragment>{children(speechToText)}</Fragment>;
});

export default SpeechToText;
7 changes: 4 additions & 3 deletions src/components/TextToSpeech.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const popularLanguages = [
'ar',
];

export const IS_SPEECH_SYNTHESIS_SUPPORTED = !!speechSynthesis || false;
export const IS_SPEECH_SYNTHESIS_SUPPORTED = !!window.speechSynthesis;
export const getSpeechSynthesisVoices = () =>
speechSynthesis
?.getVoices()
Expand Down Expand Up @@ -93,7 +93,7 @@ const useTextToSpeech = ({
utteranceRef.current.onerror = null;
}

const utterance = new SpeechSynthesisUtterance(text);
const utterance = new window.SpeechSynthesisUtterance(text);

utterance.voice = voice;
utterance.pitch = pitch;
Expand All @@ -109,7 +109,8 @@ const useTextToSpeech = ({
setIsPlaying(false);
};

utterance.onerror = () => {
utterance.onerror = (event) => {
console.error('Speech synthesis error: ', event.error);
setIsPlaying(false);
};

Expand Down