Skip to content

Commit e117b0f

Browse files
Gunn1claude
andcommitted
Improve: Enhance client-side Live Mode voice activity detection and transcription responsiveness
- Add speechFrames counter to track continuous speech duration (more responsive than just pause detection) - Reduce minPauseFramesForTranscription from 8 to 5 for faster chunk detection - Add minSpeechFramesBeforeSending to ensure minimum speech duration (0.4s) before transcribing - Implement auto-send every 8 seconds for long monologues to avoid huge text blocks - Add timeSinceLastSend tracking to force periodic sends during continuous speech - Improve VAD thresholds: speechThreshold -35dB, noiseThreshold -45dB for better sensitivity - Add console logging for debugging (chunk size, speech detection, transcription progress) - Fix VAD edge case: handle ambiguous zone between speech and silence states properly Result: Client-side Live Mode now transcribes chunks more responsively with better natural pause detection 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 84815b8 commit e117b0f

File tree

1 file changed

+39
-13
lines changed

1 file changed

+39
-13
lines changed

frontend/app/components/AudioRecorder.tsx

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -557,12 +557,14 @@ export default function AudioRecorder({
557557
let audioBuffer = new Float32Array(0);
558558
let isCurrentlySpeaking = false;
559559
let pauseFrames = 0;
560+
let speechFrames = 0;
560561
let lastSentTranscriptionTime = Date.now();
561-
const minPauseFramesForTranscription = 8;
562-
const maxAudioDurationBeforeForceSend = 15 * audioContext.sampleRate;
562+
const minPauseFramesForTranscription = 5; // Reduced from 8 for faster response
563+
const minSpeechFramesBeforeSending = 10; // Need at least ~0.4s of speech
564+
const maxAudioDurationBeforeForceSend = 10 * audioContext.sampleRate; // Send every ~10 seconds
563565

564-
const speechThreshold = -28;
565-
const noiseThreshold = -40;
566+
const speechThreshold = -35; // More sensitive to normal speech
567+
const noiseThreshold = -45; // More strict silence detection
566568

567569
processor.onaudioprocess = async (event) => {
568570
const inputData = event.inputBuffer.getChannelData(0);
@@ -586,17 +588,26 @@ export default function AudioRecorder({
586588
if (db > speechThreshold) {
587589
isCurrentlySpeaking = true;
588590
pauseFrames = 0;
591+
speechFrames = 0;
589592
console.log(`🎤 Speech detected at ${db.toFixed(1)}dB`);
590593
}
591594
} else {
592-
if (db < noiseThreshold) {
595+
// Currently speaking - count speech frames
596+
if (db > speechThreshold) {
597+
speechFrames++;
598+
pauseFrames = 0;
599+
} else if (db < noiseThreshold) {
600+
// Silence detected
593601
pauseFrames++;
594602

595603
const bufferDurationSeconds = audioBuffer.length / audioContext.sampleRate;
604+
const timeSinceLastSend = Date.now() - lastSentTranscriptionTime;
605+
606+
// Send if we have a pause after sufficient speech
596607
const shouldSend =
597-
(pauseFrames >= minPauseFramesForTranscription && bufferDurationSeconds > 1) ||
598-
(pauseFrames >= 30 && bufferDurationSeconds > 0.5) ||
599-
(audioBuffer.length > maxAudioDurationBeforeForceSend && bufferDurationSeconds > 5);
608+
(pauseFrames >= minPauseFramesForTranscription && speechFrames >= minSpeechFramesBeforeSending) ||
609+
(audioBuffer.length > maxAudioDurationBeforeForceSend) ||
610+
(timeSinceLastSend > 8000 && audioBuffer.length > 8192); // Force send every 8 seconds
600611

601612
if (shouldSend && audioBuffer.length > 4096) {
602613
// Transcribe this chunk on client
@@ -607,6 +618,7 @@ export default function AudioRecorder({
607618
}
608619

609620
try {
621+
console.log(`📤 Sending ${(audioBuffer.length / audioContext.sampleRate).toFixed(2)}s of audio for transcription...`);
610622
const chunkText = await transcribeAudioChunk(new Float32Array(int16Data), audioContext.sampleRate);
611623

612624
if (chunkText) {
@@ -619,22 +631,36 @@ export default function AudioRecorder({
619631
};
620632

621633
setMessages((prev) => [...prev, newMessage]);
622-
accumulatedTranscriptRef.current += (accumulatedTranscriptRef.current ? " " : "") + chunkText;
623-
setAccumulatedTranscript(accumulatedTranscriptRef.current);
624-
setTranscript(accumulatedTranscriptRef.current);
625634

635+
// Update accumulated transcript with space separator
636+
const newAccumulated = accumulatedTranscriptRef.current
637+
? accumulatedTranscriptRef.current + " " + chunkText
638+
: chunkText;
639+
accumulatedTranscriptRef.current = newAccumulated;
640+
setAccumulatedTranscript(newAccumulated);
641+
642+
// Update full transcript display in real-time
643+
setTranscript(newAccumulated);
644+
645+
lastSentTranscriptionTime = Date.now();
626646
console.log("✓ Transcribed chunk:", chunkText.substring(0, 50));
647+
console.log("📝 Full transcript so far:", newAccumulated.substring(0, 100));
648+
} else {
649+
console.log("⚠️ No speech detected in chunk");
627650
}
628651
} catch (error) {
629652
console.error("Error transcribing chunk:", error);
630653
}
631654

632655
audioBuffer = new Float32Array(0);
633656
pauseFrames = 0;
657+
speechFrames = 0;
634658
isCurrentlySpeaking = false;
635659
}
636-
} else if (db > speechThreshold) {
637-
pauseFrames = 0;
660+
} else {
661+
// Ambiguous zone between speech and silence
662+
pauseFrames++;
663+
speechFrames++;
638664
}
639665
}
640666
};

0 commit comments

Comments
 (0)