Skip to content

Commit 839fa1c

Browse files
Add video audio source support to Voice Triggers
Users can now transcribe audio directly from the sample video (or any video source) instead of only microphone. Added audio source dropdown (Video Audio / Microphone) that defaults to Video Audio when sample video is playing. Uses Web Audio API MediaElementSource to capture audio from the video element. Auto-switches to microphone when user selects live camera. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
1 parent 0e077e6 commit 839fa1c

File tree

2 files changed

+104
-3
lines changed

2 files changed

+104
-3
lines changed

15-voice-triggers/app.js

Lines changed: 96 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ class VoiceControl {
3131
this.transcriptHistory = [];
3232
this.maxHistory = 50;
3333

34+
// Audio source: 'video' or 'microphone'
35+
this.audioSource = options.audioSource || 'video';
36+
this.videoElement = options.videoElement || null;
37+
3438
this._loadRules();
3539
}
3640

@@ -107,6 +111,8 @@ class VoiceControl {
107111

108112
async startAudioCapture() {
109113
try {
114+
// Mute video when using microphone to avoid echo
115+
if (this.videoElement) this.videoElement.muted = true;
110116
this.mediaStream = await navigator.mediaDevices.getUserMedia({
111117
audio: {
112118
channelCount: 1,
@@ -170,6 +176,61 @@ class VoiceControl {
170176
this.audioChunks = [];
171177
}
172178

179+
async startVideoAudioCapture(videoElement) {
180+
try {
181+
// Unmute video so MediaElementSource receives audio
182+
videoElement.muted = false;
183+
videoElement.volume = 0.3;
184+
185+
this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
186+
sampleRate: this.sampleRate
187+
});
188+
189+
// MediaElementSource can only be created once per element
190+
if (!videoElement._mediaElementSource) {
191+
videoElement._mediaElementSource = this.audioContext.createMediaElementSource(videoElement);
192+
}
193+
const source = videoElement._mediaElementSource;
194+
195+
// Must connect to destination so user can still hear the video
196+
source.connect(this.audioContext.destination);
197+
198+
this.analyser = this.audioContext.createAnalyser();
199+
this.analyser.fftSize = 256;
200+
source.connect(this.analyser);
201+
202+
this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
203+
source.connect(this.processor);
204+
this.processor.connect(this.audioContext.destination);
205+
206+
this.audioChunks = [];
207+
208+
this.processor.onaudioprocess = (event) => {
209+
if (!this.isRunning) return;
210+
const inputData = event.inputBuffer.getChannelData(0);
211+
this.audioChunks.push(new Float32Array(inputData));
212+
};
213+
214+
this._monitorAudioLevel();
215+
216+
console.log('[VoiceControl] Video audio capture started');
217+
return true;
218+
} catch (error) {
219+
console.error('[VoiceControl] Video audio capture failed:', error);
220+
this.onStatusUpdate('Video audio capture failed: ' + error.message);
221+
return false;
222+
}
223+
}
224+
225+
setAudioSource(source, videoElement) {
226+
this.audioSource = source;
227+
if (videoElement) this.videoElement = videoElement;
228+
if (this.isRunning) {
229+
this.stop();
230+
this.start();
231+
}
232+
}
233+
173234
_monitorAudioLevel() {
174235
if (!this.analyser || !this.isRunning) return;
175236

@@ -340,9 +401,13 @@ class VoiceControl {
340401
if (!loaded) return false;
341402
}
342403

343-
const capturing = await this.startAudioCapture();
404+
let capturing;
405+
if (this.audioSource === 'video' && this.videoElement) {
406+
capturing = await this.startVideoAudioCapture(this.videoElement);
407+
} else {
408+
capturing = await this.startAudioCapture();
409+
}
344410
if (!capturing) return false;
345-
346411
this.isRunning = true;
347412

348413
this.processingInterval = setInterval(() => {
@@ -415,6 +480,8 @@ class VoiceTriggersApp {
415480
this.voiceControl = new VoiceControl({
416481
chunkDuration: parseInt(document.getElementById('chunkDuration').value) * 1000,
417482
cooldown: parseInt(document.getElementById('cooldown').value) * 1000,
483+
audioSource: 'video',
484+
videoElement: document.getElementById('video'),
418485

419486
onTranscript: (text, entry) => this.handleTranscript(text, entry),
420487
onRuleTriggered: (info) => this.handleRuleTrigger(info),
@@ -438,6 +505,16 @@ class VoiceTriggersApp {
438505
cameraSelect.disabled = source === 'sample';
439506
if (refreshBtn) refreshBtn.disabled = source === 'sample';
440507
this.reasoningConsole.logInfo(`Switched to ${source === 'camera' ? 'live camera' : 'sample video'}`);
508+
509+
// Auto-switch audio source based on video source
510+
const audioSourceSelect = document.getElementById('audioSourceSelect');
511+
if (source === 'sample') {
512+
if (audioSourceSelect) audioSourceSelect.value = 'video';
513+
this.voiceControl.setAudioSource('video', videoElement);
514+
} else {
515+
if (audioSourceSelect) audioSourceSelect.value = 'microphone';
516+
this.voiceControl.setAudioSource('microphone', videoElement);
517+
}
441518
}
442519
});
443520
VideoSourceAdapter.switchToSample().catch(() => {
@@ -449,6 +526,23 @@ class VoiceTriggersApp {
449526
initEventListeners() {
450527
document.getElementById('startBtn').addEventListener('click', () => this.toggleVoice());
451528

529+
// Audio source toggle
530+
const audioSourceSelect = document.getElementById('audioSourceSelect');
531+
if (audioSourceSelect) {
532+
audioSourceSelect.addEventListener('change', (e) => {
533+
const source = e.target.value;
534+
const video = document.getElementById('video');
535+
if (source === 'video') {
536+
video.muted = false;
537+
video.volume = 0.3;
538+
} else {
539+
video.muted = true;
540+
}
541+
this.voiceControl.setAudioSource(source, video);
542+
this.reasoningConsole.logInfo(`Audio source: ${source === 'video' ? 'Video Audio' : 'Microphone'}`);
543+
});
544+
}
545+
452546
document.getElementById('chunkDuration').addEventListener('input', (e) => {
453547
const value = e.target.value;
454548
document.getElementById('chunkDurationValue').textContent = `${value}s`;

15-voice-triggers/index.html

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -698,7 +698,14 @@ <h4>No API Key Required for Speech Recognition</h4>
698698

699699
<!-- Audio Visualizer -->
700700
<div class="audio-section">
701-
<h3>&#x1F3A4; Microphone Input</h3>
701+
<h3>🎤 Audio Input</h3>
702+
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;">
703+
<label for="audioSourceSelect" style="font-size: 0.85rem; font-weight: 600; color: var(--text-secondary, #aaa);">Source:</label>
704+
<select id="audioSourceSelect" style="padding: 4px 8px; border-radius: 4px; border: 1px solid var(--border, #444); background: var(--bg-secondary, #2a2a2a); color: var(--text-primary, #fff); font-size: 0.85rem;">
705+
<option value="video" selected>📹 Video Audio</option>
706+
<option value="microphone">🎤 Microphone</option>
707+
</select>
708+
</div>
702709
<div class="audio-visualizer" id="audioVisualizer">
703710
<!-- Bars will be generated by JS -->
704711
</div>

0 commit comments

Comments
 (0)