Skip to content

Commit 1120710

Browse files
rkpatel33claude
andcommitted
Merge streaming mode into context-aware-capitalization
Resolved conflicts: - lib.rs: Added both streaming and context-aware commands - settings.rs: Combined both feature settings - shortcut.rs: Added both setting change handlers - bindings.ts: Combined TypeScript bindings - AdvancedSettings.tsx: Both components imported and used - settingsStore.ts: Both handlers added Removed append_trailing_space (replaced by context-aware logic). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
2 parents 3da6b3f + 1aaf9b8 commit 1120710

File tree

23 files changed

+1899
-47
lines changed

23 files changed

+1899
-47
lines changed

.claude/docs/2025-12-07-01_streaming-output-architecture.md

Lines changed: 387 additions & 0 deletions
Large diffs are not rendered by default.

src-tauri/src/actions.rs

Lines changed: 106 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use crate::managers::history::HistoryManager;
44
use crate::managers::transcription::TranscriptionManager;
55
use crate::settings::{get_settings, AppSettings};
66
use crate::shortcut;
7+
use crate::streaming::StreamingManager;
78
use crate::tray::{change_tray_icon, TrayIconState};
89
use crate::utils::{self, show_recording_overlay, show_transcribing_overlay};
910
use async_openai::types::{
@@ -219,12 +220,21 @@ impl ShortcutAction for TranscribeAction {
219220
show_recording_overlay(app);
220221

221222
let rm = app.state::<Arc<AudioRecordingManager>>();
223+
let sm = app.state::<Arc<StreamingManager>>();
222224

223225
// Get the microphone mode to determine audio feedback timing
224226
let settings = get_settings(app);
225227
let is_always_on = settings.always_on_microphone;
226228
debug!("Microphone mode - always_on: {}", is_always_on);
227229

230+
// Set up streaming VAD callback BEFORE starting recording
231+
// This avoids recreating the microphone stream after recording starts
232+
let streaming_enabled = sm.is_streaming_enabled();
233+
if streaming_enabled {
234+
sm.setup_vad_callback();
235+
debug!("Streaming VAD callback set up before recording");
236+
}
237+
228238
let mut recording_started = false;
229239
if is_always_on {
230240
// Always-on mode: Play audio feedback immediately, then apply mute after sound finishes
@@ -267,6 +277,12 @@ impl ShortcutAction for TranscribeAction {
267277
if recording_started {
268278
// Dynamically register the cancel shortcut in a separate task to avoid deadlock
269279
shortcut::register_cancel_shortcut(app);
280+
281+
// Start streaming session if enabled (VAD callback already set up above)
282+
if streaming_enabled {
283+
sm.start_controller();
284+
debug!("Streaming session started");
285+
}
270286
}
271287

272288
debug!(
@@ -286,6 +302,10 @@ impl ShortcutAction for TranscribeAction {
286302
let rm = Arc::clone(&app.state::<Arc<AudioRecordingManager>>());
287303
let tm = Arc::clone(&app.state::<Arc<TranscriptionManager>>());
288304
let hm = Arc::clone(&app.state::<Arc<HistoryManager>>());
305+
let sm = Arc::clone(&app.state::<Arc<StreamingManager>>());
306+
307+
// Check if streaming was active
308+
let streaming_was_active = sm.is_session_active();
289309

290310
change_tray_icon(app, TrayIconState::Transcribing);
291311
show_transcribing_overlay(app);
@@ -313,34 +333,62 @@ impl ShortcutAction for TranscribeAction {
313333
samples.len()
314334
);
315335

316-
let transcription_time = Instant::now();
336+
// If streaming was active, get the final text from streaming
337+
// (which does its own final transcription)
338+
let (transcription, skip_paste) = if streaming_was_active {
339+
let samples_for_streaming = samples.clone();
340+
let streaming_text = sm.stop_session(Some(samples_for_streaming));
341+
match streaming_text {
342+
Some(text) if !text.is_empty() => {
343+
debug!("Streaming session final text: '{}'", text);
344+
// Streaming already pasted incrementally, but we may need to
345+
// replace with post-processed version
346+
(Ok(text), true)
347+
}
348+
_ => {
349+
// Streaming didn't produce output, fall back to normal transcription
350+
debug!("Streaming produced no output, falling back to batch transcription");
351+
let transcription_time = Instant::now();
352+
let result = tm.transcribe(samples.clone());
353+
debug!("Batch transcription completed in {:?}", transcription_time.elapsed());
354+
(result, false)
355+
}
356+
}
357+
} else {
358+
// Normal (non-streaming) mode
359+
let transcription_time = Instant::now();
360+
let result = tm.transcribe(samples.clone());
361+
debug!("Transcription completed in {:?}", transcription_time.elapsed());
362+
(result, false)
363+
};
364+
317365
let samples_clone = samples.clone(); // Clone for history saving
318-
match tm.transcribe(samples) {
366+
367+
match transcription {
319368
Ok(transcription) => {
320-
debug!(
321-
"Transcription completed in {:?}: '{}'",
322-
transcription_time.elapsed(),
323-
transcription
324-
);
369+
debug!("Final transcription: '{}'", transcription);
325370
if !transcription.is_empty() {
326371
let settings = get_settings(&ah);
327372
let mut final_text = transcription.clone();
328373
let mut post_processed_text: Option<String> = None;
329374
let mut post_process_prompt: Option<String> = None;
375+
let mut needs_replacement = false;
330376

331377
// First, check if Chinese variant conversion is needed
332378
if let Some(converted_text) =
333379
maybe_convert_chinese_variant(&settings, &transcription).await
334380
{
335381
final_text = converted_text.clone();
336382
post_processed_text = Some(converted_text);
383+
needs_replacement = skip_paste; // Need to replace streaming output
337384
}
338385
// Then apply regular post-processing if enabled
339386
else if let Some(processed_text) =
340387
maybe_post_process_transcription(&settings, &transcription).await
341388
{
342389
final_text = processed_text.clone();
343390
post_processed_text = Some(processed_text);
391+
needs_replacement = skip_paste; // Need to replace streaming output
344392

345393
// Get the prompt that was used
346394
if let Some(prompt_id) = &settings.post_process_selected_prompt_id {
@@ -381,26 +429,55 @@ impl ShortcutAction for TranscribeAction {
381429
}
382430
});
383431

384-
// Paste the final text (either processed or original)
385-
let ah_clone = ah.clone();
386-
let paste_time = Instant::now();
387-
ah.run_on_main_thread(move || {
388-
match utils::paste(final_text, ah_clone.clone()) {
389-
Ok(()) => debug!(
390-
"Text pasted successfully in {:?}",
391-
paste_time.elapsed()
392-
),
393-
Err(e) => error!("Failed to paste transcription: {}", e),
394-
}
395-
// Hide the overlay after transcription is complete
396-
utils::hide_recording_overlay(&ah_clone);
397-
change_tray_icon(&ah_clone, TrayIconState::Idle);
398-
})
399-
.unwrap_or_else(|e| {
400-
error!("Failed to run paste on main thread: {:?}", e);
432+
// Paste the final text
433+
// - If streaming was active and post-processing changed the text,
434+
// we need to replace what was streamed
435+
// - If streaming was active but no post-processing, skip pasting
436+
// (text already output)
437+
// - If not streaming, paste normally
438+
let should_paste = !skip_paste || needs_replacement;
439+
440+
if should_paste {
441+
let ah_clone = ah.clone();
442+
let paste_time = Instant::now();
443+
444+
// If replacing streaming output, we need to delete old text first
445+
let chars_to_delete = if needs_replacement {
446+
transcription.chars().count()
447+
} else {
448+
0
449+
};
450+
451+
ah.run_on_main_thread(move || {
452+
// Delete streaming output if we're replacing
453+
if chars_to_delete > 0 {
454+
debug!("Replacing streaming output ({} chars) with post-processed text", chars_to_delete);
455+
if let Err(e) = utils::delete_chars(chars_to_delete) {
456+
error!("Failed to delete streaming output: {}", e);
457+
}
458+
}
459+
460+
match utils::paste(final_text, ah_clone.clone()) {
461+
Ok(()) => debug!(
462+
"Text pasted successfully in {:?}",
463+
paste_time.elapsed()
464+
),
465+
Err(e) => error!("Failed to paste transcription: {}", e),
466+
}
467+
// Hide the overlay after transcription is complete
468+
utils::hide_recording_overlay(&ah_clone);
469+
change_tray_icon(&ah_clone, TrayIconState::Idle);
470+
})
471+
.unwrap_or_else(|e| {
472+
error!("Failed to run paste on main thread: {:?}", e);
473+
utils::hide_recording_overlay(&ah);
474+
change_tray_icon(&ah, TrayIconState::Idle);
475+
});
476+
} else {
477+
// Streaming output is already there, just clean up
401478
utils::hide_recording_overlay(&ah);
402479
change_tray_icon(&ah, TrayIconState::Idle);
403-
});
480+
}
404481
} else {
405482
utils::hide_recording_overlay(&ah);
406483
change_tray_icon(&ah, TrayIconState::Idle);
@@ -414,6 +491,10 @@ impl ShortcutAction for TranscribeAction {
414491
}
415492
} else {
416493
debug!("No samples retrieved from recording stop");
494+
// Also stop streaming session if it was active
495+
if streaming_was_active {
496+
sm.stop_session(None);
497+
}
417498
utils::hide_recording_overlay(&ah);
418499
change_tray_icon(&ah, TrayIconState::Idle);
419500
}

src-tauri/src/audio_toolkit/audio/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mod utils;
66
mod visualizer;
77

88
pub use device::{list_input_devices, list_output_devices, CpalDeviceInfo};
9-
pub use recorder::AudioRecorder;
9+
pub use recorder::{AudioRecorder, VadCallback};
1010
pub use resampler::FrameResampler;
1111
pub use utils::save_wav_file;
1212
pub use visualizer::AudioVisualiser;

src-tauri/src/audio_toolkit/audio/recorder.rs

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,17 @@ enum Cmd {
2222
Shutdown,
2323
}
2424

25+
/// Callback type for VAD frame events.
26+
/// Arguments: (is_speech: bool, frame_samples: &[f32])
27+
pub type VadCallback = Arc<dyn Fn(bool, &[f32]) + Send + Sync + 'static>;
28+
2529
pub struct AudioRecorder {
2630
device: Option<Device>,
2731
cmd_tx: Option<mpsc::Sender<Cmd>>,
2832
worker_handle: Option<std::thread::JoinHandle<()>>,
2933
vad: Option<Arc<Mutex<Box<dyn vad::VoiceActivityDetector>>>>,
3034
level_cb: Option<Arc<dyn Fn(Vec<f32>) + Send + Sync + 'static>>,
35+
vad_cb: Option<VadCallback>,
3136
}
3237

3338
impl AudioRecorder {
@@ -38,6 +43,7 @@ impl AudioRecorder {
3843
worker_handle: None,
3944
vad: None,
4045
level_cb: None,
46+
vad_cb: None,
4147
})
4248
}
4349

@@ -54,6 +60,18 @@ impl AudioRecorder {
5460
self
5561
}
5662

63+
/// Set a callback to receive VAD frame results during recording.
64+
///
65+
/// The callback receives (is_speech: bool, frame_samples: &[f32]) for each
66+
/// 30ms VAD frame. This is useful for streaming transcription pause detection.
67+
pub fn with_vad_callback<F>(mut self, cb: F) -> Self
68+
where
69+
F: Fn(bool, &[f32]) + Send + Sync + 'static,
70+
{
71+
self.vad_cb = Some(Arc::new(cb));
72+
self
73+
}
74+
5775
pub fn open(&mut self, device: Option<Device>) -> Result<(), Box<dyn std::error::Error>> {
5876
if self.worker_handle.is_some() {
5977
return Ok(()); // already open
@@ -74,6 +92,7 @@ impl AudioRecorder {
7492
let vad = self.vad.clone();
7593
// Move the optional level callback into the worker thread
7694
let level_cb = self.level_cb.clone();
95+
let vad_cb = self.vad_cb.clone();
7796

7897
let worker = std::thread::spawn(move || {
7998
let config = AudioRecorder::get_preferred_config(&thread_device)
@@ -117,7 +136,7 @@ impl AudioRecorder {
117136
stream.play().expect("failed to start stream");
118137

119138
// keep the stream alive while we process samples
120-
run_consumer(sample_rate, vad, sample_rx, cmd_rx, level_cb);
139+
run_consumer(sample_rate, vad, sample_rx, cmd_rx, level_cb, vad_cb);
121140
// stream is dropped here, after run_consumer returns
122141
});
123142

@@ -245,6 +264,7 @@ fn run_consumer(
245264
sample_rx: mpsc::Receiver<Vec<f32>>,
246265
cmd_rx: mpsc::Receiver<Cmd>,
247266
level_cb: Option<Arc<dyn Fn(Vec<f32>) + Send + Sync + 'static>>,
267+
vad_cb: Option<VadCallback>,
248268
) {
249269
let mut frame_resampler = FrameResampler::new(
250270
in_sample_rate as usize,
@@ -266,26 +286,38 @@ fn run_consumer(
266286
4000.0, // vocal_max_hz
267287
);
268288

269-
fn handle_frame(
270-
samples: &[f32],
271-
recording: bool,
272-
vad: &Option<Arc<Mutex<Box<dyn vad::VoiceActivityDetector>>>>,
273-
out_buf: &mut Vec<f32>,
274-
) {
289+
// Helper closure to process a VAD frame
290+
let handle_frame = |samples: &[f32],
291+
recording: bool,
292+
vad: &Option<Arc<Mutex<Box<dyn vad::VoiceActivityDetector>>>>,
293+
vad_cb: &Option<VadCallback>,
294+
out_buf: &mut Vec<f32>| {
275295
if !recording {
276296
return;
277297
}
278298

279299
if let Some(vad_arc) = vad {
280300
let mut det = vad_arc.lock().unwrap();
281-
match det.push_frame(samples).unwrap_or(VadFrame::Speech(samples)) {
301+
let vad_result = det.push_frame(samples).unwrap_or(VadFrame::Speech(samples));
302+
let is_speech = vad_result.is_speech();
303+
304+
// Call the VAD callback with the result
305+
if let Some(cb) = vad_cb {
306+
cb(is_speech, samples);
307+
}
308+
309+
match vad_result {
282310
VadFrame::Speech(buf) => out_buf.extend_from_slice(buf),
283311
VadFrame::Noise => {}
284312
}
285313
} else {
314+
// No VAD - assume all frames are speech
315+
if let Some(cb) = vad_cb {
316+
cb(true, samples);
317+
}
286318
out_buf.extend_from_slice(samples);
287319
}
288-
}
320+
};
289321

290322
loop {
291323
let raw = match sample_rx.recv() {
@@ -302,7 +334,7 @@ fn run_consumer(
302334

303335
// ---------- existing pipeline ------------------------------------ //
304336
frame_resampler.push(&raw, &mut |frame: &[f32]| {
305-
handle_frame(frame, recording, &vad, &mut processed_samples)
337+
handle_frame(frame, recording, &vad, &vad_cb, &mut processed_samples)
306338
});
307339

308340
// non-blocking check for a command
@@ -321,7 +353,7 @@ fn run_consumer(
321353

322354
frame_resampler.finish(&mut |frame: &[f32]| {
323355
// we still want to process the last few frames
324-
handle_frame(frame, true, &vad, &mut processed_samples)
356+
handle_frame(frame, true, &vad, &vad_cb, &mut processed_samples)
325357
});
326358

327359
let _ = reply_tx.send(std::mem::take(&mut processed_samples));

src-tauri/src/audio_toolkit/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ pub mod vad;
66

77
pub use audio::{
88
list_input_devices, list_output_devices, save_wav_file, AudioRecorder, CpalDeviceInfo,
9+
VadCallback,
910
};
1011
pub use text::apply_custom_words;
1112
pub use utils::get_cpal_host;

src-tauri/src/clipboard.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,35 @@ pub fn paste(text: String, app_handle: AppHandle) -> Result<(), String> {
257257

258258
Ok(())
259259
}
260+
261+
/// Delete a specified number of characters by sending backspace keys.
262+
///
263+
/// This is used when we need to replace previously output text (e.g., when
264+
/// post-processing changes streaming output).
265+
pub fn delete_chars(count: usize) -> Result<(), String> {
266+
if count == 0 {
267+
return Ok(());
268+
}
269+
270+
use enigo::Direction;
271+
use std::time::Duration;
272+
273+
let mut enigo =
274+
Enigo::new(&Settings::default()).map_err(|e| format!("Failed to init Enigo: {}", e))?;
275+
276+
for i in 0..count {
277+
enigo
278+
.key(Key::Backspace, Direction::Click)
279+
.map_err(|e| format!("Failed to send backspace: {}", e))?;
280+
281+
// Small delay every 10 backspaces to avoid overwhelming the input system
282+
if i > 0 && i % 10 == 0 {
283+
std::thread::sleep(Duration::from_millis(5));
284+
}
285+
}
286+
287+
// Small delay after all backspaces
288+
std::thread::sleep(Duration::from_millis(20));
289+
290+
Ok(())
291+
}

0 commit comments

Comments
 (0)