cjpais
diff --git a/‎.claude/docs/2025-12-07-01_streaming-output-architecture.md‎
Lines changed: 387 additions & 0 deletions b/‎.claude/docs/2025-12-07-01_streaming-output-architecture.md‎
Lines changed: 387 additions & 0 deletions
diff --git a/‎src-tauri/src/actions.rs‎
Lines changed: 106 additions & 25 deletions b/‎src-tauri/src/actions.rs‎
Lines changed: 106 additions & 25 deletions
diff --git a/‎src-tauri/src/audio_toolkit/audio/mod.rs‎
Lines changed: 1 addition & 1 deletion b/‎src-tauri/src/audio_toolkit/audio/mod.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src-tauri/src/audio_toolkit/audio/recorder.rs‎
Lines changed: 43 additions & 11 deletions b/‎src-tauri/src/audio_toolkit/audio/recorder.rs‎
Lines changed: 43 additions & 11 deletions
diff --git a/‎src-tauri/src/audio_toolkit/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎src-tauri/src/audio_toolkit/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src-tauri/src/clipboard.rs‎
Lines changed: 32 additions & 0 deletions b/‎src-tauri/src/clipboard.rs‎
Lines changed: 32 additions & 0 deletions
@@ -4,6 +4,7 @@ use crate::managers::history::HistoryManager;
 use crate::managers::transcription::TranscriptionManager;
 use crate::settings::{get_settings, AppSettings};
 use crate::shortcut;
+use crate::streaming::StreamingManager;
 use crate::tray::{change_tray_icon, TrayIconState};
 use crate::utils::{self, show_recording_overlay, show_transcribing_overlay};
 use async_openai::types::{
@@ -219,12 +220,21 @@ impl ShortcutAction for TranscribeAction {
         show_recording_overlay(app);
 
         let rm = app.state::<Arc<AudioRecordingManager>>();
+        let sm = app.state::<Arc<StreamingManager>>();
 
         // Get the microphone mode to determine audio feedback timing
         let settings = get_settings(app);
         let is_always_on = settings.always_on_microphone;
         debug!("Microphone mode - always_on: {}", is_always_on);
 
+        // Set up streaming VAD callback BEFORE starting recording
+        // This avoids recreating the microphone stream after recording starts
+        let streaming_enabled = sm.is_streaming_enabled();
+        if streaming_enabled {
+            sm.setup_vad_callback();
+            debug!("Streaming VAD callback set up before recording");
+        }
+
         let mut recording_started = false;
         if is_always_on {
             // Always-on mode: Play audio feedback immediately, then apply mute after sound finishes
@@ -267,6 +277,12 @@ impl ShortcutAction for TranscribeAction {
         if recording_started {
             // Dynamically register the cancel shortcut in a separate task to avoid deadlock
             shortcut::register_cancel_shortcut(app);
+
+            // Start streaming session if enabled (VAD callback already set up above)
+            if streaming_enabled {
+                sm.start_controller();
+                debug!("Streaming session started");
+            }
         }
 
         debug!(
@@ -286,6 +302,10 @@ impl ShortcutAction for TranscribeAction {
         let rm = Arc::clone(&app.state::<Arc<AudioRecordingManager>>());
         let tm = Arc::clone(&app.state::<Arc<TranscriptionManager>>());
         let hm = Arc::clone(&app.state::<Arc<HistoryManager>>());
+        let sm = Arc::clone(&app.state::<Arc<StreamingManager>>());
+
+        // Check if streaming was active
+        let streaming_was_active = sm.is_session_active();
 
         change_tray_icon(app, TrayIconState::Transcribing);
         show_transcribing_overlay(app);
@@ -313,34 +333,62 @@ impl ShortcutAction for TranscribeAction {
                     samples.len()
                 );
 
-                let transcription_time = Instant::now();
+                // If streaming was active, get the final text from streaming
+                // (which does its own final transcription)
+                let (transcription, skip_paste) = if streaming_was_active {
+                    let samples_for_streaming = samples.clone();
+                    let streaming_text = sm.stop_session(Some(samples_for_streaming));
+                    match streaming_text {
+                        Some(text) if !text.is_empty() => {
+                            debug!("Streaming session final text: '{}'", text);
+                            // Streaming already pasted incrementally, but we may need to
+                            // replace with post-processed version
+                            (Ok(text), true)
+                        }
+                        _ => {
+                            // Streaming didn't produce output, fall back to normal transcription
+                            debug!("Streaming produced no output, falling back to batch transcription");
+                            let transcription_time = Instant::now();
+                            let result = tm.transcribe(samples.clone());
+                            debug!("Batch transcription completed in {:?}", transcription_time.elapsed());
+                            (result, false)
+                        }
+                    }
+                } else {
+                    // Normal (non-streaming) mode
+                    let transcription_time = Instant::now();
+                    let result = tm.transcribe(samples.clone());
+                    debug!("Transcription completed in {:?}", transcription_time.elapsed());
+                    (result, false)
+                };
+
                 let samples_clone = samples.clone(); // Clone for history saving
-                match tm.transcribe(samples) {
+
+                match transcription {
                     Ok(transcription) => {
-                        debug!(
-                            "Transcription completed in {:?}: '{}'",
-                            transcription_time.elapsed(),
-                            transcription
-                        );
+                        debug!("Final transcription: '{}'", transcription);
                         if !transcription.is_empty() {
                             let settings = get_settings(&ah);
                             let mut final_text = transcription.clone();
                             let mut post_processed_text: Option<String> = None;
                             let mut post_process_prompt: Option<String> = None;
+                            let mut needs_replacement = false;
 
                             // First, check if Chinese variant conversion is needed
                             if let Some(converted_text) =
                                 maybe_convert_chinese_variant(&settings, &transcription).await
                             {
                                 final_text = converted_text.clone();
                                 post_processed_text = Some(converted_text);
+                                needs_replacement = skip_paste; // Need to replace streaming output
                             }
                             // Then apply regular post-processing if enabled
                             else if let Some(processed_text) =
                                 maybe_post_process_transcription(&settings, &transcription).await
                             {
                                 final_text = processed_text.clone();
                                 post_processed_text = Some(processed_text);
+                                needs_replacement = skip_paste; // Need to replace streaming output
 
                                 // Get the prompt that was used
                                 if let Some(prompt_id) = &settings.post_process_selected_prompt_id {
@@ -381,26 +429,55 @@ impl ShortcutAction for TranscribeAction {
                                 }
                             });
 
-                            // Paste the final text (either processed or original)
-                            let ah_clone = ah.clone();
-                            let paste_time = Instant::now();
-                            ah.run_on_main_thread(move || {
-                                match utils::paste(final_text, ah_clone.clone()) {
-                                    Ok(()) => debug!(
-                                        "Text pasted successfully in {:?}",
-                                        paste_time.elapsed()
-                                    ),
-                                    Err(e) => error!("Failed to paste transcription: {}", e),
-                                }
-                                // Hide the overlay after transcription is complete
-                                utils::hide_recording_overlay(&ah_clone);
-                                change_tray_icon(&ah_clone, TrayIconState::Idle);
-                            })
-                            .unwrap_or_else(|e| {
-                                error!("Failed to run paste on main thread: {:?}", e);
+                            // Paste the final text
+                            // - If streaming was active and post-processing changed the text,
+                            //   we need to replace what was streamed
+                            // - If streaming was active but no post-processing, skip pasting
+                            //   (text already output)
+                            // - If not streaming, paste normally
+                            let should_paste = !skip_paste || needs_replacement;
+
+                            if should_paste {
+                                let ah_clone = ah.clone();
+                                let paste_time = Instant::now();
+
+                                // If replacing streaming output, we need to delete old text first
+                                let chars_to_delete = if needs_replacement {
+                                    transcription.chars().count()
+                                } else {
+                                    0
+                                };
+
+                                ah.run_on_main_thread(move || {
+                                    // Delete streaming output if we're replacing
+                                    if chars_to_delete > 0 {
+                                        debug!("Replacing streaming output ({} chars) with post-processed text", chars_to_delete);
+                                        if let Err(e) = utils::delete_chars(chars_to_delete) {
+                                            error!("Failed to delete streaming output: {}", e);
+                                        }
+                                    }
+
+                                    match utils::paste(final_text, ah_clone.clone()) {
+                                        Ok(()) => debug!(
+                                            "Text pasted successfully in {:?}",
+                                            paste_time.elapsed()
+                                        ),
+                                        Err(e) => error!("Failed to paste transcription: {}", e),
+                                    }
+                                    // Hide the overlay after transcription is complete
+                                    utils::hide_recording_overlay(&ah_clone);
+                                    change_tray_icon(&ah_clone, TrayIconState::Idle);
+                                })
+                                .unwrap_or_else(|e| {
+                                    error!("Failed to run paste on main thread: {:?}", e);
+                                    utils::hide_recording_overlay(&ah);
+                                    change_tray_icon(&ah, TrayIconState::Idle);
+                                });
+                            } else {
+                                // Streaming output is already there, just clean up
                                 utils::hide_recording_overlay(&ah);
                                 change_tray_icon(&ah, TrayIconState::Idle);
-                            });
+                            }
                         } else {
                             utils::hide_recording_overlay(&ah);
                             change_tray_icon(&ah, TrayIconState::Idle);
@@ -414,6 +491,10 @@ impl ShortcutAction for TranscribeAction {
                 }
             } else {
                 debug!("No samples retrieved from recording stop");
+                // Also stop streaming session if it was active
+                if streaming_was_active {
+                    sm.stop_session(None);
+                }
                 utils::hide_recording_overlay(&ah);
                 change_tray_icon(&ah, TrayIconState::Idle);
             }
 
@@ -6,7 +6,7 @@ mod utils;
 mod visualizer;
 
 pub use device::{list_input_devices, list_output_devices, CpalDeviceInfo};
-pub use recorder::AudioRecorder;
+pub use recorder::{AudioRecorder, VadCallback};
 pub use resampler::FrameResampler;
 pub use utils::save_wav_file;
 pub use visualizer::AudioVisualiser;
@@ -22,12 +22,17 @@ enum Cmd {
     Shutdown,
 }
 
+/// Callback type for VAD frame events.
+/// Arguments: (is_speech: bool, frame_samples: &[f32])
+pub type VadCallback = Arc<dyn Fn(bool, &[f32]) + Send + Sync + 'static>;
+
 pub struct AudioRecorder {
     device: Option<Device>,
     cmd_tx: Option<mpsc::Sender<Cmd>>,
     worker_handle: Option<std::thread::JoinHandle<()>>,
     vad: Option<Arc<Mutex<Box<dyn vad::VoiceActivityDetector>>>>,
     level_cb: Option<Arc<dyn Fn(Vec<f32>) + Send + Sync + 'static>>,
+    vad_cb: Option<VadCallback>,
 }
 
 impl AudioRecorder {
@@ -38,6 +43,7 @@ impl AudioRecorder {
             worker_handle: None,
             vad: None,
             level_cb: None,
+            vad_cb: None,
         })
     }
 
@@ -54,6 +60,18 @@ impl AudioRecorder {
         self
     }
 
+    /// Set a callback to receive VAD frame results during recording.
+    ///
+    /// The callback receives (is_speech: bool, frame_samples: &[f32]) for each
+    /// 30ms VAD frame. This is useful for streaming transcription pause detection.
+    pub fn with_vad_callback<F>(mut self, cb: F) -> Self
+    where
+        F: Fn(bool, &[f32]) + Send + Sync + 'static,
+    {
+        self.vad_cb = Some(Arc::new(cb));
+        self
+    }
+
     pub fn open(&mut self, device: Option<Device>) -> Result<(), Box<dyn std::error::Error>> {
         if self.worker_handle.is_some() {
             return Ok(()); // already open
@@ -74,6 +92,7 @@ impl AudioRecorder {
         let vad = self.vad.clone();
         // Move the optional level callback into the worker thread
         let level_cb = self.level_cb.clone();
+        let vad_cb = self.vad_cb.clone();
 
         let worker = std::thread::spawn(move || {
             let config = AudioRecorder::get_preferred_config(&thread_device)
@@ -117,7 +136,7 @@ impl AudioRecorder {
             stream.play().expect("failed to start stream");
 
             // keep the stream alive while we process samples
-            run_consumer(sample_rate, vad, sample_rx, cmd_rx, level_cb);
+            run_consumer(sample_rate, vad, sample_rx, cmd_rx, level_cb, vad_cb);
             // stream is dropped here, after run_consumer returns
         });
 
@@ -245,6 +264,7 @@ fn run_consumer(
     sample_rx: mpsc::Receiver<Vec<f32>>,
     cmd_rx: mpsc::Receiver<Cmd>,
     level_cb: Option<Arc<dyn Fn(Vec<f32>) + Send + Sync + 'static>>,
+    vad_cb: Option<VadCallback>,
 ) {
     let mut frame_resampler = FrameResampler::new(
         in_sample_rate as usize,
@@ -266,26 +286,38 @@ fn run_consumer(
         4000.0, // vocal_max_hz
     );
 
-    fn handle_frame(
-        samples: &[f32],
-        recording: bool,
-        vad: &Option<Arc<Mutex<Box<dyn vad::VoiceActivityDetector>>>>,
-        out_buf: &mut Vec<f32>,
-    ) {
+    // Helper closure to process a VAD frame
+    let handle_frame = |samples: &[f32],
+                            recording: bool,
+                            vad: &Option<Arc<Mutex<Box<dyn vad::VoiceActivityDetector>>>>,
+                            vad_cb: &Option<VadCallback>,
+                            out_buf: &mut Vec<f32>| {
         if !recording {
             return;
         }
 
         if let Some(vad_arc) = vad {
             let mut det = vad_arc.lock().unwrap();
-            match det.push_frame(samples).unwrap_or(VadFrame::Speech(samples)) {
+            let vad_result = det.push_frame(samples).unwrap_or(VadFrame::Speech(samples));
+            let is_speech = vad_result.is_speech();
+
+            // Call the VAD callback with the result
+            if let Some(cb) = vad_cb {
+                cb(is_speech, samples);
+            }
+
+            match vad_result {
                 VadFrame::Speech(buf) => out_buf.extend_from_slice(buf),
                 VadFrame::Noise => {}
             }
         } else {
+            // No VAD - assume all frames are speech
+            if let Some(cb) = vad_cb {
+                cb(true, samples);
+            }
             out_buf.extend_from_slice(samples);
         }
-    }
+    };
 
     loop {
         let raw = match sample_rx.recv() {
@@ -302,7 +334,7 @@ fn run_consumer(
 
         // ---------- existing pipeline ------------------------------------ //
         frame_resampler.push(&raw, &mut |frame: &[f32]| {
-            handle_frame(frame, recording, &vad, &mut processed_samples)
+            handle_frame(frame, recording, &vad, &vad_cb, &mut processed_samples)
         });
 
         // non-blocking check for a command
@@ -321,7 +353,7 @@ fn run_consumer(
 
                     frame_resampler.finish(&mut |frame: &[f32]| {
                         // we still want to process the last few frames
-                        handle_frame(frame, true, &vad, &mut processed_samples)
+                        handle_frame(frame, true, &vad, &vad_cb, &mut processed_samples)
                     });
 
                     let _ = reply_tx.send(std::mem::take(&mut processed_samples));
 
@@ -6,6 +6,7 @@ pub mod vad;
 
 pub use audio::{
     list_input_devices, list_output_devices, save_wav_file, AudioRecorder, CpalDeviceInfo,
+    VadCallback,
 };
 pub use text::apply_custom_words;
 pub use utils::get_cpal_host;
 
@@ -257,3 +257,35 @@ pub fn paste(text: String, app_handle: AppHandle) -> Result<(), String> {
 
     Ok(())
 }
+
+/// Delete a specified number of characters by sending backspace keys.
+///
+/// This is used when we need to replace previously output text (e.g., when
+/// post-processing changes streaming output).
+pub fn delete_chars(count: usize) -> Result<(), String> {
+    if count == 0 {
+        return Ok(());
+    }
+
+    use enigo::Direction;
+    use std::time::Duration;
+
+    let mut enigo =
+        Enigo::new(&Settings::default()).map_err(|e| format!("Failed to init Enigo: {}", e))?;
+
+    for i in 0..count {
+        enigo
+            .key(Key::Backspace, Direction::Click)
+            .map_err(|e| format!("Failed to send backspace: {}", e))?;
+
+        // Small delay every 10 backspaces to avoid overwhelming the input system
+        if i > 0 && i % 10 == 0 {
+            std::thread::sleep(Duration::from_millis(5));
+        }
+    }
+
+    // Small delay after all backspaces
+    std::thread::sleep(Duration::from_millis(20));
+
+    Ok(())
+}