gptguy
diff --git a/‎Cargo.lock‎
Lines changed: 110 additions & 147 deletions b/‎Cargo.lock‎
Lines changed: 110 additions & 147 deletions
diff --git a/‎src-tauri/Cargo.toml‎
Lines changed: 4 additions & 2 deletions b/‎src-tauri/Cargo.toml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src-tauri/src/app.rs‎
Lines changed: 1 addition & 1 deletion b/‎src-tauri/src/app.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src-tauri/src/asr/audio_io.rs‎
Lines changed: 0 additions & 33 deletions b/‎src-tauri/src/asr/audio_io.rs‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎src-tauri/src/asr/decoder.rs‎
Lines changed: 129 additions & 87 deletions b/‎src-tauri/src/asr/decoder.rs‎
Lines changed: 129 additions & 87 deletions
diff --git a/‎src-tauri/src/asr/mod.rs‎
Lines changed: 5 additions & 19 deletions b/‎src-tauri/src/asr/mod.rs‎
Lines changed: 5 additions & 19 deletions
@@ -27,11 +27,13 @@ thiserror = "2.0.17"
 ort = "2.0.0-rc.10"
 num_cpus = "1.17.0"
 enigo = "0.6.1"
-reqwest = { version = "0.12.25", default-features = false, features = ["blocking", "rustls-tls", "gzip"] }
+ureq = { version = "2.12.1", features = ["json", "charset"] }
 dirs-next = "2.0.0"
 cpal = "0.16.0"
+tauri-plugin-single-instance = "2.3.6"
 tauri-plugin-store = "2.4.1"
 tauri-plugin-log = "2.7.1"
 tauri-plugin-dialog = "2.4.2"
 tauri-plugin-global-shortcut = "2.3.1"
-tauri-plugin-single-instance = "2.3.6"
+rtrb = "0.3.2"
+rubato = "0.16.2"
@@ -86,7 +86,7 @@ fn handle_run_event(app_handle: &AppHandle, event: RunEvent) {
 fn handle_run_event(_app_handle: &AppHandle, _event: RunEvent) {}
 
 fn on_second_instance(app: &AppHandle, argv: Vec<String>, cwd: String) {
-    log::info!("{}, {argv:?}, {cwd}", app.package_info().name);
+    log::info!("Second instance detected (args={argv:?}, cwd={cwd})");
     if let Err(err) = app.emit("single-instance", ()) {
         log::error!("Failed to emit single-instance event: {err}");
     }
 
@@ -1,34 +1 @@
 pub(crate) const TARGET_SAMPLE_RATE: u32 = 16_000;
-
-pub fn resample_linear(input: &[f32], from_sr: u32, to_sr: u32) -> Vec<f32> {
-    if from_sr == 0 || to_sr == 0 || input.is_empty() {
-        return Vec::new();
-    }
-
-    if from_sr == to_sr {
-        return input.to_vec();
-    }
-
-    let out_len = ((input.len() as f64) * (to_sr as f64) / (from_sr as f64))
-        .ceil()
-        .max(1.0) as usize;
-    let step = from_sr as f64 / to_sr as f64;
-
-    let mut output = Vec::with_capacity(out_len);
-    let input_len = input.len();
-
-    for i in 0..out_len {
-        let pos = (i as f64) * step;
-        let idx = pos.floor() as usize;
-        let frac = (pos - idx as f64) as f32;
-
-        unsafe {
-            let current = *input.get_unchecked(idx.min(input_len - 1));
-            let next_idx = (idx + 1).min(input_len - 1);
-            let next = *input.get_unchecked(next_idx);
-            output.push(current + (next - current) * frac);
-        }
-    }
-
-    output
-}
@@ -6,7 +6,7 @@ use ort::inputs;
 use ort::value::TensorRef;
 use regex::Regex;
 
-use crate::asr::recognizer::{AsrError, AsrModel, Transcript};
+use crate::asr::recognizer::{AsrError, AsrModel, InferenceConfig, Transcript};
 
 type DecoderState = (Array3<f32>, Array3<f32>);
 
@@ -17,102 +17,72 @@ const MAX_TOKENS_PER_STEP: usize = 10;
 static DECODE_SPACE_RE: LazyLock<Result<Regex, regex::Error>> =
     LazyLock::new(|| Regex::new(r"\A\s|\s\B|(\s)\b"));
 
-pub(crate) struct DecoderWorkspace {
-    encoder_step: Array3<f32>,
-    targets: Array2<i32>,
-    target_length: Array1<i32>,
-    state: DecoderState,
+pub struct DecoderSession<'m> {
+    model: &'m mut AsrModel,
+    workspace: DecoderWorkspace,
+    last_token: i32,
 }
 
-impl DecoderWorkspace {
-    pub(crate) fn new(session: &ort::session::Session) -> Result<Self, AsrError> {
-        let encoder_dim = session
-            .inputs
-            .iter()
-            .find(|input| input.name == "encoder_outputs")
-            .and_then(|input| input.input_type.tensor_shape())
-            .and_then(|shape| shape.get(1).copied())
-            .and_then(|d| usize::try_from(d).ok())
-            .unwrap_or(1024);
-
-        let state1_shape = session
-            .inputs
-            .iter()
-            .find(|input| input.name == "input_states_1")
-            .ok_or_else(|| AsrError::InputNotFound("input_states_1".to_string()))?
-            .input_type
-            .tensor_shape()
-            .ok_or_else(|| AsrError::TensorShape("input_states_1".to_string()))?;
-
-        let state2_shape = session
-            .inputs
-            .iter()
-            .find(|input| input.name == "input_states_2")
-            .ok_or_else(|| AsrError::InputNotFound("input_states_2".to_string()))?
-            .input_type
-            .tensor_shape()
-            .ok_or_else(|| AsrError::TensorShape("input_states_2".to_string()))?;
+impl<'m> DecoderSession<'m> {
+    pub(crate) fn new(
+        model: &'m mut AsrModel,
+        workspace: Option<DecoderWorkspace>,
+        last_token: i32,
+    ) -> Result<Self, AsrError> {
+        let workspace = if let Some(ws) = workspace {
+            log::debug!("Reusing cached decoder workspace");
+            ws
+        } else {
+            log::debug!("Initializing new decoder workspace");
+            DecoderWorkspace::new(&model.decoder_joint)?
+        };
 
-        let state1 = Array::zeros((state1_shape[0] as usize, 1, state1_shape[2] as usize));
-        let state2 = Array::zeros((state2_shape[0] as usize, 1, state2_shape[2] as usize));
+        log::debug!("Decoder session initialized with last_token={}", last_token);
 
         Ok(Self {
-            encoder_step: Array::zeros((1, encoder_dim, 1)),
-            targets: Array2::zeros((1, 1)),
-            target_length: Array1::from_vec(vec![1]),
-            state: (state1, state2),
+            model,
+            workspace,
+            last_token,
         })
     }
 
-    #[inline]
-    pub(crate) fn reset_state(&mut self) {
-        self.state.0.fill(0.0);
-        self.state.1.fill(0.0);
-    }
-
-    pub(crate) fn set_encoder_step(&mut self, frame: &ArrayView1<f32>) {
-        let mut view = self.encoder_step.index_axis_mut(ndarray::Axis(2), 0);
-        let mut view = view.index_axis_mut(ndarray::Axis(0), 0);
-        view.assign(frame);
-    }
-
-    pub(crate) fn set_target(&mut self, token: i32) {
-        self.targets[[0, 0]] = token;
+    pub(crate) fn into_parts(self) -> (DecoderWorkspace, i32) {
+        (self.workspace, self.last_token)
     }
-}
 
-impl AsrModel {
     pub(crate) fn decode_sequence(
         &mut self,
         encodings: &ArrayViewD<f32>,
         encodings_len: usize,
+        _config: &InferenceConfig,
     ) -> Result<(Vec<i32>, Vec<usize>), AsrError> {
         let decode_start = Instant::now();
-        let mut tokens = Vec::with_capacity(encodings_len / 2 + 4);
-        let mut timestamps = Vec::with_capacity(encodings_len / 2 + 4);
-
-        let workspace = &mut self.decoder_workspace;
-        workspace.reset_state();
+        let mut tokens = Vec::with_capacity(std::cmp::max(1, encodings_len / 2));
+        let mut timestamps = Vec::with_capacity(std::cmp::max(1, encodings_len / 2));
 
         let mut t = 0;
         let mut emitted_tokens = 0;
 
         while t < encodings_len {
             let encoder_step = encodings.slice(ndarray::s![t, ..]);
-            workspace.set_encoder_step(&encoder_step);
+            self.workspace.set_encoder_step(&encoder_step);
 
-            let target_token = tokens.last().copied().unwrap_or(self.blank_idx);
-            workspace.set_target(target_token);
+            let target_token = if let Some(last) = tokens.last() {
+                *last
+            } else {
+                self.last_token
+            };
+            self.workspace.set_target(target_token);
 
             let inputs = inputs![
-                "encoder_outputs" => TensorRef::from_array_view(workspace.encoder_step.view())?,
-                "targets" => TensorRef::from_array_view(workspace.targets.view())?,
-                "target_length" => TensorRef::from_array_view(workspace.target_length.view())?,
-                "input_states_1" => TensorRef::from_array_view(workspace.state.0.view())?,
-                "input_states_2" => TensorRef::from_array_view(workspace.state.1.view())?,
+                "encoder_outputs" => TensorRef::from_array_view(self.workspace.encoder_step.view())?,
+                "targets" => TensorRef::from_array_view(self.workspace.targets.view())?,
+                "target_length" => TensorRef::from_array_view(self.workspace.target_length.view())?,
+                "input_states_1" => TensorRef::from_array_view(self.workspace.state.0.view())?,
+                "input_states_2" => TensorRef::from_array_view(self.workspace.state.1.view())?,
             ];
 
-            let outputs = self.decoder_joint.run(inputs)?;
+            let outputs = self.model.decoder_joint.run(inputs)?;
 
             let logits = outputs
                 .get("outputs")
@@ -127,13 +97,8 @@ impl AsrModel {
                 ))
             })?;
 
-            let vocab_logits = if logits.len() > self.vocab_size {
-                log::trace!(
-                    "TDT model detected: splitting {} logits into vocab({}) + duration",
-                    logits.len(),
-                    self.vocab_size
-                );
-                &vocab_logits_slice[..self.vocab_size]
+            let vocab_logits = if logits.len() > self.model.vocab_size {
+                &vocab_logits_slice[..self.model.vocab_size]
             } else {
                 vocab_logits_slice
             };
@@ -142,10 +107,9 @@ impl AsrModel {
                 .iter()
                 .enumerate()
                 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
-                .map(|(idx, _)| idx as i32)
-                .unwrap_or(self.blank_idx);
+                .map_or(self.model.blank_idx, |(idx, _)| idx as i32);
 
-            if token != self.blank_idx {
+            if token != self.model.blank_idx {
                 let state1 = outputs
                     .get("output_states_1")
                     .ok_or_else(|| AsrError::OutputNotFound("output_states_1".to_string()))?
@@ -156,26 +120,37 @@ impl AsrModel {
                     .try_extract_array::<f32>()?;
 
                 if let Ok(state1_view) = state1.view().into_dimensionality::<ndarray::Ix3>() {
-                    if workspace.state.0.shape() == state1_view.shape() {
-                        workspace.state.0.assign(&state1_view);
+                    if self.workspace.state.0.shape() == state1_view.shape() {
+                        self.workspace.state.0.assign(&state1_view);
                     } else {
-                        workspace.state.0 = state1_view.to_owned();
+                        log::warn!(
+                            "Decoder state_1 shape changed: {:?} -> {:?}",
+                            self.workspace.state.0.shape(),
+                            state1_view.shape()
+                        );
+                        self.workspace.state.0 = state1_view.to_owned();
                     }
                 }
                 if let Ok(state2_view) = state2.view().into_dimensionality::<ndarray::Ix3>() {
-                    if workspace.state.1.shape() == state2_view.shape() {
-                        workspace.state.1.assign(&state2_view);
+                    if self.workspace.state.1.shape() == state2_view.shape() {
+                        self.workspace.state.1.assign(&state2_view);
                     } else {
-                        workspace.state.1 = state2_view.to_owned();
+                        log::warn!(
+                            "Decoder state_2 shape changed: {:?} -> {:?}",
+                            self.workspace.state.1.shape(),
+                            state2_view.shape()
+                        );
+                        self.workspace.state.1 = state2_view.to_owned();
                     }
                 }
 
                 tokens.push(token);
                 timestamps.push(t);
                 emitted_tokens += 1;
+                self.last_token = token;
             }
 
-            if token == self.blank_idx || emitted_tokens == MAX_TOKENS_PER_STEP {
+            if token == self.model.blank_idx || emitted_tokens == MAX_TOKENS_PER_STEP {
                 t += 1;
                 emitted_tokens = 0;
             }
@@ -190,7 +165,74 @@ impl AsrModel {
 
         Ok((tokens, timestamps))
     }
+}
+
+pub(crate) struct DecoderWorkspace {
+    encoder_step: Array3<f32>,
+    targets: Array2<i32>,
+    target_length: Array1<i32>,
+    state: DecoderState,
+}
+
+impl DecoderWorkspace {
+    pub(crate) fn new(session: &ort::session::Session) -> Result<Self, AsrError> {
+        let encoder_dim = session
+            .inputs
+            .iter()
+            .find(|input| input.name == "encoder_outputs")
+            .and_then(|input| input.input_type.tensor_shape())
+            .and_then(|shape| shape.get(1).copied())
+            .and_then(|d| usize::try_from(d).ok());
+
+        let encoder_dim = match encoder_dim {
+            Some(dim) => dim,
+            None => {
+                log::warn!("Could not determine encoder_dim from model, falling back to 1024");
+                1024
+            }
+        };
+
+        let state1_shape = session
+            .inputs
+            .iter()
+            .find(|input| input.name == "input_states_1")
+            .ok_or_else(|| AsrError::InputNotFound("input_states_1".to_string()))?
+            .input_type
+            .tensor_shape()
+            .ok_or_else(|| AsrError::TensorShape("input_states_1".to_string()))?;
+
+        let state2_shape = session
+            .inputs
+            .iter()
+            .find(|input| input.name == "input_states_2")
+            .ok_or_else(|| AsrError::InputNotFound("input_states_2".to_string()))?
+            .input_type
+            .tensor_shape()
+            .ok_or_else(|| AsrError::TensorShape("input_states_2".to_string()))?;
+
+        let state1 = Array::zeros((state1_shape[0] as usize, 1, state1_shape[2] as usize));
+        let state2 = Array::zeros((state2_shape[0] as usize, 1, state2_shape[2] as usize));
+
+        Ok(Self {
+            encoder_step: Array::zeros((1, encoder_dim, 1)),
+            targets: Array2::zeros((1, 1)),
+            target_length: Array1::from_vec(vec![1]),
+            state: (state1, state2),
+        })
+    }
+
+    pub(crate) fn set_encoder_step(&mut self, frame: &ArrayView1<f32>) {
+        let mut view = self.encoder_step.index_axis_mut(ndarray::Axis(2), 0);
+        let mut view = view.index_axis_mut(ndarray::Axis(0), 0);
+        view.assign(frame);
+    }
 
+    pub(crate) fn set_target(&mut self, token: i32) {
+        self.targets[[0, 0]] = token;
+    }
+}
+
+impl AsrModel {
     pub(crate) fn decode_tokens(&self, ids: Vec<i32>, timestamps: Vec<usize>) -> Transcript {
         let tokens: Vec<String> = ids
             .iter()
 
@@ -4,31 +4,17 @@ pub mod download_progress;
 mod model_store;
 mod recognizer;
 
-use crate::vad::VadModel;
-use std::sync::{Arc, OnceLock};
 use tauri::AppHandle;
 
-static VAD_MODEL: OnceLock<Arc<VadModel>> = OnceLock::new();
-
-pub fn get_or_init_vad_model(app: &AppHandle) -> Arc<VadModel> {
-    VAD_MODEL
-        .get_or_init(|| {
-            let path = model_store::vad_model_path(app);
-            match VadModel::new(&path) {
-                Ok(m) => Arc::new(m),
-                Err(e) => {
-                    log::error!("Failed to load VAD model: {e}");
-                    panic!("VAD model failed to load at {}: {e}", path.display());
-                }
-            }
-        })
-        .clone()
+pub fn get_or_init_vad_model(app: &AppHandle) -> Result<std::path::PathBuf, String> {
+    model_store::ensure_vad_model(app).map_err(|e| e.user_message().to_string())
 }
 
 pub use model_store::{
-    default_model_root, ensure_vad_model, fallback_model_root, resolve_model_dir, vad_model_path,
+    default_model_root, ensure_vad_model, fallback_model_root, missing_model_files_for_tests,
+    resolve_model_dir, vad_model_path,
 };
 pub use recognizer::{AsrError, AsrModel, Transcript};
 
-pub(crate) use audio_io::{resample_linear, TARGET_SAMPLE_RATE};
+pub(crate) use audio_io::TARGET_SAMPLE_RATE;
 pub(crate) use download_progress::{current_download_progress, record_failure, DownloadProgress};
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ fn handle_run_event(app_handle: &AppHandle, event: RunEvent) {`
`86`	`86`	`fn handle_run_event(_app_handle: &AppHandle, _event: RunEvent) {}`
`87`	`87`
`88`	`88`	`fn on_second_instance(app: &AppHandle, argv: Vec<String>, cwd: String) {`
`89`		`- log::info!("{}, {argv:?}, {cwd}", app.package_info().name);`
	`89`	`+ log::info!("Second instance detected (args={argv:?}, cwd={cwd})");`
`90`	`90`	`if let Err(err) = app.emit("single-instance", ()) {`
`91`	`91`	`log::error!("Failed to emit single-instance event: {err}");`
`92`	`92`	`}`