From 805874719937e0704c175f72314d3f015b8a0a44 Mon Sep 17 00:00:00 2001 From: ComputelessComputer <63365510+ComputelessComputer@users.noreply.github.com> Date: Wed, 20 May 2026 00:55:48 +0900 Subject: [PATCH] fix(stt): improve Soniqo Qwen transcript coverage Normalize Soniqo language hints to ISO codes and pad short Qwen audio chunks so VAD-split speech is not dropped. --- crates/listener2-core/src/batch/simple.rs | 33 ++++++++++++++++--- .../transcribe-soniqo/swift-lib/src/lib.swift | 21 +++++++----- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/crates/listener2-core/src/batch/simple.rs b/crates/listener2-core/src/batch/simple.rs index aca42f5fab..cbb564a105 100644 --- a/crates/listener2-core/src/batch/simple.rs +++ b/crates/listener2-core/src/batch/simple.rs @@ -116,10 +116,7 @@ pub(super) async fn run_soniqo_batch( })?; let file_path = params.file_path.clone(); - let language = listen_params - .languages - .first() - .map(hypr_language::Language::bcp47_code); + let language = soniqo_language_hint(&listen_params.languages); let transcribed = tokio::task::spawn_blocking(move || { transcribe_soniqo_file(model, &file_path, language.as_deref()) @@ -223,6 +220,12 @@ fn transcribe_soniqo_samples( hypr_transcribe_soniqo::transcribe_file(model, file.path(), language).map_err(|e| e.to_string()) } +fn soniqo_language_hint(languages: &[hypr_language::Language]) -> Option { + languages + .first() + .map(|language| language.iso639().code().to_string()) +} + fn collapse_identical_channels(channels: Vec>) -> Vec> { if channels.len() != 2 || !channels_are_effectively_identical(&channels[0], &channels[1]) { return channels; @@ -269,4 +272,26 @@ mod tests { assert_eq!(channels, vec![vec![0.1, 0.2], vec![0.9, 0.8]]); } + + #[test] + fn soniqo_batch_uses_iso_language_hint() { + let params = BatchParams { + session_id: "session".to_string(), + provider: super::super::BatchProvider::Soniqo, + file_path: "/tmp/audio.wav".to_string(), + model: Some("soniqo-qwen3-small".to_string()), + base_url: "soniqo://local".to_string(), + api_key: String::new(), + languages: vec!["en-US".parse().unwrap()], + keywords: vec![], + num_speakers: None, + min_speakers: None, + max_speakers: None, + }; + + assert_eq!( + soniqo_language_hint(¶ms.languages).as_deref(), + Some("en") + ); + } } diff --git a/crates/transcribe-soniqo/swift-lib/src/lib.swift b/crates/transcribe-soniqo/swift-lib/src/lib.swift index 3fb0ad0b81..612f780d19 100644 --- a/crates/transcribe-soniqo/swift-lib/src/lib.swift +++ b/crates/transcribe-soniqo/swift-lib/src/lib.swift @@ -255,23 +255,28 @@ private enum LoadedSpeechModel { language: String? ) -> String { let minimumSamples = max(sampleRate, 1) - guard audio.count >= minimumSamples else { + guard !audio.isEmpty else { return "" } + let preparedAudio = + audio.count < minimumSamples + ? audio + [Float](repeating: 0, count: minimumSamples - audio.count) + : audio + let chunkSamples = max(sampleRate * 30, minimumSamples) - guard audio.count > chunkSamples else { - return model.transcribe(audio: audio, sampleRate: sampleRate, language: language) + guard preparedAudio.count > chunkSamples else { + return model.transcribe(audio: preparedAudio, sampleRate: sampleRate, language: language) } var chunks: [String] = [] var offset = 0 - while offset < audio.count { - var end = min(offset + chunkSamples, audio.count) - let trailingSamples = audio.count - end + while offset < preparedAudio.count { + var end = min(offset + chunkSamples, preparedAudio.count) + let trailingSamples = preparedAudio.count - end if trailingSamples > 0 && trailingSamples < minimumSamples { - end = audio.count + end = preparedAudio.count } defer { offset = end @@ -279,7 +284,7 @@ private enum LoadedSpeechModel { let text = autoreleasepool { model.transcribe( - audio: Array(audio[offset..