Skip to content

Commit 038bb92

Browse files
Clarify STT model modes (#5569)
Show batch and realtime badges for Soniox, AssemblyAI, ElevenLabs, and Mistral model variants, and keep realtime aliases safe for batch transcription.
1 parent c5f1608 commit 038bb92

12 files changed

Lines changed: 252 additions & 53 deletions

File tree

apps/desktop/src/settings/ai/stt/select.tsx

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -370,16 +370,44 @@ function getProviderModelMode(
370370
providerId: ProviderId,
371371
model: string,
372372
): ModelEntry["mode"] {
373-
if (providerId !== "soniox") {
374-
return undefined;
373+
if (providerId === "assemblyai") {
374+
if (model === "universal-3-pro") {
375+
return "batch";
376+
}
377+
378+
if (model === "u3-rt-pro") {
379+
return "realtime";
380+
}
381+
}
382+
383+
if (providerId === "elevenlabs") {
384+
if (model === "scribe_v2") {
385+
return "batch";
386+
}
387+
388+
if (model === "scribe_v2_realtime") {
389+
return "realtime";
390+
}
375391
}
376392

377-
if (model === "stt-v5" || model === "stt-async-v5") {
378-
return "batch";
393+
if (providerId === "mistral") {
394+
if (model === "voxtral-mini-2602" || model === "voxtral-mini-latest") {
395+
return "batch";
396+
}
397+
398+
if (model === "voxtral-mini-transcribe-realtime-2602") {
399+
return "realtime";
400+
}
379401
}
380402

381-
if (model === "stt-v4" || model === "stt-rt-v4") {
382-
return "realtime";
403+
if (providerId === "soniox") {
404+
if (model === "stt-v5" || model === "stt-async-v5") {
405+
return "batch";
406+
}
407+
408+
if (model === "stt-v4" || model === "stt-rt-v4") {
409+
return "realtime";
410+
}
383411
}
384412

385413
return undefined;

apps/desktop/src/settings/ai/stt/shared.tsx

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ export const displayModelId = (model: string) => {
3434
return "Pro (Cloud)";
3535
}
3636

37-
if (
38-
model === "universal-3-pro" ||
39-
model === "u3-rt-pro" ||
40-
model === "universal"
41-
) {
37+
if (model === "u3-rt-pro") {
38+
return "Universal-3 Pro Streaming";
39+
}
40+
41+
if (model === "universal-3-pro" || model === "universal") {
4242
return "Universal-3 Pro";
4343
}
4444

@@ -62,6 +62,10 @@ export const displayModelId = (model: string) => {
6262
return "Solaria 1";
6363
}
6464

65+
if (model === "scribe_v2_realtime") {
66+
return "Scribe V2 Realtime";
67+
}
68+
6569
if (model === "scribe_v2") {
6670
return "Scribe V2";
6771
}
@@ -78,6 +82,10 @@ export const displayModelId = (model: string) => {
7882
return "GPT-4o mini Transcribe";
7983
}
8084

85+
if (model === "voxtral-mini-transcribe-realtime-2602") {
86+
return "Voxtral Realtime";
87+
}
88+
8189
if (model === "voxtral-mini-2602") {
8290
return "Voxtral Mini Transcribe 2";
8391
}
@@ -155,7 +163,7 @@ const _PROVIDERS = [
155163
badge: null,
156164
icon: <AssemblyAI size={12} />,
157165
baseUrl: "https://api.assemblyai.com",
158-
models: ["universal-3-pro"],
166+
models: ["universal-3-pro", "u3-rt-pro"],
159167
requirements: [{ kind: "requires_config", fields: ["api_key"] }],
160168
},
161169
{
@@ -207,7 +215,7 @@ const _PROVIDERS = [
207215
badge: null,
208216
icon: <ElevenLabs size={16} />,
209217
baseUrl: "https://api.elevenlabs.io",
210-
models: ["scribe_v2"],
218+
models: ["scribe_v2", "scribe_v2_realtime"],
211219
requirements: [{ kind: "requires_config", fields: ["api_key"] }],
212220
},
213221
{
@@ -217,7 +225,7 @@ const _PROVIDERS = [
217225
badge: null,
218226
icon: <Mistral size={16} />,
219227
baseUrl: "https://api.mistral.ai/v1",
220-
models: ["voxtral-mini-2602"],
228+
models: ["voxtral-mini-2602", "voxtral-mini-transcribe-realtime-2602"],
221229
requirements: [{ kind: "requires_config", fields: ["api_key"] }],
222230
},
223231
{

crates/owhisper-client/src/adapter/elevenlabs/batch.rs

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,7 @@ impl ElevenLabsAdapter {
7171
))
7272
})?;
7373

74-
let default = crate::providers::Provider::ElevenLabs.default_batch_model();
75-
let model = match params.model.as_deref() {
76-
Some(m) if crate::providers::is_meta_model(m) => default,
77-
Some("scribe_v2") => default,
78-
Some(m) => m,
79-
None => default,
80-
};
74+
let model = Self::resolve_batch_model(params.model.as_deref());
8175

8276
let part = reqwest::multipart::Part::bytes(file_bytes).file_name(file_name);
8377
let mut form = reqwest::multipart::Form::new()
@@ -124,6 +118,16 @@ impl ElevenLabsAdapter {
124118
params.num_speakers.or(params.max_speakers)
125119
}
126120

121+
fn resolve_batch_model(model: Option<&str>) -> &str {
122+
let default = crate::providers::Provider::ElevenLabs.default_batch_model();
123+
match model {
124+
Some(m) if crate::providers::is_meta_model(m) => default,
125+
Some("scribe_v2" | "scribe_v2_realtime") => default,
126+
Some(m) => m,
127+
None => default,
128+
}
129+
}
130+
127131
fn convert_to_batch_response(response: TranscriptResponse) -> BatchResponse {
128132
let words: Vec<BatchWord> = response
129133
.words
@@ -192,6 +196,14 @@ mod tests {
192196
);
193197
}
194198

199+
#[test]
200+
fn batch_realtime_model_alias_uses_batch_model() {
201+
assert_eq!(
202+
ElevenLabsAdapter::resolve_batch_model(Some("scribe_v2_realtime")),
203+
"scribe_v2"
204+
);
205+
}
206+
195207
#[test]
196208
fn speaker_labeled_words_use_mixed_capture_channel() {
197209
let response = TranscriptResponse {

crates/owhisper-client/src/adapter/elevenlabs/live.rs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,7 @@ impl RealtimeSttAdapter for ElevenLabsAdapter {
3434
query_pairs.append_pair(key, value);
3535
}
3636

37-
let default = crate::providers::Provider::ElevenLabs.default_live_model();
38-
let model = match params.model.as_deref() {
39-
Some(m) if crate::providers::is_meta_model(m) => default,
40-
Some("scribe_v2") => default,
41-
Some(m) => m,
42-
None => default,
43-
};
37+
let model = Self::resolve_live_model(params.model.as_deref());
4438
query_pairs.append_pair("model_id", model);
4539

4640
let audio_format = format!("pcm_{}", params.sample_rate);
@@ -201,6 +195,16 @@ enum ElevenLabsMessage {
201195
}
202196

203197
impl ElevenLabsAdapter {
198+
fn resolve_live_model(model: Option<&str>) -> &str {
199+
let default = crate::providers::Provider::ElevenLabs.default_live_model();
200+
match model {
201+
Some(m) if crate::providers::is_meta_model(m) => default,
202+
Some("scribe_v2") => default,
203+
Some(m) => m,
204+
None => default,
205+
}
206+
}
207+
204208
fn build_response(
205209
text: &str,
206210
words: Vec<ElevenLabsWord>,

crates/owhisper-client/src/adapter/mistral/batch.rs

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,7 @@ async fn do_transcribe_file(
9595
.mime_str(mime_type)
9696
.map_err(|e| Error::AudioProcessing(e.to_string()))?;
9797

98-
let default = Provider::Mistral.default_batch_model();
99-
let model = match params.model.as_deref() {
100-
Some(m) if is_meta_model(m) => default,
101-
Some(m) => m,
102-
None => default,
103-
};
98+
let model = resolve_batch_model(params.model.as_deref());
10499

105100
let mut form = Form::new()
106101
.part("file", file_part)
@@ -149,6 +144,16 @@ fn strip_punctuation(s: &str) -> String {
149144
.to_string()
150145
}
151146

147+
fn resolve_batch_model(model: Option<&str>) -> &str {
148+
let default = Provider::Mistral.default_batch_model();
149+
match model {
150+
Some(m) if is_meta_model(m) => default,
151+
Some("voxtral-mini-transcribe-realtime-2602") => default,
152+
Some(m) => m,
153+
None => default,
154+
}
155+
}
156+
152157
fn convert_response(response: MistralBatchResponse) -> BatchResponse {
153158
let (words, timing_source): (Vec<Word>, &str) = if !response.words.is_empty() {
154159
(
@@ -247,6 +252,14 @@ mod tests {
247252
use crate::adapter::BatchSttAdapter;
248253
use crate::http_client::create_client;
249254

255+
#[test]
256+
fn batch_realtime_model_alias_uses_batch_model() {
257+
assert_eq!(
258+
resolve_batch_model(Some("voxtral-mini-transcribe-realtime-2602")),
259+
"voxtral-mini-2602"
260+
);
261+
}
262+
250263
#[test]
251264
fn convert_response_marks_segment_interpolated_words() {
252265
let response = convert_response(MistralBatchResponse {

crates/transcribe-proxy/src/relay/builder.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ pub use tokio_tungstenite::tungstenite::ClientRequestBuilder;
77

88
use super::handler::WebSocketProxy;
99
use super::types::{
10-
ClientMessageFilter, FirstMessageTransformer, InitialMessage, OnCloseCallback,
11-
ResponseTransformer,
10+
ClientBinaryMessageMapper, ClientMessageFilter, FirstMessageTransformer, InitialMessage,
11+
OnCloseCallback, ResponseTransformer,
1212
};
1313
use crate::config::DEFAULT_CONNECT_TIMEOUT_MS;
1414
use crate::provider_selector::SelectedProvider;
@@ -38,6 +38,7 @@ pub struct WebSocketProxyBuilder<S = NoUpstream> {
3838
connect_timeout: Duration,
3939
on_close: Option<OnCloseCallback>,
4040
client_message_filter: Option<ClientMessageFilter>,
41+
client_binary_message_mapper: Option<ClientBinaryMessageMapper>,
4142
}
4243

4344
impl Default for WebSocketProxyBuilder<NoUpstream> {
@@ -51,6 +52,7 @@ impl Default for WebSocketProxyBuilder<NoUpstream> {
5152
connect_timeout: Duration::from_millis(DEFAULT_CONNECT_TIMEOUT_MS),
5253
on_close: None,
5354
client_message_filter: None,
55+
client_binary_message_mapper: None,
5456
}
5557
}
5658
}
@@ -66,6 +68,7 @@ impl<S> WebSocketProxyBuilder<S> {
6668
connect_timeout: self.connect_timeout,
6769
on_close: self.on_close,
6870
client_message_filter: self.client_message_filter,
71+
client_binary_message_mapper: self.client_binary_message_mapper,
6972
}
7073
}
7174

@@ -79,6 +82,7 @@ impl<S> WebSocketProxyBuilder<S> {
7982
connect_timeout: Duration,
8083
on_close: Option<OnCloseCallback>,
8184
client_message_filter: Option<ClientMessageFilter>,
85+
client_binary_message_mapper: Option<ClientBinaryMessageMapper>,
8286
) -> WebSocketProxy {
8387
let control_message_types = if control_message_types.is_empty() {
8488
None
@@ -95,6 +99,7 @@ impl<S> WebSocketProxyBuilder<S> {
9599
connect_timeout,
96100
on_close,
97101
client_message_filter,
102+
client_binary_message_mapper,
98103
)
99104
}
100105

@@ -145,6 +150,11 @@ impl<S> WebSocketProxyBuilder<S> {
145150
self.client_message_filter = Some(filter);
146151
self
147152
}
153+
154+
pub fn client_binary_message_mapper(mut self, mapper: ClientBinaryMessageMapper) -> Self {
155+
self.client_binary_message_mapper = Some(mapper);
156+
self
157+
}
148158
}
149159

150160
impl WebSocketProxyBuilder<NoUpstream> {
@@ -209,6 +219,7 @@ impl WebSocketProxyBuilder<WithUrl> {
209219
self.connect_timeout,
210220
self.on_close,
211221
self.client_message_filter,
222+
self.client_binary_message_mapper,
212223
))
213224
}
214225
}

crates/transcribe-proxy/src/relay/channel_split/io.rs

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ use futures_util::{SinkExt, StreamExt};
33
use tokio_tungstenite::tungstenite::Message as TungsteniteMessage;
44

55
use super::super::types::{
6-
ClientMessageFilter, ClientReceiver, ClientSender, DEFAULT_CLOSE_CODE, ShutdownSignal,
7-
UpstreamReceiver, UpstreamSender, convert,
6+
ClientBinaryMessage, ClientBinaryMessageMapper, ClientMessageFilter, ClientReceiver,
7+
ClientSender, DEFAULT_CLOSE_CODE, ShutdownSignal, UpstreamReceiver, UpstreamSender, convert,
88
};
99
use super::coordinator::SplitEvent;
1010
use super::payload::RewrittenSplitResponse;
@@ -74,6 +74,7 @@ pub(super) async fn relay_client_to_upstreams(
7474
mut mic_tx: UpstreamSender,
7575
mut spk_tx: UpstreamSender,
7676
client_message_filter: Option<ClientMessageFilter>,
77+
client_binary_message_mapper: Option<ClientBinaryMessageMapper>,
7778
shutdown_tx: tokio::sync::broadcast::Sender<ShutdownSignal>,
7879
event_tx: tokio::sync::mpsc::Sender<SplitEvent>,
7980
) {
@@ -122,14 +123,30 @@ pub(super) async fn relay_client_to_upstreams(
122123
}
123124

124125
let (mic, spk) = deinterleave(&bytes);
126+
let to_upstream = |data: Vec<u8>| {
127+
let mapped = match client_binary_message_mapper.as_ref() {
128+
Some(mapper) => mapper(data)?,
129+
None => ClientBinaryMessage::Binary(data),
130+
};
131+
132+
Some(match mapped {
133+
ClientBinaryMessage::Text(text) => TungsteniteMessage::Text(text.into()),
134+
ClientBinaryMessage::Binary(data) => TungsteniteMessage::Binary(data.into()),
135+
})
136+
};
137+
138+
let Some(mic_message) = to_upstream(mic) else {
139+
continue;
140+
};
141+
let Some(spk_message) = to_upstream(spk) else {
142+
continue;
143+
};
144+
125145
if mic_tx
126-
.send(TungsteniteMessage::Binary(mic.into()))
146+
.send(mic_message)
127147
.await
128148
.is_err()
129-
|| spk_tx
130-
.send(TungsteniteMessage::Binary(spk.into()))
131-
.await
132-
.is_err()
149+
|| spk_tx.send(spk_message).await.is_err()
133150
{
134151
let _ = event_tx
135152
.send(SplitEvent::Fatal(upstream_send_failed_signal()))

0 commit comments

Comments
 (0)