Skip to content

Commit df1b1c0

Browse files
tulsi-builderclaude
andcommitted
feat: wire up voice dictation in goose2 via ACP
Add voice dictation support to the goose2 Tauri app by exposing transcription and config as ACP custom methods, then wiring the frontend to use them. Backend (crates/): - Add DictationTranscribeRequest/Response and DictationConfigRequest/Response types to goose-sdk custom_requests.rs with model metadata fields - Add #[custom_method] handlers in goose-acp server.rs for transcribe (OpenAI, Groq, ElevenLabs, Local) and config - Register methods in acp-meta.json - Forward local-inference feature from goose-cli to goose-acp Tauri (ui/goose2/src-tauri/): - Rewrite dictation.rs to use call_ext_method via ACP instead of importing goose crate directly - Add generic CallExt command to ACP manager with method name normalization (strips leading _ to avoid double-prefix) - Register get_dictation_config and transcribe_dictation commands Frontend (ui/goose2/src/): - Wire useDictationRecorder + useVoiceInputPreferences into ChatInput - Replace placeholder mic button with working toggle (recording/ transcribing states, auto-submit on keyword) - Stop recording on manual send and on auto-submit keyword - Show "Listening..."/"Transcribing..." placeholder in textarea - Add Voice section to SettingsModal with VoiceInputSettings - Add all voice i18n strings (en + es) - Fix pre-existing type errors in dictationVad.ts and VoiceInputSettings Known issue: Local Whisper reports configured: false despite model being downloaded and config set. The is_downloaded() path check needs investigation in a follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 0ec73c8 commit df1b1c0

31 files changed

Lines changed: 2362 additions & 188 deletions

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/goose-acp/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ path = "src/bin/generate_acp_schema.rs"
1414
[features]
1515
default = ["code-mode", "rustls-tls"]
1616
code-mode = ["goose/code-mode"]
17+
local-inference = ["goose/local-inference"]
1718
rustls-tls = ["goose/rustls-tls", "goose-mcp/rustls-tls"]
1819
native-tls = ["goose/native-tls", "goose-mcp/native-tls"]
1920

@@ -48,6 +49,7 @@ uuid = { workspace = true, features = ["v7"] }
4849
schemars = { workspace = true, features = ["derive"] }
4950
goose-acp-macros = { path = "../goose-acp-macros" }
5051
goose-sdk = { path = "../goose-sdk" }
52+
base64 = { workspace = true }
5153

5254
[dev-dependencies]
5355
async-trait = { workspace = true }

crates/goose-acp/acp-meta.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,16 @@
109109
"method": "_goose/session/unarchive",
110110
"requestType": "UnarchiveSessionRequest",
111111
"responseType": "EmptyResponse"
112+
},
113+
{
114+
"method": "_goose/dictation/transcribe",
115+
"requestType": "DictationTranscribeRequest",
116+
"responseType": "DictationTranscribeResponse"
117+
},
118+
{
119+
"method": "_goose/dictation/config",
120+
"requestType": "DictationConfigRequest",
121+
"responseType": "DictationConfigResponse"
112122
}
113123
]
114124
}

crates/goose-acp/src/server.rs

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ use goose::config::paths::Paths;
1616
use goose::config::permission::PermissionManager;
1717
use goose::config::{Config, GooseMode};
1818
use goose::conversation::message::{ActionRequiredData, Message, MessageContent};
19+
#[cfg(feature = "local-inference")]
20+
use goose::dictation::providers::transcribe_local;
21+
use goose::dictation::providers::{
22+
all_providers, is_configured, transcribe_with_provider, DictationProvider,
23+
};
24+
#[cfg(feature = "local-inference")]
25+
use goose::dictation::whisper;
1926
use goose::mcp_utils::ToolResult;
2027
use goose::permission::permission_confirmation::PrincipalType;
2128
use goose::permission::{Permission, PermissionConfirmation};
@@ -68,6 +75,9 @@ pub type AcpProviderFactory = Arc<
6875

6976
const DEFAULT_PROVIDER_ID: &str = "goose";
7077
const DEFAULT_PROVIDER_LABEL: &str = "Goose (Default)";
78+
const OPENAI_TRANSCRIPTION_MODEL: &str = "whisper-1";
79+
const GROQ_TRANSCRIPTION_MODEL: &str = "whisper-large-v3-turbo";
80+
const ELEVENLABS_TRANSCRIPTION_MODEL: &str = "scribe_v1";
7181

7282
/// In-memory state for an active ACP session.
7383
///
@@ -2651,6 +2661,197 @@ impl GooseAcpAgent {
26512661
.map_err(|e| sacp::Error::internal_error().data(e.to_string()))?;
26522662
Ok(EmptyResponse {})
26532663
}
2664+
2665+
#[custom_method(DictationTranscribeRequest)]
2666+
async fn on_dictation_transcribe(
2667+
&self,
2668+
req: DictationTranscribeRequest,
2669+
) -> Result<DictationTranscribeResponse, sacp::Error> {
2670+
use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
2671+
2672+
let provider: DictationProvider = serde_json::from_value(serde_json::Value::String(
2673+
req.provider.clone(),
2674+
))
2675+
.map_err(|_| {
2676+
sacp::Error::invalid_params().data(format!("Unknown provider: {}", req.provider))
2677+
})?;
2678+
2679+
let audio_bytes = BASE64
2680+
.decode(&req.audio)
2681+
.map_err(|_| sacp::Error::invalid_params().data("Invalid base64 audio data"))?;
2682+
2683+
if audio_bytes.len() > 50 * 1024 * 1024 {
2684+
return Err(sacp::Error::invalid_params().data("Audio too large (max 50MB)"));
2685+
}
2686+
2687+
let extension = match req.mime_type.as_str() {
2688+
"audio/webm" | "audio/webm;codecs=opus" => "webm",
2689+
"audio/mp4" => "mp4",
2690+
"audio/mpeg" | "audio/mpga" => "mp3",
2691+
"audio/m4a" => "m4a",
2692+
"audio/wav" | "audio/x-wav" => "wav",
2693+
other => {
2694+
return Err(
2695+
sacp::Error::invalid_params().data(format!("Unsupported format: {other}"))
2696+
)
2697+
}
2698+
};
2699+
2700+
let text = match provider {
2701+
DictationProvider::OpenAI => {
2702+
transcribe_with_provider(
2703+
DictationProvider::OpenAI,
2704+
"model".to_string(),
2705+
"whisper-1".to_string(),
2706+
audio_bytes,
2707+
extension,
2708+
&req.mime_type,
2709+
)
2710+
.await
2711+
}
2712+
DictationProvider::Groq => {
2713+
transcribe_with_provider(
2714+
DictationProvider::Groq,
2715+
"model".to_string(),
2716+
"whisper-large-v3-turbo".to_string(),
2717+
audio_bytes,
2718+
extension,
2719+
&req.mime_type,
2720+
)
2721+
.await
2722+
}
2723+
DictationProvider::ElevenLabs => {
2724+
transcribe_with_provider(
2725+
DictationProvider::ElevenLabs,
2726+
"model_id".to_string(),
2727+
"scribe_v1".to_string(),
2728+
audio_bytes,
2729+
extension,
2730+
&req.mime_type,
2731+
)
2732+
.await
2733+
}
2734+
#[cfg(feature = "local-inference")]
2735+
DictationProvider::Local => transcribe_local(audio_bytes).await,
2736+
#[cfg(not(feature = "local-inference"))]
2737+
DictationProvider::Local => {
2738+
return Err(sacp::Error::invalid_params()
2739+
.data("Local inference is not available in this build"));
2740+
}
2741+
}
2742+
.map_err(|e| sacp::Error::internal_error().data(e.to_string()))?;
2743+
2744+
Ok(DictationTranscribeResponse { text })
2745+
}
2746+
2747+
#[custom_method(DictationConfigRequest)]
2748+
async fn on_dictation_config(
2749+
&self,
2750+
_req: DictationConfigRequest,
2751+
) -> Result<DictationConfigResponse, sacp::Error> {
2752+
let config = goose::config::Config::global();
2753+
let mut providers = std::collections::HashMap::new();
2754+
2755+
for def in all_providers() {
2756+
let provider = def.provider;
2757+
let host = if let Some(host_key) = def.host_key {
2758+
config
2759+
.get(host_key, false)
2760+
.ok()
2761+
.and_then(|v| v.as_str().map(|s| s.to_string()))
2762+
} else {
2763+
None
2764+
};
2765+
2766+
let provider_key = serde_json::to_value(provider)
2767+
.ok()
2768+
.and_then(|v| v.as_str().map(|s| s.to_string()))
2769+
.unwrap_or_else(|| format!("{:?}", provider).to_lowercase());
2770+
providers.insert(
2771+
provider_key,
2772+
DictationProviderStatusEntry {
2773+
configured: is_configured(provider),
2774+
host,
2775+
description: def.description.to_string(),
2776+
uses_provider_config: def.uses_provider_config,
2777+
settings_path: def.settings_path.map(|s| s.to_string()),
2778+
config_key: if !def.uses_provider_config {
2779+
Some(def.config_key.to_string())
2780+
} else {
2781+
None
2782+
},
2783+
model_config_key: dictation_model_config_key(provider),
2784+
default_model: dictation_default_model(provider),
2785+
selected_model: dictation_selected_model(&config, provider),
2786+
available_models: dictation_available_models(provider),
2787+
},
2788+
);
2789+
}
2790+
2791+
Ok(DictationConfigResponse { providers })
2792+
}
2793+
}
2794+
2795+
fn dictation_model_config_key(provider: DictationProvider) -> Option<String> {
2796+
#[cfg(feature = "local-inference")]
2797+
if provider == DictationProvider::Local {
2798+
return Some(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY.to_string());
2799+
}
2800+
2801+
None
2802+
}
2803+
2804+
fn dictation_default_model(provider: DictationProvider) -> Option<String> {
2805+
match provider {
2806+
DictationProvider::OpenAI => Some(OPENAI_TRANSCRIPTION_MODEL.to_string()),
2807+
DictationProvider::Groq => Some(GROQ_TRANSCRIPTION_MODEL.to_string()),
2808+
DictationProvider::ElevenLabs => Some(ELEVENLABS_TRANSCRIPTION_MODEL.to_string()),
2809+
#[cfg(feature = "local-inference")]
2810+
DictationProvider::Local => Some(whisper::recommend_model().to_string()),
2811+
}
2812+
}
2813+
2814+
fn dictation_selected_model(config: &Config, provider: DictationProvider) -> Option<String> {
2815+
#[cfg(feature = "local-inference")]
2816+
if provider == DictationProvider::Local {
2817+
return config
2818+
.get(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY, false)
2819+
.ok()
2820+
.and_then(|value| value.as_str().map(str::to_owned))
2821+
.filter(|model_id| whisper::get_model(model_id).is_some())
2822+
.or_else(|| dictation_default_model(provider));
2823+
}
2824+
2825+
dictation_default_model(provider)
2826+
}
2827+
2828+
fn dictation_available_models(provider: DictationProvider) -> Vec<DictationModelOption> {
2829+
match provider {
2830+
DictationProvider::OpenAI => vec![DictationModelOption {
2831+
id: OPENAI_TRANSCRIPTION_MODEL.to_string(),
2832+
label: "Whisper-1".to_string(),
2833+
description: "OpenAI's hosted Whisper transcription model.".to_string(),
2834+
}],
2835+
DictationProvider::Groq => vec![DictationModelOption {
2836+
id: GROQ_TRANSCRIPTION_MODEL.to_string(),
2837+
label: "Whisper Large V3 Turbo".to_string(),
2838+
description: "Groq's fast hosted Whisper transcription model.".to_string(),
2839+
}],
2840+
DictationProvider::ElevenLabs => vec![DictationModelOption {
2841+
id: ELEVENLABS_TRANSCRIPTION_MODEL.to_string(),
2842+
label: "Scribe v1".to_string(),
2843+
description: "ElevenLabs' hosted speech-to-text model.".to_string(),
2844+
}],
2845+
#[cfg(feature = "local-inference")]
2846+
DictationProvider::Local => whisper::available_models()
2847+
.iter()
2848+
.map(|model| DictationModelOption {
2849+
id: model.id.to_string(),
2850+
label: model.id.to_string(),
2851+
description: model.description.to_string(),
2852+
})
2853+
.collect(),
2854+
}
26542855
}
26552856

26562857
pub struct GooseAcpHandler {

crates/goose-cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ winapi = { workspace = true }
7171
[features]
7272
default = ["code-mode", "local-inference", "aws-providers", "telemetry", "otel", "rustls-tls"]
7373
code-mode = ["goose/code-mode", "goose-acp/code-mode"]
74-
local-inference = ["goose/local-inference"]
74+
local-inference = ["goose/local-inference", "goose-acp/local-inference"]
7575
aws-providers = ["goose/aws-providers"]
7676
cuda = ["goose/cuda", "local-inference"]
7777
telemetry = ["goose/telemetry"]

crates/goose-sdk/src/custom_requests.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,66 @@ pub struct ProviderConfigKey {
330330
pub primary: bool,
331331
}
332332

333+
/// Transcribe audio via a dictation provider.
334+
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)]
335+
#[request(method = "_goose/dictation/transcribe", response = DictationTranscribeResponse)]
336+
#[serde(rename_all = "camelCase")]
337+
pub struct DictationTranscribeRequest {
338+
/// Base64-encoded audio data
339+
pub audio: String,
340+
/// MIME type (e.g. "audio/wav", "audio/webm")
341+
pub mime_type: String,
342+
/// Provider to use: "openai", "groq", "elevenlabs", or "local"
343+
pub provider: String,
344+
}
345+
346+
/// Transcription result.
347+
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)]
348+
pub struct DictationTranscribeResponse {
349+
pub text: String,
350+
}
351+
352+
/// Get the configuration status of all dictation providers.
353+
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)]
354+
#[request(method = "_goose/dictation/config", response = DictationConfigResponse)]
355+
pub struct DictationConfigRequest {}
356+
357+
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)]
358+
pub struct DictationModelOption {
359+
pub id: String,
360+
pub label: String,
361+
pub description: String,
362+
}
363+
364+
/// Per-provider configuration status.
365+
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)]
366+
#[serde(rename_all = "camelCase")]
367+
pub struct DictationProviderStatusEntry {
368+
pub configured: bool,
369+
#[serde(skip_serializing_if = "Option::is_none")]
370+
pub host: Option<String>,
371+
pub description: String,
372+
pub uses_provider_config: bool,
373+
#[serde(skip_serializing_if = "Option::is_none")]
374+
pub settings_path: Option<String>,
375+
#[serde(skip_serializing_if = "Option::is_none")]
376+
pub config_key: Option<String>,
377+
#[serde(skip_serializing_if = "Option::is_none")]
378+
pub model_config_key: Option<String>,
379+
#[serde(skip_serializing_if = "Option::is_none")]
380+
pub default_model: Option<String>,
381+
#[serde(skip_serializing_if = "Option::is_none")]
382+
pub selected_model: Option<String>,
383+
#[serde(default)]
384+
pub available_models: Vec<DictationModelOption>,
385+
}
386+
387+
/// Dictation config response — map of provider name to status.
388+
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)]
389+
pub struct DictationConfigResponse {
390+
pub providers: HashMap<String, DictationProviderStatusEntry>,
391+
}
392+
333393
/// Empty success response for operations that return no data.
334394
#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)]
335395
pub struct EmptyResponse {}

0 commit comments

Comments
 (0)