@@ -16,6 +16,13 @@ use goose::config::paths::Paths;
1616use goose:: config:: permission:: PermissionManager ;
1717use goose:: config:: { Config , GooseMode } ;
1818use goose:: conversation:: message:: { ActionRequiredData , Message , MessageContent } ;
19+ #[ cfg( feature = "local-inference" ) ]
20+ use goose:: dictation:: providers:: transcribe_local;
21+ use goose:: dictation:: providers:: {
22+ all_providers, is_configured, transcribe_with_provider, DictationProvider ,
23+ } ;
24+ #[ cfg( feature = "local-inference" ) ]
25+ use goose:: dictation:: whisper;
1926use goose:: mcp_utils:: ToolResult ;
2027use goose:: permission:: permission_confirmation:: PrincipalType ;
2128use goose:: permission:: { Permission , PermissionConfirmation } ;
@@ -68,6 +75,9 @@ pub type AcpProviderFactory = Arc<
6875
6976const DEFAULT_PROVIDER_ID : & str = "goose" ;
7077const DEFAULT_PROVIDER_LABEL : & str = "Goose (Default)" ;
78+ const OPENAI_TRANSCRIPTION_MODEL : & str = "whisper-1" ;
79+ const GROQ_TRANSCRIPTION_MODEL : & str = "whisper-large-v3-turbo" ;
80+ const ELEVENLABS_TRANSCRIPTION_MODEL : & str = "scribe_v1" ;
7181
7282/// In-memory state for an active ACP session.
7383///
@@ -2651,6 +2661,197 @@ impl GooseAcpAgent {
26512661 . map_err ( |e| sacp:: Error :: internal_error ( ) . data ( e. to_string ( ) ) ) ?;
26522662 Ok ( EmptyResponse { } )
26532663 }
2664+
2665+ #[ custom_method( DictationTranscribeRequest ) ]
2666+ async fn on_dictation_transcribe (
2667+ & self ,
2668+ req : DictationTranscribeRequest ,
2669+ ) -> Result < DictationTranscribeResponse , sacp:: Error > {
2670+ use base64:: { engine:: general_purpose:: STANDARD as BASE64 , Engine } ;
2671+
2672+ let provider: DictationProvider = serde_json:: from_value ( serde_json:: Value :: String (
2673+ req. provider . clone ( ) ,
2674+ ) )
2675+ . map_err ( |_| {
2676+ sacp:: Error :: invalid_params ( ) . data ( format ! ( "Unknown provider: {}" , req. provider) )
2677+ } ) ?;
2678+
2679+ let audio_bytes = BASE64
2680+ . decode ( & req. audio )
2681+ . map_err ( |_| sacp:: Error :: invalid_params ( ) . data ( "Invalid base64 audio data" ) ) ?;
2682+
2683+ if audio_bytes. len ( ) > 50 * 1024 * 1024 {
2684+ return Err ( sacp:: Error :: invalid_params ( ) . data ( "Audio too large (max 50MB)" ) ) ;
2685+ }
2686+
2687+ let extension = match req. mime_type . as_str ( ) {
2688+ "audio/webm" | "audio/webm;codecs=opus" => "webm" ,
2689+ "audio/mp4" => "mp4" ,
2690+ "audio/mpeg" | "audio/mpga" => "mp3" ,
2691+ "audio/m4a" => "m4a" ,
2692+ "audio/wav" | "audio/x-wav" => "wav" ,
2693+ other => {
2694+ return Err (
2695+ sacp:: Error :: invalid_params ( ) . data ( format ! ( "Unsupported format: {other}" ) )
2696+ )
2697+ }
2698+ } ;
2699+
2700+ let text = match provider {
2701+ DictationProvider :: OpenAI => {
2702+ transcribe_with_provider (
2703+ DictationProvider :: OpenAI ,
2704+ "model" . to_string ( ) ,
2705+ "whisper-1" . to_string ( ) ,
2706+ audio_bytes,
2707+ extension,
2708+ & req. mime_type ,
2709+ )
2710+ . await
2711+ }
2712+ DictationProvider :: Groq => {
2713+ transcribe_with_provider (
2714+ DictationProvider :: Groq ,
2715+ "model" . to_string ( ) ,
2716+ "whisper-large-v3-turbo" . to_string ( ) ,
2717+ audio_bytes,
2718+ extension,
2719+ & req. mime_type ,
2720+ )
2721+ . await
2722+ }
2723+ DictationProvider :: ElevenLabs => {
2724+ transcribe_with_provider (
2725+ DictationProvider :: ElevenLabs ,
2726+ "model_id" . to_string ( ) ,
2727+ "scribe_v1" . to_string ( ) ,
2728+ audio_bytes,
2729+ extension,
2730+ & req. mime_type ,
2731+ )
2732+ . await
2733+ }
2734+ #[ cfg( feature = "local-inference" ) ]
2735+ DictationProvider :: Local => transcribe_local ( audio_bytes) . await ,
2736+ #[ cfg( not( feature = "local-inference" ) ) ]
2737+ DictationProvider :: Local => {
2738+ return Err ( sacp:: Error :: invalid_params ( )
2739+ . data ( "Local inference is not available in this build" ) ) ;
2740+ }
2741+ }
2742+ . map_err ( |e| sacp:: Error :: internal_error ( ) . data ( e. to_string ( ) ) ) ?;
2743+
2744+ Ok ( DictationTranscribeResponse { text } )
2745+ }
2746+
2747+ #[ custom_method( DictationConfigRequest ) ]
2748+ async fn on_dictation_config (
2749+ & self ,
2750+ _req : DictationConfigRequest ,
2751+ ) -> Result < DictationConfigResponse , sacp:: Error > {
2752+ let config = goose:: config:: Config :: global ( ) ;
2753+ let mut providers = std:: collections:: HashMap :: new ( ) ;
2754+
2755+ for def in all_providers ( ) {
2756+ let provider = def. provider ;
2757+ let host = if let Some ( host_key) = def. host_key {
2758+ config
2759+ . get ( host_key, false )
2760+ . ok ( )
2761+ . and_then ( |v| v. as_str ( ) . map ( |s| s. to_string ( ) ) )
2762+ } else {
2763+ None
2764+ } ;
2765+
2766+ let provider_key = serde_json:: to_value ( provider)
2767+ . ok ( )
2768+ . and_then ( |v| v. as_str ( ) . map ( |s| s. to_string ( ) ) )
2769+ . unwrap_or_else ( || format ! ( "{:?}" , provider) . to_lowercase ( ) ) ;
2770+ providers. insert (
2771+ provider_key,
2772+ DictationProviderStatusEntry {
2773+ configured : is_configured ( provider) ,
2774+ host,
2775+ description : def. description . to_string ( ) ,
2776+ uses_provider_config : def. uses_provider_config ,
2777+ settings_path : def. settings_path . map ( |s| s. to_string ( ) ) ,
2778+ config_key : if !def. uses_provider_config {
2779+ Some ( def. config_key . to_string ( ) )
2780+ } else {
2781+ None
2782+ } ,
2783+ model_config_key : dictation_model_config_key ( provider) ,
2784+ default_model : dictation_default_model ( provider) ,
2785+ selected_model : dictation_selected_model ( & config, provider) ,
2786+ available_models : dictation_available_models ( provider) ,
2787+ } ,
2788+ ) ;
2789+ }
2790+
2791+ Ok ( DictationConfigResponse { providers } )
2792+ }
2793+ }
2794+
2795+ fn dictation_model_config_key ( provider : DictationProvider ) -> Option < String > {
2796+ #[ cfg( feature = "local-inference" ) ]
2797+ if provider == DictationProvider :: Local {
2798+ return Some ( whisper:: LOCAL_WHISPER_MODEL_CONFIG_KEY . to_string ( ) ) ;
2799+ }
2800+
2801+ None
2802+ }
2803+
2804+ fn dictation_default_model ( provider : DictationProvider ) -> Option < String > {
2805+ match provider {
2806+ DictationProvider :: OpenAI => Some ( OPENAI_TRANSCRIPTION_MODEL . to_string ( ) ) ,
2807+ DictationProvider :: Groq => Some ( GROQ_TRANSCRIPTION_MODEL . to_string ( ) ) ,
2808+ DictationProvider :: ElevenLabs => Some ( ELEVENLABS_TRANSCRIPTION_MODEL . to_string ( ) ) ,
2809+ #[ cfg( feature = "local-inference" ) ]
2810+ DictationProvider :: Local => Some ( whisper:: recommend_model ( ) . to_string ( ) ) ,
2811+ }
2812+ }
2813+
2814+ fn dictation_selected_model ( config : & Config , provider : DictationProvider ) -> Option < String > {
2815+ #[ cfg( feature = "local-inference" ) ]
2816+ if provider == DictationProvider :: Local {
2817+ return config
2818+ . get ( whisper:: LOCAL_WHISPER_MODEL_CONFIG_KEY , false )
2819+ . ok ( )
2820+ . and_then ( |value| value. as_str ( ) . map ( str:: to_owned) )
2821+ . filter ( |model_id| whisper:: get_model ( model_id) . is_some ( ) )
2822+ . or_else ( || dictation_default_model ( provider) ) ;
2823+ }
2824+
2825+ dictation_default_model ( provider)
2826+ }
2827+
2828+ fn dictation_available_models ( provider : DictationProvider ) -> Vec < DictationModelOption > {
2829+ match provider {
2830+ DictationProvider :: OpenAI => vec ! [ DictationModelOption {
2831+ id: OPENAI_TRANSCRIPTION_MODEL . to_string( ) ,
2832+ label: "Whisper-1" . to_string( ) ,
2833+ description: "OpenAI's hosted Whisper transcription model." . to_string( ) ,
2834+ } ] ,
2835+ DictationProvider :: Groq => vec ! [ DictationModelOption {
2836+ id: GROQ_TRANSCRIPTION_MODEL . to_string( ) ,
2837+ label: "Whisper Large V3 Turbo" . to_string( ) ,
2838+ description: "Groq's fast hosted Whisper transcription model." . to_string( ) ,
2839+ } ] ,
2840+ DictationProvider :: ElevenLabs => vec ! [ DictationModelOption {
2841+ id: ELEVENLABS_TRANSCRIPTION_MODEL . to_string( ) ,
2842+ label: "Scribe v1" . to_string( ) ,
2843+ description: "ElevenLabs' hosted speech-to-text model." . to_string( ) ,
2844+ } ] ,
2845+ #[ cfg( feature = "local-inference" ) ]
2846+ DictationProvider :: Local => whisper:: available_models ( )
2847+ . iter ( )
2848+ . map ( |model| DictationModelOption {
2849+ id : model. id . to_string ( ) ,
2850+ label : model. id . to_string ( ) ,
2851+ description : model. description . to_string ( ) ,
2852+ } )
2853+ . collect ( ) ,
2854+ }
26542855}
26552856
26562857pub struct GooseAcpHandler {
0 commit comments