feat(gateway): feishu voice message STT via gateway audio attachment

wangyuyan-agent · wangyuyan-agent · commit 646d99aad490 · 2026-05-07T07:47:07.000+08:00
- Add msg_type=audio support to feishu adapter (parse, download, base64 encode)
- Add MediaRef::Audio variant and download_feishu_audio() function
- Add "audio" attachment type to core gateway handler (decode → stt::transcribe)
- Pass SttConfig to gateway handler via GatewayParams
- Update docs/feishu.md and docs/stt.md for multi-platform voice support

Feishu voice messages (opus/ogg) are downloaded by the gateway, passed as
base64-encoded audio attachments to core, and transcribed via the existing
[stt] infrastructure (Groq Whisper by default). This is the first gateway
platform to support audio — LINE/Telegram can reuse the core-side handler.

Tested: 102 gateway tests + 197 core tests pass. E2E verified.
diff --git a/docs/feishu.md b/docs/feishu.md
@@ -167,6 +167,7 @@ The gateway downloads and forwards image and text file attachments to the AI age
 | `text` | Text extracted, forwarded as prompt |
 | `image` | Image downloaded, resized (max 1200px), JPEG compressed, base64 encoded → `ContentBlock::Image` |
 | `file` | Text files only (`.txt`, `.py`, `.rs`, `.md`, `.json`, etc., max 512KB). Non-text files (`.pdf`, `.zip`, etc.) are silently ignored. |
+| `audio` | Voice message downloaded (opus/ogg, max 25MB), base64 encoded, forwarded to core. If `[stt]` is enabled, core transcribes via Whisper API and injects `[Voice message transcript]: ...` into the prompt. If STT is disabled or fails, the message is silently skipped. |
 | `post` | Rich text: text nodes extracted as prompt, `img` nodes downloaded as image attachments. This is the format Feishu uses when @mention + paste image in a group. |
 
 **Group chat limitation:** Feishu does not allow @mention and image upload in the same message. However, @mention + paste (Ctrl+V) an image works — Feishu sends this as a `post` message containing both the mention and the image. Direct image upload (via the attachment button) cannot include @mention, so the bot will not respond in groups.
diff --git a/docs/stt.md b/docs/stt.md
@@ -1,6 +1,6 @@
 # Speech-to-Text (STT) for Voice Messages
 
-openab can automatically transcribe Discord voice message attachments and forward the transcript to your ACP agent as text.
+openab can automatically transcribe voice message attachments (Discord, Feishu, and other gateway platforms) and forward the transcript to your ACP agent as text.
 
 ## Quick Start
 
@@ -24,7 +24,7 @@ api_key = "${GROQ_API_KEY}"
 ## How It Works
 
 ```
-Discord voice message (.ogg)
+Voice message (Discord .ogg, Feishu opus/ogg, etc.)
        │
        ▼
   openab downloads the audio file
@@ -161,6 +161,6 @@ When disabled, audio attachments are silently skipped with no impact on existing
 ## Technical Notes
 
 - openab sends `response_format=json` in the transcription request to ensure the response is always parseable JSON. Some local whisper servers default to plain text output without this parameter.
-- The actual MIME type from the Discord attachment is passed through to the STT API (e.g. `audio/ogg`, `audio/mp4`, `audio/wav`).
+- The actual MIME type from the platform attachment is passed through to the STT API (e.g. `audio/ogg` for Discord and Feishu voice messages, `audio/mp4`, `audio/wav`).
 - Environment variables in config values are expanded via `${VAR}` syntax (e.g. `api_key = "${GROQ_API_KEY}"`).
 - The `api_key` field is auto-detected from the `GROQ_API_KEY` environment variable when using the default Groq endpoint. If you set a custom `base_url` (e.g. local server), auto-detect is disabled to avoid leaking the Groq key to unrelated endpoints — you must set `api_key` explicitly.
diff --git a/gateway/src/adapters/feishu.rs b/gateway/src/adapters/feishu.rs
@@ -297,7 +297,7 @@ mod event_types {
         let sender = event.sender.as_ref()?;
 
         let msg_type = msg.message_type.as_deref().unwrap_or("text");
-        if !matches!(msg_type, "text" | "image" | "file" | "post") {
+        if !matches!(msg_type, "text" | "image" | "file" | "post" | "audio") {
             return None;
         }
         // Skip bot messages with explicit sender_type
@@ -385,6 +385,17 @@ mod event_types {
                 }];
                 (String::new(), mentions.1, refs)
             }
+            "audio" => {
+                let file_key = content_json.get("file_key")?.as_str()?;
+                let mentions = extract_mentions(
+                    "", msg.mentions.as_deref().unwrap_or(&[]), bot_open_id,
+                );
+                let refs = vec![MediaRef::Audio {
+                    message_id: message_id.to_string(),
+                    file_key: file_key.to_string(),
+                }];
+                (String::new(), mentions.1, refs)
+            }
             "post" => {
                 // Rich text: content is {"title":"...","content":[[{tag,text,...},{tag,image_key,...}]]}
                 let mut texts = Vec::new();
@@ -1038,6 +1049,9 @@ async fn handle_ws_message(
                         MediaRef::File { message_id, file_key, file_name } => {
                             download_feishu_file(client, &api_base, &token, message_id, file_key, file_name).await
                         }
+                        MediaRef::Audio { message_id, file_key } => {
+                            download_feishu_audio(client, &api_base, &token, message_id, file_key).await
+                        }
                     };
                     if let Some(att) = attachment {
                         gateway_event.content.attachments.push(att);
@@ -1343,6 +1357,7 @@ fn try_parse_link(chars: &[char], start: usize) -> Option<(String, String, usize
 pub enum MediaRef {
     Image { message_id: String, image_key: String },
     File { message_id: String, file_key: String, file_name: String },
+    Audio { message_id: String, file_key: String },
 }
 
 const IMAGE_MAX_DIMENSION_PX: u32 = 1200;
@@ -1497,6 +1512,56 @@ pub async fn download_feishu_file(
     })
 }
 
+const AUDIO_MAX_DOWNLOAD: u64 = 25 * 1024 * 1024; // 25 MB (Whisper API limit)
+
+/// Download a Feishu audio message by message_id + file_key → base64 Attachment.
+pub async fn download_feishu_audio(
+    client: &reqwest::Client,
+    api_base: &str,
+    token: &str,
+    message_id: &str,
+    file_key: &str,
+) -> Option<crate::schema::Attachment> {
+    let url = format!(
+        "{}/open-apis/im/v1/messages/{}/resources/{}?type=file",
+        api_base, message_id, file_key
+    );
+    let resp = match client.get(&url).bearer_auth(token).send().await {
+        Ok(r) => r,
+        Err(e) => {
+            tracing::warn!(file_key, error = %e, "feishu audio download failed");
+            return None;
+        }
+    };
+    if !resp.status().is_success() {
+        tracing::warn!(file_key, status = %resp.status(), "feishu audio download failed");
+        return None;
+    }
+    if let Some(cl) = resp.headers().get(reqwest::header::CONTENT_LENGTH) {
+        if let Ok(size) = cl.to_str().unwrap_or("0").parse::<u64>() {
+            if size > AUDIO_MAX_DOWNLOAD {
+                tracing::warn!(file_key, size, "feishu audio exceeds 25MB limit");
+                return None;
+            }
+        }
+    }
+    let bytes = resp.bytes().await.ok()?;
+    if bytes.len() as u64 > AUDIO_MAX_DOWNLOAD {
+        tracing::warn!(file_key, size = bytes.len(), "feishu audio exceeds 25MB limit");
+        return None;
+    }
+    tracing::debug!(file_key, size = bytes.len(), "feishu audio downloaded");
+    use base64::Engine;
+    let data = base64::engine::general_purpose::STANDARD.encode(&bytes);
+    Some(crate::schema::Attachment {
+        attachment_type: "audio".into(),
+        filename: format!("{}.ogg", file_key),
+        mime_type: "audio/ogg".into(),
+        data,
+        size: bytes.len() as u64,
+    })
+}
+
 /// Send a post (rich text) message to a feishu chat_id.
 /// Returns the sent message_id on success, None on failure.
 /// When `reply_to` is Some(root_id), uses the reply API to stay in a thread.
@@ -2260,6 +2325,9 @@ pub async fn webhook(
                             MediaRef::File { message_id, file_key, file_name } => {
                                 download_feishu_file(&feishu.client, &api_base, &token, message_id, file_key, file_name).await
                             }
+                            MediaRef::Audio { message_id, file_key } => {
+                                download_feishu_audio(&feishu.client, &api_base, &token, message_id, file_key).await
+                            }
                         };
                         if let Some(att) = attachment {
                             gateway_event.content.attachments.push(att);
diff --git a/src/gateway.rs b/src/gateway.rs
@@ -487,6 +487,7 @@ pub struct GatewayParams {
     pub allow_all_users: bool,
     pub allowed_users: Vec<String>,
     pub streaming: bool,
+    pub stt: crate::config::SttConfig,
 }
 
 pub async fn run_gateway_adapter(
@@ -504,6 +505,7 @@ pub async fn run_gateway_adapter(
     let allow_all_users = params.allow_all_users;
     let allowed_users = params.allowed_users;
     let streaming = params.streaming;
+    let stt_config = params.stt;
 
     let connect_url = match &params.token {
         Some(token) => {
@@ -662,6 +664,26 @@ pub async fn run_gateway_adapter(
                                                     });
                                                 }
                                             }
+                                            "audio" => {
+                                                if stt_config.enabled {
+                                                    use base64::Engine;
+                                                    if let Ok(audio_bytes) = base64::engine::general_purpose::STANDARD.decode(&att.data) {
+                                                        if let Some(transcript) = crate::stt::transcribe(
+                                                            &crate::media::HTTP_CLIENT,
+                                                            &stt_config,
+                                                            audio_bytes,
+                                                            att.filename.clone(),
+                                                            &att.mime_type,
+                                                        ).await {
+                                                            extra_blocks.push(ContentBlock::Text {
+                                                                text: format!("[Voice message transcript]: {transcript}"),
+                                                            });
+                                                        }
+                                                    } else {
+                                                        warn!(filename = %att.filename, "audio attachment base64 decode failed");
+                                                    }
+                                                }
+                                            }
                                             _ => {}
                                         }
                                     }
diff --git a/src/main.rs b/src/main.rs
@@ -228,6 +228,7 @@ async fn main() -> anyhow::Result<()> {
             allow_all_users: config::resolve_allow_all(gw_cfg.allow_all_users, &gw_cfg.allowed_users),
             allowed_users: gw_cfg.allowed_users,
             streaming: gw_cfg.streaming,
+            stt: cfg.stt.clone(),
         };
         Some(tokio::spawn(async move {
             if let Err(e) = gateway::run_gateway_adapter(params, router, shutdown_rx).await {