Skip to content

Commit 646d99a

Browse files
feat(gateway): feishu voice message STT via gateway audio attachment
- Add msg_type=audio support to feishu adapter (parse, download, base64 encode) - Add MediaRef::Audio variant and download_feishu_audio() function - Add "audio" attachment type to core gateway handler (decode → stt::transcribe) - Pass SttConfig to gateway handler via GatewayParams - Update docs/feishu.md and docs/stt.md for multi-platform voice support Feishu voice messages (opus/ogg) are downloaded by the gateway, passed as base64-encoded audio attachments to core, and transcribed via the existing [stt] infrastructure (Groq Whisper by default). This is the first gateway platform to support audio — LINE/Telegram can reuse the core-side handler. Tested: 102 gateway tests + 197 core tests pass. E2E verified.
1 parent 2ab7577 commit 646d99a

5 files changed

Lines changed: 96 additions & 4 deletions

File tree

docs/feishu.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ The gateway downloads and forwards image and text file attachments to the AI age
167167
| `text` | Text extracted, forwarded as prompt |
168168
| `image` | Image downloaded, resized (max 1200px), JPEG compressed, base64 encoded → `ContentBlock::Image` |
169169
| `file` | Text files only (`.txt`, `.py`, `.rs`, `.md`, `.json`, etc., max 512KB). Non-text files (`.pdf`, `.zip`, etc.) are silently ignored. |
170+
| `audio` | Voice message downloaded (opus/ogg, max 25MB), base64 encoded, forwarded to core. If `[stt]` is enabled, core transcribes via Whisper API and injects `[Voice message transcript]: ...` into the prompt. If STT is disabled or fails, the message is silently skipped. |
170171
| `post` | Rich text: text nodes extracted as prompt, `img` nodes downloaded as image attachments. This is the format Feishu uses when @mention + paste image in a group. |
171172

172173
**Group chat limitation:** Feishu does not allow @mention and image upload in the same message. However, @mention + paste (Ctrl+V) an image works — Feishu sends this as a `post` message containing both the mention and the image. Direct image upload (via the attachment button) cannot include @mention, so the bot will not respond in groups.

docs/stt.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Speech-to-Text (STT) for Voice Messages
22

3-
openab can automatically transcribe Discord voice message attachments and forward the transcript to your ACP agent as text.
3+
openab can automatically transcribe voice message attachments (Discord, Feishu, and other gateway platforms) and forward the transcript to your ACP agent as text.
44

55
## Quick Start
66

@@ -24,7 +24,7 @@ api_key = "${GROQ_API_KEY}"
2424
## How It Works
2525

2626
```
27-
Discord voice message (.ogg)
27+
Voice message (Discord .ogg, Feishu opus/ogg, etc.)
2828
2929
3030
openab downloads the audio file
@@ -161,6 +161,6 @@ When disabled, audio attachments are silently skipped with no impact on existing
161161
## Technical Notes
162162

163163
- openab sends `response_format=json` in the transcription request to ensure the response is always parseable JSON. Some local whisper servers default to plain text output without this parameter.
164-
- The actual MIME type from the Discord attachment is passed through to the STT API (e.g. `audio/ogg`, `audio/mp4`, `audio/wav`).
164+
- The actual MIME type from the platform attachment is passed through to the STT API (e.g. `audio/ogg` for Discord and Feishu voice messages, `audio/mp4`, `audio/wav`).
165165
- Environment variables in config values are expanded via `${VAR}` syntax (e.g. `api_key = "${GROQ_API_KEY}"`).
166166
- The `api_key` field is auto-detected from the `GROQ_API_KEY` environment variable when using the default Groq endpoint. If you set a custom `base_url` (e.g. local server), auto-detect is disabled to avoid leaking the Groq key to unrelated endpoints — you must set `api_key` explicitly.

gateway/src/adapters/feishu.rs

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ mod event_types {
297297
let sender = event.sender.as_ref()?;
298298

299299
let msg_type = msg.message_type.as_deref().unwrap_or("text");
300-
if !matches!(msg_type, "text" | "image" | "file" | "post") {
300+
if !matches!(msg_type, "text" | "image" | "file" | "post" | "audio") {
301301
return None;
302302
}
303303
// Skip bot messages with explicit sender_type
@@ -385,6 +385,17 @@ mod event_types {
385385
}];
386386
(String::new(), mentions.1, refs)
387387
}
388+
"audio" => {
389+
let file_key = content_json.get("file_key")?.as_str()?;
390+
let mentions = extract_mentions(
391+
"", msg.mentions.as_deref().unwrap_or(&[]), bot_open_id,
392+
);
393+
let refs = vec![MediaRef::Audio {
394+
message_id: message_id.to_string(),
395+
file_key: file_key.to_string(),
396+
}];
397+
(String::new(), mentions.1, refs)
398+
}
388399
"post" => {
389400
// Rich text: content is {"title":"...","content":[[{tag,text,...},{tag,image_key,...}]]}
390401
let mut texts = Vec::new();
@@ -1038,6 +1049,9 @@ async fn handle_ws_message(
10381049
MediaRef::File { message_id, file_key, file_name } => {
10391050
download_feishu_file(client, &api_base, &token, message_id, file_key, file_name).await
10401051
}
1052+
MediaRef::Audio { message_id, file_key } => {
1053+
download_feishu_audio(client, &api_base, &token, message_id, file_key).await
1054+
}
10411055
};
10421056
if let Some(att) = attachment {
10431057
gateway_event.content.attachments.push(att);
@@ -1343,6 +1357,7 @@ fn try_parse_link(chars: &[char], start: usize) -> Option<(String, String, usize
13431357
pub enum MediaRef {
13441358
Image { message_id: String, image_key: String },
13451359
File { message_id: String, file_key: String, file_name: String },
1360+
Audio { message_id: String, file_key: String },
13461361
}
13471362

13481363
const IMAGE_MAX_DIMENSION_PX: u32 = 1200;
@@ -1497,6 +1512,56 @@ pub async fn download_feishu_file(
14971512
})
14981513
}
14991514

1515+
const AUDIO_MAX_DOWNLOAD: u64 = 25 * 1024 * 1024; // 25 MB (Whisper API limit)
1516+
1517+
/// Download a Feishu audio message by message_id + file_key → base64 Attachment.
1518+
pub async fn download_feishu_audio(
1519+
client: &reqwest::Client,
1520+
api_base: &str,
1521+
token: &str,
1522+
message_id: &str,
1523+
file_key: &str,
1524+
) -> Option<crate::schema::Attachment> {
1525+
let url = format!(
1526+
"{}/open-apis/im/v1/messages/{}/resources/{}?type=file",
1527+
api_base, message_id, file_key
1528+
);
1529+
let resp = match client.get(&url).bearer_auth(token).send().await {
1530+
Ok(r) => r,
1531+
Err(e) => {
1532+
tracing::warn!(file_key, error = %e, "feishu audio download failed");
1533+
return None;
1534+
}
1535+
};
1536+
if !resp.status().is_success() {
1537+
tracing::warn!(file_key, status = %resp.status(), "feishu audio download failed");
1538+
return None;
1539+
}
1540+
if let Some(cl) = resp.headers().get(reqwest::header::CONTENT_LENGTH) {
1541+
if let Ok(size) = cl.to_str().unwrap_or("0").parse::<u64>() {
1542+
if size > AUDIO_MAX_DOWNLOAD {
1543+
tracing::warn!(file_key, size, "feishu audio exceeds 25MB limit");
1544+
return None;
1545+
}
1546+
}
1547+
}
1548+
let bytes = resp.bytes().await.ok()?;
1549+
if bytes.len() as u64 > AUDIO_MAX_DOWNLOAD {
1550+
tracing::warn!(file_key, size = bytes.len(), "feishu audio exceeds 25MB limit");
1551+
return None;
1552+
}
1553+
tracing::debug!(file_key, size = bytes.len(), "feishu audio downloaded");
1554+
use base64::Engine;
1555+
let data = base64::engine::general_purpose::STANDARD.encode(&bytes);
1556+
Some(crate::schema::Attachment {
1557+
attachment_type: "audio".into(),
1558+
filename: format!("{}.ogg", file_key),
1559+
mime_type: "audio/ogg".into(),
1560+
data,
1561+
size: bytes.len() as u64,
1562+
})
1563+
}
1564+
15001565
/// Send a post (rich text) message to a feishu chat_id.
15011566
/// Returns the sent message_id on success, None on failure.
15021567
/// When `reply_to` is Some(root_id), uses the reply API to stay in a thread.
@@ -2260,6 +2325,9 @@ pub async fn webhook(
22602325
MediaRef::File { message_id, file_key, file_name } => {
22612326
download_feishu_file(&feishu.client, &api_base, &token, message_id, file_key, file_name).await
22622327
}
2328+
MediaRef::Audio { message_id, file_key } => {
2329+
download_feishu_audio(&feishu.client, &api_base, &token, message_id, file_key).await
2330+
}
22632331
};
22642332
if let Some(att) = attachment {
22652333
gateway_event.content.attachments.push(att);

src/gateway.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,7 @@ pub struct GatewayParams {
487487
pub allow_all_users: bool,
488488
pub allowed_users: Vec<String>,
489489
pub streaming: bool,
490+
pub stt: crate::config::SttConfig,
490491
}
491492

492493
pub async fn run_gateway_adapter(
@@ -504,6 +505,7 @@ pub async fn run_gateway_adapter(
504505
let allow_all_users = params.allow_all_users;
505506
let allowed_users = params.allowed_users;
506507
let streaming = params.streaming;
508+
let stt_config = params.stt;
507509

508510
let connect_url = match &params.token {
509511
Some(token) => {
@@ -662,6 +664,26 @@ pub async fn run_gateway_adapter(
662664
});
663665
}
664666
}
667+
"audio" => {
668+
if stt_config.enabled {
669+
use base64::Engine;
670+
if let Ok(audio_bytes) = base64::engine::general_purpose::STANDARD.decode(&att.data) {
671+
if let Some(transcript) = crate::stt::transcribe(
672+
&crate::media::HTTP_CLIENT,
673+
&stt_config,
674+
audio_bytes,
675+
att.filename.clone(),
676+
&att.mime_type,
677+
).await {
678+
extra_blocks.push(ContentBlock::Text {
679+
text: format!("[Voice message transcript]: {transcript}"),
680+
});
681+
}
682+
} else {
683+
warn!(filename = %att.filename, "audio attachment base64 decode failed");
684+
}
685+
}
686+
}
665687
_ => {}
666688
}
667689
}

src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ async fn main() -> anyhow::Result<()> {
228228
allow_all_users: config::resolve_allow_all(gw_cfg.allow_all_users, &gw_cfg.allowed_users),
229229
allowed_users: gw_cfg.allowed_users,
230230
streaming: gw_cfg.streaming,
231+
stt: cfg.stt.clone(),
231232
};
232233
Some(tokio::spawn(async move {
233234
if let Err(e) = gateway::run_gateway_adapter(params, router, shutdown_rx).await {

0 commit comments

Comments
 (0)