Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 152 additions & 8 deletions src/channels/transcription.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use anyhow::{bail, Context, Result};
use reqwest::multipart::{Form, Part};
use reqwest::Url;

use crate::config::TranscriptionConfig;

Expand Down Expand Up @@ -31,11 +32,31 @@ fn normalize_audio_filename(file_name: &str) -> String {
}
}

/// Returns `true` when `api_url` points to a Mistral endpoint.
///
/// Parses the URL and inspects the host (case-insensitive). Falls back to
/// `false` on parse errors so the Groq default path is used.
fn is_mistral_host(api_url: &str) -> bool {
Url::parse(api_url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_ascii_lowercase()))
.map_or(false, |host| {
host == "mistral.ai" || host.ends_with(".mistral.ai")
})
}

/// Transcribe audio bytes via a Whisper-compatible transcription API.
///
/// Returns the transcribed text on success. Requires `GROQ_API_KEY` in the
/// environment. The caller is responsible for enforcing duration limits
/// *before* downloading the file; this function enforces the byte-size cap.
/// Supports Groq Whisper (default) and Mistral Voxtral endpoints.
/// The provider is detected from `config.api_url` by inspecting the host.
///
/// API key resolution order:
/// 1. Explicit `config.api_key` (highest priority).
/// 2. `MISTRAL_API_KEY` env var (when the endpoint is `*.mistral.ai`).
/// 3. `GROQ_API_KEY` env var (all other endpoints).
///
/// The caller is responsible for enforcing duration limits *before*
/// downloading the file; this function enforces the byte-size cap.
pub async fn transcribe_audio(
audio_data: Vec<u8>,
file_name: &str,
Expand All @@ -59,11 +80,35 @@ pub async fn transcribe_audio(
)
})?;

let api_key = std::env::var("GROQ_API_KEY").context(
"GROQ_API_KEY environment variable is not set — required for voice transcription",
)?;
let mistral = is_mistral_host(&config.api_url);

let api_key = config
.api_key
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
.map(ToOwned::to_owned)
.or_else(|| {
let var = if mistral {
"MISTRAL_API_KEY"
} else {
"GROQ_API_KEY"
};
std::env::var(var)
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
})
.context(
"Missing transcription API key: set [transcription].api_key, MISTRAL_API_KEY, or GROQ_API_KEY environment variable",
)?;

let client = crate::config::build_runtime_proxy_client("transcription.groq");
let proxy_name = if mistral {
"transcription.mistral"
} else {
"transcription.groq"
};
let client = crate::config::build_runtime_proxy_client(proxy_name);

let file_part = Part::bytes(audio_data)
.file_name(normalized_name)
Expand Down Expand Up @@ -135,11 +180,110 @@ mod tests {
.await
.unwrap_err();
assert!(
err.to_string().contains("GROQ_API_KEY"),
err.to_string().contains("Missing transcription API key"),
"expected missing-key error, got: {err}"
);
}

#[tokio::test]
async fn uses_config_api_key_without_groq_env() {
std::env::remove_var("GROQ_API_KEY");
std::env::remove_var("MISTRAL_API_KEY");

let mut config = TranscriptionConfig::default();
config.api_key = Some("explicit-key".to_string());

// Will fail on the HTTP request (no real server), but should NOT
// fail on API key resolution.
let err = transcribe_audio(vec![0u8; 100], "test.ogg", &config)
.await
.unwrap_err();
assert!(
!err.to_string().contains("Missing transcription API key"),
"should not fail on key resolution when config.api_key is set, got: {err}"
);
}

#[tokio::test]
async fn mistral_url_falls_back_to_mistral_env_key() {
std::env::remove_var("GROQ_API_KEY");
std::env::remove_var("MISTRAL_API_KEY");

let mut config = TranscriptionConfig::default();
config.api_url = "https://api.mistral.ai/v1/audio/transcriptions".to_string();
config.api_key = None;

// Without MISTRAL_API_KEY set, should get the missing-key error.
let err = transcribe_audio(vec![0u8; 100], "test.ogg", &config)
.await
.unwrap_err();
assert!(
err.to_string().contains("Missing transcription API key"),
"expected missing-key error for Mistral URL without env key, got: {err}"
);
}

#[tokio::test]
async fn whitespace_only_api_key_is_rejected() {
std::env::remove_var("GROQ_API_KEY");
std::env::remove_var("MISTRAL_API_KEY");

let mut config = TranscriptionConfig::default();
config.api_key = Some(" ".to_string());

let err = transcribe_audio(vec![0u8; 100], "test.ogg", &config)
.await
.unwrap_err();
assert!(
err.to_string().contains("Missing transcription API key"),
"whitespace-only api_key should be treated as missing, got: {err}"
);
}

// ── is_mistral_host tests ───────────────────────────────────────

#[test]
fn is_mistral_host_detects_api_subdomain() {
assert!(is_mistral_host(
"https://api.mistral.ai/v1/audio/transcriptions"
));
}

#[test]
fn is_mistral_host_detects_bare_domain() {
assert!(is_mistral_host("https://mistral.ai/endpoint"));
}

#[test]
fn is_mistral_host_case_insensitive() {
assert!(is_mistral_host(
"https://API.MISTRAL.AI/v1/audio/transcriptions"
));
}

#[test]
fn is_mistral_host_rejects_groq_url() {
assert!(!is_mistral_host(
"https://api.groq.com/openai/v1/audio/transcriptions"
));
}

#[test]
fn is_mistral_host_rejects_spoofed_path() {
// "mistral.ai" in path but not in host
assert!(!is_mistral_host(
"https://evil.com/mistral.ai/v1/audio/transcriptions"
));
}

#[test]
fn is_mistral_host_returns_false_for_invalid_url() {
assert!(!is_mistral_host("not-a-url"));
assert!(!is_mistral_host(""));
}

// ── MIME / filename tests ───────────────────────────────────────

#[test]
fn mime_for_audio_maps_accepted_formats() {
let cases = [
Expand Down
74 changes: 70 additions & 4 deletions src/config/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ const SUPPORTED_PROXY_SERVICE_KEYS: &[&str] = &[
"memory.embeddings",
"tunnel.custom",
"transcription.groq",
"transcription.mistral",
];

const SUPPORTED_PROXY_SERVICE_SELECTORS: &[&str] = &[
Expand Down Expand Up @@ -538,22 +539,50 @@ fn default_transcription_max_duration_secs() -> u64 {
120
}

/// Voice transcription configuration (Whisper API via Groq).
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
/// Voice transcription configuration (Groq Whisper default; also supports Mistral Voxtral endpoints).
///
/// # Defaults
///
/// - `enabled`: `false` — transcription is opt-in.
/// - `api_url`: `https://api.groq.com/openai/v1/audio/transcriptions`
/// - `model`: `whisper-large-v3-turbo`
///
/// # Compatibility
///
/// Additive and backward-compatible for existing Groq configurations.
/// New providers may differ in supported models, encoding, or rate limits.
///
/// # Migration / Rollback
///
/// To switch to Mistral: set `api_url` to the Mistral endpoint, `model`
/// to e.g. `voxtral-mini-latest`, and provide `api_key` or `MISTRAL_API_KEY`.
/// To revert: restore `api_url`/`model` to the Groq defaults above (or
/// remove the keys to use serde defaults).
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
pub struct TranscriptionConfig {
/// Enable voice transcription for channels that support it.
/// Default: `false`.
#[serde(default)]
pub enabled: bool,
/// Whisper API endpoint URL.
/// API key used for transcription requests.
///
/// If unset, runtime falls back to `MISTRAL_API_KEY` (for Mistral
/// endpoints) or `GROQ_API_KEY` (all others).
#[serde(default)]
pub api_key: Option<String>,
/// Transcription API endpoint URL.
/// Default: `https://api.groq.com/openai/v1/audio/transcriptions`.
#[serde(default = "default_transcription_api_url")]
pub api_url: String,
/// Whisper model name.
/// Whisper or Voxtral model name (e.g. `whisper-large-v3-turbo`, `voxtral-mini-latest`).
/// Default: `whisper-large-v3-turbo`.
#[serde(default = "default_transcription_model")]
pub model: String,
/// Optional language hint (ISO-639-1, e.g. "en", "ru").
#[serde(default)]
pub language: Option<String>,
/// Maximum voice duration in seconds (messages longer than this are skipped).
/// Default: `120`.
#[serde(default = "default_transcription_max_duration_secs")]
pub max_duration_secs: u64,
}
Expand All @@ -562,6 +591,7 @@ impl Default for TranscriptionConfig {
fn default() -> Self {
Self {
enabled: false,
api_key: None,
api_url: default_transcription_api_url(),
model: default_transcription_model(),
language: None,
Expand All @@ -570,6 +600,26 @@ impl Default for TranscriptionConfig {
}
}

impl std::fmt::Debug for TranscriptionConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TranscriptionConfig")
.field("enabled", &self.enabled)
.field(
"api_key",
&if self.api_key.is_some() {
Some("<redacted>")
} else {
None::<&str>
},
)
.field("api_url", &self.api_url)
.field("model", &self.model)
.field("language", &self.language)
.field("max_duration_secs", &self.max_duration_secs)
.finish()
}
}
Comment on lines +603 to +621
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Redact or sanitize api_url in Debug output.

api_key is redacted, but api_url is still logged verbatim. URLs can contain embedded credentials or tokenized query params, which risks secret leakage in logs.

🔒 Proposed fix
 impl std::fmt::Debug for TranscriptionConfig {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("TranscriptionConfig")
             .field("enabled", &self.enabled)
             .field(
                 "api_key",
                 &if self.api_key.is_some() {
                     Some("<redacted>")
                 } else {
                     None::<&str>
                 },
             )
-            .field("api_url", &self.api_url)
+            .field("api_url_configured", &!self.api_url.trim().is_empty())
             .field("model", &self.model)
             .field("language", &self.language)
             .field("max_duration_secs", &self.max_duration_secs)
             .finish()
     }
 }

Based on learnings: "Never log secrets, raw tokens, or sensitive payloads".

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
impl std::fmt::Debug for TranscriptionConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TranscriptionConfig")
.field("enabled", &self.enabled)
.field(
"api_key",
&if self.api_key.is_some() {
Some("<redacted>")
} else {
None::<&str>
},
)
.field("api_url", &self.api_url)
.field("model", &self.model)
.field("language", &self.language)
.field("max_duration_secs", &self.max_duration_secs)
.finish()
}
}
impl std::fmt::Debug for TranscriptionConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TranscriptionConfig")
.field("enabled", &self.enabled)
.field(
"api_key",
&if self.api_key.is_some() {
Some("<redacted>")
} else {
None::<&str>
},
)
.field("api_url_configured", &!self.api_url.trim().is_empty())
.field("model", &self.model)
.field("language", &self.language)
.field("max_duration_secs", &self.max_duration_secs)
.finish()
}
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/config/schema.rs` around lines 554 - 572, The Debug implementation for
TranscriptionConfig currently prints api_url verbatim; update the impl of
std::fmt::Debug for TranscriptionConfig (the fmt method) to redact or sanitize
the api_url field before logging (similar to api_key). Replace the direct
.field("api_url", &self.api_url) with a masked representation (e.g., None if
empty, or a string that strips credentials and query params or shows only the
scheme+host or "<redacted_url>") so no embedded credentials or tokenized query
params are emitted; keep the rest of the fields unchanged.


// ── Agents IPC ──────────────────────────────────────────────────

fn default_agents_ipc_db_path() -> String {
Expand Down Expand Up @@ -5870,6 +5920,11 @@ impl Config {
&mut config.storage.provider.config.db_url,
"config.storage.provider.config.db_url",
)?;
decrypt_optional_secret(
&store,
&mut config.transcription.api_key,
"config.transcription.api_key",
)?;
decrypt_vec_secrets(
&store,
&mut config.reliability.api_keys,
Expand Down Expand Up @@ -6739,6 +6794,11 @@ impl Config {
&mut config_to_save.storage.provider.config.db_url,
"config.storage.provider.config.db_url",
)?;
encrypt_optional_secret(
&store,
&mut config_to_save.transcription.api_key,
"config.transcription.api_key",
)?;
encrypt_vec_secrets(
&store,
&mut config_to_save.reliability.api_keys,
Expand Down Expand Up @@ -7781,6 +7841,8 @@ tool_dispatcher = "xml"
},
);

config.transcription.api_key = Some("transcription-credential".into());

config.save().await.unwrap();

let contents = tokio::fs::read_to_string(config.config_path.clone())
Expand Down Expand Up @@ -7888,6 +7950,10 @@ tool_dispatcher = "xml"
"telegram-credential"
);

let transcription_key = stored.transcription.api_key.as_deref().unwrap();
assert!(crate::security::SecretStore::is_encrypted(transcription_key));
assert_eq!(store.decrypt(transcription_key).unwrap(), "transcription-credential");

let _ = fs::remove_dir_all(&dir).await;
}

Expand Down
Loading