feat: add configuration for AI voice input/output settings and models

jxxghp · jxxghp · commit 96684a8d1365 · 2026-04-29T18:15:50.000+08:00
diff --git a/src/composables/useSetupWizard.ts b/src/composables/useSetupWizard.ts
@@ -56,9 +56,17 @@ export interface WizardData {
     model: string
     thinkingLevel: string
     supportImageInput: boolean
+    supportAudioInputOutput: boolean
     apiKey: string
     baseUrl: string
     maxContextTokens: number
+    voiceApiKey: string
+    voiceBaseUrl: string
+    voiceSttModel: string
+    voiceTtsModel: string
+    voiceTtsVoice: string
+    voiceLanguage: string
+    voiceReplyWithText: boolean
     jobInterval: number
     retryTransfer: boolean
     recommendEnabled: boolean
@@ -226,9 +234,17 @@ const wizardData = ref<WizardData>({
     model: 'deepseek-chat',
     thinkingLevel: 'off',
     supportImageInput: true,
+    supportAudioInputOutput: false,
     apiKey: '',
     baseUrl: 'https://api.deepseek.com',
     maxContextTokens: 64,
+    voiceApiKey: '',
+    voiceBaseUrl: '',
+    voiceSttModel: 'gpt-4o-mini-transcribe',
+    voiceTtsModel: 'gpt-4o-mini-tts',
+    voiceTtsVoice: 'alloy',
+    voiceLanguage: 'zh',
+    voiceReplyWithText: false,
     jobInterval: 0,
     retryTransfer: false,
     recommendEnabled: false,
@@ -1363,9 +1379,17 @@ export function useSetupWizard() {
         LLM_MODEL: wizardData.value.agent.model,
         LLM_THINKING_LEVEL: wizardData.value.agent.thinkingLevel,
         LLM_SUPPORT_IMAGE_INPUT: wizardData.value.agent.supportImageInput,
+        LLM_SUPPORT_AUDIO_INPUT_OUTPUT: wizardData.value.agent.supportAudioInputOutput,
         LLM_API_KEY: wizardData.value.agent.apiKey,
         LLM_BASE_URL: wizardData.value.agent.baseUrl || null,
         LLM_MAX_CONTEXT_TOKENS: wizardData.value.agent.maxContextTokens,
+        AI_VOICE_API_KEY: wizardData.value.agent.voiceApiKey || null,
+        AI_VOICE_BASE_URL: wizardData.value.agent.voiceBaseUrl || null,
+        AI_VOICE_STT_MODEL: wizardData.value.agent.voiceSttModel,
+        AI_VOICE_TTS_MODEL: wizardData.value.agent.voiceTtsModel,
+        AI_VOICE_TTS_VOICE: wizardData.value.agent.voiceTtsVoice,
+        AI_VOICE_LANGUAGE: wizardData.value.agent.voiceLanguage,
+        AI_VOICE_REPLY_WITH_TEXT: wizardData.value.agent.voiceReplyWithText,
         AI_AGENT_JOB_INTERVAL: wizardData.value.agent.enabled ? wizardData.value.agent.jobInterval : 0,
         AI_AGENT_RETRY_TRANSFER: wizardData.value.agent.enabled ? wizardData.value.agent.retryTransfer : false,
         AI_RECOMMEND_ENABLED:
@@ -1461,9 +1485,17 @@ export function useSetupWizard() {
         wizardData.value.agent.model = result.data.LLM_MODEL || ''
         wizardData.value.agent.thinkingLevel = resolveThinkingLevelValue(result.data)
         wizardData.value.agent.supportImageInput = result.data.LLM_SUPPORT_IMAGE_INPUT ?? true
+        wizardData.value.agent.supportAudioInputOutput = Boolean(result.data.LLM_SUPPORT_AUDIO_INPUT_OUTPUT)
         wizardData.value.agent.apiKey = result.data.LLM_API_KEY || ''
         wizardData.value.agent.baseUrl = result.data.LLM_BASE_URL || ''
         wizardData.value.agent.maxContextTokens = result.data.LLM_MAX_CONTEXT_TOKENS || 64
+        wizardData.value.agent.voiceApiKey = result.data.AI_VOICE_API_KEY || ''
+        wizardData.value.agent.voiceBaseUrl = result.data.AI_VOICE_BASE_URL || ''
+        wizardData.value.agent.voiceSttModel = result.data.AI_VOICE_STT_MODEL || 'gpt-4o-mini-transcribe'
+        wizardData.value.agent.voiceTtsModel = result.data.AI_VOICE_TTS_MODEL || 'gpt-4o-mini-tts'
+        wizardData.value.agent.voiceTtsVoice = result.data.AI_VOICE_TTS_VOICE || 'alloy'
+        wizardData.value.agent.voiceLanguage = result.data.AI_VOICE_LANGUAGE || 'zh'
+        wizardData.value.agent.voiceReplyWithText = Boolean(result.data.AI_VOICE_REPLY_WITH_TEXT)
         wizardData.value.agent.jobInterval = result.data.AI_AGENT_JOB_INTERVAL || 0
         wizardData.value.agent.retryTransfer = Boolean(result.data.AI_AGENT_RETRY_TRANSFER)
         wizardData.value.agent.recommendEnabled = Boolean(result.data.AI_RECOMMEND_ENABLED)
diff --git a/src/locales/en-US.ts b/src/locales/en-US.ts
@@ -1340,6 +1340,12 @@ export default {
       llmThinkingLevelHigh: 'High (high)',
       llmThinkingLevelMax: 'Max (max)',
       llmThinkingLevelXhigh: 'XHigh (xhigh)',
+      llmSupportImageInput: 'Model Supports Image Input',
+      llmSupportImageInputHint:
+        'When enabled, message images are sent to the LLM as multimodal image input. When disabled, images are saved locally as attachments and only the file path is passed to the AI assistant.',
+      llmSupportAudioInputOutput: 'Support Audio Input and Output',
+      llmSupportAudioInputOutputHint:
+        'When enabled, the AI assistant can transcribe incoming audio messages and reply with voice on supported channels.',
       llmMaxContextTokens: 'LLM Max Context Tokens (K)',
       llmMaxContextTokensHint:
         'Set the maximum number of context tokens (in thousands) for the LLM. Exceeding this limit will trigger context trimming.',
@@ -1348,6 +1354,23 @@ export default {
       llmApiKeyPlaceholder: 'Please enter API key',
       llmBaseUrl: 'LLM Base URL',
       llmBaseUrlHint: 'Base URL for LLM API, used for custom API endpoints',
+      aiVoiceApiKey: 'Audio API Key',
+      aiVoiceApiKeyHint:
+        'API key used for audio transcription and speech synthesis. Falls back to the current LLM API key when left blank.',
+      aiVoiceBaseUrl: 'Audio Base URL',
+      aiVoiceBaseUrlHint:
+        'Base URL used for audio transcription and speech synthesis. Falls back to the current LLM base URL when left blank.',
+      aiVoiceSttModel: 'Audio Transcription Model',
+      aiVoiceSttModelHint: 'Model name used to convert audio content into text.',
+      aiVoiceTtsModel: 'Speech Synthesis Model',
+      aiVoiceTtsModelHint: 'Model name used to convert text content into speech.',
+      aiVoiceTtsVoice: 'Voice Preset',
+      aiVoiceTtsVoiceHint: 'Speaker or voice preset used for speech synthesis.',
+      aiVoiceLanguage: 'Recognition Language',
+      aiVoiceLanguageHint:
+        'Default language for audio transcription, such as zh or en. Leave blank to use the backend default.',
+      aiVoiceReplyWithText: 'Include Text with Voice Replies',
+      aiVoiceReplyWithTextHint: 'When sending a voice reply, also include the text version of the response.',
       llmTestAction: 'Test Call',
       llmTestSuccessToast: 'LLM test call succeeded',
       llmTestFailedToast: 'LLM test call failed',
diff --git a/src/locales/zh-CN.ts b/src/locales/zh-CN.ts
@@ -1337,6 +1337,9 @@ export default {
       llmSupportImageInput: '模型支持图片输入',
       llmSupportImageInputHint:
         '启用后，消息中的图片会按多模态图片发送给 LLM；关闭后图片会作为附件保存到本地，并将文件路径提供给智能助手处理',
+      llmSupportAudioInputOutput: '支持音频输入输出',
+      llmSupportAudioInputOutputHint:
+        '启用后，智能助手可以转写用户发送的音频消息，并在支持的渠道上回复语音',
       llmMaxContextTokens: 'LLM 最大上下文 Token 数量 (K)',
       llmMaxContextTokensHint:
         '设定 LLM 记录会话历史的最大 Token 数量上限（千），超出后将自动修整历史记录以节省 Token 消耗及防止超出 LLM 限制',
@@ -1345,6 +1348,20 @@ export default {
       llmApiKeyPlaceholder: '请输入API密钥',
       llmBaseUrl: 'LLM基础URL',
       llmBaseUrlHint: 'LLM API的基础URL地址，用于自定义API端点',
+      aiVoiceApiKey: '音频 API密钥',
+      aiVoiceApiKeyHint: '音频转写与语音合成使用的 API 密钥，留空时回退到当前 LLM API 密钥',
+      aiVoiceBaseUrl: '音频基础URL',
+      aiVoiceBaseUrlHint: '音频转写与语音合成接口的基础URL，留空时回退到当前 LLM 基础 URL',
+      aiVoiceSttModel: '音频转写模型',
+      aiVoiceSttModelHint: '用于将音频内容转换为文字的模型名称',
+      aiVoiceTtsModel: '语音合成模型',
+      aiVoiceTtsModelHint: '用于将文字内容转换为语音的模型名称',
+      aiVoiceTtsVoice: '语音音色',
+      aiVoiceTtsVoiceHint: '语音合成使用的发音人或音色标识',
+      aiVoiceLanguage: '识别语言',
+      aiVoiceLanguageHint: '音频转写默认语言，例如 zh、en，留空时按后端默认处理',
+      aiVoiceReplyWithText: '语音回复附带文字',
+      aiVoiceReplyWithTextHint: '发送语音回复时，同时附带一份文字内容',
       llmTestAction: '测试调用',
       llmTestSuccessToast: 'LLM 调用测试成功',
       llmTestFailedToast: 'LLM 调用测试失败',
diff --git a/src/locales/zh-TW.ts b/src/locales/zh-TW.ts
@@ -1339,6 +1339,9 @@ export default {
       llmSupportImageInput: '模型支援圖片輸入',
       llmSupportImageInputHint:
         '啟用後，消息中的圖片會按多模態圖片發送給 LLM；關閉後圖片會作為附件保存到本地，並將檔案路徑提供給智能助手處理',
+      llmSupportAudioInputOutput: '支援音頻輸入輸出',
+      llmSupportAudioInputOutputHint:
+        '啟用後，智能助手可以轉寫用戶發送的音頻消息，並在支援的渠道上回覆語音',
       llmMaxContextTokens: 'LLM 最大上下文 Token 數量 (K)',
       llmMaxContextTokensHint:
         '設定 LLM 記錄會話歷史的最大 Token 數量上限（千），超出後將自動修整歷史記錄以節省 Token 消耗及防止超出 LLM 限制',
@@ -1347,6 +1350,20 @@ export default {
       llmApiKeyPlaceholder: '請輸入API密鑰',
       llmBaseUrl: 'LLM基礎URL',
       llmBaseUrlHint: 'LLM API的基礎URL地址，用於自定義API端點',
+      aiVoiceApiKey: '音頻 API密鑰',
+      aiVoiceApiKeyHint: '音頻轉寫與語音合成使用的 API 密鑰，留空時回退到當前 LLM API 密鑰',
+      aiVoiceBaseUrl: '音頻基礎URL',
+      aiVoiceBaseUrlHint: '音頻轉寫與語音合成接口的基礎URL，留空時回退到當前 LLM 基礎 URL',
+      aiVoiceSttModel: '音頻轉寫模型',
+      aiVoiceSttModelHint: '用於將音頻內容轉換為文字的模型名稱',
+      aiVoiceTtsModel: '語音合成模型',
+      aiVoiceTtsModelHint: '用於將文字內容轉換為語音的模型名稱',
+      aiVoiceTtsVoice: '語音音色',
+      aiVoiceTtsVoiceHint: '語音合成使用的發音人或音色標識',
+      aiVoiceLanguage: '識別語言',
+      aiVoiceLanguageHint: '音頻轉寫預設語言，例如 zh、en，留空時按後端預設處理',
+      aiVoiceReplyWithText: '語音回覆附帶文字',
+      aiVoiceReplyWithTextHint: '發送語音回覆時，同時附帶一份文字內容',
       llmTestAction: '測試調用',
       llmTestSuccessToast: 'LLM 調用測試成功',
       llmTestFailedToast: 'LLM 調用測試失敗',
diff --git a/src/views/setting/AccountSettingSystem.vue b/src/views/setting/AccountSettingSystem.vue
@@ -42,8 +42,16 @@ const SystemSettings = ref<any>({
     LLM_MODEL: 'deepseek-chat',
     LLM_THINKING_LEVEL: 'off',
     LLM_SUPPORT_IMAGE_INPUT: false,
+    LLM_SUPPORT_AUDIO_INPUT_OUTPUT: false,
     LLM_API_KEY: null,
     LLM_BASE_URL: 'https://api.deepseek.com',
+    AI_VOICE_API_KEY: null,
+    AI_VOICE_BASE_URL: null,
+    AI_VOICE_STT_MODEL: 'gpt-4o-mini-transcribe',
+    AI_VOICE_TTS_MODEL: 'gpt-4o-mini-tts',
+    AI_VOICE_TTS_VOICE: 'alloy',
+    AI_VOICE_LANGUAGE: 'zh',
+    AI_VOICE_REPLY_WITH_TEXT: false,
     AI_AGENT_RETRY_TRANSFER: false,
     AI_RECOMMEND_ENABLED: false,
     AI_RECOMMEND_USER_PREFERENCE: null,
@@ -1016,22 +1024,126 @@ watch(currentLlmSnapshotKey, (snapshotKey, previousSnapshotKey) => {
                   </VCol>
                 </VRow>
                 <VRow>
-                  <VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12" md="6">
+                  <VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12" md="4">
                     <VSwitch
                       v-model="SystemSettings.Basic.LLM_SUPPORT_IMAGE_INPUT"
                       :label="t('setting.system.llmSupportImageInput')"
                       :hint="t('setting.system.llmSupportImageInputHint')"
                       persistent-hint
                     />
                   </VCol>
-                  <VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12" md="6">
+                </VRow>
+                <VRow>
+                  <VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12">
+                    <VSwitch
+                      v-model="SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                      :label="t('setting.system.llmSupportAudioInputOutput')"
+                      :hint="t('setting.system.llmSupportAudioInputOutputHint')"
+                      persistent-hint
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                    md="6"
+                  >
+                    <VTextField
+                      v-model="SystemSettings.Basic.AI_VOICE_API_KEY"
+                      :label="t('setting.system.aiVoiceApiKey')"
+                      :hint="t('setting.system.aiVoiceApiKeyHint')"
+                      persistent-hint
+                      prepend-inner-icon="mdi-key-variant"
+                      type="password"
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                    md="6"
+                  >
+                    <VTextField
+                      v-model="SystemSettings.Basic.AI_VOICE_BASE_URL"
+                      :label="t('setting.system.aiVoiceBaseUrl')"
+                      :hint="t('setting.system.aiVoiceBaseUrlHint')"
+                      persistent-hint
+                      prepend-inner-icon="mdi-link-variant"
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                    md="6"
+                  >
+                    <VTextField
+                      v-model="SystemSettings.Basic.AI_VOICE_STT_MODEL"
+                      :label="t('setting.system.aiVoiceSttModel')"
+                      :hint="t('setting.system.aiVoiceSttModelHint')"
+                      persistent-hint
+                      prepend-inner-icon="mdi-waveform"
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                    md="6"
+                  >
+                    <VTextField
+                      v-model="SystemSettings.Basic.AI_VOICE_TTS_MODEL"
+                      :label="t('setting.system.aiVoiceTtsModel')"
+                      :hint="t('setting.system.aiVoiceTtsModelHint')"
+                      persistent-hint
+                      prepend-inner-icon="mdi-waveform"
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                    md="6"
+                  >
+                    <VTextField
+                      v-model="SystemSettings.Basic.AI_VOICE_TTS_VOICE"
+                      :label="t('setting.system.aiVoiceTtsVoice')"
+                      :hint="t('setting.system.aiVoiceTtsVoiceHint')"
+                      persistent-hint
+                      prepend-inner-icon="mdi-account-voice"
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                    md="6"
+                  >
+                    <VTextField
+                      v-model="SystemSettings.Basic.AI_VOICE_LANGUAGE"
+                      :label="t('setting.system.aiVoiceLanguage')"
+                      :hint="t('setting.system.aiVoiceLanguageHint')"
+                      persistent-hint
+                      prepend-inner-icon="mdi-translate"
+                    />
+                  </VCol>
+                  <VCol
+                    v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
+                    cols="12"
+                  >
+                    <VSwitch
+                      v-model="SystemSettings.Basic.AI_VOICE_REPLY_WITH_TEXT"
+                      :label="t('setting.system.aiVoiceReplyWithText')"
+                      :hint="t('setting.system.aiVoiceReplyWithTextHint')"
+                      persistent-hint
+                    />
+                  </VCol>
+                </VRow>
+                <VRow>
+                  <VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12">
                     <VSwitch
                       v-model="SystemSettings.Basic.AI_AGENT_RETRY_TRANSFER"
                       :label="t('setting.system.aiAgentRetryTransfer')"
                       :hint="t('setting.system.aiAgentRetryTransferHint')"
                       persistent-hint
                     />
                   </VCol>
+                </VRow>
+                <VRow>
                   <VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12">
                     <VSwitch
                       v-model="SystemSettings.Basic.AI_RECOMMEND_ENABLED"
diff --git a/src/views/setup/AgentSettingsStep.vue b/src/views/setup/AgentSettingsStep.vue