Skip to content

Commit 96684a8

Browse files
committed
feat: add configuration for AI voice input/output settings and models
1 parent fc9fe5e commit 96684a8

6 files changed

Lines changed: 299 additions & 15 deletions

File tree

src/composables/useSetupWizard.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,17 @@ export interface WizardData {
5656
model: string
5757
thinkingLevel: string
5858
supportImageInput: boolean
59+
supportAudioInputOutput: boolean
5960
apiKey: string
6061
baseUrl: string
6162
maxContextTokens: number
63+
voiceApiKey: string
64+
voiceBaseUrl: string
65+
voiceSttModel: string
66+
voiceTtsModel: string
67+
voiceTtsVoice: string
68+
voiceLanguage: string
69+
voiceReplyWithText: boolean
6270
jobInterval: number
6371
retryTransfer: boolean
6472
recommendEnabled: boolean
@@ -226,9 +234,17 @@ const wizardData = ref<WizardData>({
226234
model: 'deepseek-chat',
227235
thinkingLevel: 'off',
228236
supportImageInput: true,
237+
supportAudioInputOutput: false,
229238
apiKey: '',
230239
baseUrl: 'https://api.deepseek.com',
231240
maxContextTokens: 64,
241+
voiceApiKey: '',
242+
voiceBaseUrl: '',
243+
voiceSttModel: 'gpt-4o-mini-transcribe',
244+
voiceTtsModel: 'gpt-4o-mini-tts',
245+
voiceTtsVoice: 'alloy',
246+
voiceLanguage: 'zh',
247+
voiceReplyWithText: false,
232248
jobInterval: 0,
233249
retryTransfer: false,
234250
recommendEnabled: false,
@@ -1363,9 +1379,17 @@ export function useSetupWizard() {
13631379
LLM_MODEL: wizardData.value.agent.model,
13641380
LLM_THINKING_LEVEL: wizardData.value.agent.thinkingLevel,
13651381
LLM_SUPPORT_IMAGE_INPUT: wizardData.value.agent.supportImageInput,
1382+
LLM_SUPPORT_AUDIO_INPUT_OUTPUT: wizardData.value.agent.supportAudioInputOutput,
13661383
LLM_API_KEY: wizardData.value.agent.apiKey,
13671384
LLM_BASE_URL: wizardData.value.agent.baseUrl || null,
13681385
LLM_MAX_CONTEXT_TOKENS: wizardData.value.agent.maxContextTokens,
1386+
AI_VOICE_API_KEY: wizardData.value.agent.voiceApiKey || null,
1387+
AI_VOICE_BASE_URL: wizardData.value.agent.voiceBaseUrl || null,
1388+
AI_VOICE_STT_MODEL: wizardData.value.agent.voiceSttModel,
1389+
AI_VOICE_TTS_MODEL: wizardData.value.agent.voiceTtsModel,
1390+
AI_VOICE_TTS_VOICE: wizardData.value.agent.voiceTtsVoice,
1391+
AI_VOICE_LANGUAGE: wizardData.value.agent.voiceLanguage,
1392+
AI_VOICE_REPLY_WITH_TEXT: wizardData.value.agent.voiceReplyWithText,
13691393
AI_AGENT_JOB_INTERVAL: wizardData.value.agent.enabled ? wizardData.value.agent.jobInterval : 0,
13701394
AI_AGENT_RETRY_TRANSFER: wizardData.value.agent.enabled ? wizardData.value.agent.retryTransfer : false,
13711395
AI_RECOMMEND_ENABLED:
@@ -1461,9 +1485,17 @@ export function useSetupWizard() {
14611485
wizardData.value.agent.model = result.data.LLM_MODEL || ''
14621486
wizardData.value.agent.thinkingLevel = resolveThinkingLevelValue(result.data)
14631487
wizardData.value.agent.supportImageInput = result.data.LLM_SUPPORT_IMAGE_INPUT ?? true
1488+
wizardData.value.agent.supportAudioInputOutput = Boolean(result.data.LLM_SUPPORT_AUDIO_INPUT_OUTPUT)
14641489
wizardData.value.agent.apiKey = result.data.LLM_API_KEY || ''
14651490
wizardData.value.agent.baseUrl = result.data.LLM_BASE_URL || ''
14661491
wizardData.value.agent.maxContextTokens = result.data.LLM_MAX_CONTEXT_TOKENS || 64
1492+
wizardData.value.agent.voiceApiKey = result.data.AI_VOICE_API_KEY || ''
1493+
wizardData.value.agent.voiceBaseUrl = result.data.AI_VOICE_BASE_URL || ''
1494+
wizardData.value.agent.voiceSttModel = result.data.AI_VOICE_STT_MODEL || 'gpt-4o-mini-transcribe'
1495+
wizardData.value.agent.voiceTtsModel = result.data.AI_VOICE_TTS_MODEL || 'gpt-4o-mini-tts'
1496+
wizardData.value.agent.voiceTtsVoice = result.data.AI_VOICE_TTS_VOICE || 'alloy'
1497+
wizardData.value.agent.voiceLanguage = result.data.AI_VOICE_LANGUAGE || 'zh'
1498+
wizardData.value.agent.voiceReplyWithText = Boolean(result.data.AI_VOICE_REPLY_WITH_TEXT)
14671499
wizardData.value.agent.jobInterval = result.data.AI_AGENT_JOB_INTERVAL || 0
14681500
wizardData.value.agent.retryTransfer = Boolean(result.data.AI_AGENT_RETRY_TRANSFER)
14691501
wizardData.value.agent.recommendEnabled = Boolean(result.data.AI_RECOMMEND_ENABLED)

src/locales/en-US.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,12 @@ export default {
13401340
llmThinkingLevelHigh: 'High (high)',
13411341
llmThinkingLevelMax: 'Max (max)',
13421342
llmThinkingLevelXhigh: 'XHigh (xhigh)',
1343+
llmSupportImageInput: 'Model Supports Image Input',
1344+
llmSupportImageInputHint:
1345+
'When enabled, message images are sent to the LLM as multimodal image input. When disabled, images are saved locally as attachments and only the file path is passed to the AI assistant.',
1346+
llmSupportAudioInputOutput: 'Support Audio Input and Output',
1347+
llmSupportAudioInputOutputHint:
1348+
'When enabled, the AI assistant can transcribe incoming audio messages and reply with voice on supported channels.',
13431349
llmMaxContextTokens: 'LLM Max Context Tokens (K)',
13441350
llmMaxContextTokensHint:
13451351
'Set the maximum number of context tokens (in thousands) for the LLM. Exceeding this limit will trigger context trimming.',
@@ -1348,6 +1354,23 @@ export default {
13481354
llmApiKeyPlaceholder: 'Please enter API key',
13491355
llmBaseUrl: 'LLM Base URL',
13501356
llmBaseUrlHint: 'Base URL for LLM API, used for custom API endpoints',
1357+
aiVoiceApiKey: 'Audio API Key',
1358+
aiVoiceApiKeyHint:
1359+
'API key used for audio transcription and speech synthesis. Falls back to the current LLM API key when left blank.',
1360+
aiVoiceBaseUrl: 'Audio Base URL',
1361+
aiVoiceBaseUrlHint:
1362+
'Base URL used for audio transcription and speech synthesis. Falls back to the current LLM base URL when left blank.',
1363+
aiVoiceSttModel: 'Audio Transcription Model',
1364+
aiVoiceSttModelHint: 'Model name used to convert audio content into text.',
1365+
aiVoiceTtsModel: 'Speech Synthesis Model',
1366+
aiVoiceTtsModelHint: 'Model name used to convert text content into speech.',
1367+
aiVoiceTtsVoice: 'Voice Preset',
1368+
aiVoiceTtsVoiceHint: 'Speaker or voice preset used for speech synthesis.',
1369+
aiVoiceLanguage: 'Recognition Language',
1370+
aiVoiceLanguageHint:
1371+
'Default language for audio transcription, such as zh or en. Leave blank to use the backend default.',
1372+
aiVoiceReplyWithText: 'Include Text with Voice Replies',
1373+
aiVoiceReplyWithTextHint: 'When sending a voice reply, also include the text version of the response.',
13511374
llmTestAction: 'Test Call',
13521375
llmTestSuccessToast: 'LLM test call succeeded',
13531376
llmTestFailedToast: 'LLM test call failed',

src/locales/zh-CN.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,6 +1337,9 @@ export default {
13371337
llmSupportImageInput: '模型支持图片输入',
13381338
llmSupportImageInputHint:
13391339
'启用后,消息中的图片会按多模态图片发送给 LLM;关闭后图片会作为附件保存到本地,并将文件路径提供给智能助手处理',
1340+
llmSupportAudioInputOutput: '支持音频输入输出',
1341+
llmSupportAudioInputOutputHint:
1342+
'启用后,智能助手可以转写用户发送的音频消息,并在支持的渠道上回复语音',
13401343
llmMaxContextTokens: 'LLM 最大上下文 Token 数量 (K)',
13411344
llmMaxContextTokensHint:
13421345
'设定 LLM 记录会话历史的最大 Token 数量上限(千),超出后将自动修整历史记录以节省 Token 消耗及防止超出 LLM 限制',
@@ -1345,6 +1348,20 @@ export default {
13451348
llmApiKeyPlaceholder: '请输入API密钥',
13461349
llmBaseUrl: 'LLM基础URL',
13471350
llmBaseUrlHint: 'LLM API的基础URL地址,用于自定义API端点',
1351+
aiVoiceApiKey: '音频 API密钥',
1352+
aiVoiceApiKeyHint: '音频转写与语音合成使用的 API 密钥,留空时回退到当前 LLM API 密钥',
1353+
aiVoiceBaseUrl: '音频基础URL',
1354+
aiVoiceBaseUrlHint: '音频转写与语音合成接口的基础URL,留空时回退到当前 LLM 基础 URL',
1355+
aiVoiceSttModel: '音频转写模型',
1356+
aiVoiceSttModelHint: '用于将音频内容转换为文字的模型名称',
1357+
aiVoiceTtsModel: '语音合成模型',
1358+
aiVoiceTtsModelHint: '用于将文字内容转换为语音的模型名称',
1359+
aiVoiceTtsVoice: '语音音色',
1360+
aiVoiceTtsVoiceHint: '语音合成使用的发音人或音色标识',
1361+
aiVoiceLanguage: '识别语言',
1362+
aiVoiceLanguageHint: '音频转写默认语言,例如 zh、en,留空时按后端默认处理',
1363+
aiVoiceReplyWithText: '语音回复附带文字',
1364+
aiVoiceReplyWithTextHint: '发送语音回复时,同时附带一份文字内容',
13481365
llmTestAction: '测试调用',
13491366
llmTestSuccessToast: 'LLM 调用测试成功',
13501367
llmTestFailedToast: 'LLM 调用测试失败',

src/locales/zh-TW.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,9 @@ export default {
13391339
llmSupportImageInput: '模型支援圖片輸入',
13401340
llmSupportImageInputHint:
13411341
'啟用後,消息中的圖片會按多模態圖片發送給 LLM;關閉後圖片會作為附件保存到本地,並將檔案路徑提供給智能助手處理',
1342+
llmSupportAudioInputOutput: '支援音頻輸入輸出',
1343+
llmSupportAudioInputOutputHint:
1344+
'啟用後,智能助手可以轉寫用戶發送的音頻消息,並在支援的渠道上回覆語音',
13421345
llmMaxContextTokens: 'LLM 最大上下文 Token 數量 (K)',
13431346
llmMaxContextTokensHint:
13441347
'設定 LLM 記錄會話歷史的最大 Token 數量上限(千),超出後將自動修整歷史記錄以節省 Token 消耗及防止超出 LLM 限制',
@@ -1347,6 +1350,20 @@ export default {
13471350
llmApiKeyPlaceholder: '請輸入API密鑰',
13481351
llmBaseUrl: 'LLM基礎URL',
13491352
llmBaseUrlHint: 'LLM API的基礎URL地址,用於自定義API端點',
1353+
aiVoiceApiKey: '音頻 API密鑰',
1354+
aiVoiceApiKeyHint: '音頻轉寫與語音合成使用的 API 密鑰,留空時回退到當前 LLM API 密鑰',
1355+
aiVoiceBaseUrl: '音頻基礎URL',
1356+
aiVoiceBaseUrlHint: '音頻轉寫與語音合成接口的基礎URL,留空時回退到當前 LLM 基礎 URL',
1357+
aiVoiceSttModel: '音頻轉寫模型',
1358+
aiVoiceSttModelHint: '用於將音頻內容轉換為文字的模型名稱',
1359+
aiVoiceTtsModel: '語音合成模型',
1360+
aiVoiceTtsModelHint: '用於將文字內容轉換為語音的模型名稱',
1361+
aiVoiceTtsVoice: '語音音色',
1362+
aiVoiceTtsVoiceHint: '語音合成使用的發音人或音色標識',
1363+
aiVoiceLanguage: '識別語言',
1364+
aiVoiceLanguageHint: '音頻轉寫預設語言,例如 zh、en,留空時按後端預設處理',
1365+
aiVoiceReplyWithText: '語音回覆附帶文字',
1366+
aiVoiceReplyWithTextHint: '發送語音回覆時,同時附帶一份文字內容',
13501367
llmTestAction: '測試調用',
13511368
llmTestSuccessToast: 'LLM 調用測試成功',
13521369
llmTestFailedToast: 'LLM 調用測試失敗',

src/views/setting/AccountSettingSystem.vue

Lines changed: 114 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,16 @@ const SystemSettings = ref<any>({
4242
LLM_MODEL: 'deepseek-chat',
4343
LLM_THINKING_LEVEL: 'off',
4444
LLM_SUPPORT_IMAGE_INPUT: false,
45+
LLM_SUPPORT_AUDIO_INPUT_OUTPUT: false,
4546
LLM_API_KEY: null,
4647
LLM_BASE_URL: 'https://api.deepseek.com',
48+
AI_VOICE_API_KEY: null,
49+
AI_VOICE_BASE_URL: null,
50+
AI_VOICE_STT_MODEL: 'gpt-4o-mini-transcribe',
51+
AI_VOICE_TTS_MODEL: 'gpt-4o-mini-tts',
52+
AI_VOICE_TTS_VOICE: 'alloy',
53+
AI_VOICE_LANGUAGE: 'zh',
54+
AI_VOICE_REPLY_WITH_TEXT: false,
4755
AI_AGENT_RETRY_TRANSFER: false,
4856
AI_RECOMMEND_ENABLED: false,
4957
AI_RECOMMEND_USER_PREFERENCE: null,
@@ -1016,22 +1024,126 @@ watch(currentLlmSnapshotKey, (snapshotKey, previousSnapshotKey) => {
10161024
</VCol>
10171025
</VRow>
10181026
<VRow>
1019-
<VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12" md="6">
1027+
<VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12" md="4">
10201028
<VSwitch
10211029
v-model="SystemSettings.Basic.LLM_SUPPORT_IMAGE_INPUT"
10221030
:label="t('setting.system.llmSupportImageInput')"
10231031
:hint="t('setting.system.llmSupportImageInputHint')"
10241032
persistent-hint
10251033
/>
10261034
</VCol>
1027-
<VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12" md="6">
1035+
</VRow>
1036+
<VRow>
1037+
<VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12">
1038+
<VSwitch
1039+
v-model="SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1040+
:label="t('setting.system.llmSupportAudioInputOutput')"
1041+
:hint="t('setting.system.llmSupportAudioInputOutputHint')"
1042+
persistent-hint
1043+
/>
1044+
</VCol>
1045+
<VCol
1046+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1047+
cols="12"
1048+
md="6"
1049+
>
1050+
<VTextField
1051+
v-model="SystemSettings.Basic.AI_VOICE_API_KEY"
1052+
:label="t('setting.system.aiVoiceApiKey')"
1053+
:hint="t('setting.system.aiVoiceApiKeyHint')"
1054+
persistent-hint
1055+
prepend-inner-icon="mdi-key-variant"
1056+
type="password"
1057+
/>
1058+
</VCol>
1059+
<VCol
1060+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1061+
cols="12"
1062+
md="6"
1063+
>
1064+
<VTextField
1065+
v-model="SystemSettings.Basic.AI_VOICE_BASE_URL"
1066+
:label="t('setting.system.aiVoiceBaseUrl')"
1067+
:hint="t('setting.system.aiVoiceBaseUrlHint')"
1068+
persistent-hint
1069+
prepend-inner-icon="mdi-link-variant"
1070+
/>
1071+
</VCol>
1072+
<VCol
1073+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1074+
cols="12"
1075+
md="6"
1076+
>
1077+
<VTextField
1078+
v-model="SystemSettings.Basic.AI_VOICE_STT_MODEL"
1079+
:label="t('setting.system.aiVoiceSttModel')"
1080+
:hint="t('setting.system.aiVoiceSttModelHint')"
1081+
persistent-hint
1082+
prepend-inner-icon="mdi-waveform"
1083+
/>
1084+
</VCol>
1085+
<VCol
1086+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1087+
cols="12"
1088+
md="6"
1089+
>
1090+
<VTextField
1091+
v-model="SystemSettings.Basic.AI_VOICE_TTS_MODEL"
1092+
:label="t('setting.system.aiVoiceTtsModel')"
1093+
:hint="t('setting.system.aiVoiceTtsModelHint')"
1094+
persistent-hint
1095+
prepend-inner-icon="mdi-waveform"
1096+
/>
1097+
</VCol>
1098+
<VCol
1099+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1100+
cols="12"
1101+
md="6"
1102+
>
1103+
<VTextField
1104+
v-model="SystemSettings.Basic.AI_VOICE_TTS_VOICE"
1105+
:label="t('setting.system.aiVoiceTtsVoice')"
1106+
:hint="t('setting.system.aiVoiceTtsVoiceHint')"
1107+
persistent-hint
1108+
prepend-inner-icon="mdi-account-voice"
1109+
/>
1110+
</VCol>
1111+
<VCol
1112+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1113+
cols="12"
1114+
md="6"
1115+
>
1116+
<VTextField
1117+
v-model="SystemSettings.Basic.AI_VOICE_LANGUAGE"
1118+
:label="t('setting.system.aiVoiceLanguage')"
1119+
:hint="t('setting.system.aiVoiceLanguageHint')"
1120+
persistent-hint
1121+
prepend-inner-icon="mdi-translate"
1122+
/>
1123+
</VCol>
1124+
<VCol
1125+
v-if="SystemSettings.Basic.AI_AGENT_ENABLE && SystemSettings.Basic.LLM_SUPPORT_AUDIO_INPUT_OUTPUT"
1126+
cols="12"
1127+
>
1128+
<VSwitch
1129+
v-model="SystemSettings.Basic.AI_VOICE_REPLY_WITH_TEXT"
1130+
:label="t('setting.system.aiVoiceReplyWithText')"
1131+
:hint="t('setting.system.aiVoiceReplyWithTextHint')"
1132+
persistent-hint
1133+
/>
1134+
</VCol>
1135+
</VRow>
1136+
<VRow>
1137+
<VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12">
10281138
<VSwitch
10291139
v-model="SystemSettings.Basic.AI_AGENT_RETRY_TRANSFER"
10301140
:label="t('setting.system.aiAgentRetryTransfer')"
10311141
:hint="t('setting.system.aiAgentRetryTransferHint')"
10321142
persistent-hint
10331143
/>
10341144
</VCol>
1145+
</VRow>
1146+
<VRow>
10351147
<VCol v-if="SystemSettings.Basic.AI_AGENT_ENABLE" cols="12">
10361148
<VSwitch
10371149
v-model="SystemSettings.Basic.AI_RECOMMEND_ENABLED"

0 commit comments

Comments
 (0)