feat: 完成字节跳动声音克隆功能集成

iaiuse · iaiuse · commit aa7dc93f4a82 · 2025-01-18T03:20:02.000+08:00
- 添加字节跳动声音克隆API调用
- 实现音频文件上传和训练状态查询
- 集成克隆声音的TTS合成功能
- 优化语音合成接口和错误处理
diff --git a/src/components/ByteDanceRecorder.tsx b/src/components/ByteDanceRecorder.tsx
@@ -32,7 +32,6 @@ export function ByteDanceRecorder({ onVoiceCloned }: ByteDanceRecorderProps) {
   const fileInputRef = useRef<HTMLInputElement>(null);
   const [currentVoiceId, setCurrentVoiceId] = useState<string | null>(null);
   const [trainingStatus, setTrainingStatus] = useState<VoiceStatusResponse | null>(null);
-  const [isCheckingStatus, setIsCheckingStatus] = useState(false);
   const audioRef = useRef<HTMLAudioElement>(null);
   const [availableVoices, setAvailableVoices] = useState<VoiceInfo[]>([]);
   const [selectedVoiceId, setSelectedVoiceId] = useState<string>('');
@@ -91,7 +90,6 @@ export function ByteDanceRecorder({ onVoiceCloned }: ByteDanceRecorderProps) {
       }
 
       try {
-        setIsCheckingStatus(true);
         const status = await checkVoiceStatus(currentVoiceId);
         setTrainingStatus(status);
 
@@ -100,8 +98,6 @@ export function ByteDanceRecorder({ onVoiceCloned }: ByteDanceRecorderProps) {
         }
       } catch (error) {
         logger.log(`Failed to check voice status: ${error}`, 'ERROR', ModelName);
-      } finally {
-        setIsCheckingStatus(false);
       }
     };
 
diff --git a/src/components/InteractionInterface.tsx b/src/components/InteractionInterface.tsx
@@ -27,6 +27,7 @@ import {
   AlertDialogHeader,
   AlertDialogTitle,
 } from "@/components/ui/alert-dialog";
+import { generateBytedanceSpeech } from '@/lib/bytedanceTts';
 
 const ModelName = "InteractionInterface";
 
@@ -59,7 +60,7 @@ export const InteractionInterface: React.FC = () => {
     deviceName: 'Default Device'
   });
   const [audioBuffer, setAudioBuffer] = useState<ArrayBuffer | null>(null);
-  const [voices, setVoices] = useState<Array<{ id: string, name: string }>>(defaultVoices);
+  const [voices, setVoices] = useState<Array<CustomVoice | { id: string; name: string; }>>(defaultVoices);
   const [selectedVoice, setSelectedVoice] = useState<string>(defaultVoices[0].id);
   const [isVoiceCloningOpen, setIsVoiceCloningOpen] = useState(false);
   const [voiceToDelete, setVoiceToDelete] = useState<string | null>(null);
@@ -167,13 +168,34 @@ export const InteractionInterface: React.FC = () => {
     try {
       logger.log(`Starting chat submission with prompt: ${prompt}`, 'INFO', ModelName);
 
-      // 生成回复
       const result = await generateResponse(prompt);
       logger.log(`Generated response: ${JSON.stringify(result)}`, 'INFO', ModelName);
       setResponse(result);
 
-      // 生成语音
-      const audioBuffer = await generateSpeech(result.response, selectedVoice);
+      const selectedVoiceConfig = voices.find(voice => voice.id === selectedVoice);
+      if (!selectedVoiceConfig) {
+        throw new Error('Selected voice not found');
+      }
+
+      let audioBuffer: ArrayBuffer;
+      if ('isCustom' in selectedVoiceConfig) {
+        if (selectedVoiceConfig.provider === 'bytedance') {
+          if (!selectedVoiceConfig.speakerId) {
+            throw new Error('Speaker ID not found for bytedance voice');
+          }
+          audioBuffer = await generateBytedanceSpeech(result.response, selectedVoiceConfig.speakerId);
+        } else {
+          // Minimax case
+          if (!selectedVoiceConfig.modelPath) {
+            throw new Error('Model path not found for minimax voice');
+          }
+          throw new Error('Minimax TTS not implemented yet');
+        }
+      } else {
+        // Default TTS case
+        audioBuffer = await generateSpeech(result.response, selectedVoiceConfig.id);
+      }
+
       logger.log(`Generated speech buffer size: ${audioBuffer.byteLength}`, 'DEBUG', ModelName);
       setAudioBuffer(audioBuffer);
 
@@ -232,20 +254,22 @@ export const InteractionInterface: React.FC = () => {
   const handleVoiceCloned = async (voiceId: string, name: string) => {
     try {
       const customVoice: CustomVoice = {
-        id: voiceId,
+        id: '',  // 会在 addCustomVoice 中生成
         name: name,
         isCustom: true,
-        originalVoiceId: voiceId
+        provider: 'bytedance',
+        originalVoiceId: voiceId,
+        speakerId: voiceId,
+        modelPath: ''
       };
 
-      await addCustomVoice(customVoice);
+      const newId = await addCustomVoice(customVoice);
       
       // 重新加载语音列表
       const voicesSetting = await db.settings.get('voices');
       if (voicesSetting) {
         setVoices(voicesSetting.value);
-        // 自动选择新创建的语音
-        setSelectedVoice(voiceId);
+        setSelectedVoice(newId);
       }
     } catch (error) {
       logger.log(`Error adding cloned voice: ${error}`, 'ERROR', ModelName);
diff --git a/src/lib/bytedanceTts.ts b/src/lib/bytedanceTts.ts
@@ -25,11 +25,16 @@ interface TTSRequest {
     voice_type: string;
     encoding: string;
     speed_ratio: number;
+    volume_ratio: number;
+    pitch_ratio: number;
   };
   request: {
     reqid: string;
     text: string;
+    text_type: string;
     operation: string;
+    with_frontend: number;
+    frontend_type: string;
   };
 }
 
@@ -66,48 +71,55 @@ async function getTTSSettings(): Promise<TTSSettings> {
   }
 
   return {
-    baseUrl: baseUrl.value,
+    baseUrl: baseUrl.value || 'https://openspeech.bytedance.com',
     appId: appId.value,
     token: token.value,
     cluster: cluster.value
   };
 }
 
-export async function generateSpeech(text: string, voiceId: string): Promise<ArrayBuffer> {
+export async function generateBytedanceSpeech(text: string, voiceId: string): Promise<ArrayBuffer> {
   try {
-    logger.log(`Generating speech for text: ${text}`, 'INFO', ModelName);
+    logger.log(`Generating ByteDance speech for text: ${text}`, 'INFO', ModelName);
     const settings = await getTTSSettings();
 
-    const reqId = nanoid();
-    
     const requestBody: TTSRequest = {
       app: {
         appid: settings.appId,
         token: settings.token,
         cluster: settings.cluster
       },
       user: {
-        uid: reqId
+        uid: nanoid()  // 生成唯一ID
       },
       audio: {
         voice_type: voiceId,
         encoding: 'mp3',
-        speed_ratio: 1.0
+        speed_ratio: 1.0,
+        volume_ratio: 1.0,
+        pitch_ratio: 1.0
       },
       request: {
-        reqid: reqId,
+        reqid: nanoid(),
         text: text,
-        operation: 'query'
+        text_type: 'plain',
+        operation: 'query',
+        with_frontend: 1,
+        frontend_type: 'unitTson'
       }
     };
 
-    const response = await invoke<string>('proxy_request', {
+    const response = await invoke<string>('proxy_request_with_headers', {
       targetUrl: `${settings.baseUrl}/api/v1/tts`,
       method: 'POST',
+      headers: {
+        'Authorization': `Bearer;${settings.token}`,
+        'Content-Type': 'application/json'
+      },
       body: Array.from(new TextEncoder().encode(JSON.stringify(requestBody)))
     });
 
-    const result: TTSResponse = JSON.parse(response);
+    const result = JSON.parse(response);
     
     if (result.code !== 3000) {
       throw new Error(`TTS error: ${result.message}`);
@@ -120,11 +132,11 @@ export async function generateSpeech(text: string, voiceId: string): Promise<Arr
       bytes[i] = binaryString.charCodeAt(i);
     }
     
-    logger.log(`Speech generated successfully, duration: ${result.addition.duration}ms`, 'INFO', ModelName);
+    logger.log(`Speech generated successfully`, 'INFO', ModelName);
     return bytes.buffer;
 
   } catch (error) {
-    logger.log(`Error generating speech: ${error}`, 'ERROR', ModelName);
+    logger.log(`Error generating ByteDance speech: ${error}`, 'ERROR', ModelName);
     throw error;
   }
 }
diff --git a/src/lib/openai.ts b/src/lib/openai.ts
@@ -39,7 +39,7 @@ export async function generateResponse(prompt: string) {
     logger.log(`Using model: ${modelName?.value || "gpt-3.5-turbo-0125"}`, 'INFO', ModelName);
     logger.log(`System prompt: ${systemPrompt?.value}`, 'INFO', ModelName);
 
-    const defaultSystemPrompt = "假设你是一个可以和人类对话的具身机器人,反应内容包括响应内容,以及对应的kaomoji表情和头部动作(双轴舵机转动参数)。以json格式返回，响应内容定义为response，表情定义为kaomoji，kaomoji表情要反映响应内容情感。与表情对应的头部动作水平角度（无需单位）为servoX，范围是10~170，面向正前方是90。与表情对应的头部动作垂直角度（无需单位）为servoY，范围是10~170，水平面是90。";
+    const defaultSystemPrompt = "假设你是一个可以和人类对话的具身机器人,返回内容包括响应内容,以及对应的kaomoji表情和头部动作(双轴舵机转动参数)。以json格式返回，响应内容定义为response，表情定义为kaomoji，kaomoji表情要反映响应内容情感。与表情对应的头部动作水平角度（无需单位）为servoX，范围是10~170，面向正前方是90。与表情对应的头部动作垂直角度（无需单位）为servoY，范围是10~170，水平面是90。";
 
     const response = await openai.chat.completions.create({
       model: modelName?.value || "gpt-3.5-turbo-0125",
@@ -51,6 +51,7 @@ export async function generateResponse(prompt: string) {
     });
 
     logger.log(`Response received from OpenAI`, 'INFO', ModelName);
+    logger.log(`Raw response content: ${JSON.stringify(response)}`, 'INFO', ModelName);
     const parsedResponse = JSON.parse(response.choices[0].message.content || '{}');
     logger.log(`Parsed response: ${JSON.stringify(parsedResponse)}`, 'INFO', ModelName);
 
diff --git a/src/lib/voiceSettings.ts b/src/lib/voiceSettings.ts
@@ -4,7 +4,9 @@ export interface CustomVoice {
   id: string;
   name: string;
   isCustom: boolean;
+  provider: 'bytedance' | 'minimax';
   originalVoiceId: string;
+  speakerId?: string;
   modelPath?: string;
 }
 
@@ -83,7 +85,8 @@ export async function addCustomVoice(customVoice: CustomVoice) {
     const existingVoices = await db.settings.get('voices');
     const currentVoices = existingVoices?.value || defaultVoices;
     
-    const uniqueId = `custom-${Date.now()}-${customVoice.id}`;
+    const prefix = customVoice.provider === 'bytedance' ? 'bd' : 'mx';
+    const uniqueId = `${prefix}-${Date.now()}-${customVoice.originalVoiceId}`;
     customVoice.id = uniqueId;
     
     const updatedVoices = [...currentVoices, customVoice];