Skip to content

Commit aa7dc93

Browse files
committed
feat: 完成字节跳动声音克隆功能集成
- 添加字节跳动声音克隆API调用 - 实现音频文件上传和训练状态查询 - 集成克隆声音的TTS合成功能 - 优化语音合成接口和错误处理
1 parent 032ea2e commit aa7dc93

File tree

5 files changed

+64
-28
lines changed

5 files changed

+64
-28
lines changed

src/components/ByteDanceRecorder.tsx

-4
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ export function ByteDanceRecorder({ onVoiceCloned }: ByteDanceRecorderProps) {
3232
const fileInputRef = useRef<HTMLInputElement>(null);
3333
const [currentVoiceId, setCurrentVoiceId] = useState<string | null>(null);
3434
const [trainingStatus, setTrainingStatus] = useState<VoiceStatusResponse | null>(null);
35-
const [isCheckingStatus, setIsCheckingStatus] = useState(false);
3635
const audioRef = useRef<HTMLAudioElement>(null);
3736
const [availableVoices, setAvailableVoices] = useState<VoiceInfo[]>([]);
3837
const [selectedVoiceId, setSelectedVoiceId] = useState<string>('');
@@ -91,7 +90,6 @@ export function ByteDanceRecorder({ onVoiceCloned }: ByteDanceRecorderProps) {
9190
}
9291

9392
try {
94-
setIsCheckingStatus(true);
9593
const status = await checkVoiceStatus(currentVoiceId);
9694
setTrainingStatus(status);
9795

@@ -100,8 +98,6 @@ export function ByteDanceRecorder({ onVoiceCloned }: ByteDanceRecorderProps) {
10098
}
10199
} catch (error) {
102100
logger.log(`Failed to check voice status: ${error}`, 'ERROR', ModelName);
103-
} finally {
104-
setIsCheckingStatus(false);
105101
}
106102
};
107103

src/components/InteractionInterface.tsx

+33-9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import {
2727
AlertDialogHeader,
2828
AlertDialogTitle,
2929
} from "@/components/ui/alert-dialog";
30+
import { generateBytedanceSpeech } from '@/lib/bytedanceTts';
3031

3132
const ModelName = "InteractionInterface";
3233

@@ -59,7 +60,7 @@ export const InteractionInterface: React.FC = () => {
5960
deviceName: 'Default Device'
6061
});
6162
const [audioBuffer, setAudioBuffer] = useState<ArrayBuffer | null>(null);
62-
const [voices, setVoices] = useState<Array<{ id: string, name: string }>>(defaultVoices);
63+
const [voices, setVoices] = useState<Array<CustomVoice | { id: string; name: string; }>>(defaultVoices);
6364
const [selectedVoice, setSelectedVoice] = useState<string>(defaultVoices[0].id);
6465
const [isVoiceCloningOpen, setIsVoiceCloningOpen] = useState(false);
6566
const [voiceToDelete, setVoiceToDelete] = useState<string | null>(null);
@@ -167,13 +168,34 @@ export const InteractionInterface: React.FC = () => {
167168
try {
168169
logger.log(`Starting chat submission with prompt: ${prompt}`, 'INFO', ModelName);
169170

170-
// 生成回复
171171
const result = await generateResponse(prompt);
172172
logger.log(`Generated response: ${JSON.stringify(result)}`, 'INFO', ModelName);
173173
setResponse(result);
174174

175-
// 生成语音
176-
const audioBuffer = await generateSpeech(result.response, selectedVoice);
175+
const selectedVoiceConfig = voices.find(voice => voice.id === selectedVoice);
176+
if (!selectedVoiceConfig) {
177+
throw new Error('Selected voice not found');
178+
}
179+
180+
let audioBuffer: ArrayBuffer;
181+
if ('isCustom' in selectedVoiceConfig) {
182+
if (selectedVoiceConfig.provider === 'bytedance') {
183+
if (!selectedVoiceConfig.speakerId) {
184+
throw new Error('Speaker ID not found for bytedance voice');
185+
}
186+
audioBuffer = await generateBytedanceSpeech(result.response, selectedVoiceConfig.speakerId);
187+
} else {
188+
// Minimax case
189+
if (!selectedVoiceConfig.modelPath) {
190+
throw new Error('Model path not found for minimax voice');
191+
}
192+
throw new Error('Minimax TTS not implemented yet');
193+
}
194+
} else {
195+
// Default TTS case
196+
audioBuffer = await generateSpeech(result.response, selectedVoiceConfig.id);
197+
}
198+
177199
logger.log(`Generated speech buffer size: ${audioBuffer.byteLength}`, 'DEBUG', ModelName);
178200
setAudioBuffer(audioBuffer);
179201

@@ -232,20 +254,22 @@ export const InteractionInterface: React.FC = () => {
232254
const handleVoiceCloned = async (voiceId: string, name: string) => {
233255
try {
234256
const customVoice: CustomVoice = {
235-
id: voiceId,
257+
id: '', // 会在 addCustomVoice 中生成
236258
name: name,
237259
isCustom: true,
238-
originalVoiceId: voiceId
260+
provider: 'bytedance',
261+
originalVoiceId: voiceId,
262+
speakerId: voiceId,
263+
modelPath: ''
239264
};
240265

241-
await addCustomVoice(customVoice);
266+
const newId = await addCustomVoice(customVoice);
242267

243268
// 重新加载语音列表
244269
const voicesSetting = await db.settings.get('voices');
245270
if (voicesSetting) {
246271
setVoices(voicesSetting.value);
247-
// 自动选择新创建的语音
248-
setSelectedVoice(voiceId);
272+
setSelectedVoice(newId);
249273
}
250274
} catch (error) {
251275
logger.log(`Error adding cloned voice: ${error}`, 'ERROR', ModelName);

src/lib/bytedanceTts.ts

+25-13
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,16 @@ interface TTSRequest {
2525
voice_type: string;
2626
encoding: string;
2727
speed_ratio: number;
28+
volume_ratio: number;
29+
pitch_ratio: number;
2830
};
2931
request: {
3032
reqid: string;
3133
text: string;
34+
text_type: string;
3235
operation: string;
36+
with_frontend: number;
37+
frontend_type: string;
3338
};
3439
}
3540

@@ -66,48 +71,55 @@ async function getTTSSettings(): Promise<TTSSettings> {
6671
}
6772

6873
return {
69-
baseUrl: baseUrl.value,
74+
baseUrl: baseUrl.value || 'https://openspeech.bytedance.com',
7075
appId: appId.value,
7176
token: token.value,
7277
cluster: cluster.value
7378
};
7479
}
7580

76-
export async function generateSpeech(text: string, voiceId: string): Promise<ArrayBuffer> {
81+
export async function generateBytedanceSpeech(text: string, voiceId: string): Promise<ArrayBuffer> {
7782
try {
78-
logger.log(`Generating speech for text: ${text}`, 'INFO', ModelName);
83+
logger.log(`Generating ByteDance speech for text: ${text}`, 'INFO', ModelName);
7984
const settings = await getTTSSettings();
8085

81-
const reqId = nanoid();
82-
8386
const requestBody: TTSRequest = {
8487
app: {
8588
appid: settings.appId,
8689
token: settings.token,
8790
cluster: settings.cluster
8891
},
8992
user: {
90-
uid: reqId
93+
uid: nanoid() // 生成唯一ID
9194
},
9295
audio: {
9396
voice_type: voiceId,
9497
encoding: 'mp3',
95-
speed_ratio: 1.0
98+
speed_ratio: 1.0,
99+
volume_ratio: 1.0,
100+
pitch_ratio: 1.0
96101
},
97102
request: {
98-
reqid: reqId,
103+
reqid: nanoid(),
99104
text: text,
100-
operation: 'query'
105+
text_type: 'plain',
106+
operation: 'query',
107+
with_frontend: 1,
108+
frontend_type: 'unitTson'
101109
}
102110
};
103111

104-
const response = await invoke<string>('proxy_request', {
112+
const response = await invoke<string>('proxy_request_with_headers', {
105113
targetUrl: `${settings.baseUrl}/api/v1/tts`,
106114
method: 'POST',
115+
headers: {
116+
'Authorization': `Bearer;${settings.token}`,
117+
'Content-Type': 'application/json'
118+
},
107119
body: Array.from(new TextEncoder().encode(JSON.stringify(requestBody)))
108120
});
109121

110-
const result: TTSResponse = JSON.parse(response);
122+
const result = JSON.parse(response);
111123

112124
if (result.code !== 3000) {
113125
throw new Error(`TTS error: ${result.message}`);
@@ -120,11 +132,11 @@ export async function generateSpeech(text: string, voiceId: string): Promise<Arr
120132
bytes[i] = binaryString.charCodeAt(i);
121133
}
122134

123-
logger.log(`Speech generated successfully, duration: ${result.addition.duration}ms`, 'INFO', ModelName);
135+
logger.log(`Speech generated successfully`, 'INFO', ModelName);
124136
return bytes.buffer;
125137

126138
} catch (error) {
127-
logger.log(`Error generating speech: ${error}`, 'ERROR', ModelName);
139+
logger.log(`Error generating ByteDance speech: ${error}`, 'ERROR', ModelName);
128140
throw error;
129141
}
130142
}

src/lib/openai.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ export async function generateResponse(prompt: string) {
3939
logger.log(`Using model: ${modelName?.value || "gpt-3.5-turbo-0125"}`, 'INFO', ModelName);
4040
logger.log(`System prompt: ${systemPrompt?.value}`, 'INFO', ModelName);
4141

42-
const defaultSystemPrompt = "假设你是一个可以和人类对话的具身机器人,反应内容包括响应内容,以及对应的kaomoji表情和头部动作(双轴舵机转动参数)。以json格式返回,响应内容定义为response,表情定义为kaomoji,kaomoji表情要反映响应内容情感。与表情对应的头部动作水平角度(无需单位)为servoX,范围是10~170,面向正前方是90。与表情对应的头部动作垂直角度(无需单位)为servoY,范围是10~170,水平面是90。";
42+
const defaultSystemPrompt = "假设你是一个可以和人类对话的具身机器人,返回内容包括响应内容,以及对应的kaomoji表情和头部动作(双轴舵机转动参数)。以json格式返回,响应内容定义为response,表情定义为kaomoji,kaomoji表情要反映响应内容情感。与表情对应的头部动作水平角度(无需单位)为servoX,范围是10~170,面向正前方是90。与表情对应的头部动作垂直角度(无需单位)为servoY,范围是10~170,水平面是90。";
4343

4444
const response = await openai.chat.completions.create({
4545
model: modelName?.value || "gpt-3.5-turbo-0125",
@@ -51,6 +51,7 @@ export async function generateResponse(prompt: string) {
5151
});
5252

5353
logger.log(`Response received from OpenAI`, 'INFO', ModelName);
54+
logger.log(`Raw response content: ${JSON.stringify(response)}`, 'INFO', ModelName);
5455
const parsedResponse = JSON.parse(response.choices[0].message.content || '{}');
5556
logger.log(`Parsed response: ${JSON.stringify(parsedResponse)}`, 'INFO', ModelName);
5657

src/lib/voiceSettings.ts

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ export interface CustomVoice {
44
id: string;
55
name: string;
66
isCustom: boolean;
7+
provider: 'bytedance' | 'minimax';
78
originalVoiceId: string;
9+
speakerId?: string;
810
modelPath?: string;
911
}
1012

@@ -83,7 +85,8 @@ export async function addCustomVoice(customVoice: CustomVoice) {
8385
const existingVoices = await db.settings.get('voices');
8486
const currentVoices = existingVoices?.value || defaultVoices;
8587

86-
const uniqueId = `custom-${Date.now()}-${customVoice.id}`;
88+
const prefix = customVoice.provider === 'bytedance' ? 'bd' : 'mx';
89+
const uniqueId = `${prefix}-${Date.now()}-${customVoice.originalVoiceId}`;
8790
customVoice.id = uniqueId;
8891

8992
const updatedVoices = [...currentVoices, customVoice];

0 commit comments

Comments
 (0)