30
30
instruct_dict = {'预训练音色' : '1. 选择预训练音色\n 2. 点击生成音频按钮' ,
31
31
'3s极速复刻' : '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n 2. 输入prompt文本\n 3. 点击生成音频按钮' ,
32
32
'跨语种复刻' : '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n 2. 点击生成音频按钮' ,
33
- '自然语言控制' : '1. 选择预训练音色\n 2. 输入instruct文本\n 3. 点击生成音频按钮' }
33
+ '自然语言控制' : '1. 选择预训练音色(v2模型需要选择或录入prompt音频) \n 2. 输入instruct文本\n 3. 点击生成音频按钮' }
34
34
stream_mode_list = [('否' , False ), ('是' , True )]
35
35
max_val = 0.8
36
+ model_versions = None
36
37
37
38
38
39
def generate_seed ():
@@ -61,6 +62,10 @@ def change_instruction(mode_checkbox_group):
61
62
62
63
def generate_audio (tts_text , mode_checkbox_group , sft_dropdown , prompt_text , prompt_wav_upload , prompt_wav_record , instruct_text ,
63
64
seed , stream , speed ):
65
+ if model_versions == 'v2' :
66
+ if stream :
67
+ stream = False
68
+ gr .Warning ('您正在使用v2版本模型, 不支持流式推理, 将使用非流式模式.' )
64
69
if prompt_wav_upload is not None :
65
70
prompt_wav = prompt_wav_upload
66
71
elif prompt_wav_record is not None :
@@ -69,13 +74,13 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
69
74
prompt_wav = None
70
75
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
71
76
if mode_checkbox_group in ['自然语言控制' ]:
72
- if cosyvoice .instruct is False :
77
+ if cosyvoice .instruct is False and model_versions == 'v1' :
73
78
gr .Warning ('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型' .format (args .model_dir ))
74
79
yield (cosyvoice .sample_rate , default_data )
75
80
if instruct_text == '' :
76
81
gr .Warning ('您正在使用自然语言控制模式, 请输入instruct文本' )
77
82
yield (cosyvoice .sample_rate , default_data )
78
- if prompt_wav is not None or prompt_text != '' :
83
+ if ( prompt_wav is not None or prompt_text != '' ) and model_versions == 'v1 ' :
79
84
gr .Info ('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略' )
80
85
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
81
86
if mode_checkbox_group in ['跨语种复刻' ]:
@@ -128,11 +133,20 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
128
133
set_all_random_seed (seed )
129
134
for i in cosyvoice .inference_cross_lingual (tts_text , prompt_speech_16k , stream = stream , speed = speed ):
130
135
yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
131
- else :
136
+ elif mode_checkbox_group == '自然语言控制' :
132
137
logging .info ('get instruct inference request' )
133
138
set_all_random_seed (seed )
134
- for i in cosyvoice .inference_instruct (tts_text , sft_dropdown , instruct_text , stream = stream , speed = speed ):
135
- yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
139
+ if model_versions == 'v1' :
140
+ for i in cosyvoice .inference_instruct (tts_text , sft_dropdown , instruct_text , stream = stream , speed = speed ):
141
+ yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
142
+ elif model_versions == 'v2' :
143
+ prompt_speech_16k = postprocess (load_wav (prompt_wav , prompt_sr ))
144
+ for i in cosyvoice .inference_instruct2 (tts_text , instruct_text , prompt_speech_16k , stream = stream ):
145
+ yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
146
+ else :
147
+ gr .Warning ('非预期的模型版本!' )
148
+ else :
149
+ gr .Warning ('非预期的选项!' )
136
150
137
151
138
152
def main ():
@@ -186,9 +200,11 @@ def main():
186
200
args = parser .parse_args ()
187
201
try :
188
202
cosyvoice = CosyVoice (args .model_dir )
203
+ model_versions = 'v1'
189
204
except Exception :
190
205
try :
191
206
cosyvoice = CosyVoice2 (args .model_dir )
207
+ model_versions = 'v2'
192
208
except Exception :
193
209
raise TypeError ('no valid model_type!' )
194
210
0 commit comments