30
30
instruct_dict = {'预训练音色' : '1. 选择预训练音色\n 2. 点击生成音频按钮' ,
31
31
'3s极速复刻' : '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n 2. 输入prompt文本\n 3. 点击生成音频按钮' ,
32
32
'跨语种复刻' : '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n 2. 点击生成音频按钮' ,
33
- '自然语言控制' : '1. 选择预训练音色\n 2. 输入instruct文本\n 3. 点击生成音频按钮' }
33
+ '自然语言控制' : '1. 选择预训练音色(v2模型需要选择或录入prompt音频) \n 2. 输入instruct文本\n 3. 点击生成音频按钮' }
34
34
stream_mode_list = [('否' , False ), ('是' , True )]
35
35
max_val = 0.8
36
-
36
+ model_versions = None
37
37
38
38
def generate_seed ():
39
39
seed = random .randint (1 , 100000000 )
@@ -61,6 +61,10 @@ def change_instruction(mode_checkbox_group):
61
61
62
62
def generate_audio (tts_text , mode_checkbox_group , sft_dropdown , prompt_text , prompt_wav_upload , prompt_wav_record , instruct_text ,
63
63
seed , stream , speed ):
64
+ if model_versions == 'v2' :
65
+ if stream == True :
66
+ stream = False
67
+ gr .Warning ('您正在使用v2版本模型, 不支持流式推理, 将使用非流式模式.' )
64
68
if prompt_wav_upload is not None :
65
69
prompt_wav = prompt_wav_upload
66
70
elif prompt_wav_record is not None :
@@ -69,13 +73,13 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
69
73
prompt_wav = None
70
74
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
71
75
if mode_checkbox_group in ['自然语言控制' ]:
72
- if cosyvoice .instruct is False :
76
+ if cosyvoice .instruct is False and model_versions == 'v1' :
73
77
gr .Warning ('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型' .format (args .model_dir ))
74
78
yield (cosyvoice .sample_rate , default_data )
75
79
if instruct_text == '' :
76
80
gr .Warning ('您正在使用自然语言控制模式, 请输入instruct文本' )
77
81
yield (cosyvoice .sample_rate , default_data )
78
- if prompt_wav is not None or prompt_text != '' :
82
+ if prompt_wav is not None or prompt_text != '' and model_versions == 'v1' :
79
83
gr .Info ('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略' )
80
84
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
81
85
if mode_checkbox_group in ['跨语种复刻' ]:
@@ -128,12 +132,20 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
128
132
set_all_random_seed (seed )
129
133
for i in cosyvoice .inference_cross_lingual (tts_text , prompt_speech_16k , stream = stream , speed = speed ):
130
134
yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
131
- else :
135
+ elif mode_checkbox_group == '自然语言控制' :
132
136
logging .info ('get instruct inference request' )
133
137
set_all_random_seed (seed )
134
- for i in cosyvoice .inference_instruct (tts_text , sft_dropdown , instruct_text , stream = stream , speed = speed ):
135
- yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
136
-
138
+ if model_versions == 'v1' :
139
+ for i in cosyvoice .inference_instruct (tts_text , sft_dropdown , instruct_text , stream = stream , speed = speed ):
140
+ yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
141
+ elif model_versions == 'v2' :
142
+ prompt_speech_16k = postprocess (load_wav (prompt_wav , prompt_sr ))
143
+ for i in cosyvoice .inference_instruct2 (tts_text , instruct_text , prompt_speech_16k , stream = stream ):
144
+ yield (cosyvoice .sample_rate , i ['tts_speech' ].numpy ().flatten ())
145
+ else :
146
+ gr .Warning ('非预期的模型版本!' )
147
+ else :
148
+ gr .Warning ('非预期的选项!' )
137
149
138
150
def main ():
139
151
with gr .Blocks () as demo :
@@ -186,9 +198,11 @@ def main():
186
198
args = parser .parse_args ()
187
199
try :
188
200
cosyvoice = CosyVoice (args .model_dir )
201
+ model_versions = 'v1'
189
202
except Exception :
190
203
try :
191
204
cosyvoice = CosyVoice2 (args .model_dir )
205
+ model_versions = 'v2'
192
206
except Exception :
193
207
raise TypeError ('no valid model_type!' )
194
208
0 commit comments