Skip to content

Commit 587604b

Browse files
committed
fix inference_instruct2 speaker ID bug
1 parent e97cd1b commit 587604b

File tree

3 files changed

+41
-4
lines changed

3 files changed

+41
-4
lines changed

cosyvoice/cli/cosyvoice.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,10 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, use_fl
177177
def inference_instruct(self, *args, **kwargs):
178178
raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
179179

180-
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
180+
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
181181
assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
182182
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
183-
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
183+
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
184184
start_time = time.time()
185185
logging.info('synthesis text {}'.format(i))
186186
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):

cosyvoice/cli/frontend.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,8 @@ def frontend_instruct(self, tts_text, spk_id, instruct_text):
196196
model_input['prompt_text_len'] = instruct_text_token_len
197197
return model_input
198198

199-
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
200-
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate)
199+
def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
200+
model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate, zero_shot_spk_id)
201201
del model_input['llm_prompt_speech_token']
202202
del model_input['llm_prompt_speech_token_len']
203203
return model_input

test1.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import sys
2+
sys.path.append('third_party/Matcha-TTS')
3+
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
4+
from cosyvoice.utils.file_utils import load_wav
5+
import torchaudio # type: ignore
6+
7+
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
8+
9+
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
10+
# zero_shot usage
11+
prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
12+
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
13+
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
14+
15+
# save zero_shot spk for future usage
16+
assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
17+
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
18+
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
19+
cosyvoice.save_spkinfo()
20+
21+
# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
22+
for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
23+
torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
24+
25+
# instruct usage
26+
for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
27+
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
28+
29+
# bistream usage, you can use generator as input, this is useful when using text llm model as input
30+
# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
31+
def text_generator():
32+
yield '收到好友从远方寄来的生日礼物,'
33+
yield '那份意外的惊喜与深深的祝福'
34+
yield '让我心中充满了甜蜜的快乐,'
35+
yield '笑容如花儿般绽放。'
36+
for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
37+
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)

0 commit comments

Comments
 (0)