1
+ import sys
2
+ sys .path .append ('third_party/Matcha-TTS' )
3
+ from cosyvoice .cli .cosyvoice import CosyVoice , CosyVoice2
4
+ from cosyvoice .utils .file_utils import load_wav
5
+ import torchaudio # type: ignore
6
+
7
+ cosyvoice = CosyVoice2 ('pretrained_models/CosyVoice2-0.5B' , load_jit = False , load_trt = False , fp16 = False , use_flow_cache = False )
8
+
9
+ # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
10
+ # zero_shot usage
11
+ prompt_speech_16k = load_wav ('./asset/zero_shot_prompt.wav' , 16000 )
12
+ for i , j in enumerate (cosyvoice .inference_zero_shot ('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。' , '希望你以后能够做的比我还好呦。' , prompt_speech_16k , stream = False )):
13
+ torchaudio .save ('zero_shot_{}.wav' .format (i ), j ['tts_speech' ], cosyvoice .sample_rate )
14
+
15
+ # save zero_shot spk for future usage
16
+ assert cosyvoice .add_zero_shot_spk ('希望你以后能够做的比我还好呦。' , prompt_speech_16k , 'my_zero_shot_spk' ) is True
17
+ for i , j in enumerate (cosyvoice .inference_zero_shot ('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。' , '' , '' , zero_shot_spk_id = 'my_zero_shot_spk' , stream = False )):
18
+ torchaudio .save ('zero_shot_{}.wav' .format (i ), j ['tts_speech' ], cosyvoice .sample_rate )
19
+ cosyvoice .save_spkinfo ()
20
+
21
+ # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
22
+ for i , j in enumerate (cosyvoice .inference_cross_lingual ('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。' , prompt_speech_16k , stream = False )):
23
+ torchaudio .save ('fine_grained_control_{}.wav' .format (i ), j ['tts_speech' ], cosyvoice .sample_rate )
24
+
25
+ # instruct usage
26
+ for i , j in enumerate (cosyvoice .inference_instruct2 ('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。' , '用四川话说这句话' , prompt_speech_16k , stream = False )):
27
+ torchaudio .save ('instruct_{}.wav' .format (i ), j ['tts_speech' ], cosyvoice .sample_rate )
28
+
29
+ # bistream usage, you can use generator as input, this is useful when using text llm model as input
30
+ # NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
31
+ def text_generator ():
32
+ yield '收到好友从远方寄来的生日礼物,'
33
+ yield '那份意外的惊喜与深深的祝福'
34
+ yield '让我心中充满了甜蜜的快乐,'
35
+ yield '笑容如花儿般绽放。'
36
+ for i , j in enumerate (cosyvoice .inference_zero_shot (text_generator (), '希望你以后能够做的比我还好呦。' , prompt_speech_16k , stream = False )):
37
+ torchaudio .save ('zero_shot_{}.wav' .format (i ), j ['tts_speech' ], cosyvoice .sample_rate )
0 commit comments