From da25078e5a6092399888e8ac12589b5c57033071 Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 13:23:40 -0500 Subject: [PATCH 1/7] Add i8n for multi-language support --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 4189c5f5..b56429d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,4 @@ wget==3.2 fastapi==0.111.0 fastapi-cli==0.0.4 WeTextProcessing==1.0.3 +i8n==0.3.9 From 83133e86a47faba963f11d2a588df42af1cbc73c Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 14:40:50 -0500 Subject: [PATCH 2/7] Add translations using i18n library --- webui.py | 128 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/webui.py b/webui.py index 2099ffb1..4ad6d9fd 100644 --- a/webui.py +++ b/webui.py @@ -13,6 +13,9 @@ # limitations under the License. import os import sys +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) + import argparse import gradio as gr import numpy as np @@ -20,18 +23,16 @@ import torchaudio import random import librosa -ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) -sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) +import i18n from cosyvoice.cli.cosyvoice import CosyVoice -from cosyvoice.utils.file_utils import load_wav, logging - -inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制'] -instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮', - '3s极速复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮', - '跨语种复刻': '1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 点击生成音频按钮', - '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'} -stream_mode_list = [('否', False), ('是', True)] -max_val = 0.8 +from cosyvoice.utils.file_utils import load_wav, speed_change, logging + +# Load available languages +i18n.load_path.append('./locales/') +i18n.set('file_format', 'json') +i18n.set('filename_format', '{locale}.{format}') + + def generate_seed(): @@ -41,14 +42,13 @@ def generate_seed(): "value": seed } - def set_all_random_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) - +max_val = 0.8 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, @@ -60,13 +60,16 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) return speech - +inference_mode_list = [i18n.t('inference_mode_list.pretrained_voice'), i18n.t('inference_mode_list.3s_fast_replication'), i18n.t('inference_mode_list.crosslingual'), i18n.t('inference_mode_list.natural_language_control')] +instruct_dict = {i18n.t('inference_mode_list.pretrained_voice'): i18n.t('instruct_dict.pretrained_voice'), + i18n.t('inference_mode_list.3s_fast_replication'): i18n.t('instruct_dict.3s_fast_replication'), + i18n.t('inference_mode_list.crosslingual'): i18n.t('instruct_dict.crosslingual'), + i18n.t('inference_mode_list.natural_language_control'): i18n.t('instruct_dict.natural_language_control')} +stream_mode_list = [(i18n.t('boolean.false'), False), (i18n.t('boolean.true'), True)] def change_instruction(mode_checkbox_group): return instruct_dict[mode_checkbox_group] - -def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, - seed, stream, speed_factor): +def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, stream, speed_factor): if prompt_wav_upload is not None: prompt_wav = prompt_wav_upload elif prompt_wav_record is not None: @@ -76,107 +79,101 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode if mode_checkbox_group in ['自然语言控制']: if cosyvoice.frontend.instruct is False: - gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir)) - yield (target_sr, default_data) + gr.Warning(i18n.t('warnings.nlp_model_warn').format(args.model_dir)) + return (target_sr, default_data) if instruct_text == '': - gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本') - yield (target_sr, default_data) + gr.Warning(i18n.t('warnings.instruct_text')) + return (target_sr, default_data) if prompt_wav is not None or prompt_text != '': - gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略') + gr.Info(i18n.t('info.prompt_wav')) # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language if mode_checkbox_group in ['跨语种复刻']: if cosyvoice.frontend.instruct is True: - gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir)) - yield (target_sr, default_data) + gr.Warning(i18n.t('warnings.no_crosslingual_support').format(args.model_dir)) + return (target_sr, default_data) if instruct_text != '': - gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略') + gr.Info(i18n.t('warnings.crosslingual_instruct_ignored')) if prompt_wav is None: - gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频') - yield (target_sr, default_data) - gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言') + gr.Warning(i18n.t('warnings.crosslingual_prompt_audio_required')) + return (target_sr, default_data) + gr.Info(i18n.t('info.crosslingual_language_reminder')) # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']: if prompt_wav is None: - gr.Warning('prompt音频为空,您是否忘记输入prompt音频?') - yield (target_sr, default_data) + gr.Warning(i18n.t('warnings.prompt_audio_empty')) + return (target_sr, default_data) if torchaudio.info(prompt_wav).sample_rate < prompt_sr: - gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr)) - yield (target_sr, default_data) + gr.Warning(i18n.t('warnings.sample_rate_error').format(torchaudio.info(prompt_wav).sample_rate, prompt_sr)) + return (target_sr, default_data) # sft mode only use sft_dropdown if mode_checkbox_group in ['预训练音色']: if instruct_text != '' or prompt_wav is not None or prompt_text != '': - gr.Info('您正在使用预训练音色模式,prompt文本/prompt音频/instruct文本会被忽略!') + gr.Info(i18n.t('info.pretrained_voice_warning')) # zero_shot mode only use prompt_wav prompt text if mode_checkbox_group in ['3s极速复刻']: if prompt_text == '': - gr.Warning('prompt文本为空,您是否忘记输入prompt文本?') - yield (target_sr, default_data) + gr.Warning(i18n.t('warnings.prompt_text_empty')) + return (target_sr, default_data) if instruct_text != '': - gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!') + gr.Info(i18n.t('info.instruct_text_empty')) if mode_checkbox_group == '预训练音色': logging.info('get sft inference request') set_all_random_seed(seed) for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (target_sr, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '3s极速复刻': logging.info('get zero_shot inference request') prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (target_sr, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '跨语种复刻': logging.info('get cross_lingual inference request') prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream): - yield (target_sr, i['tts_speech'].numpy().flatten()) + yield (target_sr, i['tts_speech'].numpy().flatten()) else: logging.info('get instruct inference request') set_all_random_seed(seed) for i in cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream): - yield (target_sr, i['tts_speech'].numpy().flatten()) - + yield (target_sr, i['tts_speech'].numpy().flatten()) def main(): with gr.Blocks() as demo: - gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \ - 预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \ - [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \ - [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)") - gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作") - - tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。") - speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True) + gr.Markdown(i18n.t('markdown.code_reference')) + gr.Markdown(i18n.t('markdown.output_text_prompt')) + + tts_text = gr.Textbox(label=i18n.t('input_label.enter_synthesis_text'), lines=1, value=i18n.t('placeholders.enter_synthesis_text')) + speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label=i18n.t('input_label.speed_adjustment'), value=1.0, interactive=True) with gr.Row(): - mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0]) - instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5) - sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', value=sft_spk[0], scale=0.25) - stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1]) + mode_checkbox_group = gr.Radio(choices=inference_mode_list, label=i18n.t('input_label.select_inference_mode_radio'), value=inference_mode_list[0]) + instruction_text = gr.Text(label=i18n.t('input_label.instruction_text'), value=instruct_dict[inference_mode_list[0]], scale=0.5) + sft_dropdown = gr.Dropdown(choices=sft_spk, label=i18n.t('input_label.sft_dropdown'), value=sft_spk[0], scale=0.25) + stream = gr.Radio(choices=stream_mode_list, label=i18n.t('input_label.stream'), value=stream_mode_list[0][1]) with gr.Column(scale=0.25): seed_button = gr.Button(value="\U0001F3B2") - seed = gr.Number(value=0, label="随机推理种子") + seed = gr.Number(value=0, label=i18n.t('input_label.seed_number')) with gr.Row(): - prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件,注意采样率不低于16khz') - prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件') - prompt_text = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本,需与prompt音频内容一致,暂时不支持自动识别...", value='') - instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='') + prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label=i18n.t('input_label.prompt_wav_upload')) + prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label=i18n.t('input_label.prompt_wav_record')) + prompt_text = gr.Textbox(label=i18n.t('input_label.prompt_text'), lines=1, placeholder=i18n.t('placeholders.prompt_text'), value='') + instruct_text = gr.Textbox(label=i18n.t('input_label.instruct_text'), lines=1, placeholder=i18n.t('placeholders.instruct_text'), value='') - generate_button = gr.Button("生成音频") + generate_button = gr.Button(i18n.t('input_label.generate_button')) - audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=True) + audio_output = gr.Audio(label=i18n.t('input_label.audio_output'), autoplay=True, streaming=True) seed_button.click(generate_seed, inputs=[], outputs=seed) generate_button.click(generate_audio, - inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, - seed, stream, speed_factor], + inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, stream, speed_factor], outputs=[audio_output]) mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text]) demo.queue(max_size=4, default_concurrency_limit=2) demo.launch(server_name='0.0.0.0', server_port=args.port) - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--port', @@ -186,7 +183,12 @@ def main(): type=str, default='pretrained_models/CosyVoice-300M', help='local path or modelscope repo id') + parser.add_argument('--locale', + type=str, + default='zh', + help='language locale') args = parser.parse_args() + i18n.set('locale', args.locale) cosyvoice = CosyVoice(args.model_dir) sft_spk = cosyvoice.list_avaliable_spks() prompt_sr, target_sr = 16000, 22050 From a3b030aeaf6d7b7aa7c46f7621bd788c24f0fe98 Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 14:41:07 -0500 Subject: [PATCH 3/7] Create en.json --- locales/en.json | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 locales/en.json diff --git a/locales/en.json b/locales/en.json new file mode 100644 index 00000000..a6a0d734 --- /dev/null +++ b/locales/en.json @@ -0,0 +1,60 @@ +{ + "en" : { + "inference_mode_list": { + "pretrained_voice": "PretrainedVoice", + "3s_fast_replication": "3s Fast Replication", + "crosslingual": "Crosslingual Replication", + "natural_language_control": "Natural Language Control" + }, + "instruct_dict": { + "pretrained_voice": "1. Select pretrained voice\n2. Click the generate audio button", + "3s_fast_replication": "1. Select prompt audio file or record prompt audio (not exceeding 30s). If both are provided, prompt audio file will be prioritized.\n2. Enter prompt text\n3. Click the generate audio button", + "crosslingual": "1. Select prompt audio file or record prompt audio (not exceeding 30s). If both are provided, prompt audio file will be prioritized.\n2. Click the generate audio button", + "natural_language_control": "1. Select pretrained voice\n2. Enter instruct text\n3. Click the generate audio button" + }, + "boolean": { + "true": "Yes", + "false": "No" + }, + "warnings": { + "nlp_model_warn": "You are using the natural language control mode, {} model does not support this mode, please use the iic/CosyVoice-300M-Instruct model", + "instruct_text": "You are using the natural language control mode, please enter instruct text", + "no_crosslingual_support": "You are using the cross-lingual replication mode, {} model does not support this mode, please use the iic/CosyVoice-300M model", + "crosslingual_instruct_ignored": "You are using the cross-lingual replication mode, instruct text will be ignored", + "crosslingual_prompt_audio_required": "You are using the cross-lingual replication mode, please provide prompt audio", + "prompt_audio_empty": "Prompt audio is empty, did you forget to input prompt audio?", + "sample_rate_error": "Prompt audio sample rate {} is lower than {}", + "prompt_text_empty": "Prompt text is empty, did you forget to input prompt text?", + "instruct_text_empty": "You are using the 3s fast replication mode, pretrained voice/instruct text will be ignored!" + }, + "info": { + "prompt_wav": "You are using the natural language control mode, prompt audio/prompt text will be ignored", + "crosslingual_language_reminder": "You are using the cross-lingual replication mode, please ensure that the synthesis text and prompt text are in different languages", + "pretrained_voice_warning": "You are using the pretrained voice mode, prompt text/prompt audio/instruct text will be ignored!" + }, + "markdown": { + "code_reference": "### Code repository [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) Pretrained models [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)", + "output_text_prompt": "#### Please enter the text to be synthesized, select the inference mode, and follow the prompts to proceed" + }, + "input_label": { + "enter_synthesis_text" : "Enter synthesis text", + "speed_adjustment": "Speed adjustment", + "select_inference_mode_radio": "Select inference mode", + "instruction_text": "Instructions", + "sft_dropdown": "Select pre-trained voice", + "stream": "Enable streaming", + "seed_number": "Random seed for inference", + "prompt_wav_upload": "Select prompt audio file, with a minimum sample rate of 16kHz", + "prompt_wav_record": "Record prompt audio file", + "prompt_text": "Enter prompt text", + "instruct_text": "Enter instruct text", + "generate_button": "Generate Audio", + "audio_output": "Synthesized Audio" + }, + "placeholders": { + "enter_synthesis_text": "I am a state-of-the-art generative speech model developed by the CosyVoice team, providing comfortable and natural speech synthesis capabilities.", + "prompt_text": "Please enter prompt text, it should match the content of the prompt audio and automatic recognition is not supported at the moment...", + "instruct_text": "Please enter instruct text." + } + } +} From 966864d86866edb07aeff3adeb3d953b3d447b83 Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 14:41:19 -0500 Subject: [PATCH 4/7] Create zh.json --- locales/zh.json | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 locales/zh.json diff --git a/locales/zh.json b/locales/zh.json new file mode 100644 index 00000000..93579d06 --- /dev/null +++ b/locales/zh.json @@ -0,0 +1,61 @@ +{ + "zh": { + "inference_mode_list": { + "pretrained_voice": "预训练音色", + "3s_fast_replication": "3s极速复刻", + "crosslingual": "跨语种复刻", + "natural_language_control": "自然语言控制" + }, + "instruct_dict": { + "pretrained_voice": "1. 选择预训练音色\n2. 点击生成音频按钮", + "3s_fast_replication": "1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮", + "crosslingual": "1. 选择prompt音频文件,或录入prompt音频,注意不超过30s,若同时提供,优先选择prompt音频文件\n2. 点击生成音频按钮", + "natural_language_control": "1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮" + }, + "boolean": { + "true": "是", + "false": "否" + }, + "warnings": { + "nlp_model_warn": "您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型", + "instruct_text": "您正在使用自然语言控制模式, 请输入instruct文本", + "no_crosslingual_support": "您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型", + "crosslingual_instruct_ignored": "您正在使用跨语种复刻模式, instruct文本会被忽略", + "crosslingual_prompt_audio_required": "您正在使用跨语种复刻模式, 请提供prompt音频", + "prompt_audio_empty": "prompt音频为空,您是否忘记输入prompt音频?", + "sample_rate_error": "prompt音频采样率{}低于{}", + "prompt_text_empty": "prompt文本为空,您是否忘记输入prompt文本?", + "instruct_text_empty": "您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!" + }, + "info": { + "prompt_wav": "您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略", + "crosslingual_language_reminder": "您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言", + "pretrained_voice_warning": "您正在使用预训练音色模式,prompt文本/prompt音频/instruct文本会被忽略!" + }, + "markdown": { + "code_reference": "Welcome", + "output_text_prompt": "#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作" + }, + "input_label": { + "enter_synthesis_text" : "输入合成文本", + "speed_adjustment": "语速调节", + "select_inference_mode_radio": "选择推理模式", + "instruction_text": "操作步骤", + "sft_dropdown": "选择预训练音色", + "stream": "是否流式推理", + "seed_number": "随机推理种子", + "prompt_wav_upload": "选择prompt音频文件,注意采样率不低于16khz", + "prompt_wav_record": "录制prompt音频文件", + "prompt_text": "输入prompt文本", + "instruct_text": "输入instruct文本", + "generate_button": "生成音频", + "audio_output": "合成音频" + }, + "placeholders": { + "enter_synthesis_text": "我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。", + "prompt_text": "请输入prompt文本,需与prompt音频内容一致,暂时不支持自动识别...", + "instruct_text": "请输入instruct文本." + } + } + +} From 732b4e293ed9378db9af1a4b8bd740b6a0108438 Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 14:42:18 -0500 Subject: [PATCH 5/7] Update README.md Add locale in demo website example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a376225e..12dd24a7 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,7 @@ Please see the demo website for details. ``` python # change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference -python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M +python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M --locale zh ``` **Advanced Usage** From c0f0a5816e532d7a750824503bcbe12017c2edae Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 14:52:40 -0500 Subject: [PATCH 6/7] Update webui.py Fix error/warning/info section to correspond to locale settings --- webui.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/webui.py b/webui.py index 4ad6d9fd..26ab799c 100644 --- a/webui.py +++ b/webui.py @@ -77,7 +77,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro else: prompt_wav = None # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode - if mode_checkbox_group in ['自然语言控制']: + if mode_checkbox_group in [i18n.t('inference_mode_list.natural_language_control')]: if cosyvoice.frontend.instruct is False: gr.Warning(i18n.t('warnings.nlp_model_warn').format(args.model_dir)) return (target_sr, default_data) @@ -87,7 +87,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro if prompt_wav is not None or prompt_text != '': gr.Info(i18n.t('info.prompt_wav')) # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language - if mode_checkbox_group in ['跨语种复刻']: + if mode_checkbox_group in [i18n.t('inference_mode_list.crosslingual')]: if cosyvoice.frontend.instruct is True: gr.Warning(i18n.t('warnings.no_crosslingual_support').format(args.model_dir)) return (target_sr, default_data) @@ -98,7 +98,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro return (target_sr, default_data) gr.Info(i18n.t('info.crosslingual_language_reminder')) # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements - if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']: + if mode_checkbox_group in [i18n.t('inference_mode_list.3s_fast_replication'), i18n.t('inference_mode_list.crosslingual')]: if prompt_wav is None: gr.Warning(i18n.t('warnings.prompt_audio_empty')) return (target_sr, default_data) @@ -106,29 +106,29 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro gr.Warning(i18n.t('warnings.sample_rate_error').format(torchaudio.info(prompt_wav).sample_rate, prompt_sr)) return (target_sr, default_data) # sft mode only use sft_dropdown - if mode_checkbox_group in ['预训练音色']: + if mode_checkbox_group in [i18n.t('inference_mode_list.pretrained_voice')]: if instruct_text != '' or prompt_wav is not None or prompt_text != '': gr.Info(i18n.t('info.pretrained_voice_warning')) # zero_shot mode only use prompt_wav prompt text - if mode_checkbox_group in ['3s极速复刻']: + if mode_checkbox_group in [i18n.t('inference_mode_list.3s_fast_replication')]: if prompt_text == '': gr.Warning(i18n.t('warnings.prompt_text_empty')) return (target_sr, default_data) if instruct_text != '': gr.Info(i18n.t('info.instruct_text_empty')) - if mode_checkbox_group == '预训练音色': + if mode_checkbox_group == i18n.t('inference_mode_list.pretrained_voice'): logging.info('get sft inference request') set_all_random_seed(seed) for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream): yield (target_sr, i['tts_speech'].numpy().flatten()) - elif mode_checkbox_group == '3s极速复刻': + elif mode_checkbox_group == i18n.t('inference_mode_list.3s_fast_replication'): logging.info('get zero_shot inference request') prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream): yield (target_sr, i['tts_speech'].numpy().flatten()) - elif mode_checkbox_group == '跨语种复刻': + elif mode_checkbox_group == i18n.t('inference_mode_list.crosslingual'): logging.info('get cross_lingual inference request') prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) From c3517fde363afa83c53a1afc3e7033f44169c1dc Mon Sep 17 00:00:00 2001 From: KrugerCoder Date: Fri, 6 Sep 2024 15:12:51 -0500 Subject: [PATCH 7/7] Fix requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b56429d8..370a2105 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,4 +31,4 @@ wget==3.2 fastapi==0.111.0 fastapi-cli==0.0.4 WeTextProcessing==1.0.3 -i8n==0.3.9 +python-i18n==0.3.9