From da25078e5a6092399888e8ac12589b5c57033071 Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 13:23:40 -0500
Subject: [PATCH 1/7] Add i8n for multi-language support

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 4189c5f5..b56429d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,3 +31,4 @@ wget==3.2
 fastapi==0.111.0
 fastapi-cli==0.0.4
 WeTextProcessing==1.0.3
+i8n==0.3.9

From 83133e86a47faba963f11d2a588df42af1cbc73c Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 14:40:50 -0500
Subject: [PATCH 2/7] Add translations using i18n library

---
 webui.py | 128 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 63 deletions(-)

diff --git a/webui.py b/webui.py
index 2099ffb1..4ad6d9fd 100644
--- a/webui.py
+++ b/webui.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 import os
 import sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+
 import argparse
 import gradio as gr
 import numpy as np
@@ -20,18 +23,16 @@
 import torchaudio
 import random
 import librosa
-ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
-sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+import i18n 
 from cosyvoice.cli.cosyvoice import CosyVoice
-from cosyvoice.utils.file_utils import load_wav, logging
-
-inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
-instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
-                 '3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
-                 '跨语种复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 点击生成音频按钮',
-                 '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'}
-stream_mode_list = [('否', False), ('是', True)]
-max_val = 0.8
+from cosyvoice.utils.file_utils import load_wav, speed_change, logging
+
+# Load available languages 
+i18n.load_path.append('./locales/')
+i18n.set('file_format', 'json')
+i18n.set('filename_format', '{locale}.{format}')
+
+
 
 
 def generate_seed():
@@ -41,14 +42,13 @@ def generate_seed():
         "value": seed
     }
 
-
 def set_all_random_seed(seed):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
 
-
+max_val = 0.8
 def postprocess(speech, top_db=60, hop_length=220, win_length=440):
     speech, _ = librosa.effects.trim(
         speech, top_db=top_db,
@@ -60,13 +60,16 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
     speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
     return speech
 
-
+inference_mode_list = [i18n.t('inference_mode_list.pretrained_voice'), i18n.t('inference_mode_list.3s_fast_replication'), i18n.t('inference_mode_list.crosslingual'), i18n.t('inference_mode_list.natural_language_control')]
+instruct_dict = {i18n.t('inference_mode_list.pretrained_voice'): i18n.t('instruct_dict.pretrained_voice'),
+                 i18n.t('inference_mode_list.3s_fast_replication'): i18n.t('instruct_dict.3s_fast_replication'),
+                 i18n.t('inference_mode_list.crosslingual'): i18n.t('instruct_dict.crosslingual'),
+                 i18n.t('inference_mode_list.natural_language_control'): i18n.t('instruct_dict.natural_language_control')}
+stream_mode_list = [(i18n.t('boolean.false'), False), (i18n.t('boolean.true'), True)]
 def change_instruction(mode_checkbox_group):
     return instruct_dict[mode_checkbox_group]
 
-
-def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
-                   seed, stream, speed_factor):
+def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, stream, speed_factor):
     if prompt_wav_upload is not None:
         prompt_wav = prompt_wav_upload
     elif prompt_wav_record is not None:
@@ -76,107 +79,101 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
     # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
     if mode_checkbox_group in ['自然语言控制']:
         if cosyvoice.frontend.instruct is False:
-            gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
-            yield (target_sr, default_data)
+            gr.Warning(i18n.t('warnings.nlp_model_warn').format(args.model_dir))
+            return (target_sr, default_data)
         if instruct_text == '':
-            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
-            yield (target_sr, default_data)
+            gr.Warning(i18n.t('warnings.instruct_text'))
+            return (target_sr, default_data)
         if prompt_wav is not None or prompt_text != '':
-            gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
+            gr.Info(i18n.t('info.prompt_wav'))
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
     if mode_checkbox_group in ['跨语种复刻']:
         if cosyvoice.frontend.instruct is True:
-            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
-            yield (target_sr, default_data)
+            gr.Warning(i18n.t('warnings.no_crosslingual_support').format(args.model_dir))
+            return (target_sr, default_data)
         if instruct_text != '':
-            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
+            gr.Info(i18n.t('warnings.crosslingual_instruct_ignored'))
         if prompt_wav is None:
-            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
-            yield (target_sr, default_data)
-        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
+            gr.Warning(i18n.t('warnings.crosslingual_prompt_audio_required'))
+            return (target_sr, default_data)
+        gr.Info(i18n.t('info.crosslingual_language_reminder'))
     # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
     if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
         if prompt_wav is None:
-            gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
-            yield (target_sr, default_data)
+            gr.Warning(i18n.t('warnings.prompt_audio_empty'))
+            return (target_sr, default_data)
         if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
-            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
-            yield (target_sr, default_data)
+            gr.Warning(i18n.t('warnings.sample_rate_error').format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+            return (target_sr, default_data)
     # sft mode only use sft_dropdown
     if mode_checkbox_group in ['预训练音色']:
         if instruct_text != '' or prompt_wav is not None or prompt_text != '':
-            gr.Info('您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！')
+            gr.Info(i18n.t('info.pretrained_voice_warning'))
     # zero_shot mode only use prompt_wav prompt text
     if mode_checkbox_group in ['3s极速复刻']:
         if prompt_text == '':
-            gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
-            yield (target_sr, default_data)
+            gr.Warning(i18n.t('warnings.prompt_text_empty'))
+            return (target_sr, default_data)
         if instruct_text != '':
-            gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
+            gr.Info(i18n.t('info.instruct_text_empty'))
 
     if mode_checkbox_group == '预训练音色':
         logging.info('get sft inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (target_sr,  i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '3s极速复刻':
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (target_sr,  i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '跨语种复刻':
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
+            yield (target_sr,  i['tts_speech'].numpy().flatten())
     else:
         logging.info('get instruct inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text, stream=stream):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
-
+            yield (target_sr,  i['tts_speech'].numpy().flatten())
 
 def main():
     with gr.Blocks() as demo:
-        gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
-                    预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
-                    [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
-                    [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
-        gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
-
-        tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
-        speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
+        gr.Markdown(i18n.t('markdown.code_reference'))
+        gr.Markdown(i18n.t('markdown.output_text_prompt'))
+
+        tts_text = gr.Textbox(label=i18n.t('input_label.enter_synthesis_text'), lines=1, value=i18n.t('placeholders.enter_synthesis_text'))
+        speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label=i18n.t('input_label.speed_adjustment'), value=1.0, interactive=True)
         with gr.Row():
-            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
-            instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
-            sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', value=sft_spk[0], scale=0.25)
-            stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
+            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label=i18n.t('input_label.select_inference_mode_radio'), value=inference_mode_list[0])
+            instruction_text = gr.Text(label=i18n.t('input_label.instruction_text'), value=instruct_dict[inference_mode_list[0]], scale=0.5)
+            sft_dropdown = gr.Dropdown(choices=sft_spk, label=i18n.t('input_label.sft_dropdown'), value=sft_spk[0], scale=0.25)
+            stream = gr.Radio(choices=stream_mode_list, label=i18n.t('input_label.stream'), value=stream_mode_list[0][1])
             with gr.Column(scale=0.25):
                 seed_button = gr.Button(value="\U0001F3B2")
-                seed = gr.Number(value=0, label="随机推理种子")
+                seed = gr.Number(value=0, label=i18n.t('input_label.seed_number'))
 
         with gr.Row():
-            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件，注意采样率不低于16khz')
-            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
-        prompt_text = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本，需与prompt音频内容一致，暂时不支持自动识别...", value='')
-        instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='')
+            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label=i18n.t('input_label.prompt_wav_upload'))
+            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label=i18n.t('input_label.prompt_wav_record'))
+        prompt_text = gr.Textbox(label=i18n.t('input_label.prompt_text'), lines=1, placeholder=i18n.t('placeholders.prompt_text'), value='')
+        instruct_text = gr.Textbox(label=i18n.t('input_label.instruct_text'), lines=1, placeholder=i18n.t('placeholders.instruct_text'), value='')
 
-        generate_button = gr.Button("生成音频")
+        generate_button = gr.Button(i18n.t('input_label.generate_button'))
 
-        audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=True)
+        audio_output = gr.Audio(label=i18n.t('input_label.audio_output'), autoplay=True, streaming=True)
 
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,
-                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
-                                      seed, stream, speed_factor],
+                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, stream, speed_factor],
                               outputs=[audio_output])
         mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
     demo.queue(max_size=4, default_concurrency_limit=2)
     demo.launch(server_name='0.0.0.0', server_port=args.port)
 
-
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--port',
@@ -186,7 +183,12 @@ def main():
                         type=str,
                         default='pretrained_models/CosyVoice-300M',
                         help='local path or modelscope repo id')
+    parser.add_argument('--locale',
+                        type=str,
+                        default='zh',
+                        help='language locale')
     args = parser.parse_args()
+    i18n.set('locale', args.locale)
     cosyvoice = CosyVoice(args.model_dir)
     sft_spk = cosyvoice.list_avaliable_spks()
     prompt_sr, target_sr = 16000, 22050

From a3b030aeaf6d7b7aa7c46f7621bd788c24f0fe98 Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 14:41:07 -0500
Subject: [PATCH 3/7] Create en.json

---
 locales/en.json | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 locales/en.json

diff --git a/locales/en.json b/locales/en.json
new file mode 100644
index 00000000..a6a0d734
--- /dev/null
+++ b/locales/en.json
@@ -0,0 +1,60 @@
+{
+    "en" : {
+        "inference_mode_list": {
+            "pretrained_voice": "PretrainedVoice", 
+            "3s_fast_replication": "3s Fast Replication", 
+            "crosslingual": "Crosslingual Replication", 
+            "natural_language_control": "Natural Language Control"
+        },
+        "instruct_dict": {
+            "pretrained_voice": "1. Select pretrained voice\n2. Click the generate audio button", 
+            "3s_fast_replication": "1. Select prompt audio file or record prompt audio (not exceeding 30s). If both are provided, prompt audio file will be prioritized.\n2. Enter prompt text\n3. Click the generate audio button", 
+            "crosslingual": "1. Select prompt audio file or record prompt audio (not exceeding 30s). If both are provided, prompt audio file will be prioritized.\n2. Click the generate audio button", 
+            "natural_language_control": "1. Select pretrained voice\n2. Enter instruct text\n3. Click the generate audio button"
+        },
+        "boolean": {
+            "true": "Yes",
+            "false": "No"
+        },
+        "warnings": {
+            "nlp_model_warn": "You are using the natural language control mode, {} model does not support this mode, please use the iic/CosyVoice-300M-Instruct model",
+            "instruct_text": "You are using the natural language control mode, please enter instruct text",
+            "no_crosslingual_support": "You are using the cross-lingual replication mode, {} model does not support this mode, please use the iic/CosyVoice-300M model",
+            "crosslingual_instruct_ignored": "You are using the cross-lingual replication mode, instruct text will be ignored",
+            "crosslingual_prompt_audio_required": "You are using the cross-lingual replication mode, please provide prompt audio",
+            "prompt_audio_empty": "Prompt audio is empty, did you forget to input prompt audio?",
+            "sample_rate_error": "Prompt audio sample rate {} is lower than {}",
+            "prompt_text_empty": "Prompt text is empty, did you forget to input prompt text?",
+            "instruct_text_empty": "You are using the 3s fast replication mode, pretrained voice/instruct text will be ignored!"
+        },
+        "info": {
+            "prompt_wav": "You are using the natural language control mode, prompt audio/prompt text will be ignored",
+            "crosslingual_language_reminder": "You are using the cross-lingual replication mode, please ensure that the synthesis text and prompt text are in different languages",
+            "pretrained_voice_warning": "You are using the pretrained voice mode, prompt text/prompt audio/instruct text will be ignored!"
+        },
+        "markdown": {
+            "code_reference": "### Code repository [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) Pretrained models [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)",
+            "output_text_prompt": "#### Please enter the text to be synthesized, select the inference mode, and follow the prompts to proceed"
+        },
+        "input_label": {
+            "enter_synthesis_text" : "Enter synthesis text",
+            "speed_adjustment": "Speed adjustment",
+            "select_inference_mode_radio": "Select inference mode",
+            "instruction_text": "Instructions",
+            "sft_dropdown": "Select pre-trained voice",
+            "stream": "Enable streaming",
+            "seed_number": "Random seed for inference",
+            "prompt_wav_upload": "Select prompt audio file, with a minimum sample rate of 16kHz",
+            "prompt_wav_record": "Record prompt audio file",
+            "prompt_text": "Enter prompt text",
+            "instruct_text": "Enter instruct text",
+            "generate_button": "Generate Audio",
+            "audio_output": "Synthesized Audio"
+        },
+        "placeholders": {
+            "enter_synthesis_text": "I am a state-of-the-art generative speech model developed by the CosyVoice team, providing comfortable and natural speech synthesis capabilities.",
+            "prompt_text": "Please enter prompt text, it should match the content of the prompt audio and automatic recognition is not supported at the moment...",
+            "instruct_text": "Please enter instruct text."
+        }
+    }
+}

From 966864d86866edb07aeff3adeb3d953b3d447b83 Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 14:41:19 -0500
Subject: [PATCH 4/7] Create zh.json

---
 locales/zh.json | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 locales/zh.json

diff --git a/locales/zh.json b/locales/zh.json
new file mode 100644
index 00000000..93579d06
--- /dev/null
+++ b/locales/zh.json
@@ -0,0 +1,61 @@
+{
+    "zh": {
+        "inference_mode_list": {
+            "pretrained_voice": "预训练音色", 
+            "3s_fast_replication": "3s极速复刻", 
+            "crosslingual": "跨语种复刻", 
+            "natural_language_control": "自然语言控制"
+        },
+        "instruct_dict": {
+            "pretrained_voice": "1. 选择预训练音色\n2. 点击生成音频按钮", 
+            "3s_fast_replication": "1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮", 
+            "crosslingual": "1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 点击生成音频按钮", 
+            "natural_language_control": "1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮"
+        },
+        "boolean": {
+            "true": "是",
+            "false": "否"
+        },
+        "warnings": {
+            "nlp_model_warn": "您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型",
+            "instruct_text": "您正在使用自然语言控制模式, 请输入instruct文本",
+            "no_crosslingual_support": "您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型",
+            "crosslingual_instruct_ignored": "您正在使用跨语种复刻模式, instruct文本会被忽略",
+            "crosslingual_prompt_audio_required": "您正在使用跨语种复刻模式, 请提供prompt音频",
+            "prompt_audio_empty": "prompt音频为空，您是否忘记输入prompt音频？",
+            "sample_rate_error": "prompt音频采样率{}低于{}",
+            "prompt_text_empty": "prompt文本为空，您是否忘记输入prompt文本？",
+            "instruct_text_empty": "您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！"
+        },
+        "info": {
+            "prompt_wav": "您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略",
+            "crosslingual_language_reminder": "您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言",
+            "pretrained_voice_warning": "您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！"
+        },
+        "markdown": {
+            "code_reference": "Welcome",
+            "output_text_prompt": "#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作"
+        },
+        "input_label": {
+            "enter_synthesis_text" : "输入合成文本",
+            "speed_adjustment": "语速调节",
+            "select_inference_mode_radio": "选择推理模式",
+            "instruction_text": "操作步骤",
+            "sft_dropdown": "选择预训练音色",
+            "stream": "是否流式推理",
+            "seed_number": "随机推理种子",
+            "prompt_wav_upload": "选择prompt音频文件，注意采样率不低于16khz",
+            "prompt_wav_record": "录制prompt音频文件",
+            "prompt_text": "输入prompt文本",
+            "instruct_text": "输入instruct文本",
+            "generate_button": "生成音频",
+            "audio_output": "合成音频"
+        },
+        "placeholders": {
+            "enter_synthesis_text": "我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。",
+            "prompt_text": "请输入prompt文本，需与prompt音频内容一致，暂时不支持自动识别...",
+            "instruct_text": "请输入instruct文本."
+        }
+    }
+
+}

From 732b4e293ed9378db9af1a4b8bd740b6a0108438 Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 14:42:18 -0500
Subject: [PATCH 5/7] Update README.md

Add locale in demo website example
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a376225e..12dd24a7 100644
--- a/README.md
+++ b/README.md
@@ -146,7 +146,7 @@ Please see the demo website for details.
 
 ``` python
 # change iic/CosyVoice-300M-SFT for sft inference, or iic/CosyVoice-300M-Instruct for instruct inference
-python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
+python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M --locale zh
 ```
 
 **Advanced Usage**

From c0f0a5816e532d7a750824503bcbe12017c2edae Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 14:52:40 -0500
Subject: [PATCH 6/7] Update webui.py

Fix error/warning/info section to correspond to locale settings
---
 webui.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/webui.py b/webui.py
index 4ad6d9fd..26ab799c 100644
--- a/webui.py
+++ b/webui.py
@@ -77,7 +77,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
     else:
         prompt_wav = None
     # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
-    if mode_checkbox_group in ['自然语言控制']:
+    if mode_checkbox_group in [i18n.t('inference_mode_list.natural_language_control')]:
         if cosyvoice.frontend.instruct is False:
             gr.Warning(i18n.t('warnings.nlp_model_warn').format(args.model_dir))
             return (target_sr, default_data)
@@ -87,7 +87,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
         if prompt_wav is not None or prompt_text != '':
             gr.Info(i18n.t('info.prompt_wav'))
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
-    if mode_checkbox_group in ['跨语种复刻']:
+    if mode_checkbox_group in [i18n.t('inference_mode_list.crosslingual')]:
         if cosyvoice.frontend.instruct is True:
             gr.Warning(i18n.t('warnings.no_crosslingual_support').format(args.model_dir))
             return (target_sr, default_data)
@@ -98,7 +98,7 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
             return (target_sr, default_data)
         gr.Info(i18n.t('info.crosslingual_language_reminder'))
     # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
-    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
+    if mode_checkbox_group in [i18n.t('inference_mode_list.3s_fast_replication'), i18n.t('inference_mode_list.crosslingual')]:
         if prompt_wav is None:
             gr.Warning(i18n.t('warnings.prompt_audio_empty'))
             return (target_sr, default_data)
@@ -106,29 +106,29 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
             gr.Warning(i18n.t('warnings.sample_rate_error').format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
             return (target_sr, default_data)
     # sft mode only use sft_dropdown
-    if mode_checkbox_group in ['预训练音色']:
+    if mode_checkbox_group in [i18n.t('inference_mode_list.pretrained_voice')]:
         if instruct_text != '' or prompt_wav is not None or prompt_text != '':
             gr.Info(i18n.t('info.pretrained_voice_warning'))
     # zero_shot mode only use prompt_wav prompt text
-    if mode_checkbox_group in ['3s极速复刻']:
+    if mode_checkbox_group in [i18n.t('inference_mode_list.3s_fast_replication')]:
         if prompt_text == '':
             gr.Warning(i18n.t('warnings.prompt_text_empty'))
             return (target_sr, default_data)
         if instruct_text != '':
             gr.Info(i18n.t('info.instruct_text_empty'))
 
-    if mode_checkbox_group == '预训练音色':
+    if mode_checkbox_group == i18n.t('inference_mode_list.pretrained_voice'):
         logging.info('get sft inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream):
             yield (target_sr,  i['tts_speech'].numpy().flatten())
-    elif mode_checkbox_group == '3s极速复刻':
+    elif mode_checkbox_group == i18n.t('inference_mode_list.3s_fast_replication'):
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream):
             yield (target_sr,  i['tts_speech'].numpy().flatten())
-    elif mode_checkbox_group == '跨语种复刻':
+    elif mode_checkbox_group == i18n.t('inference_mode_list.crosslingual'):
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)

From c3517fde363afa83c53a1afc3e7033f44169c1dc Mon Sep 17 00:00:00 2001
From: KrugerCoder <KrugerCoder@joshmwilliams.com>
Date: Fri, 6 Sep 2024 15:12:51 -0500
Subject: [PATCH 7/7] Fix requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b56429d8..370a2105 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,4 +31,4 @@ wget==3.2
 fastapi==0.111.0
 fastapi-cli==0.0.4
 WeTextProcessing==1.0.3
-i8n==0.3.9
+python-i18n==0.3.9