Skip to content

Commit f775786

Browse files
author
LittleMouse
committed
[update] add en-au, en-br, en-india, en-us model. Format code.
1 parent 00c0533 commit f775786

File tree

7 files changed

+145
-63
lines changed

7 files changed

+145
-63
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"mode": "melotts-en-au",
3+
"type": "tts",
4+
"homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert",
5+
"compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-au --output_name decoder-en-au.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
6+
"pulsar_version": "4.0-64a0e58f",
7+
"capabilities": [
8+
"tts",
9+
"English"
10+
],
11+
"input_type": [
12+
"tts.utf-8"
13+
],
14+
"output_type": [
15+
"tts.wav",
16+
"sys.play.0_1"
17+
],
18+
"mode_param": {
19+
"encoder": "encoder-en-au.ort",
20+
"decoder": "decoder-en-au.axmodel",
21+
"gbin": "g-en-au.bin",
22+
"tokens": "tokens-en.txt",
23+
"lexicon": "lexicon-en.txt",
24+
"tagger": "en_tn_tagger.fst",
25+
"verbalizer": "en_tn_verbalizer.fst",
26+
"spacker_speed": 1.2,
27+
"mode_rate": 44100,
28+
"audio_rate": 16000,
29+
"awake_delay": 1000
30+
}
31+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"mode": "melotts-en-br",
3+
"type": "tts",
4+
"homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert",
5+
"compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-br --output_name decoder-en-br.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
6+
"pulsar_version": "4.0-64a0e58f",
7+
"capabilities": [
8+
"tts",
9+
"English"
10+
],
11+
"input_type": [
12+
"tts.utf-8"
13+
],
14+
"output_type": [
15+
"tts.wav",
16+
"sys.play.0_1"
17+
],
18+
"mode_param": {
19+
"encoder": "encoder-en-br.ort",
20+
"decoder": "decoder-en-br.axmodel",
21+
"gbin": "g-en-br.bin",
22+
"tokens": "tokens-en.txt",
23+
"lexicon": "lexicon-en.txt",
24+
"tagger": "en_tn_tagger.fst",
25+
"verbalizer": "en_tn_verbalizer.fst",
26+
"spacker_speed": 1.2,
27+
"mode_rate": 44100,
28+
"audio_rate": 16000,
29+
"awake_delay": 1000
30+
}
31+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"mode": "melotts-en-india",
3+
"type": "tts",
4+
"homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert",
5+
"compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-india --output_name decoder-en-india.axmodel --target_hardware AX620E",
6+
"pulsar_version": "4.0-64a0e58f",
7+
"capabilities": [
8+
"tts",
9+
"English"
10+
],
11+
"input_type": [
12+
"tts.utf-8"
13+
],
14+
"output_type": [
15+
"tts.wav",
16+
"sys.play.0_1"
17+
],
18+
"mode_param": {
19+
"encoder": "encoder-en-india.ort",
20+
"decoder": "decoder-en-india.axmodel",
21+
"gbin": "g-en-india.bin",
22+
"tokens": "tokens-en.txt",
23+
"lexicon": "lexicon-en.txt",
24+
"tagger": "en_tn_tagger.fst",
25+
"verbalizer": "en_tn_verbalizer.fst",
26+
"spacker_speed": 1.2,
27+
"mode_rate": 44100,
28+
"audio_rate": 16000,
29+
"awake_delay": 1000
30+
}
31+
}

projects/llm_framework/main_melotts/mode_melotts-en-us.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
"mode": "melotts-en-us",
33
"type": "tts",
44
"homepage": "https://huggingface.co/myshell-ai/MeloTTS-English",
5-
"compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder-en --output_name decoder-en.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
6-
"pulsar_version": "3.4-3dfd5692",
5+
"compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-us --output_name decoder-en-us.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
6+
"pulsar_version": "4.0-64a0e58f",
77
"capabilities": [
88
"tts",
99
"English"
@@ -16,14 +16,14 @@
1616
"sys.play.0_1"
1717
],
1818
"mode_param": {
19-
"encoder": "encoder-en.ort",
20-
"decoder": "decoder-en.axmodel",
21-
"gbin": "g-en.bin",
22-
"tokens": "tokens.txt",
23-
"lexicon": "lexicon.txt",
19+
"encoder": "encoder-en-us.ort",
20+
"decoder": "decoder-en-us.axmodel",
21+
"gbin": "g-en-us.bin",
22+
"tokens": "tokens-en.txt",
23+
"lexicon": "lexicon-en.txt",
2424
"tagger": "en_tn_tagger.fst",
2525
"verbalizer": "en_tn_verbalizer.fst",
26-
"spacker_speed": 1.0,
26+
"spacker_speed": 1.2,
2727
"mode_rate": 44100,
2828
"audio_rate": 16000,
2929
"awake_delay": 1000

projects/llm_framework/main_melotts/src/main.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -190,19 +190,19 @@ class llm_task {
190190
g_matrix.resize(256, 0);
191191
FILE *fp = fopen(mode_config_.gbin.c_str(), "rb");
192192
if (!fp) {
193-
printf("Open %s failed!\n", mode_config_.gbin.c_str());
193+
SLOGE("Open %s failed!", mode_config_.gbin.c_str());
194194
return -3;
195195
}
196196
fread(g_matrix.data(), sizeof(float), g_matrix.size(), fp);
197197
fclose(fp);
198198
encoder_ = std::make_unique<OnnxWrapper>();
199199
decoder_ = std::make_unique<EngineWrapper>();
200200
if (0 != encoder_->Init(mode_config_.encoder)) {
201-
printf("encoder init failed!\n");
201+
SLOGE("encoder init failed!");
202202
return -4;
203203
}
204204
if (0 != decoder_->Init(mode_config_.decoder.c_str())) {
205-
printf("Init decoder model failed!\n");
205+
SLOGE("Init decoder model failed!");
206206
return -5;
207207
}
208208
} catch (...) {
@@ -398,7 +398,6 @@ class llm_task {
398398
}
399399
}
400400

401-
402401
int aligned_start = audio_start + best_offset;
403402

404403
std::vector<float> crossfade_region(sola_buffer_frame);
@@ -457,15 +456,13 @@ class llm_task {
457456
pcmlist.resize(audio_len);
458457
}
459458

460-
461459
double src_ratio =
462460
static_cast<double>(mode_config_.audio_rate) / static_cast<double>(mode_config_.mode_rate);
463461
std::vector<float> tmp_pcm((pcmlist.size() * src_ratio + 1));
464462
int len;
465463

466464
resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio);
467465

468-
469466
wav_pcm_data.reserve(len);
470467
std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data),
471468
[](const auto val) { return static_cast<int16_t>(val * INT16_MAX); });

projects/llm_framework/main_melotts/src/runner/Lexicon.hpp

Lines changed: 37 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,6 @@
1010
#include "../../../../../SDK/components/utilities/include/sample_log.h"
1111
#include "processor/wetext_processor.h"
1212

13-
// Debug logging switch - set to true to enable debug logs
14-
static bool DEBUG_LOGGING = false;
15-
// Macro for debug logging
16-
#define DEBUG_LOG(fmt, ...) \
17-
do { \
18-
if (DEBUG_LOGGING) { \
19-
SLOGI(fmt, ##__VA_ARGS__); \
20-
} \
21-
} while (0)
2213
std::vector<std::string> split(const std::string& s, char delim)
2314
{
2415
std::vector<std::string> result;
@@ -31,6 +22,7 @@ std::vector<std::string> split(const std::string& s, char delim)
3122
}
3223
return result;
3324
}
25+
3426
class Lexicon {
3527
private:
3628
std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<int>>> lexicon;
@@ -41,18 +33,12 @@ class Lexicon {
4133
wetext::Processor* m_processor;
4234

4335
public:
44-
// Setter for debug logging
45-
static void setDebugLogging(bool enable)
46-
{
47-
DEBUG_LOGGING = enable;
48-
}
4936
Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename, const std::string& tagger_filename,
5037
const std::string& verbalizer_filename)
5138
: max_phrase_length(0)
5239
{
53-
DEBUG_LOG("Dictionary loading: %s Pronunciation table loading: %s tagger_filename: %s verbalizer_filename: %s",
54-
tokens_filename.c_str(), lexicon_filename.c_str(), tagger_filename.c_str(),
55-
verbalizer_filename.c_str());
40+
SLOGD("Dictionary loading: %s Pronunciation table loading: %s tagger_filename: %s verbalizer_filename: %s",
41+
tokens_filename.c_str(), lexicon_filename.c_str(), tagger_filename.c_str(), verbalizer_filename.c_str());
5642

5743
m_processor = new wetext::Processor(tagger_filename, verbalizer_filename);
5844

@@ -106,8 +92,8 @@ class Lexicon {
10692
lexicon[""] = lexicon["."];
10793
lexicon[""] = lexicon["!"];
10894
lexicon[""] = lexicon["?"];
109-
DEBUG_LOG("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
110-
max_phrase_length);
95+
SLOGD("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
96+
max_phrase_length);
11197
}
11298

11399
std::vector<std::string> splitEachChar(const std::string& text)
@@ -136,15 +122,17 @@ class Lexicon {
136122
{
137123
return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
138124
}
125+
139126
bool is_english_token_char(const std::string& s)
140127
{
141128
if (s.size() != 1) return false;
142129
char c = s[0];
143130
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_';
144131
}
132+
145133
void process_unknown_english(const std::string& word, std::vector<int>& phones, std::vector<int>& tones)
146134
{
147-
DEBUG_LOG("Processing unknown term: %s", word.c_str());
135+
SLOGD("Processing unknown term: %s", word.c_str());
148136
std::string orig_word = word;
149137
std::vector<std::string> parts;
150138
std::vector<std::string> phonetic_parts;
@@ -163,7 +151,7 @@ class Lexicon {
163151
tones.insert(tones.end(), sub_tones.begin(), sub_tones.end());
164152
parts.push_back(sub_word);
165153
phonetic_parts.push_back(phonesToString(sub_phones));
166-
DEBUG_LOG(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
154+
SLOGD(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
167155
start += len;
168156
matched = true;
169157
break;
@@ -180,13 +168,13 @@ class Lexicon {
180168
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
181169
parts.push_back(single_char);
182170
phonetic_parts.push_back(phonesToString(char_phones));
183-
DEBUG_LOG(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
171+
SLOGD(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
184172
} else {
185173
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
186174
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
187175
parts.push_back(single_char);
188176
phonetic_parts.push_back("_unknown_");
189-
DEBUG_LOG(" Unknown: '%s'", single_char.c_str());
177+
SLOGD(" Unknown: '%s'", single_char.c_str());
190178
}
191179
start++;
192180
}
@@ -200,26 +188,25 @@ class Lexicon {
200188
parts_str += parts[i];
201189
phonetic_str += phonetic_parts[i];
202190
}
203-
DEBUG_LOG("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(),
204-
phonetic_str.c_str());
191+
SLOGD("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str());
205192
}
206193

207194
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
208195
{
209-
DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
196+
SLOGD("\nStarting text processing: \"%s\"", text.c_str());
210197

211198
std::string taggedText = m_processor->Tag(text);
212-
DEBUG_LOG("\taggedText processing: \"%s\"", taggedText.c_str());
199+
SLOGD("\taggedText processing: \"%s\"", taggedText.c_str());
213200
std::string normalizedText = m_processor->Verbalize(taggedText);
214-
DEBUG_LOG("\normalizedText processing: \"%s\"", normalizedText.c_str());
201+
SLOGD("\normalizedText processing: \"%s\"", normalizedText.c_str());
215202

216-
DEBUG_LOG("=======Matching Results=======");
217-
DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
218-
DEBUG_LOG("-----------------------------");
203+
SLOGD("=======Matching Results=======");
204+
SLOGD("Unit\t|\tPhonemes\t|\tTones");
205+
SLOGD("-----------------------------");
219206
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
220207
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
221-
DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
222-
tonesToString(unknown_token.second).c_str());
208+
SLOGD("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
209+
tonesToString(unknown_token.second).c_str());
223210
auto chars = splitEachChar(normalizedText);
224211
int i = 0;
225212
while (i < chars.size()) {
@@ -236,8 +223,8 @@ class Lexicon {
236223
auto& [eng_phones, eng_tones] = lexicon[eng_word];
237224
phones.insert(phones.end(), eng_phones.begin(), eng_phones.end());
238225
tones.insert(tones.end(), eng_tones.begin(), eng_tones.end());
239-
DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
240-
tonesToString(eng_tones).c_str());
226+
SLOGD("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
227+
tonesToString(eng_tones).c_str());
241228
} else {
242229
process_unknown_english(orig_word, phones, tones);
243230
}
@@ -256,8 +243,8 @@ class Lexicon {
256243
auto& [phrase_phones, phrase_tones] = lexicon[phrase];
257244
phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end());
258245
tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end());
259-
DEBUG_LOG("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
260-
tonesToString(phrase_tones).c_str());
246+
SLOGD("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
247+
tonesToString(phrase_tones).c_str());
261248
i += len;
262249
matched = true;
263250
break;
@@ -279,25 +266,25 @@ class Lexicon {
279266
auto& [char_phones, char_tones] = lexicon[s];
280267
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
281268
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
282-
DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
283-
tonesToString(char_tones).c_str());
269+
SLOGD("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
270+
tonesToString(char_tones).c_str());
284271
} else {
285272
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
286273
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
287-
DEBUG_LOG("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
288-
phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
274+
SLOGD("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
275+
phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
289276
}
290277
}
291278
}
292279
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
293280
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
294-
DEBUG_LOG("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
295-
tonesToString(unknown_token.second).c_str());
296-
DEBUG_LOG("\nProcessing Summary:");
297-
DEBUG_LOG("Original text: %s", text.c_str());
298-
DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str());
299-
DEBUG_LOG("Tones: %s", tonesToString(tones).c_str());
300-
DEBUG_LOG("====================");
281+
SLOGD("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
282+
tonesToString(unknown_token.second).c_str());
283+
SLOGD("\nProcessing Summary:");
284+
SLOGD("Original text: %s", text.c_str());
285+
SLOGD("Phonemes: %s", phonesToString(phones).c_str());
286+
SLOGD("Tones: %s", tonesToString(tones).c_str());
287+
SLOGD("====================");
301288
}
302289

303290
private:
@@ -316,6 +303,7 @@ class Lexicon {
316303
phones.insert(phones.end(), phones_and_tones.first.begin(), phones_and_tones.first.end());
317304
tones.insert(tones.end(), phones_and_tones.second.begin(), phones_and_tones.second.end());
318305
}
306+
319307
std::string phonesToString(const std::vector<int>& phones)
320308
{
321309
std::string result;
@@ -329,6 +317,7 @@ class Lexicon {
329317
}
330318
return result;
331319
}
320+
332321
std::string tonesToString(const std::vector<int>& tones)
333322
{
334323
std::string result;

0 commit comments

Comments
 (0)