[update] add en-au, en-br, en-india, en-us model. Format code.

LittleMouse · LittleMouse · commit f77578603385 · 2025-05-20T14:19:44.000+08:00
diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-au.json b/projects/llm_framework/main_melotts/mode_melotts-en-au.json
@@ -0,0 +1,31 @@
+{
+    "mode": "melotts-en-au",
+    "type": "tts",
+    "homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert",
+    "compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-au --output_name decoder-en-au.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
+    "pulsar_version": "4.0-64a0e58f",
+    "capabilities": [
+        "tts",
+        "English"
+    ],
+    "input_type": [
+        "tts.utf-8"
+    ],
+    "output_type": [
+        "tts.wav",
+        "sys.play.0_1"
+    ],
+    "mode_param": {
+        "encoder": "encoder-en-au.ort",
+        "decoder": "decoder-en-au.axmodel",
+        "gbin": "g-en-au.bin",
+        "tokens": "tokens-en.txt",
+        "lexicon": "lexicon-en.txt",
+        "tagger": "en_tn_tagger.fst",
+        "verbalizer": "en_tn_verbalizer.fst",
+        "spacker_speed": 1.2,
+        "mode_rate": 44100,
+        "audio_rate": 16000,
+        "awake_delay": 1000
+    }
+}
diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-br.json b/projects/llm_framework/main_melotts/mode_melotts-en-br.json
@@ -0,0 +1,31 @@
+{
+    "mode": "melotts-en-br",
+    "type": "tts",
+    "homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert",
+    "compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-br --output_name decoder-en-br.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
+    "pulsar_version": "4.0-64a0e58f",
+    "capabilities": [
+        "tts",
+        "English"
+    ],
+    "input_type": [
+        "tts.utf-8"
+    ],
+    "output_type": [
+        "tts.wav",
+        "sys.play.0_1"
+    ],
+    "mode_param": {
+        "encoder": "encoder-en-br.ort",
+        "decoder": "decoder-en-br.axmodel",
+        "gbin": "g-en-br.bin",
+        "tokens": "tokens-en.txt",
+        "lexicon": "lexicon-en.txt",
+        "tagger": "en_tn_tagger.fst",
+        "verbalizer": "en_tn_verbalizer.fst",
+        "spacker_speed": 1.2,
+        "mode_rate": 44100,
+        "audio_rate": 16000,
+        "awake_delay": 1000
+    }
+}
diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-india.json b/projects/llm_framework/main_melotts/mode_melotts-en-india.json
@@ -0,0 +1,31 @@
+{
+    "mode": "melotts-en-india",
+    "type": "tts",
+    "homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert",
+    "compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-india --output_name decoder-en-india.axmodel --target_hardware AX620E",
+    "pulsar_version": "4.0-64a0e58f",
+    "capabilities": [
+        "tts",
+        "English"
+    ],
+    "input_type": [
+        "tts.utf-8"
+    ],
+    "output_type": [
+        "tts.wav",
+        "sys.play.0_1"
+    ],
+    "mode_param": {
+        "encoder": "encoder-en-india.ort",
+        "decoder": "decoder-en-india.axmodel",
+        "gbin": "g-en-india.bin",
+        "tokens": "tokens-en.txt",
+        "lexicon": "lexicon-en.txt",
+        "tagger": "en_tn_tagger.fst",
+        "verbalizer": "en_tn_verbalizer.fst",
+        "spacker_speed": 1.2,
+        "mode_rate": 44100,
+        "audio_rate": 16000,
+        "awake_delay": 1000
+    }
+}
diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-us.json b/projects/llm_framework/main_melotts/mode_melotts-en-us.json
@@ -2,8 +2,8 @@
     "mode": "melotts-en-us",
     "type": "tts",
     "homepage": "https://huggingface.co/myshell-ai/MeloTTS-English",
-    "compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder-en --output_name decoder-en.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
-    "pulsar_version": "3.4-3dfd5692",
+    "compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder_en-us --output_name decoder-en-us.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0",
+    "pulsar_version": "4.0-64a0e58f",
     "capabilities": [
         "tts",
         "English"
@@ -16,14 +16,14 @@
         "sys.play.0_1"
     ],
     "mode_param": {
-        "encoder": "encoder-en.ort",
-        "decoder": "decoder-en.axmodel",
-        "gbin": "g-en.bin",
-        "tokens": "tokens.txt",
-        "lexicon": "lexicon.txt",
+        "encoder": "encoder-en-us.ort",
+        "decoder": "decoder-en-us.axmodel",
+        "gbin": "g-en-us.bin",
+        "tokens": "tokens-en.txt",
+        "lexicon": "lexicon-en.txt",
         "tagger": "en_tn_tagger.fst",
         "verbalizer": "en_tn_verbalizer.fst",
-        "spacker_speed": 1.0,
+        "spacker_speed": 1.2,
         "mode_rate": 44100,
         "audio_rate": 16000,
         "awake_delay": 1000
diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp
@@ -190,19 +190,19 @@ class llm_task {
             g_matrix.resize(256, 0);
             FILE *fp = fopen(mode_config_.gbin.c_str(), "rb");
             if (!fp) {
-                printf("Open %s failed!\n", mode_config_.gbin.c_str());
+                SLOGE("Open %s failed!", mode_config_.gbin.c_str());
                 return -3;
             }
             fread(g_matrix.data(), sizeof(float), g_matrix.size(), fp);
             fclose(fp);
             encoder_ = std::make_unique<OnnxWrapper>();
             decoder_ = std::make_unique<EngineWrapper>();
             if (0 != encoder_->Init(mode_config_.encoder)) {
-                printf("encoder init failed!\n");
+                SLOGE("encoder init failed!");
                 return -4;
             }
             if (0 != decoder_->Init(mode_config_.decoder.c_str())) {
-                printf("Init decoder model failed!\n");
+                SLOGE("Init decoder model failed!");
                 return -5;
             }
         } catch (...) {
@@ -398,7 +398,6 @@ class llm_task {
                         }
                     }
 
-
                     int aligned_start = audio_start + best_offset;
 
                     std::vector<float> crossfade_region(sola_buffer_frame);
@@ -457,15 +456,13 @@ class llm_task {
                 pcmlist.resize(audio_len);
             }
 
-
             double src_ratio =
                 static_cast<double>(mode_config_.audio_rate) / static_cast<double>(mode_config_.mode_rate);
             std::vector<float> tmp_pcm((pcmlist.size() * src_ratio + 1));
             int len;
 
             resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio);
 
-
             wav_pcm_data.reserve(len);
             std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data),
                            [](const auto val) { return static_cast<int16_t>(val * INT16_MAX); });
diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp
@@ -10,15 +10,6 @@
 #include "../../../../../SDK/components/utilities/include/sample_log.h"
 #include "processor/wetext_processor.h"
 
-// Debug logging switch - set to true to enable debug logs
-static bool DEBUG_LOGGING = false;
-// Macro for debug logging
-#define DEBUG_LOG(fmt, ...)            \
-    do {                               \
-        if (DEBUG_LOGGING) {           \
-            SLOGI(fmt, ##__VA_ARGS__); \
-        }                              \
-    } while (0)
 std::vector<std::string> split(const std::string& s, char delim)
 {
     std::vector<std::string> result;
@@ -31,6 +22,7 @@ std::vector<std::string> split(const std::string& s, char delim)
     }
     return result;
 }
+
 class Lexicon {
 private:
     std::unordered_map<std::string, std::pair<std::vector<int>, std::vector<int>>> lexicon;
@@ -41,18 +33,12 @@ class Lexicon {
     wetext::Processor* m_processor;
 
 public:
-    // Setter for debug logging
-    static void setDebugLogging(bool enable)
-    {
-        DEBUG_LOGGING = enable;
-    }
     Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename, const std::string& tagger_filename,
             const std::string& verbalizer_filename)
         : max_phrase_length(0)
     {
-        DEBUG_LOG("Dictionary loading: %s Pronunciation table loading: %s tagger_filename: %s verbalizer_filename: %s",
-                  tokens_filename.c_str(), lexicon_filename.c_str(), tagger_filename.c_str(),
-                  verbalizer_filename.c_str());
+        SLOGD("Dictionary loading: %s Pronunciation table loading: %s tagger_filename: %s verbalizer_filename: %s",
+              tokens_filename.c_str(), lexicon_filename.c_str(), tagger_filename.c_str(), verbalizer_filename.c_str());
 
         m_processor = new wetext::Processor(tagger_filename, verbalizer_filename);
 
@@ -106,8 +92,8 @@ class Lexicon {
         lexicon["。"] = lexicon["."];
         lexicon["！"] = lexicon["!"];
         lexicon["？"] = lexicon["?"];
-        DEBUG_LOG("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
-                  max_phrase_length);
+        SLOGD("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
+              max_phrase_length);
     }
 
     std::vector<std::string> splitEachChar(const std::string& text)
@@ -136,15 +122,17 @@ class Lexicon {
     {
         return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
     }
+
     bool is_english_token_char(const std::string& s)
     {
         if (s.size() != 1) return false;
         char c = s[0];
         return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_';
     }
+
     void process_unknown_english(const std::string& word, std::vector<int>& phones, std::vector<int>& tones)
     {
-        DEBUG_LOG("Processing unknown term: %s", word.c_str());
+        SLOGD("Processing unknown term: %s", word.c_str());
         std::string orig_word = word;
         std::vector<std::string> parts;
         std::vector<std::string> phonetic_parts;
@@ -163,7 +151,7 @@ class Lexicon {
                     tones.insert(tones.end(), sub_tones.begin(), sub_tones.end());
                     parts.push_back(sub_word);
                     phonetic_parts.push_back(phonesToString(sub_phones));
-                    DEBUG_LOG("  Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
+                    SLOGD("  Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
                     start += len;
                     matched = true;
                     break;
@@ -180,13 +168,13 @@ class Lexicon {
                     tones.insert(tones.end(), char_tones.begin(), char_tones.end());
                     parts.push_back(single_char);
                     phonetic_parts.push_back(phonesToString(char_phones));
-                    DEBUG_LOG("  Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
+                    SLOGD("  Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
                 } else {
                     phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
                     tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
                     parts.push_back(single_char);
                     phonetic_parts.push_back("_unknown_");
-                    DEBUG_LOG("  Unknown: '%s'", single_char.c_str());
+                    SLOGD("  Unknown: '%s'", single_char.c_str());
                 }
                 start++;
             }
@@ -200,26 +188,25 @@ class Lexicon {
             parts_str += parts[i];
             phonetic_str += phonetic_parts[i];
         }
-        DEBUG_LOG("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(),
-                  phonetic_str.c_str());
+        SLOGD("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str());
     }
 
     void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
     {
-        DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
+        SLOGD("\nStarting text processing: \"%s\"", text.c_str());
 
         std::string taggedText = m_processor->Tag(text);
-        DEBUG_LOG("\taggedText processing: \"%s\"", taggedText.c_str());
+        SLOGD("\taggedText processing: \"%s\"", taggedText.c_str());
         std::string normalizedText = m_processor->Verbalize(taggedText);
-        DEBUG_LOG("\normalizedText processing: \"%s\"", normalizedText.c_str());
+        SLOGD("\normalizedText processing: \"%s\"", normalizedText.c_str());
 
-        DEBUG_LOG("=======Matching Results=======");
-        DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
-        DEBUG_LOG("-----------------------------");
+        SLOGD("=======Matching Results=======");
+        SLOGD("Unit\t|\tPhonemes\t|\tTones");
+        SLOGD("-----------------------------");
         phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
         tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-        DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
-                  tonesToString(unknown_token.second).c_str());
+        SLOGD("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
+              tonesToString(unknown_token.second).c_str());
         auto chars = splitEachChar(normalizedText);
         int i      = 0;
         while (i < chars.size()) {
@@ -236,8 +223,8 @@ class Lexicon {
                     auto& [eng_phones, eng_tones] = lexicon[eng_word];
                     phones.insert(phones.end(), eng_phones.begin(), eng_phones.end());
                     tones.insert(tones.end(), eng_tones.begin(), eng_tones.end());
-                    DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
-                              tonesToString(eng_tones).c_str());
+                    SLOGD("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
+                          tonesToString(eng_tones).c_str());
                 } else {
                     process_unknown_english(orig_word, phones, tones);
                 }
@@ -256,8 +243,8 @@ class Lexicon {
                     auto& [phrase_phones, phrase_tones] = lexicon[phrase];
                     phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end());
                     tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end());
-                    DEBUG_LOG("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
-                              tonesToString(phrase_tones).c_str());
+                    SLOGD("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
+                          tonesToString(phrase_tones).c_str());
                     i += len;
                     matched = true;
                     break;
@@ -279,25 +266,25 @@ class Lexicon {
                     auto& [char_phones, char_tones] = lexicon[s];
                     phones.insert(phones.end(), char_phones.begin(), char_phones.end());
                     tones.insert(tones.end(), char_tones.begin(), char_tones.end());
-                    DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
-                              tonesToString(char_tones).c_str());
+                    SLOGD("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
+                          tonesToString(char_tones).c_str());
                 } else {
                     phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
                     tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-                    DEBUG_LOG("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
-                              phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
+                    SLOGD("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
+                          phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
                 }
             }
         }
         phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
         tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
-        DEBUG_LOG("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
-                  tonesToString(unknown_token.second).c_str());
-        DEBUG_LOG("\nProcessing Summary:");
-        DEBUG_LOG("Original text: %s", text.c_str());
-        DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str());
-        DEBUG_LOG("Tones: %s", tonesToString(tones).c_str());
-        DEBUG_LOG("====================");
+        SLOGD("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
+              tonesToString(unknown_token.second).c_str());
+        SLOGD("\nProcessing Summary:");
+        SLOGD("Original text: %s", text.c_str());
+        SLOGD("Phonemes: %s", phonesToString(phones).c_str());
+        SLOGD("Tones: %s", tonesToString(tones).c_str());
+        SLOGD("====================");
     }
 
 private:
@@ -316,6 +303,7 @@ class Lexicon {
         phones.insert(phones.end(), phones_and_tones.first.begin(), phones_and_tones.first.end());
         tones.insert(tones.end(), phones_and_tones.second.begin(), phones_and_tones.second.end());
     }
+
     std::string phonesToString(const std::vector<int>& phones)
     {
         std::string result;
@@ -329,6 +317,7 @@ class Lexicon {
         }
         return result;
     }
+
     std::string tonesToString(const std::vector<int>& tones)
     {
         std::string result;
diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py