From 52e90744e31ecee086c2dd450798f453882d9823 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Thu, 3 Apr 2025 16:18:14 +0800 Subject: [PATCH 01/64] [fix] yolo11n-seg mode config error --- projects/llm_framework/main_yolo/mode_yolo11n-seg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/main_yolo/mode_yolo11n-seg.json b/projects/llm_framework/main_yolo/mode_yolo11n-seg.json index a992a51..71c86c4 100644 --- a/projects/llm_framework/main_yolo/mode_yolo11n-seg.json +++ b/projects/llm_framework/main_yolo/mode_yolo11n-seg.json @@ -1,5 +1,5 @@ { - "mode":"yolo11s-seg", + "mode":"yolo11n-seg", "type":"cv", "homepage":"https://github.com/ultralytics/ultralytics", "capabilities":[ From a231d3967e554e1a91f7be432a564f3e4c7ab863 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Thu, 3 Apr 2025 16:47:09 +0800 Subject: [PATCH 02/64] [update] llm-model-yolo11n-seg version --- projects/llm_framework/tools/llm_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 257db7f..09d0967 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -391,7 +391,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], - 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', data_version, src_folder, revision], + 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], From 6d62b0ddd9d6d63fd9a682b6a8236b70d37204d4 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 7 Apr 2025 18:59:04 +0800 Subject: [PATCH 03/64] [update] Update llm_asr & llm_kws en docs --- doc/projects_llm_framework_doc/llm_asr_en.md | 8 ++++---- doc/projects_llm_framework_doc/llm_kws_en.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/projects_llm_framework_doc/llm_asr_en.md b/doc/projects_llm_framework_doc/llm_asr_en.md index adf681a..8e96f25 100644 --- a/doc/projects_llm_framework_doc/llm_asr_en.md +++ b/doc/projects_llm_framework_doc/llm_asr_en.md @@ -16,7 +16,7 @@ Send JSON: "action": "setup", "object": "asr.setup", "data": { - "model": "sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23", + "model": "sherpa-ncnn-streaming-zipformer-20M-2023-02-17", "response_format": "asr.utf-8.stream", "input": "sys.pcm", "enoutput": true, @@ -34,7 +34,7 @@ Send JSON: - work_id: For configuration units, it is `asr`. - action: The method to be called is `setup`. - object: The type of data being transmitted is `asr.setup`. -- model: The model used is the Chinese model `sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23`. +- model: The model used is the Chinese model `sherpa-ncnn-streaming-zipformer-20M-2023-02-17`. - response_format: The result format is `asr.utf-8.stream`, a UTF-8 stream output. - input: The input is `sys.pcm`, representing system audio. - enoutput: Whether to enable user result output. @@ -109,7 +109,7 @@ Example: "action": "setup", "object": "asr.setup", "data": { - "model": "sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23", + "model": "sherpa-ncnn-streaming-zipformer-20M-2023-02-17", "response_format": "asr.utf-8.stream", "input": [ "sys.pcm", @@ -310,7 +310,7 @@ Response JSON: "inputs_": [ "sys.pcm" ], - "model": "sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23", + "model": "sherpa-ncnn-streaming-zipformer-20M-2023-02-17", "response_format": "asr.utf-8-stream" }, "error": { diff --git a/doc/projects_llm_framework_doc/llm_kws_en.md b/doc/projects_llm_framework_doc/llm_kws_en.md index 7504e26..89a9cad 100644 --- a/doc/projects_llm_framework_doc/llm_kws_en.md +++ b/doc/projects_llm_framework_doc/llm_kws_en.md @@ -16,11 +16,11 @@ Send JSON: "action": "setup", "object": "kws.setup", "data": { - "model": "sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01", + "model": "sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01", "response_format": "kws.bool", "input": "sys.pcm", "enoutput": true, - "kws": "你好你好" + "kws": "HELLO" } } ``` @@ -29,7 +29,7 @@ Send JSON: - work_id: When configuring the unit, it is `kws`. - action: The method called is `setup`. - object: The type of data being transmitted is `kws.setup`. -- model: The model used is the Chinese model `sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01`. +- model: The model used is the Chinese model `sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01`. - response_format: The result returned is in `kws.bool` format. - input: The input is `sys.pcm`, representing system audio. - enoutput: Whether to enable user result output. @@ -204,7 +204,7 @@ Response JSON: "inputs_": [ "sys.pcm" ], - "model": "sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01", + "model": "sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01", "response_format": "kws.bool" }, "error": { From bed467e0e5291d3c073011ab4368cc65489007e4 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 7 Apr 2025 19:04:28 +0800 Subject: [PATCH 04/64] [update] Delete the old version of the dynamic library --- projects/llm_framework/main/SConstruct | 1 - projects/llm_framework/main_melotts/SConstruct | 2 +- projects/llm_framework/main_whisper/SConstruct | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/projects/llm_framework/main/SConstruct b/projects/llm_framework/main/SConstruct index 3284163..79a6012 100644 --- a/projects/llm_framework/main/SConstruct +++ b/projects/llm_framework/main/SConstruct @@ -24,7 +24,6 @@ STATIC_FILES += [AFile('../static_lib/sherpa/ncnn/libsherpa-ncnn-core.so'), AFile('../static_lib/sherpa/ncnn/libncnn.so'), AFile('../static_lib/libtts.so'), AFile('../static_lib/sherpa/ncnn/libkaldi-native-fbank-core.so'), - AFile('../static_lib/libonnxruntime.so.1.14.0') ] env['COMPONENTS'].append({'target':'static_file', diff --git a/projects/llm_framework/main_melotts/SConstruct b/projects/llm_framework/main_melotts/SConstruct index e54608b..358ddb2 100644 --- a/projects/llm_framework/main_melotts/SConstruct +++ b/projects/llm_framework/main_melotts/SConstruct @@ -21,7 +21,7 @@ DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] -REQUIREMENTS += ['onnxruntime', 'samplerate'] +REQUIREMENTS += ['samplerate'] INCLUDE += [ADir('../include')] INCLUDE += [ADir('src/runner'), ADir('../include/onnxruntime/core/session')] diff --git a/projects/llm_framework/main_whisper/SConstruct b/projects/llm_framework/main_whisper/SConstruct index 4dee5cf..c14cf6b 100644 --- a/projects/llm_framework/main_whisper/SConstruct +++ b/projects/llm_framework/main_whisper/SConstruct @@ -21,7 +21,7 @@ DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] -REQUIREMENTS += ['onnxruntime', 'samplerate'] +# REQUIREMENTS += ['onnxruntime', 'samplerate'] INCLUDE += [ADir('../include')] INCLUDE += [ADir('src/runner'), ADir('../include/onnxruntime/core/session')] From d6d8f3cb3dfaf17daa34cc85c93c93bc79eb93f0 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 7 Apr 2025 19:06:26 +0800 Subject: [PATCH 05/64] [update] main_kws & main_llm use their own python environment --- projects/llm_framework/main_kws/SConstruct | 21 +++++++++++++++++++ projects/llm_framework/main_kws/src/main.cpp | 4 ++-- projects/llm_framework/main_llm/SConstruct | 21 +++++++++++++++++++ projects/llm_framework/main_llm/src/main.cpp | 1 + .../llm_framework/main_openai_api/SConstruct | 2 +- 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/projects/llm_framework/main_kws/SConstruct b/projects/llm_framework/main_kws/SConstruct index 5a48407..f82c7a5 100644 --- a/projects/llm_framework/main_kws/SConstruct +++ b/projects/llm_framework/main_kws/SConstruct @@ -16,6 +16,8 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] +python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-kws-python-venv_v1.6.tar.gz", 'm5stack_llm-kws-python-venv_v1.6.tar.gz') + DEFINITIONS += ['-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] @@ -31,9 +33,28 @@ LDFLAGS += ['-l:libcargs.a', '-l:libonnxruntime.a', '-l:libsherpa-onnx-core.a', '-l:libkaldi-native-fbank-core.a', '-l:libkaldi-decoder-core.a', '-l:libssentencepiece_core.a'] +STATIC_FILES += [os.path.join(python_venv, 'sherpa-onnx')] STATIC_FILES += Glob('llm-kws_text2token.py') STATIC_FILES += Glob('mode_*.json') +IGNORE_FILES = [] +IGNORE_FILES += ['sherpa-onnx'] + +import json +if not os.path.exists('../dist'): + os.makedirs('../dist') +ignore = {'ignore':[]} +try: + with open('../dist/fileignore', 'a+') as f: + f.seek(0) + ignore = json.load(f) +except: + pass +ignore['ignore'] += IGNORE_FILES +ignore['ignore'] = list(set(ignore['ignore'])) +with open('../dist/fileignore', 'w') as f: + json.dump(ignore, f, indent=4) + env['COMPONENTS'].append({'target':'llm_kws', 'SRCS':SRCS, 'INCLUDE':INCLUDE, diff --git a/projects/llm_framework/main_kws/src/main.cpp b/projects/llm_framework/main_kws/src/main.cpp index 4c51002..1c6cfea 100644 --- a/projects/llm_framework/main_kws/src/main.cpp +++ b/projects/llm_framework/main_kws/src/main.cpp @@ -176,9 +176,9 @@ class llm_task { temp_awake_key.close(); std::ostringstream awake_key_compile_cmd; if (file_exists("/opt/m5stack/scripts/text2token.py")) - awake_key_compile_cmd << "/usr/bin/python3 /opt/m5stack/scripts/text2token.py "; + awake_key_compile_cmd << "PYTHONPATH=/opt/m5stack/lib/sherpa-onnx/site-packages /usr/bin/python3 /opt/m5stack/scripts/text2token.py "; else if (file_exists("/opt/m5stack/scripts/llm-kws_text2token.py")) - awake_key_compile_cmd << "/usr/bin/python3 /opt/m5stack/scripts/llm-kws_text2token.py "; + awake_key_compile_cmd << "PYTHONPATH=/opt/m5stack/lib/sherpa-onnx/site-packages /usr/bin/python3 /opt/m5stack/scripts/llm-kws_text2token.py "; else { SLOGE("text2token.py or llm-kws_text2token.py not found!"); } diff --git a/projects/llm_framework/main_llm/SConstruct b/projects/llm_framework/main_llm/SConstruct index e744507..ad02ce8 100644 --- a/projects/llm_framework/main_llm/SConstruct +++ b/projects/llm_framework/main_llm/SConstruct @@ -17,6 +17,8 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] +python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-llm-python-venv_v1.7.tar.gz", 'm5stack_llm-llm-python-venv_v1.7.tar.gz') + # REQUIREMENTS += ['Backward_cpp'] # DYNAMIC_LIB += [ AFile('../static_lib/libdw.so.1'), # AFile('../static_lib/libelf.so.1'), @@ -41,10 +43,29 @@ static_file = Glob('../static_lib/module-llm/libabsl_*') static_file += [AFile('../static_lib/module-llm/libre2.a'), AFile('../static_lib/module-llm/libsentencepiece.a'), AFile('../static_lib/module-llm/libsentencepiece_train.a')] STATIC_LIB += static_file * 4 +STATIC_FILES += [os.path.join(python_venv, 'llm')] STATIC_FILES += Glob('scripts/tokenizer_*.py') STATIC_FILES += Glob('models/mode_*.json') STATIC_FILES += [AFile('scripts/llm-llm_tokenizer_auto.py')] +IGNORE_FILES = [] +IGNORE_FILES += ['llm'] + +import json +if not os.path.exists('../dist'): + os.makedirs('../dist') +ignore = {'ignore':[]} +try: + with open('../dist/fileignore', 'a+') as f: + f.seek(0) + ignore = json.load(f) +except: + pass +ignore['ignore'] += IGNORE_FILES +ignore['ignore'] = list(set(ignore['ignore'])) +with open('../dist/fileignore', 'w') as f: + json.dump(ignore, f, indent=4) + env['COMPONENTS'].append({'target':'llm_llm', 'SRCS':SRCS, 'INCLUDE':INCLUDE, diff --git a/projects/llm_framework/main_llm/src/main.cpp b/projects/llm_framework/main_llm/src/main.cpp index c667209..8fd97a0 100644 --- a/projects/llm_framework/main_llm/src/main.cpp +++ b/projects/llm_framework/main_llm/src/main.cpp @@ -155,6 +155,7 @@ class llm_task { if (!tokenizer_server_flage_.load()) { tokenizer_pid_ = fork(); if (tokenizer_pid_ == 0) { + setenv("PYTHONPATH", "/opt/m5stack/lib/llm/site-packages", 1); execl("/usr/bin/python3", "python3", tokenizer_file.c_str(), "--host", "localhost", "--port", std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(), "--content", ("'" + prompt_ + "'").c_str(), nullptr); diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct index ac778df..35cbcee 100644 --- a/projects/llm_framework/main_openai_api/SConstruct +++ b/projects/llm_framework/main_openai_api/SConstruct @@ -19,7 +19,7 @@ STATIC_FILES = [] ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/Abandon-ht/ModuleLLM-OpenAI-Plugin.git', '1077efbe201ea3f29517f5ce4a0cfc3b04c25d1d', True) -python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.5.tar.gz", 'm5stack_llm-llm-openai-api-python-venv_v1.5.tar.gz') +python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.5.tar.gz", 'm5stack_llm-openai-api-python-venv_v1.5.tar.gz') DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] From f5c82e45a7d0edcf1bf20e13929d20da6f415589 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 7 Apr 2025 19:09:26 +0800 Subject: [PATCH 06/64] [update] main_vlm updates the image encoding model and uses its own python environment --- projects/llm_framework/main_vlm/SConstruct | 21 ++++ projects/llm_framework/main_vlm/src/main.cpp | 32 +++--- .../llm_framework/main_vlm/src/runner/LLM.hpp | 100 ++++++++++-------- 3 files changed, 88 insertions(+), 65 deletions(-) diff --git a/projects/llm_framework/main_vlm/SConstruct b/projects/llm_framework/main_vlm/SConstruct index 3153957..4d9e16e 100644 --- a/projects/llm_framework/main_vlm/SConstruct +++ b/projects/llm_framework/main_vlm/SConstruct @@ -17,7 +17,9 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] +python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-vlm-python-venv_v1.6.tar.gz", 'm5stack_llm-vlm-python-venv_v1.6.tar.gz') +DEFINITIONS += ['-O2'] DEFINITIONS += ['-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] @@ -49,9 +51,28 @@ static_file += [AFile('../static_lib/libopencv-4.6-aarch64-none/lib/libtegra_hal static_file += [AFile('../static_lib/libopencv-4.6-aarch64-none/lib/libzlib.a')] STATIC_LIB += static_file * 4 +STATIC_FILES += [os.path.join(python_venv, 'vlm')] STATIC_FILES += Glob('scripts/tokenizer_*.py') STATIC_FILES += Glob('models/mode_*.json') +IGNORE_FILES = [] +IGNORE_FILES += ['vlm'] + +import json +if not os.path.exists('../dist'): + os.makedirs('../dist') +ignore = {'ignore':[]} +try: + with open('../dist/fileignore', 'a+') as f: + f.seek(0) + ignore = json.load(f) +except: + pass +ignore['ignore'] += IGNORE_FILES +ignore['ignore'] = list(set(ignore['ignore'])) +with open('../dist/fileignore', 'w') as f: + json.dump(ignore, f, indent=4) + env['COMPONENTS'].append({'target':'llm_vlm', 'SRCS':SRCS, 'INCLUDE':INCLUDE, diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp index 758dff2..50cbabe 100644 --- a/projects/llm_framework/main_vlm/src/main.cpp +++ b/projects/llm_framework/main_vlm/src/main.cpp @@ -50,8 +50,8 @@ class llm_task { std::string response_format_; std::vector inputs_; std::vector prompt_data_; - std::vector> image_datas_; - std::vector> img_embeds; + std::vector image_data_; + std::vector img_embed; std::string prompt_; task_callback_t out_callback_; bool enoutput_; @@ -125,6 +125,7 @@ class llm_task { CONFIG_AUTO_SET(file_body["mode_param"], b_eos); CONFIG_AUTO_SET(file_body["mode_param"], axmodel_num); CONFIG_AUTO_SET(file_body["mode_param"], tokens_embed_num); + CONFIG_AUTO_SET(file_body["mode_param"], img_token_id); CONFIG_AUTO_SET(file_body["mode_param"], tokens_embed_size); CONFIG_AUTO_SET(file_body["mode_param"], b_use_mmap_load_embed); CONFIG_AUTO_SET(file_body["mode_param"], b_dynamic_load_axmodel_layer); @@ -215,32 +216,25 @@ class llm_task { oss_prompt << input; break; } - SLOGI("prompt_complete:%s", oss_prompt.str().c_str()); + // SLOGI("prompt_complete:%s", oss_prompt.str().c_str()); return oss_prompt.str(); } void inference(const std::string &msg) { try { - if (image_datas_.empty()) { + if (image_data_.empty()) { lLaMa_->Encode(prompt_data_, prompt_complete(msg)); std::string out = lLaMa_->Run(prompt_data_); if (out_callback_) out_callback_(out, true); } else { - img_embeds.clear(); - for (auto &img_data : image_datas_) { - cv::Mat src = cv::imdecode(img_data, cv::IMREAD_COLOR); - if (src.empty()) continue; - std::vector embed; - lLaMa_->Encode(src, embed); - img_embeds.push_back(embed); - } - image_datas_.clear(); - if (!img_embeds.empty()) { - lLaMa_->Encode(img_embeds, prompt_data_, prompt_complete(msg)); - std::string out = lLaMa_->Run(prompt_data_); - if (out_callback_) out_callback_(out, true); - } + cv::Mat src = cv::imdecode(image_data_, cv::IMREAD_COLOR); + if (src.empty()) return; + image_data_.clear(); + lLaMa_->Encode(src, img_embed); + lLaMa_->Encode(img_embed, prompt_data_, prompt_complete(msg)); + std::string out = lLaMa_->Run(prompt_data_); + if (out_callback_) out_callback_(out, true); } } catch (...) { SLOGW("lLaMa_->Run have error!"); @@ -404,7 +398,7 @@ class llm_llm : public StackFlow { next_data = &tmp_msg2; } if (object.find("jpeg") != std::string::npos) { - llm_task_obj->image_datas_.emplace_back(next_data->begin(), next_data->end()); + llm_task_obj->image_data_.assign(next_data->begin(), next_data->end()); return; } llm_task_obj->inference((*next_data)); diff --git a/projects/llm_framework/main_vlm/src/runner/LLM.hpp b/projects/llm_framework/main_vlm/src/runner/LLM.hpp index a1d34ed..2cbbf38 100644 --- a/projects/llm_framework/main_vlm/src/runner/LLM.hpp +++ b/projects/llm_framework/main_vlm/src/runner/LLM.hpp @@ -26,8 +26,6 @@ struct LLMAttrType { std::string filename_post_axmodel = "tinyllama-int8/tinyllama_post.axmodel"; - bool b_use_topk = false; - std::string filename_vpm_encoder_axmodedl = "minicpmv/vpm_resampler_version0_fp16.axmodel"; std::string filename_vpm_resampler_axmodedl = "minicpmv/vpm_resampler_version0_fp16.axmodel"; int vpm_width = 280; @@ -39,6 +37,7 @@ struct LLMAttrType { bool b_bos = true, b_eos = false; std::string filename_tokens_embed = "tinyllama.model.embed_tokens.weight.bfloat16.bin"; int tokens_embed_num = 32000; + int img_token_id = 151667; // InternVL2.5 int tokens_embed_size = 2048; int max_token_len = 127; // auto calc @@ -53,6 +52,9 @@ struct LLMAttrType { bool b_use_mmap_load_layer = true; + bool b_use_topk = false; + std::string post_config_path = "post_config.json"; + // bool b_live_print = true; LLMRuningCallback runing_callback = nullptr; void *reserve = nullptr; @@ -84,36 +86,17 @@ class LLM { bool b_stop = false; - int post_process(unsigned short *p, int n, std::vector &history, float *val = 0) + LLMPostprocess postprocess; + static int post_process(LLMPostprocess &postprocess, unsigned short *p, int n, std::vector &history, + float *val = 0) { std::vector logits(n); for (int i = 0; i < n; i++) { unsigned int proc = p[i] << 16; logits[i] = *reinterpret_cast(&proc); } - LLMPostprocess postprocess; - postprocess.set_temperature(true, _attr.temperature); - postprocess.set_repetition_penalty(true, 1.2f); - // postprocess.set_top_k_sampling(true, 40); - postprocess.set_top_p_sampling(true, _attr.top_p); return postprocess.apply(logits, history); - - // float max_val = -MAXFLOAT; - // int max_index = 0; - // for (int i = 0; i < n; i++) - // { - // unsigned int proc = p[i] << 16; - // float tmp = *reinterpret_cast(&proc); - // if (tmp > max_val) - // { - // max_val = tmp; - // max_index = i; - // } - // } - // if (val) - // *val = max_val; - // return max_index; } public: @@ -308,18 +291,24 @@ class LLM { vpm_encoder.inference(); AX_SYS_MinvalidateCache(vpm_encoder.get_output(0).phyAddr, vpm_encoder.get_output(0).pVirAddr, vpm_encoder.get_output(0).nSize); - memcpy(vpm_resampler.get_input("input").pVirAddr, vpm_encoder.get_output(0).pVirAddr, + memcpy(vpm_resampler.get_input(0).pVirAddr, vpm_encoder.get_output(0).pVirAddr, vpm_encoder.get_output(0).nSize); } else { - void *data = vpm_resampler.get_input("input").pVirAddr; + void *data = vpm_resampler.get_input(0).pVirAddr; memcpy(data, dst.data, dst.rows * dst.cols * 3); } vpm_resampler.inference(); - out_embed.resize(vpm_resampler.get_output("output").nSize / sizeof(unsigned short)); - AX_SYS_MinvalidateCache(vpm_resampler.get_output("output").phyAddr, vpm_resampler.get_output("output").pVirAddr, - vpm_resampler.get_output("output").nSize); - memcpy(out_embed.data(), vpm_resampler.get_output("output").pVirAddr, vpm_resampler.get_output("output").nSize); + out_embed.resize(vpm_resampler.get_output(0).nSize / sizeof(float)); + AX_SYS_MinvalidateCache(vpm_resampler.get_output(0).phyAddr, vpm_resampler.get_output(0).pVirAddr, + vpm_resampler.get_output(0).nSize); + + float *output_data = (float *)vpm_resampler.get_output(0).pVirAddr; + for (size_t i = 0; i < out_embed.size(); i++) { + out_embed[i] = bfloat16(output_data[i]).data; + } + + // memcpy(out_embed.data(), vpm_resampler.get_output(0).pVirAddr, vpm_resampler.get_output(0).nSize); ALOGI("image encode time : %f ms, size : %d", t.cost(), out_embed.size()); return 0; } @@ -337,27 +326,49 @@ class LLM { embed_selector.getByIndex(input_ids[i], out_embed.data() + i * _attr.tokens_embed_size); } - // memcpy(out_embed.data() + 5 * _attr.tokens_embed_size, vpm_resampler.get_output("output").pVirAddr, - // vpm_resampler.get_output("output").nSize); + // memcpy(out_embed.data() + 5 * _attr.tokens_embed_size, vpm_resampler.get_output(0).pVirAddr, + // vpm_resampler.get_output(0).nSize); return 0; } - int Encode(std::vector> &img_embeds, std::vector &out_embed, - std::string prompt = "What is in the images?") + int Encode(std::vector &img_embed, std::vector &out_embed, + std::string prompt = "What is in the image?") { std::vector input_ids = tokenizer->Encode(prompt, true); - constexpr int IMG_CONTEXT = 151667; // InternVL2.5 - std::vector img_positions; + // constexpr int img_token_id = 49190; // smolvlm + // constexpr int img_token_id = 151667; // InternVL2.5 + int offset = 0; + int img_context_count = 0; for (size_t i = 0; i < input_ids.size(); i++) { - if (input_ids[i] == IMG_CONTEXT) { - img_positions.push_back(i); + if (input_ids[i] == _attr.img_token_id) { + img_context_count++; + if (img_context_count == 1) { + offset = i; + } } } - if (img_positions.size() > _attr.prefill_token_num) { + if (offset == 0) { + ALOGE("offset == 0"); + return -1; + } + + if (img_context_count != img_embed.size() / _attr.tokens_embed_size) { + ALOGE("img_context_count(%d) != img_embed.size() / tokens_embed_size(%d)", img_context_count, + img_embed.size() / _attr.tokens_embed_size); + return -1; + } + + // for (size_t i = 0; i < input_ids.size(); i++) + // { + // printf("%d ", input_ids[i]); + // } + // printf("\n"); + + if (input_ids.size() > _attr.prefill_token_num) { ALOGE("input_ids(%d) > prefill_token_num(%d)", input_ids.size(), _attr.prefill_token_num); return -1; } @@ -366,11 +377,8 @@ class LLM { for (size_t i = 0; i < input_ids.size(); i++) { embed_selector.getByIndex(input_ids[i], out_embed.data() + i * _attr.tokens_embed_size); } - for (size_t img_idx = 0; img_idx < img_embeds.size(); img_idx++) { - // int pos = img_positions[img_idx]; - memcpy(out_embed.data() + (14 + img_idx * 64) * _attr.tokens_embed_size, img_embeds[img_idx].data(), - img_embeds[img_idx].size() * sizeof(unsigned short)); - } + memcpy(out_embed.data() + offset * _attr.tokens_embed_size, img_embed.data(), + img_embed.size() * sizeof(unsigned short)); return 0; } @@ -504,7 +512,7 @@ class LLM { AX_SYS_MinvalidateCache(output_post.phyAddr, output_post.pVirAddr, output_post.nSize); unsigned short *post_out = (unsigned short *)output_post.pVirAddr; float max_val = -MAXFLOAT; - max_index = post_process(post_out, _attr.tokens_embed_num, token_ids, &max_val); + max_index = post_process(postprocess, post_out, _attr.tokens_embed_num, token_ids, &max_val); } next_token = max_index; @@ -599,7 +607,7 @@ class LLM { AX_SYS_MinvalidateCache(output_post.phyAddr, output_post.pVirAddr, output_post.nSize); unsigned short *post_out = (unsigned short *)output_post.pVirAddr; float max_val = -MAXFLOAT; - max_index = post_process(post_out, _attr.tokens_embed_num, token_ids, &max_val); + max_index = post_process(postprocess, post_out, _attr.tokens_embed_num, token_ids, &max_val); } next_token = max_index; From 033313f67230b9cd5d666dca9e4f014ef0397449 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 7 Apr 2025 19:22:35 +0800 Subject: [PATCH 07/64] [update] Remove the system environment python package. Add internvl2.5-1B-364-ax630c, smolvlm-256M-ax630c model. --- .../mode_internvl2.5-1B-364-ax630c.json | 35 ++++++ .../models/mode_smolvlm-256M-ax630c.json | 35 ++++++ projects/llm_framework/tools/llm_pack.py | 119 ++++++++++-------- 3 files changed, 137 insertions(+), 52 deletions(-) create mode 100644 projects/llm_framework/main_vlm/models/mode_internvl2.5-1B-364-ax630c.json create mode 100644 projects/llm_framework/main_vlm/models/mode_smolvlm-256M-ax630c.json diff --git a/projects/llm_framework/main_vlm/models/mode_internvl2.5-1B-364-ax630c.json b/projects/llm_framework/main_vlm/models/mode_internvl2.5-1B-364-ax630c.json new file mode 100644 index 0000000..8f6a396 --- /dev/null +++ b/projects/llm_framework/main_vlm/models/mode_internvl2.5-1B-364-ax630c.json @@ -0,0 +1,35 @@ +{ + "mode":"internvl2.5-1B-364-ax630c", + "type":"vlm", + "homepage":"https://huggingface.co/AXERA-TECH/InternVL2_5-1B", + "capabilities":[ + "text_generation", + "chat" + ], + "input_type":[ + "vlm.chat_completion", + "vlm.chat_completion.stream" + ], + "output_type":[ + "vlm.utf-8", + "vlm.utf-8.stream" + ], + "mode_param":{ + "tokenizer_type":2, + "filename_tokenizer_model":"http://localhost:8080", + "filename_tokens_embed":"model.embed_tokens.weight.bfloat16.bin", + "filename_post_axmodel":"qwen2_post.axmodel", + "template_filename_axmodel":"qwen2_p256_l%d_together.axmodel", + "filename_vpm_resampler_axmodedl":"vit_intern_sim_space2depth.axmodel", + "b_use_topk":false, + "b_bos":false, + "b_eos":false, + "axmodel_num":24, + "tokens_embed_num":151674, + "img_token_id":151667, + "tokens_embed_size":896, + "b_use_mmap_load_embed":true, + "b_dynamic_load_axmodel_layer":false, + "ext_scripts":["tokenizer_internvl2.5-1B-364-ax630c.py"] + } +} \ No newline at end of file diff --git a/projects/llm_framework/main_vlm/models/mode_smolvlm-256M-ax630c.json b/projects/llm_framework/main_vlm/models/mode_smolvlm-256M-ax630c.json new file mode 100644 index 0000000..1d3a293 --- /dev/null +++ b/projects/llm_framework/main_vlm/models/mode_smolvlm-256M-ax630c.json @@ -0,0 +1,35 @@ +{ + "mode":"smolvlm-256M-ax630c", + "type":"vlm", + "homepage":"https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct", + "capabilities":[ + "text_generation", + "chat" + ], + "input_type":[ + "vlm.chat_completion", + "vlm.chat_completion.stream" + ], + "output_type":[ + "vlm.utf-8", + "vlm.utf-8.stream" + ], + "mode_param":{ + "tokenizer_type":2, + "filename_tokenizer_model":"http://localhost:8080", + "filename_tokens_embed":"model.embed_tokens.weight.bfloat16.bin", + "filename_post_axmodel":"llama_post.axmodel", + "template_filename_axmodel":"llama_p128_l%d_together.axmodel", + "filename_vpm_resampler_axmodedl":"SmolVLM-256M-Instruct_vision_nhwc.axmodel", + "b_use_topk":false, + "b_bos":false, + "b_eos":false, + "axmodel_num":30, + "tokens_embed_num":49280, + "img_token_id":49190, + "tokens_embed_size":576, + "b_use_mmap_load_embed":true, + "b_dynamic_load_axmodel_layer":false, + "ext_scripts":["tokenizer_smolvlm-256M-ax630c.py"] + } +} \ No newline at end of file diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 09d0967..dd22c5c 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -68,23 +68,23 @@ def create_lib_deb(package_name, version, src_folder, revision = 'm5stack1'): # if os.path.exists(zip_file_extrpath): # shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'opt/m5stack/scripts')) - zip_file = 'm5stack_dist-packages.tar.gz' - down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_dist-packages.tar.gz' - zip_file_extrpath = 'm5stack_dist-packages' - if not os.path.exists(zip_file_extrpath): - # Downloading via HTTP (more common) - if not os.path.exists(zip_file): - response = requests.get(down_url) - if response.status_code == 200: - with open(zip_file, 'wb') as file: - file.write(response.content) - else: - print("{} down failed".format(down_url)) - with tarfile.open(zip_file, 'r:gz') as tar: - tar.extractall(path=zip_file_extrpath) - print("The {} download successful.".format(down_url)) - if os.path.exists(zip_file_extrpath): - shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'usr/local/lib/python3.10/dist-packages')) + # zip_file = 'm5stack_dist-packages.tar.gz' + # down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_dist-packages.tar.gz' + # zip_file_extrpath = 'm5stack_dist-packages' + # if not os.path.exists(zip_file_extrpath): + # # Downloading via HTTP (more common) + # if not os.path.exists(zip_file): + # response = requests.get(down_url) + # if response.status_code == 200: + # with open(zip_file, 'wb') as file: + # file.write(response.content) + # else: + # print("{} down failed".format(down_url)) + # with tarfile.open(zip_file, 'r:gz') as tar: + # tar.extractall(path=zip_file_extrpath) + # print("The {} download successful.".format(down_url)) + # if os.path.exists(zip_file_extrpath): + # shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'usr/local/lib/python3.10/dist-packages')) os.makedirs(os.path.join(deb_folder, 'DEBIAN'), exist_ok = True) with open(os.path.join(deb_folder, 'DEBIAN/control'),'w') as f: @@ -238,6 +238,18 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): openai_api_dir = os.path.join(src_folder, 'openai-api') if os.path.exists(openai_api_dir): shutil.copytree(openai_api_dir, os.path.join(deb_folder, 'opt/m5stack/lib/openai-api')) + if package_name == 'llm-kws': + sherpa_dir = os.path.join(src_folder, 'sherpa-onnx') + if os.path.exists(sherpa_dir): + shutil.copytree(sherpa_dir, os.path.join(deb_folder, 'opt/m5stack/lib/sherpa-onnx')) + if package_name == 'llm-llm': + llm_dir = os.path.join(src_folder, 'llm') + if os.path.exists(llm_dir): + shutil.copytree(llm_dir, os.path.join(deb_folder, 'opt/m5stack/lib/llm')) + if package_name == 'llm-vlm': + vlm_dir = os.path.join(src_folder, 'vlm') + if os.path.exists(vlm_dir): + shutil.copytree(vlm_dir, os.path.join(deb_folder, 'opt/m5stack/lib/vlm')) shutil.copy2(os.path.join(src_folder, package_name.replace("-", "_")), os.path.join(deb_folder, 'opt/m5stack/bin', package_name.replace("-", "_"))) ext_scripts_files = glob.glob(os.path.join(src_folder, package_name + "_*")) if ext_scripts_files: @@ -253,7 +265,8 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): f.write(f'Original-Maintainer: m5stack \n') f.write(f'Section: llm-module\n') f.write(f'Priority: optional\n') - f.write(f'Depends: lib-llm\n') + # f.write(f'Depends: lib-llm\n') + f.write(f'Depends: lib-llm (>= 1.7)\n') f.write(f'Homepage: https://www.m5stack.com\n') f.write(f'Description: llm-module\n') f.write(f' bsp.\n') @@ -363,12 +376,12 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): #################################################注意################################################ #################################################注意################################################ Tasks = { - 'lib-llm':[create_lib_deb,'lib-llm', 1.6, src_folder, revision], + 'lib-llm':[create_lib_deb,'lib-llm', 1.7, src_folder, revision], 'llm-sys':[create_bin_deb,'llm-sys', version, src_folder, revision], 'llm-audio':[create_bin_deb,'llm-audio', version, src_folder, revision], - 'llm-kws':[create_bin_deb,'llm-kws', version, src_folder, revision], + 'llm-kws':[create_bin_deb,'llm-kws', '1.6', src_folder, revision], 'llm-asr':[create_bin_deb,'llm-asr', version, src_folder, revision], - 'llm-llm':[create_bin_deb,'llm-llm', '1.6', src_folder, revision], + 'llm-llm':[create_bin_deb,'llm-llm', '1.7', src_folder, revision], 'llm-tts':[create_bin_deb,'llm-tts', version, src_folder, revision], 'llm-melotts':[create_bin_deb,'llm-melotts', version, src_folder, revision], 'llm-camera':[create_bin_deb,'llm-camera', '1.6', src_folder, revision], @@ -378,37 +391,39 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): 'llm-depth-anything':[create_bin_deb,'llm-depth-anything', version, src_folder, revision], 'llm-vad':[create_bin_deb,'llm-vad', version, src_folder, revision], 'llm-whisper':[create_bin_deb,'llm-whisper', version, src_folder, revision], - 'llm-openai-api':[create_bin_deb,'llm-openai-api', version, src_folder, revision], - 'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision], - 'llm-model-audio-zh-cn':[create_data_deb,'llm-model-audio-zh-cn', data_version, src_folder, revision], - 'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17', data_version, src_folder, revision], - 'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23', data_version, src_folder, revision], - 'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01', '0.3', src_folder, revision], - 'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01', '0.3', src_folder, revision], - 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', data_version, src_folder, revision], - 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', data_version, src_folder, revision], - 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.4', src_folder, revision], - 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], - 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], - 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], - 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], - 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], - 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], - 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], - 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.3', src_folder, revision], - 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], - 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], - 'llm-model-qwen2.5-0.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-Int4-ax630c', '0.4', src_folder, revision], - 'llm-model-qwen2.5-1.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-ax630c', '0.3', src_folder, revision], - 'llm-model-qwen2.5-1.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-p256-ax630c', '0.4', src_folder, revision], - 'llm-model-qwen2.5-1.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-Int4-ax630c', '0.4', src_folder, revision], - 'llm-model-qwen2.5-coder-0.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-coder-0.5B-ax630c', data_version, src_folder, revision], - 'llm-model-llama3.2-1B-prefill-ax630c':[create_data_deb,'llm-model-llama3.2-1B-prefill-ax630c', data_version, src_folder, revision], - 'llm-model-llama3.2-1B-p256-ax630c':[create_data_deb,'llm-model-llama3.2-1B-p256-ax630c', '0.4', src_folder, revision], - 'llm-model-openbuddy-llama3.2-1B-ax630c':[create_data_deb,'llm-model-openbuddy-llama3.2-1B-ax630c', data_version, src_folder, revision], - 'llm-model-internvl2.5-1B-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-ax630c', '0.4', src_folder, revision], - 'llm-model-deepseek-r1-1.5B-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-ax630c', '0.3', src_folder, revision], - 'llm-model-deepseek-r1-1.5B-p256-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-p256-ax630c', '0.4', src_folder, revision], + 'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.6', src_folder, revision], + # 'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision], + # 'llm-model-audio-zh-cn':[create_data_deb,'llm-model-audio-zh-cn', data_version, src_folder, revision], + # 'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17', data_version, src_folder, revision], + # 'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23', data_version, src_folder, revision], + # 'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01', '0.3', src_folder, revision], + # 'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01', '0.3', src_folder, revision], + # 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', data_version, src_folder, revision], + # 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', data_version, src_folder, revision], + # 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.4', src_folder, revision], + # 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], + # 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], + # 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], + # 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], + # 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], + # 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], + # 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], + # 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.3', src_folder, revision], + # 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], + # 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], + # 'llm-model-qwen2.5-0.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-Int4-ax630c', '0.4', src_folder, revision], + # 'llm-model-qwen2.5-1.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-ax630c', '0.3', src_folder, revision], + # 'llm-model-qwen2.5-1.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-p256-ax630c', '0.4', src_folder, revision], + # 'llm-model-qwen2.5-1.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-Int4-ax630c', '0.4', src_folder, revision], + # 'llm-model-qwen2.5-coder-0.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-coder-0.5B-ax630c', data_version, src_folder, revision], + # 'llm-model-llama3.2-1B-prefill-ax630c':[create_data_deb,'llm-model-llama3.2-1B-prefill-ax630c', data_version, src_folder, revision], + # 'llm-model-llama3.2-1B-p256-ax630c':[create_data_deb,'llm-model-llama3.2-1B-p256-ax630c', '0.4', src_folder, revision], + # 'llm-model-openbuddy-llama3.2-1B-ax630c':[create_data_deb,'llm-model-openbuddy-llama3.2-1B-ax630c', data_version, src_folder, revision], + # 'llm-model-internvl2.5-1B-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-ax630c', '0.4', src_folder, revision], + # 'llm-model-internvl2.5-1B-364-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-364-ax630c', '0.4', src_folder, revision], + # 'llm-model-deepseek-r1-1.5B-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-ax630c', '0.3', src_folder, revision], + # 'llm-model-deepseek-r1-1.5B-p256-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-p256-ax630c', '0.4', src_folder, revision], + 'llm-model-smolvlm-256M-ax630c':[create_data_deb,'llm-model-smolvlm-256M-ax630c', '0.4', src_folder, revision], # 'llm-model-qwen2-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2-0.5B-prefill-20e', data_version, src_folder, revision], # 'llm-model-qwen2-1.5B-prefill-20e':[create_data_deb,'llm-model-qwen2-1.5B-prefill-20e', data_version, src_folder, revision] } From a0222ca0323700abe5935ace4253b444b1cdf6c4 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 8 Apr 2025 10:08:28 +0800 Subject: [PATCH 08/64] [update] add smolvlm-500M-ax630c model & tokenizer server --- .../models/mode_smolvlm-500M-ax630c.json | 35 +++ .../tokenizer_internvl2.5-1B-364-ax630c.py | 138 ++++++++++ .../scripts/tokenizer_smolvlm-256M-ax630c.py | 248 ++++++++++++++++++ .../scripts/tokenizer_smolvlm-500M-ax630c.py | 248 ++++++++++++++++++ projects/llm_framework/tools/llm_pack.py | 63 ++--- 5 files changed, 701 insertions(+), 31 deletions(-) create mode 100644 projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json create mode 100644 projects/llm_framework/main_vlm/scripts/tokenizer_internvl2.5-1B-364-ax630c.py create mode 100644 projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py create mode 100644 projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py diff --git a/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json b/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json new file mode 100644 index 0000000..4c07e36 --- /dev/null +++ b/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json @@ -0,0 +1,35 @@ +{ + "mode":"smolvlm-500M-ax630c", + "type":"vlm", + "homepage":"https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct", + "capabilities":[ + "text_generation", + "chat" + ], + "input_type":[ + "vlm.chat_completion", + "vlm.chat_completion.stream" + ], + "output_type":[ + "vlm.utf-8", + "vlm.utf-8.stream" + ], + "mode_param":{ + "tokenizer_type":2, + "filename_tokenizer_model":"http://localhost:8080", + "filename_tokens_embed":"model.embed_tokens.weight.bfloat16.bin", + "filename_post_axmodel":"llama_post.axmodel", + "template_filename_axmodel":"llama_p128_l%d_together.axmodel", + "filename_vpm_resampler_axmodedl":"SmolVLM-500M-Instruct_vision.axmodel", + "b_use_topk":false, + "b_bos":false, + "b_eos":false, + "axmodel_num":32, + "tokens_embed_num":49280, + "img_token_id":49190, + "tokens_embed_size":960, + "b_use_mmap_load_embed":true, + "b_dynamic_load_axmodel_layer":false, + "ext_scripts":["tokenizer_smolvlm-500M-ax630c.py"] + } +} \ No newline at end of file diff --git a/projects/llm_framework/main_vlm/scripts/tokenizer_internvl2.5-1B-364-ax630c.py b/projects/llm_framework/main_vlm/scripts/tokenizer_internvl2.5-1B-364-ax630c.py new file mode 100644 index 0000000..569c5da --- /dev/null +++ b/projects/llm_framework/main_vlm/scripts/tokenizer_internvl2.5-1B-364-ax630c.py @@ -0,0 +1,138 @@ +from transformers import AutoTokenizer, PreTrainedTokenizerFast +from http.server import HTTPServer, BaseHTTPRequestHandler +import json +import argparse + + +class Tokenizer_Http: + + def __init__(self, model_id): + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=True, use_fast=False + ) + + def encode(self, prompt, content): + prompt = f"<|im_start|>system\n{content}<|im_end|><|im_start|>user\n{prompt}<|im_end|><|im_start|>assistant\n" + input_ids = self.tokenizer.encode(prompt) + return input_ids + + def encode_vpm(self, prompt, content="Please describe the image shortly."): + prompt = f"<|im_start|>system\n{content}<|im_end|><|im_start|>user\n" + "" * 169 + f"\n{prompt}<|im_end|><|im_start|>assistant\n" + input_ids = self.tokenizer.encode(prompt) + return input_ids + + def decode(self, token_ids): + return self.tokenizer.decode(token_ids, clean_up_tokenization_spaces=False) + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def bos_token(self): + return self.tokenizer.bos_token + + @property + def eos_token(self): + return self.tokenizer.eos_token + +class Request(BaseHTTPRequestHandler): + # 通过类继承,新定义类 + timeout = 5 + server_version = "Apache" + + def do_GET(self): + print(self.path) + # 在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行) + self.send_response(200) + self.send_header("type", "get") # 设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == "/bos_id": + bos_id = tokenizer.bos_id + # print(bos_id) + # to json + if bos_id is None: + msg = json.dumps({"bos_id": -1}) + else: + msg = json.dumps({"bos_id": bos_id}) + elif self.path == "/eos_id": + eos_id = tokenizer.eos_id + if eos_id is None: + msg = json.dumps({"eos_id": -1}) + else: + msg = json.dumps({"eos_id": eos_id}) + else: + msg = "error" + + print(msg) + msg = str(msg).encode() # 转为str再转为byte格式 + + self.wfile.write(msg) # 将byte格式的信息返回给客户端 + + def do_POST(self): + # 在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行) + data = self.rfile.read( + int(self.headers["content-length"]) + ) # 获取从客户端传入的参数(byte格式) + data = data.decode() # 将byte格式转为str格式 + + self.send_response(200) + self.send_header("type", "post") # 设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == "/encode": + req = json.loads(data) + print(req) + prompt = req["text"] + b_img_prompt = False + if "img_prompt" in req: + b_img_prompt = req["img_prompt"] + if b_img_prompt: + token_ids = tokenizer.encode_vpm(prompt) + else: + token_ids = tokenizer.encode(prompt, args.content) + if token_ids is None: + msg = json.dumps({"token_ids": -1}) + else: + msg = json.dumps({"token_ids": token_ids}) + + elif self.path == "/decode": + req = json.loads(data) + token_ids = req["token_ids"] + text = tokenizer.decode(token_ids) + if text is None: + msg = json.dumps({"text": ""}) + else: + msg = json.dumps({"text": text}) + else: + msg = "error" + print(msg) + msg = str(msg).encode() # 转为str再转为byte格式 + + self.wfile.write(msg) # 将byte格式的信息返回给客户端 + + +if __name__ == "__main__": + + args = argparse.ArgumentParser() + args.add_argument("--host", type=str, default="localhost") + args.add_argument("--port", type=int, default=8080) + args.add_argument('--model_id', type=str, default='internvl2_tokenizer') + args.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。') + args = args.parse_args() + + tokenizer = Tokenizer_Http(args.model_id) + + + # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token) + # print(tokenizer.encode("hello world", args.content)) + + host = (args.host, args.port) # 设定地址与端口号,'localhost'等价于'127.0.0.1' + print("http://%s:%s" % host) + server = HTTPServer(host, Request) # 根据地址端口号和新定义的类,创建服务器实例 + server.serve_forever() # 开启服务 diff --git a/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py new file mode 100644 index 0000000..5f99032 --- /dev/null +++ b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py @@ -0,0 +1,248 @@ +from transformers import AutoTokenizer, PreTrainedTokenizerFast +from transformers.tokenization_utils_base import AddedToken +from http.server import HTTPServer, BaseHTTPRequestHandler +import json +import argparse + +def _prompt_split_image( + image_seq_len, + image_rows, + image_cols, + fake_token_around_image, + image_token, + global_img_token, +): + """Prompt with expanded image tokens for when the image is split into patches.""" + text_split_images = "" + for n_h in range(image_rows): + for n_w in range(image_cols): + text_split_images += ( + f"{fake_token_around_image}" + + f"" + + f"{image_token}" * image_seq_len + ) + text_split_images += "\n" + + text_split_images += ( + f"\n{fake_token_around_image}" + + f"{global_img_token}" + + f"{image_token}" * image_seq_len + + f"{fake_token_around_image}" + ) + return text_split_images + + +def _prompt_single_image( + image_seq_len, fake_token_around_image, image_token, global_img_token +): + """Prompt with expanded image tokens for a single image.""" + return ( + f"{fake_token_around_image}" + + f"{global_img_token}" + + f"{image_token}" * image_seq_len + + f"{fake_token_around_image}" + ) + + +def get_image_prompt_string( + image_rows, + image_cols, + image_seq_len, + fake_token_around_image, + image_token, + global_img_token, +): + if image_rows == 0 and image_cols == 0: + return _prompt_single_image( + image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token, + ) + return _prompt_split_image( + image_seq_len, + image_rows, + image_cols, + fake_token_around_image, + image_token, + global_img_token, + ) + +class Tokenizer_Http: + + def __init__(self, model_id): + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=True, use_fast=False + ) + + def encode(self, prompt, content): + prompt = f"<|im_start|>User:{content}\nAssistant:" + input_ids = self.tokenizer(prompt) + return input_ids["input_ids"] + + def encode_vpm(self, prompt, content="Please describe the image shortly."): + prompt = f"<|im_start|>User:{content}\nAssistant:" + text = [prompt] + image_rows = [[0]] + image_cols = [[0]] + image_seq_len = 64 + image_token = "" + fake_image_token = "" + global_img_token = "" + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): + # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = get_image_prompt_string( + n_rows, + n_cols, + image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) + + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError("The image token should be present in the text.") + + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) + + fake_image_token = AddedToken(fake_image_token, normalized=False, special=True) + image_token = AddedToken(image_token, normalized=False, special=True) + end_of_utterance_token = AddedToken( + "", normalized=False, special=True + ) + tokens_to_add = { + "additional_special_tokens": [ + fake_image_token, + image_token, + end_of_utterance_token, + ] + } + self.tokenizer.add_special_tokens(tokens_to_add) + + input_ids = self.tokenizer(prompt_strings)["input_ids"][0] + return input_ids + + def decode(self, token_ids): + return self.tokenizer.decode(token_ids, clean_up_tokenization_spaces=False) + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def bos_token(self): + return self.tokenizer.bos_token + + @property + def eos_token(self): + return self.tokenizer.eos_token + +class Request(BaseHTTPRequestHandler): + # 通过类继承,新定义类 + timeout = 5 + server_version = "Apache" + + def do_GET(self): + print(self.path) + # 在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行) + self.send_response(200) + self.send_header("type", "get") # 设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == "/bos_id": + bos_id = tokenizer.bos_id + # print(bos_id) + # to json + if bos_id is None: + msg = json.dumps({"bos_id": -1}) + else: + msg = json.dumps({"bos_id": bos_id}) + elif self.path == "/eos_id": + eos_id = tokenizer.eos_id + if eos_id is None: + msg = json.dumps({"eos_id": -1}) + else: + msg = json.dumps({"eos_id": eos_id}) + else: + msg = "error" + + print(msg) + msg = str(msg).encode() # 转为str再转为byte格式 + + self.wfile.write(msg) # 将byte格式的信息返回给客户端 + + def do_POST(self): + # 在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行) + data = self.rfile.read( + int(self.headers["content-length"]) + ) # 获取从客户端传入的参数(byte格式) + data = data.decode() # 将byte格式转为str格式 + + self.send_response(200) + self.send_header("type", "post") # 设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == "/encode": + req = json.loads(data) + print(req) + prompt = req["text"] + b_img_prompt = False + if "img_prompt" in req: + b_img_prompt = req["img_prompt"] + if b_img_prompt: + token_ids = tokenizer.encode_vpm(prompt) + else: + token_ids = tokenizer.encode(prompt, args.content) + if token_ids is None: + msg = json.dumps({"token_ids": -1}) + else: + msg = json.dumps({"token_ids": token_ids}) + + elif self.path == "/decode": + req = json.loads(data) + token_ids = req["token_ids"] + text = tokenizer.decode(token_ids) + if text is None: + msg = json.dumps({"text": ""}) + else: + msg = json.dumps({"text": text}) + else: + msg = "error" + print(msg) + msg = str(msg).encode() # 转为str再转为byte格式 + + self.wfile.write(msg) # 将byte格式的信息返回给客户端 + + +if __name__ == "__main__": + + args = argparse.ArgumentParser() + args.add_argument("--host", type=str, default="localhost") + args.add_argument("--port", type=int, default=8080) + args.add_argument('--model_id', type=str, default='internvl2_tokenizer') + args.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。') + args = args.parse_args() + + tokenizer = Tokenizer_Http(args.model_id) + + + # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token) + # print(tokenizer.encode("hello world", args.content)) + + host = (args.host, args.port) # 设定地址与端口号,'localhost'等价于'127.0.0.1' + print("http://%s:%s" % host) + server = HTTPServer(host, Request) # 根据地址端口号和新定义的类,创建服务器实例 + server.serve_forever() # 开启服务 diff --git a/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py new file mode 100644 index 0000000..5f99032 --- /dev/null +++ b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py @@ -0,0 +1,248 @@ +from transformers import AutoTokenizer, PreTrainedTokenizerFast +from transformers.tokenization_utils_base import AddedToken +from http.server import HTTPServer, BaseHTTPRequestHandler +import json +import argparse + +def _prompt_split_image( + image_seq_len, + image_rows, + image_cols, + fake_token_around_image, + image_token, + global_img_token, +): + """Prompt with expanded image tokens for when the image is split into patches.""" + text_split_images = "" + for n_h in range(image_rows): + for n_w in range(image_cols): + text_split_images += ( + f"{fake_token_around_image}" + + f"" + + f"{image_token}" * image_seq_len + ) + text_split_images += "\n" + + text_split_images += ( + f"\n{fake_token_around_image}" + + f"{global_img_token}" + + f"{image_token}" * image_seq_len + + f"{fake_token_around_image}" + ) + return text_split_images + + +def _prompt_single_image( + image_seq_len, fake_token_around_image, image_token, global_img_token +): + """Prompt with expanded image tokens for a single image.""" + return ( + f"{fake_token_around_image}" + + f"{global_img_token}" + + f"{image_token}" * image_seq_len + + f"{fake_token_around_image}" + ) + + +def get_image_prompt_string( + image_rows, + image_cols, + image_seq_len, + fake_token_around_image, + image_token, + global_img_token, +): + if image_rows == 0 and image_cols == 0: + return _prompt_single_image( + image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token, + ) + return _prompt_split_image( + image_seq_len, + image_rows, + image_cols, + fake_token_around_image, + image_token, + global_img_token, + ) + +class Tokenizer_Http: + + def __init__(self, model_id): + self.tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=True, use_fast=False + ) + + def encode(self, prompt, content): + prompt = f"<|im_start|>User:{content}\nAssistant:" + input_ids = self.tokenizer(prompt) + return input_ids["input_ids"] + + def encode_vpm(self, prompt, content="Please describe the image shortly."): + prompt = f"<|im_start|>User:{content}\nAssistant:" + text = [prompt] + image_rows = [[0]] + image_cols = [[0]] + image_seq_len = 64 + image_token = "" + fake_image_token = "" + global_img_token = "" + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): + # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = get_image_prompt_string( + n_rows, + n_cols, + image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) + + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError("The image token should be present in the text.") + + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) + + fake_image_token = AddedToken(fake_image_token, normalized=False, special=True) + image_token = AddedToken(image_token, normalized=False, special=True) + end_of_utterance_token = AddedToken( + "", normalized=False, special=True + ) + tokens_to_add = { + "additional_special_tokens": [ + fake_image_token, + image_token, + end_of_utterance_token, + ] + } + self.tokenizer.add_special_tokens(tokens_to_add) + + input_ids = self.tokenizer(prompt_strings)["input_ids"][0] + return input_ids + + def decode(self, token_ids): + return self.tokenizer.decode(token_ids, clean_up_tokenization_spaces=False) + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def bos_token(self): + return self.tokenizer.bos_token + + @property + def eos_token(self): + return self.tokenizer.eos_token + +class Request(BaseHTTPRequestHandler): + # 通过类继承,新定义类 + timeout = 5 + server_version = "Apache" + + def do_GET(self): + print(self.path) + # 在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行) + self.send_response(200) + self.send_header("type", "get") # 设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == "/bos_id": + bos_id = tokenizer.bos_id + # print(bos_id) + # to json + if bos_id is None: + msg = json.dumps({"bos_id": -1}) + else: + msg = json.dumps({"bos_id": bos_id}) + elif self.path == "/eos_id": + eos_id = tokenizer.eos_id + if eos_id is None: + msg = json.dumps({"eos_id": -1}) + else: + msg = json.dumps({"eos_id": eos_id}) + else: + msg = "error" + + print(msg) + msg = str(msg).encode() # 转为str再转为byte格式 + + self.wfile.write(msg) # 将byte格式的信息返回给客户端 + + def do_POST(self): + # 在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行) + data = self.rfile.read( + int(self.headers["content-length"]) + ) # 获取从客户端传入的参数(byte格式) + data = data.decode() # 将byte格式转为str格式 + + self.send_response(200) + self.send_header("type", "post") # 设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == "/encode": + req = json.loads(data) + print(req) + prompt = req["text"] + b_img_prompt = False + if "img_prompt" in req: + b_img_prompt = req["img_prompt"] + if b_img_prompt: + token_ids = tokenizer.encode_vpm(prompt) + else: + token_ids = tokenizer.encode(prompt, args.content) + if token_ids is None: + msg = json.dumps({"token_ids": -1}) + else: + msg = json.dumps({"token_ids": token_ids}) + + elif self.path == "/decode": + req = json.loads(data) + token_ids = req["token_ids"] + text = tokenizer.decode(token_ids) + if text is None: + msg = json.dumps({"text": ""}) + else: + msg = json.dumps({"text": text}) + else: + msg = "error" + print(msg) + msg = str(msg).encode() # 转为str再转为byte格式 + + self.wfile.write(msg) # 将byte格式的信息返回给客户端 + + +if __name__ == "__main__": + + args = argparse.ArgumentParser() + args.add_argument("--host", type=str, default="localhost") + args.add_argument("--port", type=int, default=8080) + args.add_argument('--model_id', type=str, default='internvl2_tokenizer') + args.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。') + args = args.parse_args() + + tokenizer = Tokenizer_Http(args.model_id) + + + # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token) + # print(tokenizer.encode("hello world", args.content)) + + host = (args.host, args.port) # 设定地址与端口号,'localhost'等价于'127.0.0.1' + print("http://%s:%s" % host) + server = HTTPServer(host, Request) # 根据地址端口号和新定义的类,创建服务器实例 + server.serve_forever() # 开启服务 diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index dd22c5c..d0f1396 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -392,38 +392,39 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): 'llm-vad':[create_bin_deb,'llm-vad', version, src_folder, revision], 'llm-whisper':[create_bin_deb,'llm-whisper', version, src_folder, revision], 'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.6', src_folder, revision], - # 'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision], - # 'llm-model-audio-zh-cn':[create_data_deb,'llm-model-audio-zh-cn', data_version, src_folder, revision], - # 'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17', data_version, src_folder, revision], - # 'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23', data_version, src_folder, revision], - # 'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01', '0.3', src_folder, revision], - # 'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01', '0.3', src_folder, revision], - # 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', data_version, src_folder, revision], - # 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', data_version, src_folder, revision], - # 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.4', src_folder, revision], - # 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], - # 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], - # 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], - # 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], - # 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], - # 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], - # 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], - # 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.3', src_folder, revision], - # 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], - # 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], - # 'llm-model-qwen2.5-0.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-Int4-ax630c', '0.4', src_folder, revision], - # 'llm-model-qwen2.5-1.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-ax630c', '0.3', src_folder, revision], - # 'llm-model-qwen2.5-1.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-p256-ax630c', '0.4', src_folder, revision], - # 'llm-model-qwen2.5-1.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-Int4-ax630c', '0.4', src_folder, revision], - # 'llm-model-qwen2.5-coder-0.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-coder-0.5B-ax630c', data_version, src_folder, revision], - # 'llm-model-llama3.2-1B-prefill-ax630c':[create_data_deb,'llm-model-llama3.2-1B-prefill-ax630c', data_version, src_folder, revision], - # 'llm-model-llama3.2-1B-p256-ax630c':[create_data_deb,'llm-model-llama3.2-1B-p256-ax630c', '0.4', src_folder, revision], - # 'llm-model-openbuddy-llama3.2-1B-ax630c':[create_data_deb,'llm-model-openbuddy-llama3.2-1B-ax630c', data_version, src_folder, revision], - # 'llm-model-internvl2.5-1B-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-ax630c', '0.4', src_folder, revision], - # 'llm-model-internvl2.5-1B-364-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-364-ax630c', '0.4', src_folder, revision], - # 'llm-model-deepseek-r1-1.5B-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-ax630c', '0.3', src_folder, revision], - # 'llm-model-deepseek-r1-1.5B-p256-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-p256-ax630c', '0.4', src_folder, revision], + 'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision], + 'llm-model-audio-zh-cn':[create_data_deb,'llm-model-audio-zh-cn', data_version, src_folder, revision], + 'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17', data_version, src_folder, revision], + 'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23', data_version, src_folder, revision], + 'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01', '0.3', src_folder, revision], + 'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01', '0.3', src_folder, revision], + 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', data_version, src_folder, revision], + 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', data_version, src_folder, revision], + 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.4', src_folder, revision], + 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], + 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], + 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], + 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], + 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], + 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], + 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], + 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.3', src_folder, revision], + 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], + 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], + 'llm-model-qwen2.5-0.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-Int4-ax630c', '0.4', src_folder, revision], + 'llm-model-qwen2.5-1.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-ax630c', '0.3', src_folder, revision], + 'llm-model-qwen2.5-1.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-p256-ax630c', '0.4', src_folder, revision], + 'llm-model-qwen2.5-1.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-Int4-ax630c', '0.4', src_folder, revision], + 'llm-model-qwen2.5-coder-0.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-coder-0.5B-ax630c', data_version, src_folder, revision], + 'llm-model-llama3.2-1B-prefill-ax630c':[create_data_deb,'llm-model-llama3.2-1B-prefill-ax630c', data_version, src_folder, revision], + 'llm-model-llama3.2-1B-p256-ax630c':[create_data_deb,'llm-model-llama3.2-1B-p256-ax630c', '0.4', src_folder, revision], + 'llm-model-openbuddy-llama3.2-1B-ax630c':[create_data_deb,'llm-model-openbuddy-llama3.2-1B-ax630c', data_version, src_folder, revision], + 'llm-model-internvl2.5-1B-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-ax630c', '0.4', src_folder, revision], + 'llm-model-internvl2.5-1B-364-ax630c':[create_data_deb,'llm-model-internvl2.5-1B-364-ax630c', '0.4', src_folder, revision], + 'llm-model-deepseek-r1-1.5B-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-ax630c', '0.3', src_folder, revision], + 'llm-model-deepseek-r1-1.5B-p256-ax630c':[create_data_deb,'llm-model-deepseek-r1-1.5B-p256-ax630c', '0.4', src_folder, revision], 'llm-model-smolvlm-256M-ax630c':[create_data_deb,'llm-model-smolvlm-256M-ax630c', '0.4', src_folder, revision], + 'llm-model-smolvlm-500M-ax630c':[create_data_deb,'llm-model-smolvlm-500M-ax630c', '0.4', src_folder, revision], # 'llm-model-qwen2-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2-0.5B-prefill-20e', data_version, src_folder, revision], # 'llm-model-qwen2-1.5B-prefill-20e':[create_data_deb,'llm-model-qwen2-1.5B-prefill-20e', data_version, src_folder, revision] } From 8ae949d403772ba05dd661a5eac2da0fcb114640 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 8 Apr 2025 10:32:23 +0800 Subject: [PATCH 09/64] [update] update main_vlm version --- projects/llm_framework/main_vlm/src/main.cpp | 1 + projects/llm_framework/tools/llm_pack.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp index 50cbabe..19867ad 100644 --- a/projects/llm_framework/main_vlm/src/main.cpp +++ b/projects/llm_framework/main_vlm/src/main.cpp @@ -154,6 +154,7 @@ class llm_task { if (!tokenizer_server_flage_.load()) { tokenizer_pid_ = fork(); if (tokenizer_pid_ == 0) { + setenv("PYTHONPATH", "/opt/m5stack/lib/vlm/site-packages", 1); execl("/usr/bin/python3", "python3", tokenizer_file.c_str(), "--host", "localhost", "--port", std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(), "--content", ("'" + prompt_ + "'").c_str(), nullptr); diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index d0f1396..05dea8d 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -385,7 +385,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): 'llm-tts':[create_bin_deb,'llm-tts', version, src_folder, revision], 'llm-melotts':[create_bin_deb,'llm-melotts', version, src_folder, revision], 'llm-camera':[create_bin_deb,'llm-camera', '1.6', src_folder, revision], - 'llm-vlm':[create_bin_deb,'llm-vlm', version, src_folder, revision], + 'llm-vlm':[create_bin_deb,'llm-vlm', '1.6', src_folder, revision], 'llm-yolo':[create_bin_deb,'llm-yolo', '1.6', src_folder, revision], 'llm-skel':[create_bin_deb,'llm-skel', version, src_folder, revision], 'llm-depth-anything':[create_bin_deb,'llm-depth-anything', version, src_folder, revision], From c39e6913577a94da1591f352c3d49e56587baf4d Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 8 Apr 2025 16:11:19 +0800 Subject: [PATCH 10/64] [fix] Fix tokenizer_smolvlm prompt bug. --- .../main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py | 4 ++-- .../main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py index 5f99032..560a71f 100644 --- a/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py +++ b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-256M-ax630c.py @@ -81,7 +81,7 @@ def encode(self, prompt, content): return input_ids["input_ids"] def encode_vpm(self, prompt, content="Please describe the image shortly."): - prompt = f"<|im_start|>User:{content}\nAssistant:" + prompt = f"<|im_start|>User:{prompt}\nAssistant:" text = [prompt] image_rows = [[0]] image_cols = [[0]] @@ -233,7 +233,7 @@ def do_POST(self): args.add_argument("--host", type=str, default="localhost") args.add_argument("--port", type=int, default=8080) args.add_argument('--model_id', type=str, default='internvl2_tokenizer') - args.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。') + args.add_argument('--content', type=str, default='') args = args.parse_args() tokenizer = Tokenizer_Http(args.model_id) diff --git a/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py index 5f99032..560a71f 100644 --- a/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py +++ b/projects/llm_framework/main_vlm/scripts/tokenizer_smolvlm-500M-ax630c.py @@ -81,7 +81,7 @@ def encode(self, prompt, content): return input_ids["input_ids"] def encode_vpm(self, prompt, content="Please describe the image shortly."): - prompt = f"<|im_start|>User:{content}\nAssistant:" + prompt = f"<|im_start|>User:{prompt}\nAssistant:" text = [prompt] image_rows = [[0]] image_cols = [[0]] @@ -233,7 +233,7 @@ def do_POST(self): args.add_argument("--host", type=str, default="localhost") args.add_argument("--port", type=int, default=8080) args.add_argument('--model_id', type=str, default='internvl2_tokenizer') - args.add_argument('--content', type=str, default='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。') + args.add_argument('--content', type=str, default='') args = args.parse_args() tokenizer = Tokenizer_Http(args.model_id) From a78367ea6e7412530e02071317c6b42957efacab Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Wed, 9 Apr 2025 17:59:12 +0800 Subject: [PATCH 11/64] [update] main_camera add AXERA VIN && add rtsp && add webstream --- projects/llm_framework/main_camera/SConstruct | 23 +- .../llm_framework/main_camera/camera.json | 112 ++- .../main_camera/src/axera_camera.c | 687 ++++++++++++++---- .../main_camera/src/axera_camera.h | 5 +- .../llm_framework/main_camera/src/main.cpp | 497 ++++++++++++- 5 files changed, 1135 insertions(+), 189 deletions(-) diff --git a/projects/llm_framework/main_camera/SConstruct b/projects/llm_framework/main_camera/SConstruct index 56707db..c506734 100644 --- a/projects/llm_framework/main_camera/SConstruct +++ b/projects/llm_framework/main_camera/SConstruct @@ -6,9 +6,9 @@ with open(env['PROJECT_TOOL_S']) as f: # SRCS = append_srcs_dir(ADir('src')) SRCS = Glob('src/*.c*') -INCLUDE = [ADir('include'), ADir('.')] +INCLUDE = [ADir('../include'), ADir('.')] PRIVATE_INCLUDE = [] -REQUIREMENTS = ['pthread', 'utilities', 'ax_msp', 'eventpp', 'StackFlow', 'single_header_libs'] +REQUIREMENTS = ['hv', 'pthread', 'utilities', 'ax_msp', 'eventpp', 'StackFlow', 'single_header_libs'] STATIC_LIB = [] DYNAMIC_LIB = [] DEFINITIONS = [] @@ -17,11 +17,26 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-std=c++17', '-O2'] +DEFINITIONS += ['-O2'] +DEFINITIONS += ['-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] # INCLUDE += [ADir('../include'), ADir('../include/cppzmq'), ADir('../include/libzmq')] LINK_SEARCH_PATH += [ADir('../static_lib')] +# REQUIREMENTS += ['Backward_cpp'] +# DYNAMIC_LIB += [ AFile('../static_lib/libdw.so.1'), +# AFile('../static_lib/libelf.so.1'), +# AFile('../static_lib/libz.so.1'), +# AFile('../static_lib/liblzma.so.5'), +# AFile('../static_lib/libbz2.so.1.0')] +# DEFINITIONS += ["-DENABLE_BACKWARD"] + +INCLUDE.append(os.path.join(env["MSP_PATH"], 'third-party/live/out/arm64/glibc/include/liveMedia')) +INCLUDE.append(os.path.join(env["MSP_PATH"], 'third-party/live/out/arm64/glibc/include/groupsock')) +INCLUDE.append(os.path.join(env["MSP_PATH"], 'third-party/live/out/arm64/glibc/include/UsageEnvironment')) +INCLUDE.append(os.path.join(env["MSP_PATH"], 'third-party/live/out/arm64/glibc/include/BasicUsageEnvironment')) +INCLUDE.append(os.path.join(env["MSP_PATH"], 'sample/rtsp')) +SRCS += Glob(os.path.join(env["MSP_PATH"], 'sample/rtsp/*.c*') ) # INCLUDE.append(ADir('ax_sample')) # SRCS += Glob('ax_sample/*.c*') @@ -30,7 +45,7 @@ REQUIREMENTS += ['ax_sys', 'ax_interpreter'] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys', 'ax_vo'] REQUIREMENTS += ['ax_ae', 'ax_skel', 'ax_venc', 'ax_mipi', 'ax_ives', 'ax_ivps', 'ax_proton', 'ax_audio', 'tinyalsa'] REQUIREMENTS += ['ax_audio_3a','samplerate', 'ax_fdk', 'fdk-aac', 'ax_awb', 'ax_af', 'ax_nt_stream', 'ax_nt_ctrl'] - +REQUIREMENTS += ['liveMedia', 'UsageEnvironment', 'groupsock', 'BasicUsageEnvironment'] INCLUDE += [ADir('../include/opencv4')] static_file = [] diff --git a/projects/llm_framework/main_camera/camera.json b/projects/llm_framework/main_camera/camera.json index 608b163..22fbd88 100644 --- a/projects/llm_framework/main_camera/camera.json +++ b/projects/llm_framework/main_camera/camera.json @@ -2,17 +2,119 @@ "mode": "None", "type": "camera", "capabilities": [ - "play", - "cap" + "camera cap" ], "input_type": [ - "rpc.camera.wav.base64", - "rpc.camera.pcm.base64" + "camera.v4l2_dev", + "camera.axera_dev" ], "output_type": [ - "camera.pcm.stream" + "image.yuyv422.base64", + "image.jpeg.base64" ], "cap_param": { "None": "None" + }, + "jpeg_config_param": { + "stVencAttr.enType": 26, + "stVencAttr.u32MaxPicWidth": 32768, + "stVencAttr.u32MaxPicHeight": 32768, + "stVencAttr.u32PicWidthSrc": 32768, + "stVencAttr.u32PicHeightSrc": 32768, + "stVencAttr.enLinkMode": 1 + }, + "h264_config_param": { + "stVencAttr.enType": 96, + "stVencAttr.u32MaxPicWidth": 3840, + "stVencAttr.u32MaxPicHeight": 2160, + "stVencAttr.enMemSource": 0, + "stVencAttr.u32BufSize": 16588800, + "stVencAttr.enProfile": 10, + "stVencAttr.enLevel": 52, + "stVencAttr.enTier": 0, + "stVencAttr.u32PicWidthSrc": 1280, + "stVencAttr.u32PicHeightSrc": 720, + "stVencAttr.stCropCfg.bEnable": 0, + "stVencAttr.stCropCfg.stRect.s32X": 0, + "stVencAttr.stCropCfg.stRect.s32Y": 0, + "stVencAttr.stCropCfg.stRect.u32Width": 0, + "stVencAttr.stCropCfg.stRect.u32Height": 0, + "stVencAttr.enRotation": 0, + "stVencAttr.enLinkMode": 1, + "stVencAttr.bDeBreathEffect": 0, + "stVencAttr.bRefRingbuf": 0, + "stVencAttr.s32StopWaitTime": 0, + "stVencAttr.u8InFifoDepth": 4, + "stVencAttr.u8OutFifoDepth": 4, + "stVencAttr.u32SliceNum": 0, + "stVencAttr.stAttrH265e.bRcnRefShareBuf": 0, + "stRcAttr.enRcMode": 1, + "stRcAttr.s32FirstFrameStartQp": -1, + "stRcAttr.stFrameRate.fSrcFrameRate": 30.0, + "stRcAttr.stFrameRate.fDstFrameRate": 30.0, + "stRcAttr.stH264Cbr.u32Gop": 120, + "stRcAttr.stH264Cbr.u32StatTime": 0, + "stRcAttr.stH264Cbr.u32BitRate": 2048, + "stRcAttr.stH264Cbr.u32MinQp": 10, + "stRcAttr.stH264Cbr.u32MaxQp": 51, + "stRcAttr.stH264Cbr.u32MinIQp": 10, + "stRcAttr.stH264Cbr.u32MaxIQp": 51, + "stRcAttr.stH264Cbr.u32MaxIprop": 40, + "stRcAttr.stH264Cbr.u32MinIprop": 10, + "stRcAttr.stH264Cbr.s32IntraQpDelta": -2, + "stRcAttr.stH264Cbr.s32DeBreathQpDelta": 0, + "stRcAttr.stH264Cbr.u32IdrQpDeltaRange": 0, + "stRcAttr.stH264Cbr.stQpmapInfo.enCtbRcMode": 0, + "stRcAttr.stH264Cbr.stQpmapInfo.enQpmapQpType": 0, + "stRcAttr.stH264Cbr.stQpmapInfo.enQpmapBlockType": 0, + "stRcAttr.stH264Cbr.stQpmapInfo.enQpmapBlockUnit": 0, + "stGopAttr.enGopMode": 0 + }, + "h265_config_param": { + "stVencAttr.enType": 265, + "stVencAttr.u32MaxPicWidth": 3840, + "stVencAttr.u32MaxPicHeight": 2160, + "stVencAttr.enMemSource": 0, + "stVencAttr.u32BufSize": 16588800, + "stVencAttr.enProfile": 0, + "stVencAttr.enLevel": 153, + "stVencAttr.enTier": 0, + "stVencAttr.u32PicWidthSrc": 1280, + "stVencAttr.u32PicHeightSrc": 720, + "stVencAttr.stCropCfg.bEnable": 0, + "stVencAttr.stCropCfg.stRect.s32X": 0, + "stVencAttr.stCropCfg.stRect.s32Y": 0, + "stVencAttr.stCropCfg.stRect.u32Width": 0, + "stVencAttr.stCropCfg.stRect.u32Height": 0, + "stVencAttr.enRotation": 0, + "stVencAttr.enLinkMode": 1, + "stVencAttr.bDeBreathEffect": 0, + "stVencAttr.bRefRingbuf": 0, + "stVencAttr.s32StopWaitTime": 0, + "stVencAttr.u8InFifoDepth": 4, + "stVencAttr.u8OutFifoDepth": 4, + "stVencAttr.u32SliceNum": 0, + "stVencAttr.stAttrH265e.bRcnRefShareBuf": 0, + "stRcAttr.enRcMode": 11, + "stRcAttr.s32FirstFrameStartQp": -1, + "stRcAttr.stFrameRate.fSrcFrameRate": 30.0, + "stRcAttr.stFrameRate.fDstFrameRate": 30.0, + "stRcAttr.stH265Cbr.u32Gop": 120, + "stRcAttr.stH265Cbr.u32StatTime": 0, + "stRcAttr.stH265Cbr.u32BitRate": 2048, + "stRcAttr.stH265Cbr.u32MinQp": 10, + "stRcAttr.stH265Cbr.u32MaxQp": 51, + "stRcAttr.stH265Cbr.u32MinIQp": 10, + "stRcAttr.stH265Cbr.u32MaxIQp": 51, + "stRcAttr.stH265Cbr.u32MaxIprop": 40, + "stRcAttr.stH265Cbr.u32MinIprop": 30, + "stRcAttr.stH265Cbr.s32IntraQpDelta": -2, + "stRcAttr.stH265Cbr.s32DeBreathQpDelta": 0, + "stRcAttr.stH265Cbr.u32IdrQpDeltaRange": 0, + "stRcAttr.stH265Cbr.stQpmapInfo.enCtbRcMode": 0, + "stRcAttr.stH265Cbr.stQpmapInfo.enQpmapQpType": 0, + "stRcAttr.stH265Cbr.stQpmapInfo.enQpmapBlockType": 0, + "stRcAttr.stH265Cbr.stQpmapInfo.enQpmapBlockUnit": 0, + "stGopAttr.enGopMode": 0 } } \ No newline at end of file diff --git a/projects/llm_framework/main_camera/src/axera_camera.c b/projects/llm_framework/main_camera/src/axera_camera.c index d929a21..841a6b5 100644 --- a/projects/llm_framework/main_camera/src/axera_camera.c +++ b/projects/llm_framework/main_camera/src/axera_camera.c @@ -16,7 +16,10 @@ #include #include #include - +#include +#include +#include "AXRtspWrapper.h" +#include "ax_venc_api.h" #include "ax_global_type.h" #include "common_isp.h" #include "common_sys.h" @@ -28,13 +31,16 @@ #include "ax_ivps_api.h" #ifndef ALIGN_UP -#define ALIGN_UP(x, a) ((((x) + ((a) - 1)) / a) * a) +#define ALIGN_UP(x, a) ((((x) + ((a) - 1)) / a) * a) #endif +#define USER_OUTPUT_CHN 2 +#define RTSP_OUTPUT_CHN 1 + AX_MIPI_RX_ATTR_T gSc850slMipiAttr = { - .ePhyMode = AX_MIPI_PHY_TYPE_DPHY, - .eLaneNum = AX_MIPI_DATA_LANE_4, - .nDataRate = 80, + .ePhyMode = AX_MIPI_PHY_TYPE_DPHY, + .eLaneNum = AX_MIPI_DATA_LANE_4, + .nDataRate = 80, .nDataLaneMap[0] = 0, .nDataLaneMap[1] = 1, .nDataLaneMap[2] = 3, @@ -44,30 +50,30 @@ AX_MIPI_RX_ATTR_T gSc850slMipiAttr = { }; AX_SNS_ATTR_T gSc850slSnsAttr = { - .nWidth = 3840, - .nHeight = 2160, - .fFrameRate = 30, - .eSnsMode = AX_SNS_LINEAR_MODE, - .eRawType = AX_RT_RAW10, - .eBayerPattern = AX_BP_RGGB, + .nWidth = 3840, + .nHeight = 2160, + .fFrameRate = 30, + .eSnsMode = AX_SNS_LINEAR_MODE, + .eRawType = AX_RT_RAW10, + .eBayerPattern = AX_BP_RGGB, .bTestPatternEnable = AX_FALSE, // .nSettingIndex = 12, }; AX_SNS_CLK_ATTR_T gSc850slSnsClkAttr = { - .nSnsClkIdx = 0, + .nSnsClkIdx = 0, .eSnsClkRate = AX_SNS_CLK_24M, }; AX_VIN_DEV_ATTR_T gSc850slDevAttr = { - .bImgDataEnable = AX_TRUE, + .bImgDataEnable = AX_TRUE, .bNonImgDataEnable = AX_FALSE, - .eDevMode = AX_VIN_DEV_ONLINE, - .eSnsIntfType = AX_SNS_INTF_TYPE_MIPI_RAW, - .tDevImgRgn[0] = {0, 0, 3840, 2160}, - .tDevImgRgn[1] = {0, 0, 3840, 2160}, - .tDevImgRgn[2] = {0, 0, 3840, 2160}, - .tDevImgRgn[3] = {0, 0, 3840, 2160}, + .eDevMode = AX_VIN_DEV_ONLINE, + .eSnsIntfType = AX_SNS_INTF_TYPE_MIPI_RAW, + .tDevImgRgn[0] = {0, 0, 3840, 2160}, + .tDevImgRgn[1] = {0, 0, 3840, 2160}, + .tDevImgRgn[2] = {0, 0, 3840, 2160}, + .tDevImgRgn[3] = {0, 0, 3840, 2160}, /* When users transfer special data, they need to configure VC&DT for szImgVc/szImgDt/szInfoVc/szInfoDt */ //.tMipiIntfAttr.szImgVc[0] = 0, @@ -79,33 +85,33 @@ AX_VIN_DEV_ATTR_T gSc850slDevAttr = { //.tMipiIntfAttr.szInfoDt[0] = 63, //.tMipiIntfAttr.szInfoDt[1] = 63, - .ePixelFmt = AX_FORMAT_BAYER_RAW_10BPP_PACKED, - .eBayerPattern = AX_BP_RGGB, - .eSnsMode = AX_SNS_LINEAR_MODE, + .ePixelFmt = AX_FORMAT_BAYER_RAW_10BPP_PACKED, + .eBayerPattern = AX_BP_RGGB, + .eSnsMode = AX_SNS_LINEAR_MODE, .eSnsOutputMode = AX_SNS_NORMAL, - .tCompressInfo = {AX_COMPRESS_MODE_NONE, 0}, - .tFrameRateCtrl= {AX_INVALID_FRMRATE, AX_INVALID_FRMRATE}, + .tCompressInfo = {AX_COMPRESS_MODE_NONE, 0}, + .tFrameRateCtrl = {AX_INVALID_FRMRATE, AX_INVALID_FRMRATE}, }; AX_VIN_PIPE_ATTR_T gSc850slPipeAttr = { - .ePipeWorkMode = AX_VIN_PIPE_NORMAL_MODE1, - .tPipeImgRgn = {0, 0, 3840, 2160}, - .nWidthStride = 3840, - .eBayerPattern = AX_BP_RGGB, - .ePixelFmt = AX_FORMAT_BAYER_RAW_10BPP_PACKED, - .eSnsMode = AX_SNS_LINEAR_MODE, - .tCompressInfo = {AX_COMPRESS_MODE_LOSSY, 0}, - .tNrAttr = {{0, {AX_COMPRESS_MODE_LOSSLESS, 0}}, {0, {AX_COMPRESS_MODE_NONE, 0}}}, + .ePipeWorkMode = AX_VIN_PIPE_NORMAL_MODE1, + .tPipeImgRgn = {0, 0, 3840, 2160}, + .nWidthStride = 3840, + .eBayerPattern = AX_BP_RGGB, + .ePixelFmt = AX_FORMAT_BAYER_RAW_10BPP_PACKED, + .eSnsMode = AX_SNS_LINEAR_MODE, + .tCompressInfo = {AX_COMPRESS_MODE_LOSSY, 0}, + .tNrAttr = {{0, {AX_COMPRESS_MODE_LOSSLESS, 0}}, {0, {AX_COMPRESS_MODE_NONE, 0}}}, .tFrameRateCtrl = {AX_INVALID_FRMRATE, AX_INVALID_FRMRATE}, }; AX_VIN_CHN_ATTR_T gSc850slChn0Attr = { - .nWidth = 3840, - .nHeight = 2160, - .nWidthStride = 3840, - .eImgFormat = AX_FORMAT_YUV420_SEMIPLANAR, - .nDepth = 1, - .tCompressInfo = {AX_COMPRESS_MODE_LOSSY, 4}, + .nWidth = 3840, + .nHeight = 2160, + .nWidthStride = 3840, + .eImgFormat = AX_FORMAT_YUV420_SEMIPLANAR, + .nDepth = 1, + .tCompressInfo = {AX_COMPRESS_MODE_LOSSY, 4}, .tFrameRateCtrl = {AX_INVALID_FRMRATE, AX_INVALID_FRMRATE}, }; @@ -131,28 +137,25 @@ typedef enum { SAMPLE_VIN_BUTT } SAMPLE_VIN_CASE_E; -struct axera_camera_index_t -{ +struct axera_camera_index_t { char name[48]; SAMPLE_VIN_CASE_E index; -}axera_camera_index[] = { - {"axera_single_dummy", SAMPLE_VIN_SINGLE_DUMMY}, - {"axera_single_os04a10", SAMPLE_VIN_SINGLE_OS04A10}, - {"axera_double_os04a10", SAMPLE_VIN_DOUBLE_OS04A10}, - {"axera_single_sc450ai", SAMPLE_VIN_SINGLE_SC450AI}, - {"axera_double_sc450ai", SAMPLE_VIN_DOUBLE_SC450AI}, - {"axera_double_os04a10_and_bt656", SAMPLE_VIN_DOUBLE_OS04A10_AND_BT656}, - {"axera_single_s5kjn1sq03", SAMPLE_VIN_SINGLE_S5KJN1SQ03}, - {"axera_single_os04a10_dcg_hdr", SAMPLE_VIN_SINGLE_OS04A10_DCG_HDR}, - {"axera_single_os04a10_dcg_vs_hdr", SAMPLE_VIN_SINGLE_OS04A10_DCG_VS_HDR}, - {"axera_single_dvp", SYS_CASE_SINGLE_DVP}, - {"axera_single_bt601", SYS_CASE_SINGLE_BT601}, - {"axera_single_bt656", SYS_CASE_SINGLE_BT656}, - {"axera_single_bt1120", SYS_CASE_SINGLE_BT1120}, - {"axera_single_lvds", SYS_CASE_SINGLE_LVDS}, - {"axera_single_os04a10_online", SYS_CASE_SINGLE_OS04A10_ONLINE}, - {"axera_single_sc850sl", SAMPLE_VIN_SINGLE_SC850SL} -}; +} axera_camera_index[] = {{"axera_single_dummy", SAMPLE_VIN_SINGLE_DUMMY}, + {"axera_single_os04a10", SAMPLE_VIN_SINGLE_OS04A10}, + {"axera_double_os04a10", SAMPLE_VIN_DOUBLE_OS04A10}, + {"axera_single_sc450ai", SAMPLE_VIN_SINGLE_SC450AI}, + {"axera_double_sc450ai", SAMPLE_VIN_DOUBLE_SC450AI}, + {"axera_double_os04a10_and_bt656", SAMPLE_VIN_DOUBLE_OS04A10_AND_BT656}, + {"axera_single_s5kjn1sq03", SAMPLE_VIN_SINGLE_S5KJN1SQ03}, + {"axera_single_os04a10_dcg_hdr", SAMPLE_VIN_SINGLE_OS04A10_DCG_HDR}, + {"axera_single_os04a10_dcg_vs_hdr", SAMPLE_VIN_SINGLE_OS04A10_DCG_VS_HDR}, + {"axera_single_dvp", SYS_CASE_SINGLE_DVP}, + {"axera_single_bt601", SYS_CASE_SINGLE_BT601}, + {"axera_single_bt656", SYS_CASE_SINGLE_BT656}, + {"axera_single_bt1120", SYS_CASE_SINGLE_BT1120}, + {"axera_single_lvds", SYS_CASE_SINGLE_LVDS}, + {"axera_single_os04a10_online", SYS_CASE_SINGLE_OS04A10_ONLINE}, + {"axera_single_sc850sl", SAMPLE_VIN_SINGLE_SC850SL}}; typedef struct { SAMPLE_VIN_CASE_E eSysCase; @@ -260,7 +263,7 @@ COMMON_SYS_POOL_CFG_T gtPrivatePoolSingleOs450aiSdr[] = { /*************************************/ COMMON_SYS_POOL_CFG_T gtSysCommPoolSingleSc850SlSdr[] = { - {3840, 2160, 3840, AX_FORMAT_YUV420_SEMIPLANAR, 4, AX_COMPRESS_MODE_LOSSY, 4}, /* vin nv21/nv21 use */ + {3840, 2160, 3840, AX_FORMAT_YUV420_SEMIPLANAR, 8, AX_COMPRESS_MODE_LOSSY, 8}, /* vin nv21/nv21 use */ {2688, 1520, 2688, AX_FORMAT_YUV420_SEMIPLANAR, 4}, /* vin nv21/nv21 use */ {1920, 1080, 1920, AX_FORMAT_YUV420_SEMIPLANAR, 3}, /* vin nv21/nv21 use */ {720, 576, 720, AX_FORMAT_YUV420_SEMIPLANAR, 3}, /* vin nv21/nv21 use */ @@ -385,24 +388,54 @@ static AX_VOID __set_vin_attr(AX_CAMERA_T *pCam, SAMPLE_SNS_TYPE_E eSnsType, AX_ return; } - -AX_S32 CUSTOM_COMMON_VIN_GetSnsConfig(SAMPLE_SNS_TYPE_E eSnsType, - AX_MIPI_RX_ATTR_T *ptMipiAttr, AX_SNS_ATTR_T *ptSnsAttr, - AX_SNS_CLK_ATTR_T *ptSnsClkAttr, AX_VIN_DEV_ATTR_T *pDevAttr, - AX_VIN_PIPE_ATTR_T *pPipeAttr, AX_VIN_CHN_ATTR_T *pChnAttr) { - if(eSnsType == SMARTSENS_SC850SL) - { +AX_S32 CUSTOM_COMMON_VIN_GetSnsConfig(SAMPLE_SNS_TYPE_E eSnsType, AX_MIPI_RX_ATTR_T *ptMipiAttr, + AX_SNS_ATTR_T *ptSnsAttr, AX_SNS_CLK_ATTR_T *ptSnsClkAttr, + AX_VIN_DEV_ATTR_T *pDevAttr, AX_VIN_PIPE_ATTR_T *pPipeAttr, + AX_VIN_CHN_ATTR_T *pChnAttr) +{ + if (eSnsType == SMARTSENS_SC850SL) { memcpy(ptMipiAttr, &gSc850slMipiAttr, sizeof(AX_MIPI_RX_ATTR_T)); memcpy(ptSnsAttr, &gSc850slSnsAttr, sizeof(AX_SNS_ATTR_T)); memcpy(ptSnsClkAttr, &gSc850slSnsClkAttr, sizeof(AX_SNS_CLK_ATTR_T)); memcpy(pDevAttr, &gSc850slDevAttr, sizeof(AX_VIN_DEV_ATTR_T)); memcpy(pPipeAttr, &gSc850slPipeAttr, sizeof(AX_VIN_PIPE_ATTR_T)); memcpy(&pChnAttr[0], &gSc850slChn0Attr, sizeof(AX_VIN_CHN_ATTR_T)); + return 0; } - return COMMON_VIN_GetSnsConfig(eSnsType, ptMipiAttr, ptSnsAttr, - ptSnsClkAttr, pDevAttr, pPipeAttr, pChnAttr); + return COMMON_VIN_GetSnsConfig(eSnsType, ptMipiAttr, ptSnsAttr, ptSnsClkAttr, pDevAttr, pPipeAttr, pChnAttr); } +AX_SENSOR_REGISTER_FUNC_T *CUSTOM_COMMON_ISP_GetSnsObj(SAMPLE_SNS_TYPE_E eSnsType) +{ + if (eSnsType == SMARTSENS_SC850SL) { + AX_SENSOR_REGISTER_FUNC_T *ptSnsHdl = NULL; + void *handler = NULL; + AX_CHAR *err = NULL; + AX_U16 i = 0; + AX_CHAR *pSnsPath = "/opt/lib/libsns_sc850sl.so"; + AX_CHAR *pObjName = "gSnssc850slObj"; + + if ((NULL != pSnsPath) && (NULL != pObjName)) { + handler = dlopen((void *)pSnsPath, RTLD_LAZY); + if (NULL == handler) { + COMM_ISP_PRT("open %s fail!---%s\n", pSnsPath, dlerror()); + return NULL; + } + ptSnsHdl = (AX_SENSOR_REGISTER_FUNC_T *)dlsym(handler, pObjName); + err = dlerror(); + if (NULL != err) { + ptSnsHdl = NULL; + COMM_ISP_PRT("dlsym %s fail!\n", pObjName); + } + } else { + ptSnsHdl = NULL; + COMM_ISP_PRT("not find eSnsType = %d\n", eSnsType); + } + + return ptSnsHdl; + } + return COMMON_ISP_GetSnsObj(eSnsType); +} static AX_U32 __sample_case_single_dummy(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYPE_E eSnsType, SAMPLE_VIN_PARAM_T *pVinParam, COMMON_SYS_ARGS_T *pCommonArgs) @@ -419,14 +452,14 @@ static AX_U32 __sample_case_single_dummy(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYPE_ pCam = &pCamList[i]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eLoadRawNode = eLoadRawNode; pCam->eInputMode = AX_INPUT_MODE_MIPI; @@ -452,13 +485,13 @@ static AX_U32 __sample_case_single_dvp(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYPE_E pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_DVP; __set_pipe_hdr_mode(&pCam->tDevBindPipe.nHDRSel[0], eHdrMode); @@ -482,13 +515,13 @@ static AX_U32 __sample_case_single_bt656(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYPE_ pCam = &pCamList[0]; pCam->nPipeId = 2; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 2; pCam->nRxDev = 2; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_BT656; pCam->tPipeAttr[pCam->nPipeId].ePipeWorkMode = AX_VIN_PIPE_ISP_BYPASS_MODE; @@ -510,13 +543,13 @@ static AX_U32 __sample_case_single_bt1120(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYPE pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 2; pCam->nRxDev = 2; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_BT1120; pCam->tPipeAttr[pCam->nPipeId].ePipeWorkMode = AX_VIN_PIPE_ISP_BYPASS_MODE; @@ -538,13 +571,13 @@ static AX_U32 __sample_case_single_lvds(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYPE_E pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_LVDS; __set_pipe_hdr_mode(&pCam->tDevBindPipe.nHDRSel[0], eHdrMode); @@ -570,14 +603,14 @@ static AX_U32 __sample_case_single_os04a10(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; pCam->eLoadRawNode = eLoadRawNode; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_MIPI; __set_pipe_hdr_mode(&pCam->tDevBindPipe.nHDRSel[0], eHdrMode); @@ -602,14 +635,14 @@ static AX_U32 __sample_case_single_sc850sl(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; pCam->eLoadRawNode = eLoadRawNode; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_MIPI; __set_pipe_hdr_mode(&pCam->tDevBindPipe.nHDRSel[0], eHdrMode); @@ -619,8 +652,7 @@ static AX_U32 __sample_case_single_sc850sl(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam->tPipeInfo[j].bAiispEnable = pVinParam->bAiispEnable; if (pCam->tPipeInfo[j].bAiispEnable) { if (eHdrMode <= AX_SNS_LINEAR_MODE) { - strncpy(pCam->tPipeInfo[j].szBinPath, "/opt/etc/sc850sl_sdr.bin", - sizeof(pCam->tPipeInfo[j].szBinPath)); + strncpy(pCam->tPipeInfo[j].szBinPath, "/opt/etc/sc850sl_sdr.bin", sizeof(pCam->tPipeInfo[j].szBinPath)); } else { strncpy(pCam->tPipeInfo[j].szBinPath, "/opt/etc/sc850sl_hdr_2x.bin", sizeof(pCam->tPipeInfo[j].szBinPath)); @@ -646,7 +678,7 @@ static AX_U32 __sample_case_double_os04a10(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam->nNumber = i; pCam->nPipeId = i; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = i; if (i == 0) { @@ -659,7 +691,7 @@ static AX_U32 __sample_case_double_os04a10(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); if (eHdrMode == AX_SNS_LINEAR_MODE) pCam->tSnsAttr.nSettingIndex = 33; @@ -694,14 +726,14 @@ static AX_U32 __sample_case_single_sc450ai(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->eLoadRawNode = eLoadRawNode; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_MIPI; __set_pipe_hdr_mode(&pCam->tDevBindPipe.nHDRSel[0], eHdrMode); @@ -738,7 +770,7 @@ static AX_U32 __sample_case_double_sc450ai(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam->nNumber = i; pCam->nPipeId = i; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = i; if (i == 0) { @@ -751,7 +783,7 @@ static AX_U32 __sample_case_double_sc450ai(AX_CAMERA_T *pCamList, SAMPLE_SNS_TYP pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); if (eHdrMode == AX_SNS_LINEAR_MODE) pCam->tSnsAttr.nSettingIndex = 33; @@ -804,7 +836,7 @@ static AX_U32 __sample_case_double_os04a10_and_bt656(AX_CAMERA_T *pCamList, SAMP pCam->nNumber = i; pCam->nPipeId = i; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = i; pCam->nRxDev = i; @@ -816,7 +848,7 @@ static AX_U32 __sample_case_double_os04a10_and_bt656(AX_CAMERA_T *pCamList, SAMP pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); if (eHdrMode == AX_SNS_LINEAR_MODE) pCam->tSnsAttr.nSettingIndex = 33; @@ -861,14 +893,14 @@ static AX_U32 __sample_case_single_s5kjn1sq03(AX_CAMERA_T *pCamList, SAMPLE_SNS_ pCam = &pCamList[0]; pCam->nPipeId = 0; CUSTOM_COMMON_VIN_GetSnsConfig(eSnsType, &pCam->tMipiAttr, &pCam->tSnsAttr, &pCam->tSnsClkAttr, &pCam->tDevAttr, - &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); + &pCam->tPipeAttr[pCam->nPipeId], pCam->tChnAttr); pCam->nDevId = 0; pCam->nRxDev = 0; pCam->tSnsClkAttr.nSnsClkIdx = 0; pCam->tDevBindPipe.nNum = 1; pCam->tDevBindPipe.nPipeId[0] = pCam->nPipeId; pCam->eLoadRawNode = eLoadRawNode; - pCam->ptSnsHdl[pCam->nPipeId] = COMMON_ISP_GetSnsObj(eSnsType); + pCam->ptSnsHdl[pCam->nPipeId] = CUSTOM_COMMON_ISP_GetSnsObj(eSnsType); pCam->eBusType = COMMON_ISP_GetSnsBusType(eSnsType); pCam->eInputMode = AX_INPUT_MODE_MIPI; __set_pipe_hdr_mode(&pCam->tDevBindPipe.nHDRSel[0], eHdrMode); @@ -881,12 +913,8 @@ static AX_U32 __sample_case_single_s5kjn1sq03(AX_CAMERA_T *pCamList, SAMPLE_SNS_ return 0; } - - - - -static AX_U32 __sample_case_config(AX_CAMERA_T *gCams, SAMPLE_VIN_PARAM_T *pVinParam, - COMMON_SYS_ARGS_T *pCommonArgs, COMMON_SYS_ARGS_T *pPrivArgs) +static AX_U32 __sample_case_config(AX_CAMERA_T *gCams, SAMPLE_VIN_PARAM_T *pVinParam, COMMON_SYS_ARGS_T *pCommonArgs, + COMMON_SYS_ARGS_T *pPrivArgs) { AX_CAMERA_T *pCamList = gCams; SAMPLE_SNS_TYPE_E eSnsType = OMNIVISION_OS04A10; @@ -1067,20 +1095,20 @@ static AX_U32 __sample_case_config(AX_CAMERA_T *gCams, SAMPLE_VIN_PARAM_T *pVinP /* cams config */ __sample_case_single_sc450ai(pCamList, eSnsType, pVinParam, pCommonArgs); break; - case SAMPLE_VIN_SINGLE_SC850SL: - eSnsType = SMARTSENS_SC850SL; - /* comm pool config */ - __cal_dump_pool(gtSysCommPoolSingleSc850SlSdr, pVinParam->eHdrMode, pVinParam->nDumpFrameNum); - pCommonArgs->nPoolCfgCnt = sizeof(gtSysCommPoolSingleSc850SlSdr) / sizeof(gtSysCommPoolSingleSc850SlSdr[0]); - pCommonArgs->pPoolCfg = gtSysCommPoolSingleSc850SlSdr; - - /* private pool config */ - __cal_dump_pool(gtPrivatePoolSingleSc850SlSdr, pVinParam->eHdrMode, pVinParam->nDumpFrameNum); - pPrivArgs->nPoolCfgCnt = sizeof(gtPrivatePoolSingleSc850SlSdr) / sizeof(gtPrivatePoolSingleSc850SlSdr[0]); - pPrivArgs->pPoolCfg = gtPrivatePoolSingleSc850SlSdr; - - /* cams config */ - __sample_case_single_sc850sl(pCamList, eSnsType, pVinParam, pCommonArgs); + case SAMPLE_VIN_SINGLE_SC850SL: + eSnsType = SMARTSENS_SC850SL; + /* comm pool config */ + __cal_dump_pool(gtSysCommPoolSingleSc850SlSdr, pVinParam->eHdrMode, pVinParam->nDumpFrameNum); + pCommonArgs->nPoolCfgCnt = sizeof(gtSysCommPoolSingleSc850SlSdr) / sizeof(gtSysCommPoolSingleSc850SlSdr[0]); + pCommonArgs->pPoolCfg = gtSysCommPoolSingleSc850SlSdr; + + /* private pool config */ + __cal_dump_pool(gtPrivatePoolSingleSc850SlSdr, pVinParam->eHdrMode, pVinParam->nDumpFrameNum); + pPrivArgs->nPoolCfgCnt = sizeof(gtPrivatePoolSingleSc850SlSdr) / sizeof(gtPrivatePoolSingleSc850SlSdr[0]); + pPrivArgs->pPoolCfg = gtPrivatePoolSingleSc850SlSdr; + + /* cams config */ + __sample_case_single_sc850sl(pCamList, eSnsType, pVinParam, pCommonArgs); break; case SAMPLE_VIN_DOUBLE_OS04A10_AND_BT656: /* comm pool config */ @@ -1165,9 +1193,15 @@ struct axera_camera_t { AX_IMG_INFO_T ax_img; AX_VIDEO_FRAME_T out_img; int Chn; -} axera_obj; - -static int camera_capture_callback_set(struct camera_t* camera, vcamera_frame_get pcallback) + AX_VENC_CHN_ATTR_T stVencChnAttr; + AX_IVPS_PIPELINE_ATTR_T stPipelineAttr; + AX_RTSP_HANDLE pRtspHandle; + AX_RTSP_ATTR_T stRtspAttr[MAX_RTSP_MAX_CHANNEL_NUM]; + pthread_t venc_thread_id_; + int venc_run_; +} axera_obj = {0}; + +static int camera_capture_callback_set(struct camera_t *camera, vcamera_frame_get pcallback) { if (camera->state_ == CAMERA_SATTE_CAP) { SLOGW("Set capture callback failed"); @@ -1177,10 +1211,10 @@ static int camera_capture_callback_set(struct camera_t* camera, vcamera_frame_ge return 0; } -static void* camera_capture_thread(void* param) +static void *camera_capture_thread(void *param) { int Ret = -1; - camera_t* camera = (camera_t*)param; + camera_t *camera = (camera_t *)param; struct v4l2_buffer EnQueueBuf; struct v4l2_buffer DeQueueBuf; @@ -1188,21 +1222,20 @@ static void* camera_capture_thread(void* param) while (camera->state_ & AX_SENSOR_GET_FRAME_THREAD) { AX_S32 axRet = AX_VIN_GetYuvFrame(axera_obj.gCams.nPipeId, axera_obj.Chn, &axera_obj.ax_img, 500); - if (axRet == 0) - { - // axera_obj.ax_img.tFrameInfo.stVFrame.u64VirAddr[0] = (AX_U64)AX_POOL_GetBlockVirAddr(axera_obj.ax_img.tFrameInfo.stVFrame.u32BlkId[0]); - // axera_obj.ax_img.tFrameInfo.stVFrame.u64PhyAddr[0] = AX_POOL_Handle2PhysAddr(axera_obj.ax_img.tFrameInfo.stVFrame.u32BlkId[0]); - // AX_S32 AX_IVPS_CropResizeTdp(const AX_VIDEO_FRAME_T *ptSrc, AX_VIDEO_FRAME_T *ptDst, + if (axRet == 0) { + // axera_obj.ax_img.tFrameInfo.stVFrame.u64VirAddr[0] = + // (AX_U64)AX_POOL_GetBlockVirAddr(axera_obj.ax_img.tFrameInfo.stVFrame.u32BlkId[0]); + // axera_obj.ax_img.tFrameInfo.stVFrame.u64PhyAddr[0] = + // AX_POOL_Handle2PhysAddr(axera_obj.ax_img.tFrameInfo.stVFrame.u32BlkId[0]); AX_S32 + // AX_IVPS_CropResizeTdp(const AX_VIDEO_FRAME_T *ptSrc, AX_VIDEO_FRAME_T *ptDst, // const AX_IVPS_CROP_RESIZE_ATTR_T *ptAttr); - AX_IVPS_CROP_RESIZE_ATTR_T tAttr = {0}; - AX_IVPS_CropResizeTdp(&axera_obj.ax_img.tFrameInfo.stVFrame, &axera_obj.out_img, &tAttr); - AX_VIN_ReleaseYuvFrame(axera_obj.gCams.nPipeId, axera_obj.Chn, &axera_obj.ax_img); - camera->pcallback_((void*)axera_obj.out_img.u64VirAddr[0], axera_obj.out_img.u32Width, axera_obj.out_img.u32Height, - axera_obj.out_img.u32FrameSize, camera->ctx_); - } - else - { - // ALOGD("get ax img error! code:0x%x", axRet); + // AX_IVPS_CROP_RESIZE_ATTR_T tAttr = {0}; + // AX_IVPS_CropResizeTdp(&axera_obj.ax_img.tFrameInfo.stVFrame, &axera_obj.out_img, &tAttr); + // AX_VIN_ReleaseYuvFrame(axera_obj.gCams.nPipeId, axera_obj.Chn, &axera_obj.ax_img); + // camera->pcallback_((void *)axera_obj.out_img.u64VirAddr[0], axera_obj.out_img.u32Width, + // axera_obj.out_img.u32Height, axera_obj.out_img.u32FrameSize, camera->ctx_); + } else { + SLOGI("get ax img error! code:0x%x", axRet); usleep(10 * 1000); } } @@ -1212,7 +1245,49 @@ static void* camera_capture_thread(void* param) return NULL; } -static int camera_capture_start(struct camera_t* camera) +static AX_VOID *IVPS_GetFrameThread(AX_VOID *pArg) +{ + AX_S32 ret = 0; + AX_VIDEO_FRAME_T tSrcFrame; + AX_BLK BlkId; + struct camera_t *camera = (struct camera_t *)pArg; + SLOGI("IVPS Grp: %d, Chn: %d", 0, USER_OUTPUT_CHN - 1); + uint64_t totalGetStream = 0; + while (camera->state_ & AX_SENSOR_GET_FRAME_THREAD) { + // sleep(1); + ret = AX_IVPS_GetChnFrame(0, USER_OUTPUT_CHN - 1, &tSrcFrame, 1000); + if (ret) { + /* reach EOF */ + SLOGI("[WARN] CHN[%d] is empty ret:0x%x\n", 2, ret); + usleep(10000); + continue; + } + + // SLOGI( + // "AX_IVPS_GetChnFrame(%lld): Chn:%d, (%d x %d) Stride:%d, FrameSize:%d, Phy:%llx, UserData:%llx, PTS:%llx, + // " "BlockId:%x\n", tSrcFrame.u64SeqNum, USER_OUTPUT_CHN-1, tSrcFrame.u32Width, tSrcFrame.u32Height, + // tSrcFrame.u32PicStride[0], tSrcFrame.u32FrameSize, tSrcFrame.u64PhyAddr[0], tSrcFrame.u64UserData, + // tSrcFrame.u64PTS, tSrcFrame.u32BlkId[0]); + void *pFrame = (void *)AX_POOL_GetBlockVirAddr(tSrcFrame.u32BlkId[0]); + camera->pcallback_(pFrame, tSrcFrame.u32Width, tSrcFrame.u32Height, + tSrcFrame.u32Width * tSrcFrame.u32Height * 2, camera->ctx_); + if ((totalGetStream % 60) == 0) { + memset(pFrame, 128, tSrcFrame.u32Width * tSrcFrame.u32Height * 2); + } + ret = AX_IVPS_ReleaseChnFrame(0, USER_OUTPUT_CHN - 1, &tSrcFrame); + if (ret) { + SLOGI("AX_IVPS_ReleaseFrame fail, ret=0x%x", ret); + usleep(10000); + continue; + } + // SLOGI("AX_IVPS_GetFrame"); + totalGetStream++; + } + SLOGI("IVPS END"); + return NULL; +} + +static int camera_capture_start(struct camera_t *camera) { SLOGI("Start capture thread"); if (!camera->pcallback_) { @@ -1224,12 +1299,11 @@ static int camera_capture_start(struct camera_t* camera) return -1; } camera->state_ |= AX_SENSOR_GET_FRAME_THREAD; - pthread_create(&camera->capture_thread_id_, NULL, camera_capture_thread, camera); - + pthread_create(&camera->capture_thread_id_, NULL, IVPS_GetFrameThread, camera); return 0; } -static int camera_capture_stop(struct camera_t* camera) +static int camera_capture_stop(struct camera_t *camera) { SLOGI("Stop capture thread"); camera->state_ &= ~((int)AX_SENSOR_GET_FRAME_THREAD); @@ -1238,12 +1312,301 @@ static int camera_capture_stop(struct camera_t* camera) return 0; } -static void camera_set_ctx(struct camera_t* camera, void* ctx) +static void camera_set_ctx(struct camera_t *camera, void *ctx) { camera->ctx_ = ctx; } -int axera_camera_open_from(camera_t* camera) +/* venc get stream task */ +static void *VencGetStreamProc(void *arg) +{ + AX_S32 s32Ret = -1; + AX_VENC_RECV_PIC_PARAM_T stRecvParam; + AX_VENC_STREAM_T stStream = {0}; + AX_S16 syncType = -1; + FILE *pStrm = NULL; + int totalGetStream = 0; + s32Ret = AX_VENC_StartRecvFrame(RTSP_OUTPUT_CHN - 1, &stRecvParam); + if (AX_SUCCESS != s32Ret) { + SLOGI("AX_VENC_StartRecvFrame failed, s32Ret:0x%x", s32Ret); + return NULL; + } + while (axera_obj.venc_run_) { + s32Ret = AX_VENC_GetStream(RTSP_OUTPUT_CHN - 1, &stStream, 1000); + if (AX_SUCCESS == s32Ret) { + totalGetStream++; + /* Send to RTSP */ + AX_BOOL bIFrame = (AX_VENC_INTRA_FRAME == stStream.stPack.enCodingType) ? AX_TRUE : AX_FALSE; + AX_Rtsp_SendNalu(axera_obj.pRtspHandle, 0, stStream.stPack.pu8Addr, stStream.stPack.u32Len, + stStream.stPack.u64PTS, bIFrame); + // SLOGI("VencChn %d: u64PTS:%lld pu8Addr:%p u32Len:%d enCodingType:%d", 2, stStream.stPack.u64PTS, + // stStream.stPack.pu8Addr, stStream.stPack.u32Len, stStream.stPack.enCodingType); + s32Ret = AX_VENC_ReleaseStream(RTSP_OUTPUT_CHN - 1, &stStream); + if (AX_SUCCESS != s32Ret) { + SLOGE("VencChn %d: AX_VENC_ReleaseStream failed!s32Ret:0x%x", 0, s32Ret); + usleep(10000); + continue; + } + } else if (AX_ERR_VENC_FLOW_END == s32Ret) { + SLOGE("VencChn %d: AX_VENC_GetStream end flow,exit!", 0); + usleep(10000); + continue; + } + } +EXIT: + s32Ret = AX_VENC_StopRecvFrame(RTSP_OUTPUT_CHN - 1); + if (0 != s32Ret) { + SLOGE("VencChn %d:AX_VENC_StopRecvFrame failed,s32Ret:0x%x", 0, s32Ret); + return NULL; + } + SLOGI("VencChn %d: Total get %u encoded frames. getStream Exit!", 0, totalGetStream); + return NULL; +} + +static AX_S32 SAMPLE_VENC_Init() +{ + AX_S32 VencChn = 0, s32Ret = 0; + AX_VENC_MOD_ATTR_T stModAttr = { + .enVencType = AX_VENC_MULTI_ENCODER, + .stModThdAttr.u32TotalThreadNum = 1, + .stModThdAttr.bExplicitSched = AX_FALSE, + }; + s32Ret = AX_VENC_Init(&stModAttr); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_VENC_Init failed, s32Ret:0x%x", s32Ret); + return s32Ret; + } + AX_S32 ret = AX_VENC_CreateChn(VencChn, &axera_obj.stVencChnAttr); + if (AX_SUCCESS != ret) { + SLOGE("VencChn %d: AX_VENC_CreateChn failed, s32Ret:0x%x", VencChn, ret); + return -1; + } + AX_MOD_INFO_T srcMod, dstMod; + srcMod.enModId = AX_ID_IVPS; + srcMod.s32GrpId = 0; + srcMod.s32ChnId = RTSP_OUTPUT_CHN - 1; + dstMod.enModId = AX_ID_VENC; + dstMod.s32GrpId = 0; + dstMod.s32ChnId = RTSP_OUTPUT_CHN - 1; + AX_SYS_Link(&srcMod, &dstMod); + axera_obj.stRtspAttr[0].nChannel = 0; + axera_obj.stRtspAttr[0].stVideoAttr.bEnable = AX_TRUE; + axera_obj.stRtspAttr[0].stVideoAttr.ePt = axera_obj.stVencChnAttr.stVencAttr.enType; + AX_Rtsp_Init(&axera_obj.pRtspHandle, &axera_obj.stRtspAttr[0], 1, 0); + AX_Rtsp_Start(axera_obj.pRtspHandle); + pthread_create(&axera_obj.venc_thread_id_, NULL, VencGetStreamProc, NULL); + SLOGI("RTSP pRtspHandle:%p nChnNum:%d", axera_obj.pRtspHandle, 1); + return 0; +} + +static AX_S32 SAMPLE_VENC_DeInit() +{ + AX_S32 VencChn = 0, s32Ret = 0, s32Retry = 5; + + if (axera_obj.stVencChnAttr.stVencAttr.enType == PT_PCMU) { + return s32Ret; + } + axera_obj.venc_run_ = 0; + pthread_join(axera_obj.venc_thread_id_, NULL); + AX_Rtsp_Stop(axera_obj.pRtspHandle); + AX_Rtsp_Deinit(axera_obj.pRtspHandle); + AX_MOD_INFO_T srcMod, dstMod; + srcMod.enModId = AX_ID_IVPS; + srcMod.s32GrpId = 0; + srcMod.s32ChnId = RTSP_OUTPUT_CHN - 1; + dstMod.enModId = AX_ID_VENC; + dstMod.s32GrpId = 0; + dstMod.s32ChnId = RTSP_OUTPUT_CHN - 1; + AX_SYS_UnLink(&srcMod, &dstMod); + + s32Retry = 5; + do { + s32Ret = AX_VENC_DestroyChn(VencChn); + if (AX_ERR_VENC_BUSY == s32Ret) { + SLOGE("VencChn %d:AX_VENC_DestroyChn return AX_ERR_VENC_BUSY,retry...", VencChn); + --s32Retry; + usleep(100 * 1000); + } else { + break; + } + } while (s32Retry >= 0); + + if (s32Retry == -1 || AX_SUCCESS != s32Ret) { + SLOGE("VencChn %d: AX_VENC_DestroyChn failed, s32Retry=%d, s32Ret=0x%x\n", VencChn, s32Retry, s32Ret); + } + s32Ret = AX_VENC_Deinit(); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_VENC_Deinit failed, s32Ret=0x%x", s32Ret); + return s32Ret; + } + return 0; +} + +void init_rtsp(AX_VENC_CHN_ATTR_T *stVencChnAttr) +{ + axera_obj.stVencChnAttr = *stVencChnAttr; + axera_obj.venc_run_ = 1; +} + +static int SAMPLE_IVPS_Init(AX_S32 nGrpId, camera_t *camera) +{ + AX_S32 s32Ret = 0, nChn; + AX_IVPS_GRP_ATTR_T stGrpAttr = {0}; + AX_IVPS_PIPELINE_ATTR_T stPipelineAttr = {0}; + + s32Ret = AX_IVPS_Init(); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_Init failed,s32Ret:0x%x", s32Ret); + return s32Ret; + } + + stGrpAttr.nInFifoDepth = 2; + stGrpAttr.ePipeline = AX_IVPS_PIPELINE_DEFAULT; + s32Ret = AX_IVPS_CreateGrp(nGrpId, &stGrpAttr); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_CreateGrp failed,nGrp %d,s32Ret:0x%x", nGrpId, s32Ret); + return s32Ret; + } + stPipelineAttr.nOutChnNum = 3; + // vin 输入 + stPipelineAttr.tFilter[0][0].bEngage = AX_TRUE; + if (axera_obj.stVencChnAttr.stVencAttr.u32PicWidthSrc > 1920) { + stPipelineAttr.tFilter[0][0].nDstPicWidth = axera_obj.gCams.tChnAttr[0].nWidth; + stPipelineAttr.tFilter[0][0].nDstPicHeight = axera_obj.gCams.tChnAttr[0].nHeight; + stPipelineAttr.tFilter[0][0].nDstPicStride = axera_obj.gCams.tChnAttr[0].nWidthStride; + } else { + stPipelineAttr.tFilter[0][0].nDstPicWidth = 1920; + stPipelineAttr.tFilter[0][0].nDstPicHeight = 1080; + stPipelineAttr.tFilter[0][0].nDstPicStride = 1920; + } + + stPipelineAttr.tFilter[0][0].eDstPicFormat = AX_FORMAT_YUV420_SEMIPLANAR; + stPipelineAttr.tFilter[0][0].eEngine = AX_IVPS_ENGINE_VPP; + // rtps输出 chn0 + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].bEngage = AX_TRUE; + if (axera_obj.stVencChnAttr.stVencAttr.enType != PT_PCMU) { + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].nDstPicWidth = axera_obj.stVencChnAttr.stVencAttr.u32PicWidthSrc; + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].nDstPicHeight = axera_obj.stVencChnAttr.stVencAttr.u32PicHeightSrc; + } else { + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].nDstPicWidth = 1280; + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].nDstPicHeight = 720; + } + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].nDstPicStride = + ALIGN_UP(stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].nDstPicWidth, 16); + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].eDstPicFormat = AX_FORMAT_YUV420_SEMIPLANAR; + stPipelineAttr.tFilter[RTSP_OUTPUT_CHN][0].eEngine = AX_IVPS_ENGINE_VPP; + + // 设备获取 chn2 + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].bEngage = AX_TRUE; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].nDstPicWidth = camera->width_; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].nDstPicHeight = camera->height_; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].nDstPicStride = + ALIGN_UP(stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].nDstPicWidth, 16); + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].eDstPicFormat = AX_FORMAT_YUV422_INTERLEAVED_YUYV; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].eEngine = AX_IVPS_ENGINE_TDP; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].tTdpCfg.eRotation = AX_IVPS_ROTATION_0; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].tAspectRatio.eMode = AX_IVPS_ASPECT_RATIO_MANUAL; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].tAspectRatio.tRect.nX = 0; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].tAspectRatio.tRect.nY = + (int)(stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].nDstPicHeight - + (((stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].nDstPicWidth * 1.0f) / + (stPipelineAttr.tFilter[0][0].nDstPicWidth * 1.0f)) * + (stPipelineAttr.tFilter[0][0].nDstPicHeight * 1.0f))); + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].tAspectRatio.tRect.nW = camera->width_; + stPipelineAttr.tFilter[USER_OUTPUT_CHN][0].tAspectRatio.tRect.nH = camera->height_; + stPipelineAttr.nOutFifoDepth[USER_OUTPUT_CHN - 1] = 2; + + // // jpeg输出 chn1 + // memcpy(&stPipelineAttr.tFilter[2][0], &stPipelineAttr.tFilter[3][0], sizeof(AX_IVPS_FILTER_T)); + // stPipelineAttr.tFilter[2][0].eDstPicFormat = AX_FORMAT_YUV420_SEMIPLANAR; + +#ifdef SAMPLE_IVPS_CROPRESIZE_ENABLE + AX_S32 nChnGetId = 0; + stPipelineAttr.nOutFifoDepth[nChnGetId] = 1; +#endif + s32Ret = AX_IVPS_SetPipelineAttr(nGrpId, &stPipelineAttr); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_SetPipelineAttr failed,nGrp %d,s32Ret:0x%x", nGrpId, s32Ret); + return s32Ret; + } + for (nChn = 0; nChn < stPipelineAttr.nOutChnNum; nChn++) { + s32Ret = AX_IVPS_EnableChn(nGrpId, nChn); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_EnableChn failed,nGrp %d,nChn %d,s32Ret:0x%x", nGrpId, nChn, s32Ret); + return s32Ret; + } + } + s32Ret = AX_IVPS_StartGrp(nGrpId); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_StartGrp failed,nGrp %d,s32Ret:0x%x", nGrpId, s32Ret); + return s32Ret; + } +#ifdef SAMPLE_IVPS_CROPRESIZE_ENABLE + s32Ret = IVPS_CropResizeThreadStart(nGrpId, nChnGetId); + if (AX_SUCCESS != s32Ret) { + SLOGE("IVPS_CropResizeThreadStart failed,nGrp %d,s32Ret:0x%x", nGrpId, s32Ret); + return s32Ret; + } +#endif + AX_MOD_INFO_T srcMod, dstMod; + srcMod.enModId = AX_ID_VIN; + srcMod.s32GrpId = 0; + srcMod.s32ChnId = 0; + + dstMod.enModId = AX_ID_IVPS; + dstMod.s32GrpId = nGrpId; + dstMod.s32ChnId = 0; + AX_SYS_Link(&srcMod, &dstMod); + return 0; +} + +static AX_S32 SAMPLE_IVPS_DeInit(AX_S32 nGrpId) +{ + AX_S32 s32Ret = 0, nChn = 0; + + AX_MOD_INFO_T srcMod, dstMod; + srcMod.enModId = AX_ID_VIN; + srcMod.s32GrpId = 0; + srcMod.s32ChnId = 0; + dstMod.enModId = AX_ID_IVPS; + dstMod.s32GrpId = nGrpId; + dstMod.s32ChnId = 0; + AX_SYS_UnLink(&srcMod, &dstMod); + +#ifdef SAMPLE_IVPS_CROPRESIZE_ENABLE + IVPS_CropResizeThreadStop(); +#endif + + s32Ret = AX_IVPS_StopGrp(nGrpId); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_StopGrp failed,nGrp %d,s32Ret:0x%x", nGrpId, s32Ret); + return s32Ret; + } + + for (nChn = 0; nChn < 3; nChn++) { + s32Ret = AX_IVPS_DisableChn(nGrpId, nChn); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_DisableChn failed,nGrp %d,nChn %d,s32Ret:0x%x", nGrpId, nChn, s32Ret); + return s32Ret; + } + } + + s32Ret = AX_IVPS_DestoryGrp(nGrpId); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_DestoryGrp failed,nGrp %d,s32Ret:0x%x", nGrpId, s32Ret); + return s32Ret; + } + + s32Ret = AX_IVPS_Deinit(); + if (AX_SUCCESS != s32Ret) { + SLOGE("AX_IVPS_Deinit failed,s32Ret:0x%x", s32Ret); + return s32Ret; + } + + return 0; +} + +int axera_camera_open_from(camera_t *camera) { int Ret = -1; AX_S32 axRet; @@ -1255,28 +1618,29 @@ int axera_camera_open_from(camera_t* camera) goto ErrorHandle; } axera_obj.VinParam.eSysCase = SAMPLE_VIN_BUTT; - for (int i = 0; i < sizeof(axera_camera_index) / sizeof(axera_camera_index[0]); i++) - { - if(strcmp(axera_camera_index[i].name, camera->dev_name_) == 0) - { + for (int i = 0; i < sizeof(axera_camera_index) / sizeof(axera_camera_index[0]); i++) { + if (strcmp(axera_camera_index[i].name, camera->dev_name_) == 0) { axera_obj.VinParam.eSysCase = axera_camera_index[i].index; break; } } - if(axera_obj.VinParam.eSysCase == SAMPLE_VIN_BUTT) - { + if (axera_obj.VinParam.eSysCase == SAMPLE_VIN_BUTT) { SLOGE("Error: camera not support %s", camera->dev_name_); return -10; } - axera_obj.VinParam.eSysMode = COMMON_VIN_SENSOR; - axera_obj.VinParam.eHdrMode = AX_SNS_LINEAR_MODE; + axera_obj.VinParam.eSysMode = COMMON_VIN_SENSOR; + axera_obj.VinParam.eHdrMode = AX_SNS_LINEAR_MODE; axera_obj.VinParam.bAiispEnable = AX_TRUE; // axera_obj.gCams.tChnAttr __sample_case_config(&axera_obj.gCams, &axera_obj.VinParam, &axera_obj.tCommonArgs, &axera_obj.tPrivArgs); COMMON_SYS_Init(&axera_obj.tCommonArgs); COMMON_NPU_Init(); - AX_IVPS_Init(); + SAMPLE_IVPS_Init(0, camera); + if (axera_obj.venc_run_) { + SAMPLE_VENC_Init(); + } + axRet = COMMON_CAM_Init(); if (axRet) { COMM_ISP_PRT("COMMON_CAM_Init fail, ret:0x%x", axRet); @@ -1296,14 +1660,15 @@ int axera_camera_open_from(camera_t* camera) return -3; } axera_obj.Chn = AX_VIN_CHN_ID_MAIN; - camera->state_ |= AX_SENSOR_CAM_OPEN; + camera->state_ |= AX_SENSOR_CAM_OPEN; - axera_obj.out_img.u32Width = camera->width_; - axera_obj.out_img.u32Height = camera->height_; - axera_obj.out_img.u32PicStride[0] = ALIGN_UP(camera->width_, 16); - axera_obj.out_img.enImgFormat = AX_FORMAT_YUV420_SEMIPLANAR; - axera_obj.out_img.u32FrameSize = camera->width_ * camera->height_ * 3 / 2; - AX_SYS_MemAlloc(&axera_obj.out_img.u64PhyAddr[0], (AX_VOID **)&axera_obj.out_img.u64VirAddr[0], ALIGN_UP(axera_obj.out_img.u32FrameSize, 0x100), 0x100, (AX_S8 *)"StackFlow_camera_output_buff"); + // axera_obj.out_img.u32Width = camera->width_; + // axera_obj.out_img.u32Height = camera->height_; + // axera_obj.out_img.u32PicStride[0] = ALIGN_UP(camera->width_, 16); + // axera_obj.out_img.enImgFormat = AX_FORMAT_YUV420_SEMIPLANAR; + // axera_obj.out_img.u32FrameSize = camera->width_ * camera->height_ * 3 / 2; + // AX_SYS_MemAlloc(&axera_obj.out_img.u64PhyAddr[0], (AX_VOID **)&axera_obj.out_img.u64VirAddr[0], + // ALIGN_UP(axera_obj.out_img.u32FrameSize, 0x100), 0x100, (AX_S8 *)"StackFlowCameraBuff"); camera->camera_capture_callback_set = camera_capture_callback_set; camera->camera_capture_start = camera_capture_start; @@ -1317,10 +1682,10 @@ int axera_camera_open_from(camera_t* camera) return -1; } -camera_t* axera_camera_open(const char* pdev_name, int width, int height, int fps) +camera_t *axera_camera_open(const char *pdev_name, int width, int height, int fps) { int Ret = -1; - camera_t* camera = (camera_t*)malloc(sizeof(camera_t)); + camera_t *camera = (camera_t *)malloc(sizeof(camera_t)); if (camera == NULL) return NULL; memset(camera, 0, sizeof(camera_t)); camera->buffer_cnt_ = CONFIG_CAPTURE_BUF_CNT; @@ -1342,6 +1707,7 @@ camera_t* axera_camera_open(const char* pdev_name, int width, int height, int fp goto ErrorHandle; } camera->is_alloc_ = 1; + SLOGI("camera %s open success", camera->dev_name_); return camera; ErrorHandle: @@ -1350,20 +1716,21 @@ camera_t* axera_camera_open(const char* pdev_name, int width, int height, int fp return NULL; } -int axera_camera_close(camera_t* camera) +int axera_camera_close(camera_t *camera) { if (camera == NULL) return -1; if (camera->state_ & AX_SENSOR_CAM_OPEN) { COMMON_CAM_Close(&axera_obj.gCams, axera_obj.tCommonArgs.nCamCnt); camera->state_ &= ~((int)AX_SENSOR_CAM_OPEN); } - + if (camera->state_ & AX_SENSOR_CAM_ENABLE) { COMMON_CAM_Deinit(); camera->state_ &= ~((int)AX_SENSOR_CAM_ENABLE); } camera->state_ = AX_SENSOR_NONT; - AX_IVPS_Deinit(); + SAMPLE_IVPS_DeInit(0); + SAMPLE_VENC_DeInit(); COMMON_SYS_DeInit(); if (camera->is_alloc_) free(camera); diff --git a/projects/llm_framework/main_camera/src/axera_camera.h b/projects/llm_framework/main_camera/src/axera_camera.h index 6035c8c..6b8644d 100644 --- a/projects/llm_framework/main_camera/src/axera_camera.h +++ b/projects/llm_framework/main_camera/src/axera_camera.h @@ -5,7 +5,7 @@ */ #ifndef AXERA_CAMERA_H #define AXERA_CAMERA_H - +#include "common_venc.h" #if __cplusplus extern "C" { #endif @@ -29,6 +29,9 @@ int axera_camera_open_from(camera_t* camera); */ int axera_camera_close(camera_t* camera); +void init_rtsp(AX_VENC_CHN_ATTR_T *stVencChnAttr); +void init_jpeg(); + #if __cplusplus } #endif diff --git a/projects/llm_framework/main_camera/src/main.cpp b/projects/llm_framework/main_camera/src/main.cpp index b53467f..26bbd77 100644 --- a/projects/llm_framework/main_camera/src/main.cpp +++ b/projects/llm_framework/main_camera/src/main.cpp @@ -16,9 +16,44 @@ #include "axera_camera.h" #include #include +#include "hv/TcpServer.h" +#include +#include +#include +// #include + +#ifdef ENABLE_BACKWARD +#define BACKWARD_HAS_DW 1 +#include "backward.hpp" +#include "backward.h" +#endif + +#define MAX_TASK_NUM 1 using namespace StackFlows; int main_exit_flage = 0; + +const char *http_response = + "HTTP/1.0 200 OK\n" + "Server: BaseHTTP/0.6 Python/3.10.12\n" + "Date: %s\n" + "Cache-Control: no-store, no-cache, must-revalidate, pre-check=0, post-check=0, max-age=0\n" + "Connection: close\n" + "Content-Type: multipart/x-mixed-replace;boundary=--boundarydonotcross\n" + "Expires: Mon, 1 Jan 2130 00:00:00 GMT\n" + "Pragma: no-cache\n" + "Access-Control-Allow-Origin: *\n"; +const char *http_jpeg_response = + "\n" + "--boundarydonotcross\n" + "X-Timestamp: %lf\n" + "Content-Length: %d\n" + "Content-Type: image/jpeg\n" + "\n"; + +char http_response_buff[1024]; +char http_response_buff1[1024]; + static void __sigint(int iSigNo) { main_exit_flage = 1; @@ -26,26 +61,29 @@ static void __sigint(int iSigNo) typedef std::function task_callback_t; -typedef camera_t* (*hal_camera_open_fun)(const char* pdev_name, int width, int height, int fps); -typedef int (*hal_camera_close_fun)(camera_t* camera); - +typedef camera_t *(*hal_camera_open_fun)(const char *pdev_name, int width, int height, int fps); +typedef int (*hal_camera_close_fun)(camera_t *camera); -#define CONFIG_AUTO_SET(obj, key) \ - if (config_body.contains(#key)) \ - mode_config_.key = config_body[#key]; \ - else if (obj.contains(#key)) \ - mode_config_.key = obj[#key]; +#define CONFIG_AUTO_SET(obj, key) \ + if (config_body.contains(#key)) \ + stVencChnAttr.key = config_body[#key]; \ + else if (obj.contains(#key)) \ + stVencChnAttr.key = obj[#key]; class llm_task { private: camera_t *cam; hal_camera_open_fun hal_camera_open; hal_camera_close_fun hal_camera_close; + public: std::string response_format_; task_callback_t out_callback_; bool enoutput_; bool enstream_; + bool enjpegout_; + std::string rtsp_config_; + bool enable_webstream_; std::atomic_int cap_status_; std::unique_ptr camera_cap_thread_; std::atomic_bool camera_clear_flage_; @@ -55,6 +93,8 @@ class llm_task { int frame_height_; cv::Mat yuv_dist_; + std::unique_ptr hv_tcpserver_; + static void on_cap_fream(void *pData, uint32_t width, uint32_t height, uint32_t Length, void *ctx) { llm_task *self = static_cast(ctx); @@ -127,21 +167,360 @@ class llm_task { devname_ = config_body.at("input"); frame_width_ = config_body.at("frame_width"); frame_height_ = config_body.at("frame_height"); + if (config_body.contains("rtsp")) { + rtsp_config_ = config_body.at("rtsp"); + } + if (config_body.contains("enable_webstream")) { + enable_webstream_ = config_body.at("enable_webstream"); + } else { + enable_webstream_ = false; + } } catch (...) { return true; } - enstream_ = (response_format_.find("stream") != std::string::npos); - yuv_dist_ = cv::Mat(frame_height_, frame_width_, CV_8UC2, cv::Scalar(0, 128)); - if(devname_.find("/dev/video") != std::string::npos){ - hal_camera_open = camera_open; + enstream_ = (response_format_.find("stream") != std::string::npos); + enjpegout_ = (response_format_.find("jpeg") != std::string::npos); + yuv_dist_ = cv::Mat(frame_height_, frame_width_, CV_8UC2, cv::Scalar(0, 128)); + if (devname_.find("/dev/video") != std::string::npos) { + hal_camera_open = camera_open; hal_camera_close = camera_close; - }else if(devname_.find("axera_") != std::string::npos){ - hal_camera_open = axera_camera_open; + } else if (devname_.find("axera_") != std::string::npos) { + hal_camera_open = axera_camera_open; hal_camera_close = axera_camera_close; - }else { + if (!rtsp_config_.empty()) { + nlohmann::json error_body; + nlohmann::json file_body; + std::string base_model_path; + std::string base_model_config_path; + std::list config_file_paths = + get_config_file_paths(base_model_path, base_model_config_path, "camera"); + try { + for (auto file_name : config_file_paths) { + std::ifstream config_file(file_name); + if (!config_file.is_open()) { + SLOGW("config file :%s miss", file_name.c_str()); + continue; + } + SLOGI("config file :%s read", file_name.c_str()); + config_file >> file_body; + config_file.close(); + break; + } + if (file_body.empty()) { + SLOGE("all config file miss"); + return true; + } + AX_VENC_CHN_ATTR_T stVencChnAttr; + memset(&stVencChnAttr, 0, sizeof(AX_VENC_CHN_ATTR_T)); + if (rtsp_config_.find("h264") != std::string::npos) { + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enType); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u32MaxPicWidth); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u32MaxPicHeight); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enMemSource); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u32BufSize); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enProfile); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enLevel); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enTier); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u32PicWidthSrc); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u32PicHeightSrc); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.stCropCfg.bEnable); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.stCropCfg.stRect.s32X); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.stCropCfg.stRect.s32Y); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.stCropCfg.stRect.u32Width); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.stCropCfg.stRect.u32Height); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enRotation); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.enLinkMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.bDeBreathEffect); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.bRefRingbuf); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.s32StopWaitTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u8InFifoDepth); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u8OutFifoDepth); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.u32SliceNum); + CONFIG_AUTO_SET(file_body["h264_config_param"], stVencAttr.stAttrH265e.bRcnRefShareBuf); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.enRcMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.s32FirstFrameStartQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stFrameRate.fSrcFrameRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stFrameRate.fDstFrameRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32BitRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32MaxIprop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32MinIprop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Cbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264Cbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264Cbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32MaxBitRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.enVQ); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264Vbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264Vbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264Vbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32MaxBitRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264AVbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264AVbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264AVbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QVbr.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QVbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QVbr.u32TargetBitRate); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MinQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MaxQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MaxIprop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MinIprop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32MaxBitRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32ShortTermStatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32LongTermStatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32LongTermMaxBitrate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32LongTermMinBitrate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32ExtraBitPercent); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.u32LongTermStatTimeUnit); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264CVbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264CVbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264CVbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264FixQp.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264FixQp.u32IQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264FixQp.u32PQp); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264FixQp.u32BQp); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QpMap.u32Gop); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QpMap.u32StatTime); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QpMap.u32TargetBitRate); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QpMap.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stRcAttr.stH264QpMap.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264QpMap.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stRcAttr.stH264QpMap.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h264_config_param"], stGopAttr.enGopMode); + CONFIG_AUTO_SET(file_body["h264_config_param"], stGopAttr.stNormalP.stPicConfig.s32QpOffset); + CONFIG_AUTO_SET(file_body["h264_config_param"], stGopAttr.stNormalP.stPicConfig.f32QpFactor); + CONFIG_AUTO_SET(file_body["h264_config_param"], stGopAttr.stOneLTR.stPicConfig.s32QpOffset); + CONFIG_AUTO_SET(file_body["h264_config_param"], stGopAttr.stOneLTR.stPicConfig.f32QpFactor); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stGopAttr.stOneLTR.stPicSpecialConfig.s32QpOffset); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stGopAttr.stOneLTR.stPicSpecialConfig.f32QpFactor); + CONFIG_AUTO_SET(file_body["h264_config_param"], + stGopAttr.stOneLTR.stPicSpecialConfig.s32Interval); + CONFIG_AUTO_SET(file_body["h264_config_param"], stGopAttr.stSvcT.u32GopSize); + } else if (rtsp_config_.find("h265") != std::string::npos) { + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enType); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u32MaxPicWidth); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u32MaxPicHeight); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enMemSource); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u32BufSize); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enProfile); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enLevel); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enTier); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u32PicWidthSrc); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u32PicHeightSrc); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.stCropCfg.bEnable); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.stCropCfg.stRect.s32X); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.stCropCfg.stRect.s32Y); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.stCropCfg.stRect.u32Width); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.stCropCfg.stRect.u32Height); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enRotation); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.enLinkMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.bDeBreathEffect); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.bRefRingbuf); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.s32StopWaitTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u8InFifoDepth); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u8OutFifoDepth); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.u32SliceNum); + CONFIG_AUTO_SET(file_body["h265_config_param"], stVencAttr.stAttrH265e.bRcnRefShareBuf); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.enRcMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.s32FirstFrameStartQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stFrameRate.fSrcFrameRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stFrameRate.fDstFrameRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32BitRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32MaxIprop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32MinIprop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Cbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265Cbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265Cbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32MaxBitRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.enVQ); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265Vbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265Vbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265Vbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32MaxBitRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265AVbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265AVbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265AVbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QVbr.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QVbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QVbr.u32TargetBitRate); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32StatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MaxQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MinQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MaxIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MinIQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MinQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MaxQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.s32DeBreathQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32IdrQpDeltaRange); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MaxIprop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MinIprop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32MaxBitRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32ShortTermStatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32LongTermStatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32LongTermMaxBitrate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32LongTermMinBitrate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32ExtraBitPercent); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.u32LongTermStatTimeUnit); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.s32IntraQpDelta); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265CVbr.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265CVbr.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265CVbr.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265FixQp.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265FixQp.u32IQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265FixQp.u32PQp); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265FixQp.u32BQp); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QpMap.u32Gop); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QpMap.u32StatTime); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QpMap.u32TargetBitRate); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QpMap.stQpmapInfo.enCtbRcMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stRcAttr.stH265QpMap.stQpmapInfo.enQpmapQpType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265QpMap.stQpmapInfo.enQpmapBlockType); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stRcAttr.stH265QpMap.stQpmapInfo.enQpmapBlockUnit); + + CONFIG_AUTO_SET(file_body["h265_config_param"], stGopAttr.enGopMode); + CONFIG_AUTO_SET(file_body["h265_config_param"], stGopAttr.stNormalP.stPicConfig.s32QpOffset); + CONFIG_AUTO_SET(file_body["h265_config_param"], stGopAttr.stNormalP.stPicConfig.f32QpFactor); + CONFIG_AUTO_SET(file_body["h265_config_param"], stGopAttr.stOneLTR.stPicConfig.s32QpOffset); + CONFIG_AUTO_SET(file_body["h265_config_param"], stGopAttr.stOneLTR.stPicConfig.f32QpFactor); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stGopAttr.stOneLTR.stPicSpecialConfig.s32QpOffset); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stGopAttr.stOneLTR.stPicSpecialConfig.f32QpFactor); + CONFIG_AUTO_SET(file_body["h265_config_param"], + stGopAttr.stOneLTR.stPicSpecialConfig.s32Interval); + CONFIG_AUTO_SET(file_body["h265_config_param"], stGopAttr.stSvcT.u32GopSize); + } + try { + std::regex pattern(R"(rtsp\.(\d+)[xX-](\d+)\.h(264|265))"); + std::smatch matches; + if (std::regex_search(rtsp_config_, matches, pattern)) { + if (matches.size() >= 3) { + stVencChnAttr.stVencAttr.u32PicWidthSrc = std::stoi(matches[1].str()); + stVencChnAttr.stVencAttr.u32PicHeightSrc = std::stoi(matches[2].str()); + } + } + } catch (...) { + return true; + } + if ((stVencChnAttr.stVencAttr.u32PicWidthSrc < frame_width_) || + (stVencChnAttr.stVencAttr.u32PicHeightSrc < frame_height_)) { + return true; + } + init_rtsp(&stVencChnAttr); + } catch (...) { + return true; + } + } + } else { return true; } + return false; } @@ -177,13 +556,26 @@ class llm_task { cam = NULL; } - ~llm_task() + void start() + { + } + + void stop() { if (cam) { cam->camera_capture_stop(cam); hal_camera_close(cam); cam = NULL; } + if (hv_tcpserver_) { + hv_tcpserver_->stop(); + hv_tcpserver_.reset(); + } + } + + ~llm_task() + { + stop(); } }; @@ -222,27 +614,92 @@ class llm_camera : public StackFlow { if (!(llm_task_obj && llm_channel)) { return; } + std::vector jpeg_image; + // StackFlow output std::string out_data((char *)data, size); llm_channel->send_raw_to_pub(out_data); + // user output if (llm_task_obj->enoutput_) { std::string base64_data; - int ret = StackFlows::encode_base64(out_data, base64_data); + if (llm_task_obj->enjpegout_) { + cv::Mat yuv_image(llm_task_obj->frame_height_, llm_task_obj->frame_width_, CV_8UC2, (void *)data); + cv::Mat bgr_image; + cv::cvtColor(yuv_image, bgr_image, cv::COLOR_YUV2BGR_YUYV); + cv::imencode(".jpg", bgr_image, jpeg_image); + std::string in_data((char *)jpeg_image.data(), jpeg_image.size()); + StackFlows::encode_base64(in_data, base64_data); + } else { + StackFlows::encode_base64(out_data, base64_data); + } std::string out_json_str; - out_json_str.reserve(llm_channel->request_id_.size() + llm_channel->work_id_.size() + base64_data.size() + 108); + out_json_str.reserve(llm_channel->request_id_.size() + llm_channel->work_id_.size() + base64_data.size() + + 128); out_json_str += R"({"request_id":")"; out_json_str += llm_channel->request_id_; out_json_str += R"(","work_id":")"; out_json_str += llm_channel->work_id_; - out_json_str += R"(","object":"image.yuvraw.base64","error":{"code":0, "message":""},"data":")"; + out_json_str += R"(","object":")"; + out_json_str += llm_task_obj->response_format_; + out_json_str += R"(","error":{"code":0, "message":""},"data":")"; out_json_str += base64_data; out_json_str += "\"}\n"; llm_channel->send_raw_to_usr(out_json_str); } + // webstream output + if (llm_task_obj->enable_webstream_) { + if (!llm_task_obj->hv_tcpserver_) { + llm_task_obj->hv_tcpserver_ = std::make_unique(); + int listenfd = llm_task_obj->hv_tcpserver_->createsocket(8989); + if (listenfd < 0) { + llm_task_obj->hv_tcpserver_.reset(); + return; + } + llm_task_obj->hv_tcpserver_->onConnection = [](const hv::SocketChannelPtr &channel) { + std::string peeraddr = channel->peeraddr(); + if (channel->isConnected()) { + memset(http_response_buff, 0, 1024); + time_t current_time; + struct tm *time_info; + time(¤t_time); + time_info = gmtime(¤t_time); + char time_str[30]; + strftime(time_str, sizeof(time_str), "%a, %d %b %Y %H:%M:%S GMT", time_info); + sprintf(http_response_buff, http_response, time_str); + channel->write(http_response_buff); + } + }; + llm_task_obj->hv_tcpserver_->onMessage = [](const hv::SocketChannelPtr &channel, hv::Buffer *buf) {}; + llm_task_obj->hv_tcpserver_->setThreadNum(1); + llm_task_obj->hv_tcpserver_->start(); + } + llm_task_obj->hv_tcpserver_->foreachChannel([&](const hv::SocketChannelPtr &channel) { + if (jpeg_image.empty()) { + cv::Mat yuv_image(llm_task_obj->frame_height_, llm_task_obj->frame_width_, CV_8UC2, (void *)data); + cv::Mat bgr_image; + cv::cvtColor(yuv_image, bgr_image, cv::COLOR_YUV2BGR_YUYV); + cv::imencode(".jpg", bgr_image, jpeg_image); + } + char tmpsdas[256]; + struct timeval tv; + gettimeofday(&tv, NULL); + double timestamp = (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; + memset(http_response_buff1, 0, 1024); + sprintf(http_response_buff1, http_jpeg_response, timestamp, jpeg_image.size()); + channel->write(http_response_buff1); + channel->write(jpeg_image.data(), jpeg_image.size()); + }); + } } int setup(const std::string &work_id, const std::string &object, const std::string &data) override { nlohmann::json error_body; + if ((llm_task_channel_.size() - 1) == MAX_TASK_NUM) { + error_body["code"] = -21; + error_body["message"] = "task full"; + send("None", "None", error_body, "llm"); + return -1; + } int work_id_num = sample_get_work_id_num(work_id); auto llm_channel = get_channel(work_id); auto llm_task_obj = std::make_shared(work_id); @@ -310,6 +767,7 @@ class llm_camera : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); llm_task_.erase(work_id_num); @@ -324,6 +782,7 @@ class llm_camera : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); get_channel(iteam->first)->stop_subscriber(""); iteam->second.reset(); llm_task_.erase(iteam->first); From 9842ffbed1119db37ae68ec90e2c9277255ed608 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Wed, 9 Apr 2025 17:59:54 +0800 Subject: [PATCH 12/64] [update] llm_task add start and stop --- projects/llm_framework/main/src/main.cpp | 13 ++++++++++++- projects/llm_framework/main_asr/src/main.cpp | 12 ++++++++++++ .../llm_framework/main_audio/src/main.cpp | 1 + .../main_depth_anything/src/main.cpp | 12 ++++++++++++ projects/llm_framework/main_kws/src/main.cpp | 12 ++++++++++++ projects/llm_framework/main_llm/src/main.cpp | 16 ++++++++-------- .../llm_framework/main_melotts/src/main.cpp | 19 ++++++++++++++++--- .../llm_framework/main_sys/src/event_loop.cpp | 2 +- projects/llm_framework/main_tts/src/main.cpp | 12 ++++++++++++ projects/llm_framework/main_vad/src/main.cpp | 12 ++++++++++++ projects/llm_framework/main_vlm/src/main.cpp | 14 ++++++++++++++ .../llm_framework/main_whisper/src/main.cpp | 12 ++++++++++++ projects/llm_framework/main_yolo/src/main.cpp | 1 + 13 files changed, 125 insertions(+), 13 deletions(-) diff --git a/projects/llm_framework/main/src/main.cpp b/projects/llm_framework/main/src/main.cpp index 116a52d..89423c9 100644 --- a/projects/llm_framework/main/src/main.cpp +++ b/projects/llm_framework/main/src/main.cpp @@ -73,8 +73,17 @@ class llm_task { { } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); } }; @@ -229,7 +238,7 @@ class llm_llm : public StackFlow { req_body["model"] = llm_task_obj->model_; req_body["response_format"] = llm_task_obj->response_format_; req_body["enoutput"] = llm_task_obj->enoutput_; - req_body["inputs"] = llm_task_obj->inputs_; + req_body["inputs"] = llm_task_obj->inputs_; send("llm.taskinfo", req_body, LLM_NO_ERROR, work_id); } } @@ -244,6 +253,7 @@ class llm_llm : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); llm_task_.erase(work_id_num); @@ -258,6 +268,7 @@ class llm_llm : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); get_channel(iteam->first)->stop_subscriber(""); iteam->second.reset(); llm_task_.erase(iteam->first); diff --git a/projects/llm_framework/main_asr/src/main.cpp b/projects/llm_framework/main_asr/src/main.cpp index 0648433..2d3de1c 100644 --- a/projects/llm_framework/main_asr/src/main.cpp +++ b/projects/llm_framework/main_asr/src/main.cpp @@ -101,6 +101,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -247,8 +248,17 @@ class llm_task { pcmdata = buffer_create(); } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); if (recognizer_stream_) { recognizer_stream_.reset(); } @@ -643,6 +653,7 @@ class llm_asr : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); if (llm_task_[work_id_num]->audio_flage_) { @@ -660,6 +671,7 @@ class llm_asr : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); if (iteam->second->audio_flage_) { unit_call("audio", "cap_stop", "None"); } diff --git a/projects/llm_framework/main_audio/src/main.cpp b/projects/llm_framework/main_audio/src/main.cpp index 3e023ea..32407d3 100644 --- a/projects/llm_framework/main_audio/src/main.cpp +++ b/projects/llm_framework/main_audio/src/main.cpp @@ -151,6 +151,7 @@ class llm_audio : public StackFlow { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; diff --git a/projects/llm_framework/main_depth_anything/src/main.cpp b/projects/llm_framework/main_depth_anything/src/main.cpp index 4e9369c..0d119d7 100644 --- a/projects/llm_framework/main_depth_anything/src/main.cpp +++ b/projects/llm_framework/main_depth_anything/src/main.cpp @@ -98,6 +98,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -245,8 +246,17 @@ class llm_task { _ax_init(); } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); _ax_deinit(); } }; @@ -536,6 +546,7 @@ class llm_depth_anything : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); llm_task_.erase(work_id_num); @@ -550,6 +561,7 @@ class llm_depth_anything : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); get_channel(iteam->first)->stop_subscriber(""); iteam->second.reset(); llm_task_.erase(iteam->first); diff --git a/projects/llm_framework/main_kws/src/main.cpp b/projects/llm_framework/main_kws/src/main.cpp index 4c51002..4a96fc1 100644 --- a/projects/llm_framework/main_kws/src/main.cpp +++ b/projects/llm_framework/main_kws/src/main.cpp @@ -107,6 +107,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -255,8 +256,17 @@ class llm_task { pcmdata = buffer_create(); } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); if (spotter_stream_) { spotter_stream_.reset(); } @@ -509,6 +519,7 @@ class llm_kws : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); if (llm_task_[work_id_num]->audio_flage_) { @@ -526,6 +537,7 @@ class llm_kws : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); if (iteam->second->audio_flage_) { unit_call("audio", "cap_stop", "None"); } diff --git a/projects/llm_framework/main_llm/src/main.cpp b/projects/llm_framework/main_llm/src/main.cpp index c667209..98a8aba 100644 --- a/projects/llm_framework/main_llm/src/main.cpp +++ b/projects/llm_framework/main_llm/src/main.cpp @@ -23,6 +23,9 @@ using namespace StackFlows; #include "backward.hpp" #include "backward.h" #endif + +#define MAX_TASK_NUM 2 + int main_exit_flage = 0; static void __sigint(int iSigNo) { @@ -108,6 +111,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -287,8 +291,7 @@ class llm_task { bool pause() { - if(lLaMa_) - lLaMa_->Stop(); + if (lLaMa_) lLaMa_->Stop(); return true; } @@ -331,12 +334,11 @@ class llm_task { if (inference_run_) { std::string par; async_list_.put(par); - if(lLaMa_) - lLaMa_->Stop(); + if (lLaMa_) lLaMa_->Stop(); inference_run_->join(); inference_run_.reset(); } - } + } ~llm_task() { @@ -357,13 +359,11 @@ std::atomic llm_task::next_port_{8080}; class llm_llm : public StackFlow { private: - int task_count_; std::unordered_map> llm_task_; public: llm_llm() : StackFlow("llm") { - task_count_ = 2; } void task_output(const std::weak_ptr llm_task_obj_weak, @@ -499,7 +499,7 @@ class llm_llm : public StackFlow { int setup(const std::string &work_id, const std::string &object, const std::string &data) override { nlohmann::json error_body; - if ((llm_task_channel_.size() - 1) == task_count_) { + if ((llm_task_channel_.size() - 1) == MAX_TASK_NUM) { error_body["code"] = -21; error_body["message"] = "task full"; send("None", "None", error_body, "llm"); diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index 6b9c89e..f8610ef 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -140,6 +140,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -240,9 +241,9 @@ class llm_task { if (msg_str.empty()) { SLOGI("empty"); if (out_callback_) { - std::string output = wav_pcm_data.empty() ? - std::string() : - std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)); + std::string output = wav_pcm_data.empty() ? std::string() + : std::string((char *)wav_pcm_data.data(), + wav_pcm_data.size() * sizeof(int16_t)); out_callback_(output, finish); } return false; @@ -350,10 +351,20 @@ class llm_task { _ax_init(); } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); if (decoder_) { decoder_->Release(); + // decoder_.reset(); } _ax_deinit(); } @@ -671,6 +682,7 @@ class llm_tts : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); llm_task_.erase(work_id_num); @@ -685,6 +697,7 @@ class llm_tts : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); get_channel(iteam->first)->stop_subscriber(""); iteam->second.reset(); llm_task_.erase(iteam->first); diff --git a/projects/llm_framework/main_sys/src/event_loop.cpp b/projects/llm_framework/main_sys/src/event_loop.cpp index 00b9d9b..4f8d312 100644 --- a/projects/llm_framework/main_sys/src/event_loop.cpp +++ b/projects/llm_framework/main_sys/src/event_loop.cpp @@ -707,7 +707,7 @@ int sys_reset(int com_id, const nlohmann::json &json_obj) int sys_version(int com_id, const nlohmann::json &json_obj) { - usr_out(json_obj["request_id"], json_obj["work_id"], std::string("v1.4"), com_id); + usr_out(json_obj["request_id"], json_obj["work_id"], std::string("v1.5"), com_id); int out = 0; return out; } diff --git a/projects/llm_framework/main_tts/src/main.cpp b/projects/llm_framework/main_tts/src/main.cpp index 055412e..88ef4a8 100644 --- a/projects/llm_framework/main_tts/src/main.cpp +++ b/projects/llm_framework/main_tts/src/main.cpp @@ -111,6 +111,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -166,8 +167,17 @@ class llm_task { { } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); } }; @@ -495,6 +505,7 @@ class llm_tts : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); llm_task_.erase(work_id_num); @@ -509,6 +520,7 @@ class llm_tts : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); get_channel(iteam->first)->stop_subscriber(""); iteam->second.reset(); llm_task_.erase(iteam->first); diff --git a/projects/llm_framework/main_vad/src/main.cpp b/projects/llm_framework/main_vad/src/main.cpp index d9c1f7d..6189257 100644 --- a/projects/llm_framework/main_vad/src/main.cpp +++ b/projects/llm_framework/main_vad/src/main.cpp @@ -106,6 +106,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -216,8 +217,17 @@ class llm_task { pcmdata = buffer_create(); } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); if (vad_) { vad_.reset(); } @@ -553,6 +563,7 @@ class llm_vad : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); if (llm_task_[work_id_num]->audio_flage_) { @@ -570,6 +581,7 @@ class llm_vad : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); if (iteam->second->audio_flage_) { unit_call("audio", "cap_stop", "None"); } diff --git a/projects/llm_framework/main_vlm/src/main.cpp b/projects/llm_framework/main_vlm/src/main.cpp index 758dff2..3ea904c 100644 --- a/projects/llm_framework/main_vlm/src/main.cpp +++ b/projects/llm_framework/main_vlm/src/main.cpp @@ -102,6 +102,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -279,14 +280,25 @@ class llm_task { { } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); if (tokenizer_pid_ != -1) { kill(tokenizer_pid_, SIGTERM); waitpid(tokenizer_pid_, nullptr, WNOHANG); + // tokenizer_pid_ = -1; } if (lLaMa_) { lLaMa_->Deinit(); + // lLaMa_.reset(); } } }; @@ -605,6 +617,7 @@ class llm_llm : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); task_pause(llm_task_[work_id_num], get_channel(work_id_num)); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); @@ -620,6 +633,7 @@ class llm_llm : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); get_channel(iteam->first)->stop_subscriber(""); iteam->second.reset(); llm_task_.erase(iteam->first); diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 768e387..e98a301 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -212,6 +212,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; @@ -533,8 +534,17 @@ class llm_task { _ax_init(); } + void start() + { + } + + void stop() + { + } + ~llm_task() { + stop(); _ax_deinit(); buffer_destroy(pcmdata); } @@ -957,6 +967,7 @@ class llm_whisper : public StackFlow { send("None", "None", error_body, work_id); return -1; } + llm_task_[work_id_num]->stop(); auto llm_channel = get_channel(work_id_num); llm_channel->stop_subscriber(""); if (llm_task_[work_id_num]->audio_flage_) { @@ -974,6 +985,7 @@ class llm_whisper : public StackFlow { if (iteam == llm_task_.end()) { break; } + iteam->second->stop(); if (iteam->second->audio_flage_) { unit_call("audio", "cap_stop", "None"); } diff --git a/projects/llm_framework/main_yolo/src/main.cpp b/projects/llm_framework/main_yolo/src/main.cpp index 9e30f5f..8f3284b 100644 --- a/projects/llm_framework/main_yolo/src/main.cpp +++ b/projects/llm_framework/main_yolo/src/main.cpp @@ -111,6 +111,7 @@ class llm_task { SLOGW("config file :%s miss", file_name.c_str()); continue; } + SLOGI("config file :%s read", file_name.c_str()); config_file >> file_body; config_file.close(); break; From 9f257bca57621154115e7f8bdee67c3af07fed86 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Wed, 9 Apr 2025 18:09:54 +0800 Subject: [PATCH 13/64] [update] main_camera doc --- .../llm_camera_en.md | 27 ++++++++++--------- .../llm_camera_zh.md | 14 ++++++---- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/doc/projects_llm_framework_doc/llm_camera_en.md b/doc/projects_llm_framework_doc/llm_camera_en.md index f7a93f9..25a0f05 100644 --- a/doc/projects_llm_framework_doc/llm_camera_en.md +++ b/doc/projects_llm_framework_doc/llm_camera_en.md @@ -15,25 +15,28 @@ Send JSON: "action": "setup", "object": "camera.setup", "data": { - "response_format": "camera.raw", + "response_format": "image.yuyv422.base64", "input": "/dev/video0", "enoutput": false, "frame_width": 320, - "frame_height": 320 + "frame_height": 320, + "enable_webstream":false, + "rtsp":"rtsp.1280x720.h265" } } ``` -- request_id: Reference basic data explanation. +- request_id: Refer to the basic data explanation. - work_id: When configuring the unit, it is `camera`. -- action: The method being called is `setup`. -- object: The data type being transmitted is `camera.setup`. -- response_format: The returned result is `camera.raw`, which is in YUV422 format. -- input: The name of the device being read. -- frame_width: The output video frame width. -- frame_height: The output video frame height. -- enoutput: Whether to enable the user result output. If camera images are not needed, do not enable this parameter, as - the video stream will increase the communication load of the channel. +- action: The method called is `setup`. +- object: The type of data transmitted is `camera.setup`. +- response_format: The output format is `image.yuyv422.base64`, which is in yuyv422 format. An optional format is image.jpeg.base64. +- input: The device name to be read. Example: "/dev/video0", "axera_single_sc850sl" +- frame_width: The width of the video frame output. +- frame_height: The height of the video frame output. +- enoutput: Whether to enable user result output. If you do not need to obtain camera images, do not enable this parameter, as the video stream will increase the communication pressure on the channel. +- enable_webstream: Whether to enable webstream output, webstream will listen on tcp:8989 port, and once a client connection is received, it will push jpeg images in HTTP protocol multipart/x-mixed-replace type. +- rtsp: Whether to enable rtsp stream output, rtsp will establish an RTSP TCP server at rtsp://{DevIp}:8554/axstream0, and you can pull the video stream from this port using the RTSP protocol. The video stream format is 1280x720 H265. Note that this video stream is only valid on the AX630C MIPI camera, and the UVC camera cannot use RTSP. Response JSON: @@ -137,7 +140,7 @@ Response JSON: "created": 1731652344, "data": { "enoutput": false, - "response_format": "camera.raw", + "response_format": "image.yuyv422.base64", "input": "/dev/video0", "frame_width": 320, "frame_height": 320 diff --git a/doc/projects_llm_framework_doc/llm_camera_zh.md b/doc/projects_llm_framework_doc/llm_camera_zh.md index 3610a81..97cacbe 100644 --- a/doc/projects_llm_framework_doc/llm_camera_zh.md +++ b/doc/projects_llm_framework_doc/llm_camera_zh.md @@ -15,11 +15,13 @@ "action": "setup", "object": "camera.setup", "data": { - "response_format": "camera.raw", + "response_format": "image.yuyv422.base64", "input": "/dev/video0", "enoutput": false, "frame_width": 320, - "frame_height": 320 + "frame_height": 320, + "enable_webstream":false, + "rtsp":"rtsp.1280x720.h265" } } ``` @@ -28,11 +30,13 @@ - work_id:配置单元时,为 `camera`。 - action:调用的方法为 `setup`。 - object:传输的数据类型为 `camera.setup`。 -- response_format:返回结果为 `camera.raw`,是 yuv422 格式。 -- input:读取的设备名。 +- response_format:返回结果为 `image.yuyv422.base64`,是 yuyv422 格式。可选 image.jpeg.base64 格式输出。 +- input:读取的设备名。示例:"/dev/video0", "axera_single_sc850sl" - frame_width:输出的视频帧宽。 - frame_height:输出的视频帧高。 - enoutput:是否起用用户结果输出。如果不需要获取摄像头图片,请不要开启该参数,视频流会增加信道的通信压力。 +- enable_webstream:是否启用 webstream 流输出,webstream 会监听 tcp:8989 端口,一但收到客户端连接,将会以 HTTP 协议 multipart/x-mixed-replace 类型推送 jpeg 图片。 +- rtsp:是否启用 rtsp 流输出,rtsp 会建立一个 rtsp://{DevIp}:8554/axstream0 RTSP TCP 服务端,可使用RTSP 协议向该端口拉取视频流。视频流的格式为 1280x720 H265。注意,该视频流只在 AX630C MIPI 摄像头上有效,UVC 摄像头无法使用 RTSP。 响应 json: @@ -136,7 +140,7 @@ error::code 为 0 表示执行成功。 "created": 1731652344, "data": { "enoutput": false, - "response_format": "camera.raw", + "response_format": "image.yuyv422.base64", "input": "/dev/video0", "frame_width": 320, "frame_height": 320 From 3e367c1536e777bc2edde829dd2510dbd5c86bb6 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Thu, 10 Apr 2025 12:25:44 +0800 Subject: [PATCH 14/64] [update] kws doc add enwake_audio --- doc/projects_llm_framework_doc/llm_kws_en.md | 4 +++- doc/projects_llm_framework_doc/llm_kws_zh.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/projects_llm_framework_doc/llm_kws_en.md b/doc/projects_llm_framework_doc/llm_kws_en.md index 89a9cad..4ab85b4 100644 --- a/doc/projects_llm_framework_doc/llm_kws_en.md +++ b/doc/projects_llm_framework_doc/llm_kws_en.md @@ -20,7 +20,8 @@ Send JSON: "response_format": "kws.bool", "input": "sys.pcm", "enoutput": true, - "kws": "HELLO" + "kws": "HELLO", + "enwake_audio": true } } ``` @@ -34,6 +35,7 @@ Send JSON: - input: The input is `sys.pcm`, representing system audio. - enoutput: Whether to enable user result output. - kws: The Chinese wake-up word is `"你好你好"`. +- enwake_audio: Whether to enable wake-up audio output. Default is true. Response JSON: diff --git a/doc/projects_llm_framework_doc/llm_kws_zh.md b/doc/projects_llm_framework_doc/llm_kws_zh.md index 86dcbfa..5bda61b 100644 --- a/doc/projects_llm_framework_doc/llm_kws_zh.md +++ b/doc/projects_llm_framework_doc/llm_kws_zh.md @@ -19,7 +19,8 @@ "response_format": "kws.bool", "input": "sys.pcm", "enoutput": true, - "kws": "你好你好" + "kws": "你好你好", + "enwake_audio": true } } ``` @@ -33,6 +34,7 @@ - input:输入的为 `sys.pcm`,代表的是系统音频。 - enoutput:是否起用用户结果输出。 - kws:中文唤醒词为 `"你好你好"`。 +- enwake_audio:是否起用唤醒音频输出。默认是 true 响应 json: From b8108f313491641ba0e02ee1409ba109fd2d32b6 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 15 Apr 2025 12:20:08 +0800 Subject: [PATCH 15/64] [update] Compatible with OpenAI API calls --- projects/llm_framework/main_whisper/src/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index e98a301..c2bd3d0 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -821,6 +821,7 @@ class llm_whisper : public StackFlow { }); llm_task_obj->audio_flage_ = true; } else if (input.find("whisper") != std::string::npos) { + if (input.find("base64.stream") != std::string::npos) llm_task_obj->delay_audio_frame_ = 0; llm_channel->subscriber_work_id( "", std::bind(&llm_whisper::task_user_data, this, std::weak_ptr(llm_task_obj), std::weak_ptr(llm_channel), std::placeholders::_1, From a6bc38bf6f94b28718459c710e9a4496ee6e9731 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 15 Apr 2025 19:37:42 +0800 Subject: [PATCH 16/64] [update] Update log printing. Update fields --- .../llm_framework/main_whisper/src/main.cpp | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index c2bd3d0..9421662 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -260,7 +260,7 @@ class llm_task { positional_embedding.resize(mode_config_.whisper_n_text_ctx * WHISPER_N_TEXT_STATE); FILE *fp = fopen(mode_config_.positional_embedding.c_str(), "rb"); if (!fp) { - printf("Open %s failed!\n", mode_config_.positional_embedding.c_str()); + SLOGE("Open %s failed!\n", mode_config_.positional_embedding.c_str()); return -3; } fread(positional_embedding.data(), sizeof(float), mode_config_.whisper_n_text_ctx * WHISPER_N_TEXT_STATE, @@ -282,15 +282,15 @@ class llm_task { decoder_main_ = std::make_unique(); decoder_loop_ = std::make_unique(); if (0 != encoder_->Init(mode_config_.encoder.c_str())) { - printf("encoder init failed!\n"); + SLOGE("encoder init failed!\n"); return -4; } if (0 != decoder_main_->Init(mode_config_.decoder_main.c_str())) { - printf("Init decoder_main model failed!\n"); + SLOGE("Init decoder_main model failed!\n"); return -5; } if (0 != decoder_loop_->Init(mode_config_.decoder_loop.c_str())) { - printf("Init decoder_main model failed!\n"); + SLOGE("Init decoder_main model failed!\n"); return -6; } } catch (...) { @@ -391,7 +391,7 @@ class llm_task { return; } end = get_current_time(); - printf("Encoder run take %.2f ms\n", (end - start)); + SLOGI("Encoder run take %.2f ms\n", (end - start)); // detect language SOT_SEQUENCE[1] = detect_language(language_); @@ -417,7 +417,7 @@ class llm_task { supress_tokens(logits, true); max_token_id = argmax(logits); - printf("First token: %d \t take %.2fms\n", max_token_id, (end - start)); + SLOGI("First token: %d \t take %.2fms\n", max_token_id, (end - start)); mode_config_.neg_inf = -std::numeric_limits::infinity(); std::vector mask(mode_config_.whisper_n_text_ctx); for (int n = 0; n < mode_config_.whisper_n_text_ctx - offset - 1; n++) { @@ -446,7 +446,7 @@ class llm_task { ret = decoder_loop_->Run(); if (ret) { - printf("decoder_loop run failed!\n"); + SLOGE("decoder_loop run failed!\n"); return; } @@ -461,11 +461,11 @@ class llm_task { max_token_id = argmax(logits); end = get_current_time(); - printf("Next Token: %d \t take %.2fms\n", max_token_id, (end - start)); + SLOGI("Next Token: %d \t take %.2fms\n", max_token_id, (end - start)); } end_all = get_current_time(); - printf("All take %.2f ms\n", (end_all - start_all)); + SLOGI("All take %.2f ms\n", (end_all - start_all)); std::string s; for (const auto i : results) { @@ -476,12 +476,10 @@ class llm_task { } if (mode_config_.language == "en" || mode_config_.language == "ja") { - printf("Result: %s\n", s.c_str()); if (out_callback_) out_callback_(s, true); } else { const opencc::SimpleConverter converter(mode_config_.t2s.c_str()); std::string simple_str = converter.Convert(s); - printf("Result: %s\n", simple_str.c_str()); if ((!simple_str.empty()) && out_callback_) { out_callback_(simple_str, true); } @@ -821,7 +819,7 @@ class llm_whisper : public StackFlow { }); llm_task_obj->audio_flage_ = true; } else if (input.find("whisper") != std::string::npos) { - if (input.find("base64.stream") != std::string::npos) llm_task_obj->delay_audio_frame_ = 0; + if (input.find("stream.base64") != std::string::npos) llm_task_obj->delay_audio_frame_ = 0; llm_channel->subscriber_work_id( "", std::bind(&llm_whisper::task_user_data, this, std::weak_ptr(llm_task_obj), std::weak_ptr(llm_channel), std::placeholders::_1, From 4274e8c320c5ab9969613a8cad3896aaf3cf7760 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 16 Apr 2025 11:13:39 +0800 Subject: [PATCH 17/64] [fix] Fix buffer data overwrite --- projects/llm_framework/main_whisper/src/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 9421662..3dc05e8 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -316,6 +316,7 @@ class llm_task { if (endpoint_flage_) return; } endpoint_flage_ = true; + buffer_resize(pcmdata, 0); buffer_write_char(pcmdata, raw.c_str(), raw.length()); buffer_position_set(pcmdata, 0); count = 0; From f952bb64e6277daca5292abfb0b591d909916690 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 16 Apr 2025 14:47:10 +0800 Subject: [PATCH 18/64] [update] Added support for 650N. Enable bLoadModelUseCmm. --- .../llm_framework/main_whisper/SConstruct | 2 +- .../main_whisper/src/runner/EngineWrapper.cpp | 106 +++++++++++++++++- 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/projects/llm_framework/main_whisper/SConstruct b/projects/llm_framework/main_whisper/SConstruct index c14cf6b..c8c205f 100644 --- a/projects/llm_framework/main_whisper/SConstruct +++ b/projects/llm_framework/main_whisper/SConstruct @@ -17,7 +17,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] +DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17', '-DCHIP_AX630C'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp b/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp index 4bc6aed..cd490ee 100644 --- a/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp +++ b/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp @@ -12,8 +12,13 @@ #include +#if defined(CHIP_AX650) +static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; +#endif +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; +#endif /// @brief npu type @@ -26,7 +31,104 @@ typedef enum axNPU_TYPE_E { AX_BL_VNPU_2 = (1 << 4) /* running under BIG-LITTLE VNPU2 */ } AX_NPU_TYPE_E; +#if defined(CHIP_AX650) +static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { + AX_ENGINE_NPU_ATTR_T stNpuAttr; + memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); + + auto ret = AX_ENGINE_GetVNPUAttr(&stNpuAttr); + if (ret == 0) { + // VNPU DISABLE + if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { + nNpuSet = 0x01; // NON-VNPU (0b111) + // printf("%s will run under VNPU-DISABLE [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + // STD VNPU + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_STD) { + // 7.2T & 10.8T no allow + if (eModelType == AX_ENGINE_MODEL_TYPE1 + || eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under STD VNPU\n", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); + return -1; + } + // default STD VNPU2 + if (nNpuType == 0) { + nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + else { + if (nNpuType & AX_STD_VNPU_1) { + nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + if (nNpuType & AX_STD_VNPU_2) { + nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + if (nNpuType & AX_STD_VNPU_3) { + nNpuSet |= 0x04; // VNPU3 (0b100) + // printf("%s will run under STD-VNPU3 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + } + // BL VNPU + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BIG_LITTLE) { + // 10.8T no allow + if (eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU\n", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); + return -1; + } + + // default BL VNPU + if (nNpuType == 0) { + // 7.2T default BL VNPU1 + if (eModelType == AX_ENGINE_MODEL_TYPE1) { + nNpuSet = 0x01; // VNPU1 (0b001) + // printf("%s will run under default BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + // 3.6T default BL VNPU2 + else { + nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default BL-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + else { + // 7.2T + if (eModelType == AX_ENGINE_MODEL_TYPE1) { + // no allow set to BL VNPU2 + if (nNpuType & AX_BL_VNPU_2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU2\n", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); + return -1; + } + if (nNpuType & AX_BL_VNPU_1) { + nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + // 3.6T + else { + if (nNpuType & AX_BL_VNPU_1) { + nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + if (nNpuType & AX_BL_VNPU_2) { + nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under BL-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + } + } + } + else { + printf("AX_ENGINE_GetVNPUAttr fail ret = %x\n", ret); + } + + return ret; +} +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { AX_ENGINE_NPU_ATTR_T stNpuAttr; memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); @@ -69,13 +171,15 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ return ret; } +#endif + int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) { AX_S32 ret = 0; // 1. load model - AX_BOOL bLoadModelUseCmm = AX_FALSE; + AX_BOOL bLoadModelUseCmm = AX_TRUE; AX_CHAR *pModelBufferVirAddr = nullptr; AX_U64 u64ModelBufferPhyAddr = 0; AX_U32 nModelBufferSize = 0; From 98adcc20cc4347987d6e3eee9be606c35c6694d5 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 16 Apr 2025 17:08:08 +0800 Subject: [PATCH 19/64] [update] depth_anything, melotts, yolo. Added support for 650N. --- .../main_depth_anything/SConstruct | 2 +- .../main_depth_anything/src/EngineWrapper.cpp | 172 +++++++++++++----- .../main_depth_anything/src/EngineWrapper.hpp | 33 ++-- .../main_depth_anything/src/main.cpp | 3 +- .../llm_framework/main_melotts/SConstruct | 2 +- .../llm_framework/main_melotts/src/main.cpp | 11 +- .../main_melotts/src/runner/EngineWrapper.cpp | 88 ++++++++- .../main_melotts/src/runner/EngineWrapper.hpp | 4 +- projects/llm_framework/main_yolo/SConstruct | 2 +- .../main_yolo/src/EngineWrapper.cpp | 146 +++++++++++---- .../main_yolo/src/EngineWrapper.hpp | 12 +- projects/llm_framework/main_yolo/src/main.cpp | 2 +- 12 files changed, 347 insertions(+), 130 deletions(-) diff --git a/projects/llm_framework/main_depth_anything/SConstruct b/projects/llm_framework/main_depth_anything/SConstruct index 8500311..7ad6376 100644 --- a/projects/llm_framework/main_depth_anything/SConstruct +++ b/projects/llm_framework/main_depth_anything/SConstruct @@ -16,7 +16,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-std=c++17'] +DEFINITIONS += ['-O3', '-std=c++17', '-DCHIP_AX630C'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp b/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp index 09d7067..7947417 100644 --- a/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp +++ b/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp @@ -7,13 +7,18 @@ * written consent of Axera Semiconductor (Ningbo) Co., Ltd. * **************************************************************************************************/ -#include "base/detection.hpp" -#define UNUSE_STRUCT_OBJECT #include "EngineWrapper.hpp" #include "utils/io.hpp" + #include -static const char* strAlgoModelType[AX_ENGINE_VIRTUAL_NPU_BUTT] = {"1.6T", "3.2T"}; +#if defined(CHIP_AX650) +static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; +#endif /// @brief npu type typedef enum axNPU_TYPE_E { @@ -25,8 +30,9 @@ typedef enum axNPU_TYPE_E { AX_BL_VNPU_2 = (1 << 4) /* running under BIG-LITTLE VNPU2 */ } AX_NPU_TYPE_E; -static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_TYPE_T& eModelType, - const AX_S32& nNpuType, AX_U32& nNpuSet) +#if defined(CHIP_AX650) +static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, + const AX_S32 &nNpuType, AX_U32 &nNpuSet) { AX_ENGINE_NPU_ATTR_T stNpuAttr; memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); @@ -36,33 +42,42 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ // VNPU DISABLE if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { nNpuSet = 0x01; // NON-VNPU (0b111) + // printf("%s will run under VNPU-DISABLE [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } // STD VNPU - else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BUTT) { + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_STD) { // 7.2T & 10.8T no allow - if (eModelType == AX_ENGINE_MODEL_TYPE1 || eModelType == AX_ENGINE_MODEL_TYPE1) { + if (eModelType == AX_ENGINE_MODEL_TYPE1 || eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under STD VNPU\n", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); return -1; } // default STD VNPU2 if (nNpuType == 0) { nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } else { if (nNpuType & AX_STD_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_STD_VNPU_2) { nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_STD_VNPU_3) { nNpuSet |= 0x04; // VNPU3 (0b100) + // printf("%s will run under STD-VNPU3 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } } // BL VNPU - else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BUTT) { + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BIG_LITTLE) { // 10.8T no allow - if (eModelType == AX_ENGINE_MODEL_TYPE1) { + if (eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU\n", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); return -1; } @@ -71,29 +86,38 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ // 7.2T default BL VNPU1 if (eModelType == AX_ENGINE_MODEL_TYPE1) { nNpuSet = 0x01; // VNPU1 (0b001) + // printf("%s will run under default BL-VNPU1 [%s]\n", strModel.c_str(), + // strAlgoModelType[eModelType]); } // 3.6T default BL VNPU2 else { nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default BL-VNPU2 [%s]\n", strModel.c_str(), + // strAlgoModelType[eModelType]); } } else { // 7.2T if (eModelType == AX_ENGINE_MODEL_TYPE1) { // no allow set to BL VNPU2 if (nNpuType & AX_BL_VNPU_2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU2\n", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); return -1; } if (nNpuType & AX_BL_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } // 3.6T else { if (nNpuType & AX_BL_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_BL_VNPU_2) { nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under BL-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } } @@ -104,21 +128,69 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ return ret; } +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, + const AX_S32 &nNpuType, AX_U32 &nNpuSet) +{ + AX_ENGINE_NPU_ATTR_T stNpuAttr; + memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); + + auto ret = AX_ENGINE_GetVNPUAttr(&stNpuAttr); + if (ret == 0) { + // VNPU DISABLE + if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { + nNpuSet = 0x01; // NON-VNPU (0b111) + // ALOGN("%s will run under VNPU-DISABLE [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + // STD VNPU + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_ENABLE) { + // full ocm model was no allowned + if (eModelType == AX_ENGINE_MODEL_TYPE1) { + // printf("%s model type%d: [%s], no allow run under STD VNPU", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); + return -1; + } + + // default STD VNPU2 + if (nNpuType == 0) { + nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } else { + if (nNpuType & AX_STD_VNPU_1) { + nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + if (nNpuType & AX_STD_VNPU_2) { + nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + } + } else { + printf("AX_ENGINE_GetVNPUAttr fail ret = %x", ret); + } + + return ret; +} +#endif -int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) +int EngineWrapper::Init(const char *strModelPath, uint32_t nNpuType) { AX_S32 ret = 0; // 1. load model - AX_BOOL bLoadModelUseCmm = AX_FALSE; - AX_CHAR* pModelBufferVirAddr = nullptr; + AX_BOOL bLoadModelUseCmm = AX_TRUE; + AX_CHAR *pModelBufferVirAddr = nullptr; AX_U64 u64ModelBufferPhyAddr = 0; AX_U32 nModelBufferSize = 0; std::vector model_buffer; if (bLoadModelUseCmm) { - if (!utils::read_file(strModelPath, (AX_VOID**)&pModelBufferVirAddr, u64ModelBufferPhyAddr, nModelBufferSize)) { + if (!utils::read_file(strModelPath, (AX_VOID **)&pModelBufferVirAddr, u64ModelBufferPhyAddr, + nModelBufferSize)) { printf("ALGO read model(%s) fail\n", strModelPath); return -1; } @@ -220,7 +292,7 @@ int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) // 6. prepare io // AX_U32 nIoDepth = (stCtx.vecOutputBufferFlag.size() == 0) ? 1 : stCtx.vecOutputBufferFlag.size(); - ret = utils::prepare_io(strModelPath, m_io_info, m_io, utils::IO_BUFFER_STRATEGY_DEFAULT); + ret = utils::prepare_io(strModelPath, m_io_info, m_io, utils::IO_BUFFER_STRATEGY_CACHED); if (0 != ret) { printf("prepare io failed!\n"); utils::free_io(m_io); @@ -233,12 +305,12 @@ int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) return 0; } -int EngineWrapper::SetInput(void* pInput, int index) +int EngineWrapper::SetInput(void *pInput, int index) { return utils::push_io_input(pInput, index, m_io); } -int EngineWrapper::RunSync() +int EngineWrapper::Run() { if (!m_hasInit) return -1; @@ -252,12 +324,43 @@ int EngineWrapper::RunSync() return 0; } -void post_process(AX_ENGINE_IO_INFO_T* io_info, AX_ENGINE_IO_T* io_data, const cv::Mat& mat, std::string& model_type, - std::string& byteString) +int EngineWrapper::GetOutput(void *pOutput, int index) +{ + return utils::push_io_output(pOutput, index, m_io); +} + +int EngineWrapper::GetInputSize(int index) +{ + return m_io.pInputs[index].nSize; +} + +int EngineWrapper::GetOutputSize(int index) +{ + return m_io.pOutputs[index].nSize; +} + +void *EngineWrapper::GetOutputPtr(int index) +{ + utils::cache_io_flush(&m_io.pOutputs[index]); + return m_io.pOutputs[index].pVirAddr; +} + +int EngineWrapper::Release() +{ + if (m_handle) { + utils::free_io(m_io); + AX_ENGINE_DestroyHandle(m_handle); + m_handle = nullptr; + } + return 0; +} + +void post_process(AX_ENGINE_IO_INFO_T *io_info, AX_ENGINE_IO_T *io_data, const cv::Mat &mat, std::string &model_type, + std::string &byteString) { if (model_type == "segment") { - auto& output = io_data->pOutputs[0]; - auto& info = io_info->pOutputs[0]; + auto &output = io_data->pOutputs[0]; + auto &info = io_info->pOutputs[0]; cv::Mat feature(info.pShape[2], info.pShape[3], CV_32FC1, output.pVirAddr); double minVal, maxVal; @@ -280,33 +383,8 @@ void post_process(AX_ENGINE_IO_INFO_T* io_info, AX_ENGINE_IO_T* io_data, const c } } -int EngineWrapper::Post_Process(cv::Mat& mat, std::string& model_type, std::string& byteString) +int EngineWrapper::Post_Process(cv::Mat &mat, std::string &model_type, std::string &byteString) { post_process(m_io_info, &m_io, mat, model_type, byteString); return 0; -} - -int EngineWrapper::GetOutput(void* pOutput, int index) -{ - return utils::push_io_output(pOutput, index, m_io); -} - -int EngineWrapper::GetInputSize(int index) -{ - return m_io.pInputs[index].nSize; -} - -int EngineWrapper::GetOutputSize(int index) -{ - return m_io.pOutputs[index].nSize; -} - -int EngineWrapper::Release() -{ - if (m_handle) { - utils::free_io(m_io); - AX_ENGINE_DestroyHandle(m_handle); - m_handle = nullptr; - } - return 0; -} +} \ No newline at end of file diff --git a/projects/llm_framework/main_depth_anything/src/EngineWrapper.hpp b/projects/llm_framework/main_depth_anything/src/EngineWrapper.hpp index 520a313..c3f848e 100644 --- a/projects/llm_framework/main_depth_anything/src/EngineWrapper.hpp +++ b/projects/llm_framework/main_depth_anything/src/EngineWrapper.hpp @@ -10,27 +10,14 @@ #pragma once +#include +#include +#include +#include #include #include -#include "ax_engine_api.h" - -#ifndef UNUSE_STRUCT_OBJECT -namespace detection { -typedef struct Object { - cv::Rect_ rect; - int label; - float prob; - cv::Point2f landmark[5]; - /* for yolov5-seg */ - cv::Mat mask; - std::vector mask_feat; - std::vector kps_feat; - /* for yolov8-obb */ - float angle; -} Object; -} // namespace detection -#endif +#include "ax_engine_api.h" class EngineWrapper { public: @@ -47,21 +34,23 @@ class EngineWrapper { int SetInput(void* pInput, int index); - int RunSync(); - - int Post_Process(cv::Mat& mat, std::string& model_type, std::string& byteString); + int Run(); int GetOutput(void* pOutput, int index); int GetInputSize(int index); int GetOutputSize(int index); + void* GetOutputPtr(int index); + int Release(); + int Post_Process(cv::Mat& mat, std::string& model_type, std::string& byteString); + protected: bool m_hasInit; AX_ENGINE_HANDLE m_handle; AX_ENGINE_IO_INFO_T* m_io_info{}; AX_ENGINE_IO_T m_io{}; int m_input_num{}, m_output_num{}; -}; +}; \ No newline at end of file diff --git a/projects/llm_framework/main_depth_anything/src/main.cpp b/projects/llm_framework/main_depth_anything/src/main.cpp index 0d119d7..ee7d9e8 100644 --- a/projects/llm_framework/main_depth_anything/src/main.cpp +++ b/projects/llm_framework/main_depth_anything/src/main.cpp @@ -198,11 +198,10 @@ class llm_task { common::get_input_data_no_letterbox(src, image, mode_config_.img_h, mode_config_.img_w, bgr2rgb); cv::Mat img_mat(mode_config_.img_h, mode_config_.img_w, CV_8UC3, image.data()); depth_anything_->SetInput((void *)image.data(), 0); - if (0 != depth_anything_->RunSync()) { + if (0 != depth_anything_->Run()) { SLOGE("Run depth_anything model failed!\n"); throw std::string("depth_anything_ RunSync error"); } - std::vector objects; std::string depth_anything_output; depth_anything_->Post_Process(img_mat, mode_config_.model_type, depth_anything_output); if (out_callback_) out_callback_(depth_anything_output, true); diff --git a/projects/llm_framework/main_melotts/SConstruct b/projects/llm_framework/main_melotts/SConstruct index 358ddb2..0fd84fb 100644 --- a/projects/llm_framework/main_melotts/SConstruct +++ b/projects/llm_framework/main_melotts/SConstruct @@ -17,7 +17,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] +DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17', '-DCHIP_AX630C'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index f8610ef..9890fac 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -29,7 +29,7 @@ using namespace StackFlows; int main_exit_flage = 0; static void __sigint(int iSigNo) { - SLOGW("llm_sys will be exit!"); + SLOGW("llm_melotts will be exit!"); main_exit_flage = 1; } @@ -277,7 +277,7 @@ class llm_task { } decoder_->SetInput(zp.data(), 0); decoder_->SetInput(g_matrix.data(), 1); - if (0 != decoder_->RunSync()) { + if (0 != decoder_->Run()) { printf("Run decoder model failed!\n"); throw std::string("decoder_ RunSync error"); } @@ -362,10 +362,7 @@ class llm_task { ~llm_task() { stop(); - if (decoder_) { - decoder_->Release(); - // decoder_.reset(); - } + if (decoder_) decoder_->Release(); _ax_deinit(); } }; @@ -672,7 +669,7 @@ class llm_tts : public StackFlow { int exit(const std::string &work_id, const std::string &object, const std::string &data) override { - SLOGI("llm_tts::exit:%s", data.c_str()); + SLOGI("llm_melotts::exit:%s", data.c_str()); nlohmann::json error_body; int work_id_num = sample_get_work_id_num(work_id); diff --git a/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp b/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp index d604104..cd490ee 100644 --- a/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp +++ b/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp @@ -12,7 +12,14 @@ #include -static const char *strAlgoModelType[AX_ENGINE_VIRTUAL_NPU_BUTT] = {"1.6T", "3.2T"}; +#if defined(CHIP_AX650) +static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; +#endif + /// @brief npu type typedef enum axNPU_TYPE_E { @@ -24,6 +31,7 @@ typedef enum axNPU_TYPE_E { AX_BL_VNPU_2 = (1 << 4) /* running under BIG-LITTLE VNPU2 */ } AX_NPU_TYPE_E; +#if defined(CHIP_AX650) static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { AX_ENGINE_NPU_ATTR_T stNpuAttr; memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); @@ -33,35 +41,42 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ // VNPU DISABLE if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { nNpuSet = 0x01; // NON-VNPU (0b111) + // printf("%s will run under VNPU-DISABLE [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } // STD VNPU - else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BUTT) { + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_STD) { // 7.2T & 10.8T no allow if (eModelType == AX_ENGINE_MODEL_TYPE1 - || eModelType == AX_ENGINE_MODEL_TYPE1) { + || eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under STD VNPU\n", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); return -1; } // default STD VNPU2 if (nNpuType == 0) { nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } else { if (nNpuType & AX_STD_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_STD_VNPU_2) { nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_STD_VNPU_3) { nNpuSet |= 0x04; // VNPU3 (0b100) + // printf("%s will run under STD-VNPU3 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } } // BL VNPU - else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BUTT) { + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BIG_LITTLE) { // 10.8T no allow - if (eModelType == AX_ENGINE_MODEL_TYPE1) { + if (eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU\n", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); return -1; } @@ -70,10 +85,12 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ // 7.2T default BL VNPU1 if (eModelType == AX_ENGINE_MODEL_TYPE1) { nNpuSet = 0x01; // VNPU1 (0b001) + // printf("%s will run under default BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } // 3.6T default BL VNPU2 else { nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default BL-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } else { @@ -81,19 +98,23 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ if (eModelType == AX_ENGINE_MODEL_TYPE1) { // no allow set to BL VNPU2 if (nNpuType & AX_BL_VNPU_2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU2\n", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); return -1; } if (nNpuType & AX_BL_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } // 3.6T else { if (nNpuType & AX_BL_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_BL_VNPU_2) { nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under BL-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } } @@ -105,6 +126,52 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ return ret; } +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { + AX_ENGINE_NPU_ATTR_T stNpuAttr; + memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); + + auto ret = AX_ENGINE_GetVNPUAttr(&stNpuAttr); + if (ret == 0) { + // VNPU DISABLE + if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { + nNpuSet = 0x01; // NON-VNPU (0b111) + // ALOGN("%s will run under VNPU-DISABLE [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + // STD VNPU + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_ENABLE) { + // full ocm model was no allowned + if (eModelType == AX_ENGINE_MODEL_TYPE1) { + // printf("%s model type%d: [%s], no allow run under STD VNPU", strModel.c_str(), eModelType, strAlgoModelType[eModelType]); + return -1; + } + + // default STD VNPU2 + if (nNpuType == 0) { + nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + else { + if (nNpuType & AX_STD_VNPU_1) { + nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + if (nNpuType & AX_STD_VNPU_2) { + nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + } + } + else { + printf("AX_ENGINE_GetVNPUAttr fail ret = %x", ret); + } + + return ret; +} +#endif int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) @@ -112,7 +179,7 @@ int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) AX_S32 ret = 0; // 1. load model - AX_BOOL bLoadModelUseCmm = AX_FALSE; + AX_BOOL bLoadModelUseCmm = AX_TRUE; AX_CHAR *pModelBufferVirAddr = nullptr; AX_U64 u64ModelBufferPhyAddr = 0; AX_U32 nModelBufferSize = 0; @@ -224,7 +291,7 @@ int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) // 6. prepare io // AX_U32 nIoDepth = (stCtx.vecOutputBufferFlag.size() == 0) ? 1 : stCtx.vecOutputBufferFlag.size(); - ret = utils::prepare_io(strModelPath, m_io_info, m_io, utils::IO_BUFFER_STRATEGY_DEFAULT); + ret = utils::prepare_io(strModelPath, m_io_info, m_io, utils::IO_BUFFER_STRATEGY_CACHED); if (0 != ret) { printf("prepare io failed!\n"); utils::free_io(m_io); @@ -241,7 +308,7 @@ int EngineWrapper::SetInput(void* pInput, int index) { return utils::push_io_input(pInput, index, m_io); } -int EngineWrapper::RunSync() +int EngineWrapper::Run() { if (!m_hasInit) return -1; @@ -268,6 +335,11 @@ int EngineWrapper::GetOutputSize(int index) { return m_io.pOutputs[index].nSize; } +void* EngineWrapper::GetOutputPtr(int index) { + utils::cache_io_flush(&m_io.pOutputs[index]); + return m_io.pOutputs[index].pVirAddr; +} + int EngineWrapper::Release() { if (m_handle) { diff --git a/projects/llm_framework/main_melotts/src/runner/EngineWrapper.hpp b/projects/llm_framework/main_melotts/src/runner/EngineWrapper.hpp index e249d54..c0d53ba 100644 --- a/projects/llm_framework/main_melotts/src/runner/EngineWrapper.hpp +++ b/projects/llm_framework/main_melotts/src/runner/EngineWrapper.hpp @@ -33,13 +33,15 @@ class EngineWrapper { int SetInput(void* pInput, int index); - int RunSync(); + int Run(); int GetOutput(void* pOutput, int index); int GetInputSize(int index); int GetOutputSize(int index); + void* GetOutputPtr(int index); + int Release(); protected: diff --git a/projects/llm_framework/main_yolo/SConstruct b/projects/llm_framework/main_yolo/SConstruct index 8400aa9..8b8f356 100644 --- a/projects/llm_framework/main_yolo/SConstruct +++ b/projects/llm_framework/main_yolo/SConstruct @@ -16,7 +16,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-std=c++17', '-O2'] +DEFINITIONS += ['-std=c++17', '-O2', '-DCHIP_AX630C'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_yolo/src/EngineWrapper.cpp b/projects/llm_framework/main_yolo/src/EngineWrapper.cpp index 035e259..0280180 100644 --- a/projects/llm_framework/main_yolo/src/EngineWrapper.cpp +++ b/projects/llm_framework/main_yolo/src/EngineWrapper.cpp @@ -13,7 +13,13 @@ #include "utils/io.hpp" #include -static const char* strAlgoModelType[AX_ENGINE_VIRTUAL_NPU_BUTT] = {"1.6T", "3.2T"}; +#if defined(CHIP_AX650) +static const char* strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +static const char* strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; +#endif /// @brief npu type typedef enum axNPU_TYPE_E { @@ -25,6 +31,7 @@ typedef enum axNPU_TYPE_E { AX_BL_VNPU_2 = (1 << 4) /* running under BIG-LITTLE VNPU2 */ } AX_NPU_TYPE_E; +#if defined(CHIP_AX650) static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_TYPE_T& eModelType, const AX_S32& nNpuType, AX_U32& nNpuSet) { @@ -36,33 +43,42 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ // VNPU DISABLE if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { nNpuSet = 0x01; // NON-VNPU (0b111) + // printf("%s will run under VNPU-DISABLE [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } // STD VNPU - else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BUTT) { + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_STD) { // 7.2T & 10.8T no allow - if (eModelType == AX_ENGINE_MODEL_TYPE1 || eModelType == AX_ENGINE_MODEL_TYPE1) { + if (eModelType == AX_ENGINE_MODEL_TYPE1 || eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under STD VNPU\n", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); return -1; } // default STD VNPU2 if (nNpuType == 0) { nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } else { if (nNpuType & AX_STD_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_STD_VNPU_2) { nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_STD_VNPU_3) { nNpuSet |= 0x04; // VNPU3 (0b100) + // printf("%s will run under STD-VNPU3 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } } // BL VNPU - else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BUTT) { + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_BIG_LITTLE) { // 10.8T no allow - if (eModelType == AX_ENGINE_MODEL_TYPE1) { + if (eModelType == AX_ENGINE_MODEL_TYPE2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU\n", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); return -1; } @@ -71,29 +87,38 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ // 7.2T default BL VNPU1 if (eModelType == AX_ENGINE_MODEL_TYPE1) { nNpuSet = 0x01; // VNPU1 (0b001) + // printf("%s will run under default BL-VNPU1 [%s]\n", strModel.c_str(), + // strAlgoModelType[eModelType]); } // 3.6T default BL VNPU2 else { nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default BL-VNPU2 [%s]\n", strModel.c_str(), + // strAlgoModelType[eModelType]); } } else { // 7.2T if (eModelType == AX_ENGINE_MODEL_TYPE1) { // no allow set to BL VNPU2 if (nNpuType & AX_BL_VNPU_2) { + // printf("%s model type%d: [%s], no allow run under BL VNPU2\n", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); return -1; } if (nNpuType & AX_BL_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } // 3.6T else { if (nNpuType & AX_BL_VNPU_1) { nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under BL-VNPU1 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } if (nNpuType & AX_BL_VNPU_2) { nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under BL-VNPU2 [%s]\n", strModel.c_str(), strAlgoModelType[eModelType]); } } } @@ -104,13 +129,60 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ return ret; } +#endif + +#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_TYPE_T& eModelType, + const AX_S32& nNpuType, AX_U32& nNpuSet) +{ + AX_ENGINE_NPU_ATTR_T stNpuAttr; + memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); + + auto ret = AX_ENGINE_GetVNPUAttr(&stNpuAttr); + if (ret == 0) { + // VNPU DISABLE + if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_DISABLE) { + nNpuSet = 0x01; // NON-VNPU (0b111) + // ALOGN("%s will run under VNPU-DISABLE [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + // STD VNPU + else if (stNpuAttr.eHardMode == AX_ENGINE_VIRTUAL_NPU_ENABLE) { + // full ocm model was no allowned + if (eModelType == AX_ENGINE_MODEL_TYPE1) { + // printf("%s model type%d: [%s], no allow run under STD VNPU", strModel.c_str(), eModelType, + // strAlgoModelType[eModelType]); + return -1; + } + + // default STD VNPU2 + if (nNpuType == 0) { + nNpuSet = 0x02; // VNPU2 (0b010) + // printf("%s will run under default STD-VNPU2 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } else { + if (nNpuType & AX_STD_VNPU_1) { + nNpuSet |= 0x01; // VNPU1 (0b001) + // printf("%s will run under STD-VNPU1 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + if (nNpuType & AX_STD_VNPU_2) { + nNpuSet |= 0x02; // VNPU2 (0b010) + // printf("%s will run under STD-VNPU2 [%s]", strModel.c_str(), strAlgoModelType[eModelType]); + } + } + } + } else { + printf("AX_ENGINE_GetVNPUAttr fail ret = %x", ret); + } + + return ret; +} +#endif int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) { AX_S32 ret = 0; // 1. load model - AX_BOOL bLoadModelUseCmm = AX_FALSE; + AX_BOOL bLoadModelUseCmm = AX_TRUE; AX_CHAR* pModelBufferVirAddr = nullptr; AX_U64 u64ModelBufferPhyAddr = 0; AX_U32 nModelBufferSize = 0; @@ -220,7 +292,7 @@ int EngineWrapper::Init(const char* strModelPath, uint32_t nNpuType) // 6. prepare io // AX_U32 nIoDepth = (stCtx.vecOutputBufferFlag.size() == 0) ? 1 : stCtx.vecOutputBufferFlag.size(); - ret = utils::prepare_io(strModelPath, m_io_info, m_io, utils::IO_BUFFER_STRATEGY_DEFAULT); + ret = utils::prepare_io(strModelPath, m_io_info, m_io, utils::IO_BUFFER_STRATEGY_CACHED); if (0 != ret) { printf("prepare io failed!\n"); utils::free_io(m_io); @@ -238,7 +310,7 @@ int EngineWrapper::SetInput(void* pInput, int index) return utils::push_io_input(pInput, index, m_io); } -int EngineWrapper::RunSync() +int EngineWrapper::Run() { if (!m_hasInit) return -1; @@ -252,6 +324,37 @@ int EngineWrapper::RunSync() return 0; } +int EngineWrapper::GetOutput(void* pOutput, int index) +{ + return utils::push_io_output(pOutput, index, m_io); +} + +int EngineWrapper::GetInputSize(int index) +{ + return m_io.pInputs[index].nSize; +} + +int EngineWrapper::GetOutputSize(int index) +{ + return m_io.pOutputs[index].nSize; +} + +void* EngineWrapper::GetOutputPtr(int index) +{ + utils::cache_io_flush(&m_io.pOutputs[index]); + return m_io.pOutputs[index].pVirAddr; +} + +int EngineWrapper::Release() +{ + if (m_handle) { + utils::free_io(m_io); + AX_ENGINE_DestroyHandle(m_handle); + m_handle = nullptr; + } + return 0; +} + const char* CLASS_NAMES[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", @@ -375,29 +478,4 @@ int EngineWrapper::Post_Process(cv::Mat& mat, int& input_w, int& input_h, int& c post_process(m_io_info, &m_io, mat, input_w, input_h, cls_num, point_num, pron_threshold, nms_threshold, objects, model_type); return 0; -} - -int EngineWrapper::GetOutput(void* pOutput, int index) -{ - return utils::push_io_output(pOutput, index, m_io); -} - -int EngineWrapper::GetInputSize(int index) -{ - return m_io.pInputs[index].nSize; -} - -int EngineWrapper::GetOutputSize(int index) -{ - return m_io.pOutputs[index].nSize; -} - -int EngineWrapper::Release() -{ - if (m_handle) { - utils::free_io(m_io); - AX_ENGINE_DestroyHandle(m_handle); - m_handle = nullptr; - } - return 0; -} +} \ No newline at end of file diff --git a/projects/llm_framework/main_yolo/src/EngineWrapper.hpp b/projects/llm_framework/main_yolo/src/EngineWrapper.hpp index 1a5b9e0..5d42a07 100644 --- a/projects/llm_framework/main_yolo/src/EngineWrapper.hpp +++ b/projects/llm_framework/main_yolo/src/EngineWrapper.hpp @@ -47,22 +47,24 @@ class EngineWrapper { int SetInput(void* pInput, int index); - int RunSync(); - - int Post_Process(cv::Mat& mat, int& input_w, int& input_, int& cls_num, int& point_num, float& pron_threshold, - float& nms_threshold, std::vector& objects, std::string& model_type); + int Run(); int GetOutput(void* pOutput, int index); int GetInputSize(int index); int GetOutputSize(int index); + void* GetOutputPtr(int index); + int Release(); + int Post_Process(cv::Mat& mat, int& input_w, int& input_, int& cls_num, int& point_num, float& pron_threshold, + float& nms_threshold, std::vector& objects, std::string& model_type); + protected: bool m_hasInit; AX_ENGINE_HANDLE m_handle; AX_ENGINE_IO_INFO_T* m_io_info{}; AX_ENGINE_IO_T m_io{}; int m_input_num{}, m_output_num{}; -}; +}; \ No newline at end of file diff --git a/projects/llm_framework/main_yolo/src/main.cpp b/projects/llm_framework/main_yolo/src/main.cpp index 8f3284b..ddf67bb 100644 --- a/projects/llm_framework/main_yolo/src/main.cpp +++ b/projects/llm_framework/main_yolo/src/main.cpp @@ -228,7 +228,7 @@ class llm_task { common::get_input_data_letterbox(src, image, mode_config_.img_h, mode_config_.img_w, bgr2rgb); cv::Mat img_mat(mode_config_.img_h, mode_config_.img_w, CV_8UC3, image.data()); yolo_->SetInput((void *)image.data(), 0); - if (0 != yolo_->RunSync()) { + if (0 != yolo_->Run()) { SLOGE("Run yolo model failed!\n"); throw std::string("yolo_ RunSync error"); } From c63bb098e8e5ea97897b6767d65b6993497fe4fa Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 16 Apr 2025 18:08:56 +0800 Subject: [PATCH 20/64] [fix] Fixed the issue that CMM cannot be released after the class EngineWrapper is destroyed --- projects/llm_framework/main_whisper/src/main.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 3dc05e8..776408b 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -544,6 +544,9 @@ class llm_task { ~llm_task() { stop(); + if (encoder_) encoder_->Release(); + if (decoder_main_) decoder_main_->Release(); + if (decoder_loop_) decoder_loop_->Release(); _ax_deinit(); buffer_destroy(pcmdata); } From 8d18ef321345d44728bf39e4a61cc25fcc3d6f5b Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Wed, 16 Apr 2025 18:34:33 +0800 Subject: [PATCH 21/64] [update] mode config add compile_flage --- .../main_llm/models/mode_qwen2.5-0.5B-p256-ax630c.json | 2 ++ .../main_llm/models/mode_qwen2.5-0.5B-prefill-20e.json | 2 ++ .../llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json | 2 ++ projects/llm_framework/main_yolo/mode_yolo11n-hand-pose.json | 2 ++ projects/llm_framework/main_yolo/mode_yolo11n-pose.json | 2 ++ projects/llm_framework/main_yolo/mode_yolo11n-seg.json | 2 ++ projects/llm_framework/main_yolo/mode_yolo11n.json | 2 ++ 7 files changed, 14 insertions(+) diff --git a/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-p256-ax630c.json b/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-p256-ax630c.json index dc98810..cd16fec 100644 --- a/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-p256-ax630c.json +++ b/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-p256-ax630c.json @@ -2,6 +2,8 @@ "mode":"qwen2.5-0.5B-p256-ax630c", "type":"llm", "homepage":"https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct", + "compile_flage":"pulsar2 llm_build --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 256 --chip AX620E", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "text_generation", "chat" diff --git a/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-prefill-20e.json b/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-prefill-20e.json index 86bda88..b5502b5 100644 --- a/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-prefill-20e.json +++ b/projects/llm_framework/main_llm/models/mode_qwen2.5-0.5B-prefill-20e.json @@ -2,6 +2,8 @@ "mode":"qwen2.5-0.5B-prefill-20e", "type":"llm", "homepage":"https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct", + "compile_flage":"pulsar2 llm_build --input_path Qwen/Qwen2-0.5B-Instruct/ --output_path Qwen/Qwen2-0.5B-w8a16/ --kv_cache_len 1023 --hidden_state_type bf16 --prefill_len 128 --chip AX620E;./tools/embed_process.sh Qwen/Qwen2-0.5B-Instruct/ Qwen/Qwen2-0.5B-w8a16/", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "text_generation", "chat" diff --git a/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json b/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json index 4c07e36..3ce09a0 100644 --- a/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json +++ b/projects/llm_framework/main_vlm/models/mode_smolvlm-500M-ax630c.json @@ -2,6 +2,8 @@ "mode":"smolvlm-500M-ax630c", "type":"vlm", "homepage":"https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct", + "compile_flage":"pulsar2 build --input HuggingFaceTB/SmolVLM-500M-w8a16/SmolVLM-500M-Instruct_vision.onnx --config AXERA/SmolVLM-256M-Instruct.axera/model_convert/config.json --output_dir HuggingFaceTB/SmolVLM-500M-w8a16/build-output --output_name SmolVLM-500M-Instruct_vision.axmodel --target_hardware AX620E --compiler.check 0 --npu_mode NPU2", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "text_generation", "chat" diff --git a/projects/llm_framework/main_yolo/mode_yolo11n-hand-pose.json b/projects/llm_framework/main_yolo/mode_yolo11n-hand-pose.json index a75f0c0..051518c 100644 --- a/projects/llm_framework/main_yolo/mode_yolo11n-hand-pose.json +++ b/projects/llm_framework/main_yolo/mode_yolo11n-hand-pose.json @@ -2,6 +2,8 @@ "mode":"yolo11n-hand-pose", "type":"cv", "homepage":"https://github.com/ultralytics/ultralytics", + "compile_flage":"pulsar2 build --target_hardware AX620E --input yolo11n-hand.onnx --output_dir output --config yolo11n-hand_config.json", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "Pose" ], diff --git a/projects/llm_framework/main_yolo/mode_yolo11n-pose.json b/projects/llm_framework/main_yolo/mode_yolo11n-pose.json index 7879c01..785135b 100644 --- a/projects/llm_framework/main_yolo/mode_yolo11n-pose.json +++ b/projects/llm_framework/main_yolo/mode_yolo11n-pose.json @@ -2,6 +2,8 @@ "mode":"yolo11n-pose", "type":"cv", "homepage":"https://github.com/ultralytics/ultralytics", + "compile_flage":"pulsar2 build --target_hardware AX620E --input yolo11n-pose.onnx --output_dir output --config yolo11n-pose_config.json", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "Pose" ], diff --git a/projects/llm_framework/main_yolo/mode_yolo11n-seg.json b/projects/llm_framework/main_yolo/mode_yolo11n-seg.json index 71c86c4..9da34ef 100644 --- a/projects/llm_framework/main_yolo/mode_yolo11n-seg.json +++ b/projects/llm_framework/main_yolo/mode_yolo11n-seg.json @@ -2,6 +2,8 @@ "mode":"yolo11n-seg", "type":"cv", "homepage":"https://github.com/ultralytics/ultralytics", + "compile_flage":"pulsar2 build --target_hardware AX620E --input yolo11n-seg.onnx --output_dir output --config yolo11n-seg_config.json", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "Segmentation" ], diff --git a/projects/llm_framework/main_yolo/mode_yolo11n.json b/projects/llm_framework/main_yolo/mode_yolo11n.json index e8bc8dc..b2c6cf4 100644 --- a/projects/llm_framework/main_yolo/mode_yolo11n.json +++ b/projects/llm_framework/main_yolo/mode_yolo11n.json @@ -2,6 +2,8 @@ "mode":"yolo11n", "type":"cv", "homepage":"https://github.com/ultralytics/ultralytics", + "compile_flage":"pulsar2 build --target_hardware AX620E --input yolo11n.onnx --output_dir output --config yolo11n_config.json", + "pulsar_version":"3.4-983bb35e", "capabilities":[ "Detection" ], From 8f673e4fef6130951bcffe86e9934fc296abe6bb Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 16 Apr 2025 18:40:21 +0800 Subject: [PATCH 22/64] [fix] Fix CMM cannot be released --- projects/llm_framework/main_asr/src/main.cpp | 3 --- projects/llm_framework/main_depth_anything/src/main.cpp | 1 + projects/llm_framework/main_kws/src/main.cpp | 9 ++++----- projects/llm_framework/main_vad/src/main.cpp | 3 --- projects/llm_framework/main_yolo/src/main.cpp | 1 + 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/projects/llm_framework/main_asr/src/main.cpp b/projects/llm_framework/main_asr/src/main.cpp index 2d3de1c..f09ade0 100644 --- a/projects/llm_framework/main_asr/src/main.cpp +++ b/projects/llm_framework/main_asr/src/main.cpp @@ -259,9 +259,6 @@ class llm_task { ~llm_task() { stop(); - if (recognizer_stream_) { - recognizer_stream_.reset(); - } buffer_destroy(pcmdata); } }; diff --git a/projects/llm_framework/main_depth_anything/src/main.cpp b/projects/llm_framework/main_depth_anything/src/main.cpp index ee7d9e8..b7bb591 100644 --- a/projects/llm_framework/main_depth_anything/src/main.cpp +++ b/projects/llm_framework/main_depth_anything/src/main.cpp @@ -256,6 +256,7 @@ class llm_task { ~llm_task() { stop(); + if (depth_anything_) depth_anything_->Release(); _ax_deinit(); } }; diff --git a/projects/llm_framework/main_kws/src/main.cpp b/projects/llm_framework/main_kws/src/main.cpp index 91d51ed..fc4f31c 100644 --- a/projects/llm_framework/main_kws/src/main.cpp +++ b/projects/llm_framework/main_kws/src/main.cpp @@ -177,9 +177,11 @@ class llm_task { temp_awake_key.close(); std::ostringstream awake_key_compile_cmd; if (file_exists("/opt/m5stack/scripts/text2token.py")) - awake_key_compile_cmd << "PYTHONPATH=/opt/m5stack/lib/sherpa-onnx/site-packages /usr/bin/python3 /opt/m5stack/scripts/text2token.py "; + awake_key_compile_cmd << "PYTHONPATH=/opt/m5stack/lib/sherpa-onnx/site-packages /usr/bin/python3 " + "/opt/m5stack/scripts/text2token.py "; else if (file_exists("/opt/m5stack/scripts/llm-kws_text2token.py")) - awake_key_compile_cmd << "PYTHONPATH=/opt/m5stack/lib/sherpa-onnx/site-packages /usr/bin/python3 /opt/m5stack/scripts/llm-kws_text2token.py "; + awake_key_compile_cmd << "PYTHONPATH=/opt/m5stack/lib/sherpa-onnx/site-packages /usr/bin/python3 " + "/opt/m5stack/scripts/llm-kws_text2token.py "; else { SLOGE("text2token.py or llm-kws_text2token.py not found!"); } @@ -267,9 +269,6 @@ class llm_task { ~llm_task() { stop(); - if (spotter_stream_) { - spotter_stream_.reset(); - } buffer_destroy(pcmdata); } }; diff --git a/projects/llm_framework/main_vad/src/main.cpp b/projects/llm_framework/main_vad/src/main.cpp index 6189257..f0ab519 100644 --- a/projects/llm_framework/main_vad/src/main.cpp +++ b/projects/llm_framework/main_vad/src/main.cpp @@ -228,9 +228,6 @@ class llm_task { ~llm_task() { stop(); - if (vad_) { - vad_.reset(); - } buffer_destroy(pcmdata); } }; diff --git a/projects/llm_framework/main_yolo/src/main.cpp b/projects/llm_framework/main_yolo/src/main.cpp index ddf67bb..453ad34 100644 --- a/projects/llm_framework/main_yolo/src/main.cpp +++ b/projects/llm_framework/main_yolo/src/main.cpp @@ -340,6 +340,7 @@ class llm_task { ~llm_task() { stop(); + if (yolo_) yolo_->Release(); _ax_deinit(); } }; From 91c5bbd08def7d113495298825851c1bcf514740 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Wed, 16 Apr 2025 18:51:16 +0800 Subject: [PATCH 23/64] [update] change bsp define --- ext_components/ax_msp/Kconfig | 4 ++++ projects/llm_framework/main_depth_anything/SConstruct | 2 +- .../main_depth_anything/src/EngineWrapper.cpp | 8 +++++--- projects/llm_framework/main_melotts/SConstruct | 2 +- .../main_melotts/src/runner/EngineWrapper.cpp | 7 ++++--- projects/llm_framework/main_whisper/SConstruct | 2 +- .../main_whisper/src/runner/EngineWrapper.cpp | 8 +++++--- projects/llm_framework/main_yolo/SConstruct | 2 +- projects/llm_framework/main_yolo/src/EngineWrapper.cpp | 8 +++++--- 9 files changed, 27 insertions(+), 16 deletions(-) diff --git a/ext_components/ax_msp/Kconfig b/ext_components/ax_msp/Kconfig index ba264e3..7382424 100644 --- a/ext_components/ax_msp/Kconfig +++ b/ext_components/ax_msp/Kconfig @@ -12,6 +12,10 @@ menuconfig AX_MSP_ENABLED bool "enable ax620e bsp" help build by AXERA! + config AX_650N_MSP_ENABLED + bool "enable ax650n bsp" + help + build by AXERA! config AX_520_MSP_ENABLED bool "enable ax520 bsp" help diff --git a/projects/llm_framework/main_depth_anything/SConstruct b/projects/llm_framework/main_depth_anything/SConstruct index 7ad6376..ea26da6 100644 --- a/projects/llm_framework/main_depth_anything/SConstruct +++ b/projects/llm_framework/main_depth_anything/SConstruct @@ -16,7 +16,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-O3', '-std=c++17', '-DCHIP_AX630C'] +DEFINITIONS += ['-O3', '-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp b/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp index 7947417..7a7ec61 100644 --- a/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp +++ b/projects/llm_framework/main_depth_anything/src/EngineWrapper.cpp @@ -12,11 +12,13 @@ #include -#if defined(CHIP_AX650) +#include + +#if defined(CONFIG_AX_650N_MSP_ENABLED) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; #endif @@ -130,7 +132,7 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ } #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { diff --git a/projects/llm_framework/main_melotts/SConstruct b/projects/llm_framework/main_melotts/SConstruct index 0fd84fb..358ddb2 100644 --- a/projects/llm_framework/main_melotts/SConstruct +++ b/projects/llm_framework/main_melotts/SConstruct @@ -17,7 +17,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17', '-DCHIP_AX630C'] +DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp b/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp index cd490ee..0dda3e7 100644 --- a/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp +++ b/projects/llm_framework/main_melotts/src/runner/EngineWrapper.cpp @@ -11,12 +11,13 @@ #include "utils/io.hpp" #include +#include -#if defined(CHIP_AX650) +#if defined(CONFIG_AX_650N_MSP_ENABLED) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; #endif @@ -128,7 +129,7 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ } #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { AX_ENGINE_NPU_ATTR_T stNpuAttr; memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); diff --git a/projects/llm_framework/main_whisper/SConstruct b/projects/llm_framework/main_whisper/SConstruct index c8c205f..c14cf6b 100644 --- a/projects/llm_framework/main_whisper/SConstruct +++ b/projects/llm_framework/main_whisper/SConstruct @@ -17,7 +17,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17', '-DCHIP_AX630C'] +DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp b/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp index cd490ee..6d21964 100644 --- a/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp +++ b/projects/llm_framework/main_whisper/src/runner/EngineWrapper.cpp @@ -12,11 +12,13 @@ #include -#if defined(CHIP_AX650) +#include + +#if defined(CONFIG_AX_650N_MSP_ENABLED) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static const char *strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; #endif @@ -128,7 +130,7 @@ static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_ } #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static AX_S32 CheckModelVNpu(const std::string &strModel, const AX_ENGINE_MODEL_TYPE_T &eModelType, const AX_S32 &nNpuType, AX_U32 &nNpuSet) { AX_ENGINE_NPU_ATTR_T stNpuAttr; memset(&stNpuAttr, 0x00, sizeof(stNpuAttr)); diff --git a/projects/llm_framework/main_yolo/SConstruct b/projects/llm_framework/main_yolo/SConstruct index 8b8f356..8400aa9 100644 --- a/projects/llm_framework/main_yolo/SConstruct +++ b/projects/llm_framework/main_yolo/SConstruct @@ -16,7 +16,7 @@ LDFLAGS = [] LINK_SEARCH_PATH = [] STATIC_FILES = [] -DEFINITIONS += ['-std=c++17', '-O2', '-DCHIP_AX630C'] +DEFINITIONS += ['-std=c++17', '-O2'] LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib/gcc-10.3', '-Wl,-rpath=/opt/lib', '-Wl,-rpath=/opt/usr/lib', '-Wl,-rpath=./'] LINK_SEARCH_PATH += [ADir('../static_lib')] REQUIREMENTS += ['ax_engine', 'ax_interpreter', 'ax_sys'] diff --git a/projects/llm_framework/main_yolo/src/EngineWrapper.cpp b/projects/llm_framework/main_yolo/src/EngineWrapper.cpp index 0280180..84085db 100644 --- a/projects/llm_framework/main_yolo/src/EngineWrapper.cpp +++ b/projects/llm_framework/main_yolo/src/EngineWrapper.cpp @@ -13,11 +13,13 @@ #include "utils/io.hpp" #include -#if defined(CHIP_AX650) +#include + +#if defined(CONFIG_AX_650N_MSP_ENABLED) static const char* strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"3.6T", "7.2T", "18T"}; #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static const char* strAlgoModelType[AX_ENGINE_MODEL_TYPE_BUTT] = {"HalfOCM", "FullOCM"}; #endif @@ -131,7 +133,7 @@ static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_ } #endif -#if defined(CHIP_AX630C) || defined(CHIP_AX620Q) +#if defined(CONFIG_AX_620E_MSP_ENABLED) || defined(CONFIG_AX_620Q_MSP_ENABLED) static AX_S32 CheckModelVNpu(const std::string& strModel, const AX_ENGINE_MODEL_TYPE_T& eModelType, const AX_S32& nNpuType, AX_U32& nNpuSet) { From 9276df6d830ef4bd3c4692b8ed55f4bd6f8b4c55 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 16 Apr 2025 19:12:39 +0800 Subject: [PATCH 24/64] [update] Release whisper-small model --- .../main_whisper/mode_whisper-small.json | 44 +++++++++++++++++++ projects/llm_framework/tools/llm_pack.py | 1 + 2 files changed, 45 insertions(+) create mode 100644 projects/llm_framework/main_whisper/mode_whisper-small.json diff --git a/projects/llm_framework/main_whisper/mode_whisper-small.json b/projects/llm_framework/main_whisper/mode_whisper-small.json new file mode 100644 index 0000000..db13bef --- /dev/null +++ b/projects/llm_framework/main_whisper/mode_whisper-small.json @@ -0,0 +1,44 @@ +{ + "mode": "whisper-small", + "type": "asr", + "homepage":"https://huggingface.co/openai/whisper-small", + "compile_flage":"pulsar2 build --input small-encoder.onnx --config config_whisper_encoder_u16.json --output_dir small_encoder --output_name small-encoder.axmodel --target_hardware AX620E --compiler.check 0 --npu_mode NPU2", + "pulsar_version":"3.3-f0b32d03", + "capabilities": [ + "Automatic_Speech_Recognition", + "English", + "Chinese", + "Japanese" + ], + "input_type": [ + "sys.pcm" + ], + "output_type": [ + "asr.utf-8" + ], + "mode_param": { + "model_type": "small", + "language": "en", + "encoder": "small-encoder.axmodel", + "decoder_main": "small-decoder-main.axmodel", + "decoder_loop": "small-decoder-loop.axmodel", + "positional_embedding": "small-positional_embedding.bin", + "tokens": "small-tokens.txt", + "t2s": "t2s.json", + "whisper_sample_rate": 16000, + "whisper_n_fft": 400, + "awake_delay": 1000, + "whisper_hop_length": 160, + "whisper_chunk_size": 30, + "whisper_n_mels": 80, + "whisper_sot": 50258, + "whisper_eot": 50257, + "whisper_blank": 220, + "whisper_no_timestamps": 50363, + "whisper_no_speech": 50362, + "whisper_translate": 50358, + "whisper_transcribe": 50359, + "whisper_vocab_size": 51865, + "whisper_n_text_ctx": 448 + } +} \ No newline at end of file diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 05dea8d..80ad7ae 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -408,6 +408,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], + 'llm-model-whisper-small':[create_data_deb,'llm-model-whisper-small', '0.3', src_folder, revision], 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.3', src_folder, revision], 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], From 3203fea4a027653b7e9f97293de65cee620041db Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 17 Apr 2025 09:39:39 +0800 Subject: [PATCH 25/64] [update] Update ModuleLLM-OpenAI-Plugin version --- projects/llm_framework/main_openai_api/SConstruct | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct index 35cbcee..7d072fd 100644 --- a/projects/llm_framework/main_openai_api/SConstruct +++ b/projects/llm_framework/main_openai_api/SConstruct @@ -18,7 +18,7 @@ LINK_SEARCH_PATH = [] STATIC_FILES = [] -ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/Abandon-ht/ModuleLLM-OpenAI-Plugin.git', '1077efbe201ea3f29517f5ce4a0cfc3b04c25d1d', True) +ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', '33477071ca362d20cd65cfad43f5b05480724711', True) python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.5.tar.gz", 'm5stack_llm-openai-api-python-venv_v1.5.tar.gz') From 52f1a484bd94265274d9d0f1452e8da5978bc05d Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 17 Apr 2025 09:40:23 +0800 Subject: [PATCH 26/64] [update] add benchmark test --- benchmark/README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 benchmark/README.md diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..f32dcdb --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,7 @@ +benchmodulellm can be used to test llm unit inference performance + +Only the llm unit definition files (model json) are required. + +If no model specified, it would benchmark default list. More model networks may be added later. + +Usage \ No newline at end of file From c85e4d63fe0b1a149e890a4265ff8a8a6d70e487 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 17 Apr 2025 17:36:35 +0800 Subject: [PATCH 27/64] [update] add llm unit test. --- benchmark/README.md | 5 +- benchmark/benchmodulellm.py | 126 ++++++++++++++++++++++++ benchmark/default.yaml | 31 ++++++ benchmark/utils/llm.py | 174 ++++++++++++++++++++++++++++++++++ benchmark/utils/token_calc.py | 20 ++++ 5 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 benchmark/benchmodulellm.py create mode 100644 benchmark/default.yaml create mode 100644 benchmark/utils/llm.py create mode 100644 benchmark/utils/token_calc.py diff --git a/benchmark/README.md b/benchmark/README.md index f32dcdb..e21ed4a 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -4,4 +4,7 @@ Only the llm unit definition files (model json) are required. If no model specified, it would benchmark default list. More model networks may be added later. -Usage \ No newline at end of file +Usage +```shell +python benchmodulellm.py --host 192.168.20.100 --port 10001 --test-items default.yaml +``` \ No newline at end of file diff --git a/benchmark/benchmodulellm.py b/benchmark/benchmodulellm.py new file mode 100644 index 0000000..00dfcd0 --- /dev/null +++ b/benchmark/benchmodulellm.py @@ -0,0 +1,126 @@ +import argparse +import os +import sys + +import yaml +import logging + +from pathlib import Path + +from utils.llm import LLMClient + +FILE = Path(__file__).resolve() +ROOT = FILE.parents[0] +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) +ROOT = Path(os.path.relpath(ROOT, Path.cwd())) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +def parse_opt(known=False): + """ + Parse command-line options. + """ + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="127.0.0.1", help="ModuleLLM IP Address") + parser.add_argument("--port", type=int, default=10001, help="ModuleLLM TCP Port") + parser.add_argument("--test-items", type=str, default=ROOT / "default.yaml", help="testitems.yaml path") + + args = parser.parse_known_args()[0] if known else parser.parse_args() + + return args + +def read_yaml(file_path): + """ + Read a YAML file and return its content. + """ + if not os.path.exists(file_path): + logging.error(f"YAML file '{file_path}' does not exist.") + sys.exit(1) + + try: + with open(file_path, "r") as file: + data = yaml.safe_load(file) + if data is None: + logging.warning(f"YAML file '{file_path}' is empty.") + return {} + + logging.info(f"YAML file '{file_path}' read successfully.") + + if "items" in data: + return data["items"] + else: + logging.warning(f"'items' not found in YAML file.") + return [] + except Exception as e: + logging.error(f"Failed to read YAML file '{file_path}': {e}") + sys.exit(1) + +def write_yaml(file_path, data): + """ + Write data to a YAML file. + """ + try: + with open(file_path, "w") as file: + yaml.safe_dump(data, file) + logging.info(f"YAML file '{file_path}' written successfully.") + except Exception as e: + logging.error(f"Failed to write YAML file '{file_path}': {e}") + sys.exit(1) + +def categorize_and_deduplicate(items): + """ + Categorize items by 'type' and remove duplicate 'model_name'. + """ + categorized = {} + for item in items: + item_type = item.get("type") + model_name = item.get("model_name") + if not item_type or not model_name: + continue + + if item_type not in categorized: + categorized[item_type] = set() + + categorized[item_type].add(model_name) + + # Convert sets back to lists for easier usage + return {key: list(value) for key, value in categorized.items()} + +def main(opt): + items = read_yaml(opt.test_items) + if not items: + logging.warning(f"No items found in YAML file '{opt.test_items}'.") + return + + categorized_items = categorize_and_deduplicate(items) + + logging.info("Categorized items:") + for item_type, models in categorized_items.items(): + logging.info(f"Type: {item_type}, Models: {models}") + + if item_type == "llm": + logging.info("Initializing LLMClient...") + llm_client = LLMClient(opt.host, opt.port) + + for model_name in models: + logging.info(f"Testing model: {model_name}") + input_text = "This is a test input for the LLM." + try: + result = llm_client.test(model_name, input_text) + logging.info(f"Test result for model '{model_name}': {result}") + except Exception as e: + logging.error(f"Error testing model '{model_name}': {e}") + + del llm_client + logging.info("LLMClient deleted successfully.") + + return categorized_items + +if __name__ == "__main__": + opt = parse_opt() + main(opt) diff --git a/benchmark/default.yaml b/benchmark/default.yaml new file mode 100644 index 0000000..b10f7f2 --- /dev/null +++ b/benchmark/default.yaml @@ -0,0 +1,31 @@ +items: +- model_name: qwen2.5-0.5B-p256-ax630c + type: llm +- model_name: internvl2.5-1B-364-ax630c + type: vlm +- model_name: whisper-tiny + type: whisper +- model_name: whisper-base + type: whisper +- model_name: whisper-small + type: whisper +- model_name: sherpa-ncnn-streaming-zipformer-20M-2023-02-17 + type: asr +- model_name: sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23 + type: asr +- model_name: sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01 + type: kws +- model_name: sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 + type: kws +- model_name: melotts-zh-cn + type: melotts +- model_name: single_speaker_english_fast + type: tts +- model_name: single_speaker_fast + type: tts +- model_name: yolo11n + type: yolo +- model_name: yolo11n-seg + type: yolo +- model_name: yolo11n-pose + type: yolo \ No newline at end of file diff --git a/benchmark/utils/llm.py b/benchmark/utils/llm.py new file mode 100644 index 0000000..e2d6a0b --- /dev/null +++ b/benchmark/utils/llm.py @@ -0,0 +1,174 @@ +import socket +import json +import time +import logging +import uuid +from .token_calc import calculate_token_length + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +class LLMClient: + def __init__(self, host, port): + self.host = host + self.port = port + self.work_id = None + self.response_format = None + self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.sock.connect((self.host, self.port)) + + def generate_request_id(self): + return str(uuid.uuid4()) + + def send_request_stream(self, request): + self.sock.sendall(json.dumps(request).encode('utf-8')) + response = b"" + parsed_responses = [] + output_text = "" + + start_time = time.time() + first_packet_time = None + + while True: + chunk = self.sock.recv(4096) + logging.info(f"Received chunk: {chunk}") + response += chunk + + while b'\n' in response: + line, response = response.split(b'\n', 1) + try: + parsed_response = json.loads(line.decode('utf-8')) + parsed_responses.append(parsed_response) + + if "data" in parsed_response and "delta" in parsed_response["data"]: + if first_packet_time is None: + first_packet_time = time.time() + output_text += parsed_response["data"]["delta"] + + if "data" in parsed_response and parsed_response["data"].get("finish", False): + end_time = time.time() + total_time = end_time - start_time + first_packet_latency = first_packet_time - start_time if first_packet_time else None + + token_count = calculate_token_length(output_text) + token_speed = token_count / total_time if total_time > 0 else 0 + + logging.info("Stream reception completed.") + logging.info("First packet latency: %.2f seconds", first_packet_latency if first_packet_latency else 0) + logging.info("Total reception time: %.2f seconds", total_time) + logging.info("Total tokens received: %d", token_count) + logging.info("Token reception speed: %.2f tokens/second", token_speed) + logging.info("Total output text length: %d characters", len(output_text)) + + return { + "responses": parsed_responses, + "output_text": output_text, + "token_count": token_count, + "first_packet_latency": first_packet_latency, + "total_time": total_time, + "token_speed": token_speed + } + except json.JSONDecodeError: + logging.warning("Failed to decode JSON, skipping line.") + continue + + def send_request_non_stream(self, request): + self.sock.sendall(json.dumps(request).encode('utf-8')) + response = b"" + while True: + chunk = self.sock.recv(4096) + response += chunk + if b'\n' in chunk: + break + return json.loads(response.decode('utf-8')) + + def setup(self, model): + setup_request = { + "request_id": self.generate_request_id(), + "work_id": "llm", + "action": "setup", + "object": "llm.setup", + "data": { + "model": model, + "response_format": "llm.utf-8.stream", + "input": "llm.utf-8", + "enoutput": True, + "max_token_len": 256, + "prompt": "You are a knowledgeable assistant capable of answering various questions and providing information." + } + } + response = self.send_request_non_stream(setup_request) + self.work_id = response.get("work_id") + self.response_format = setup_request["data"]["response_format"] + return response + + def inference(self, input_text): + if not self.work_id: + raise ValueError("work_id is not set. Please call setup() first.") + + inference_request = { + "request_id": self.generate_request_id(), + "work_id": self.work_id, + "action": "inference", + "object": self.response_format, + "data": { + "delta": input_text, + "index": 0, + "finish": True + } + } + if "stream" in self.response_format: + logging.info("Sending stream request...") + result = self.send_request_stream(inference_request) + return { + "output_text": result["output_text"], + "token_count": result["token_count"], + "first_packet_latency": result["first_packet_latency"], + "total_time": result["total_time"], + "token_speed": result["token_speed"] + } + else: + logging.info("Sending non-stream request...") + response = self.send_request_non_stream(inference_request) + return { + "output_text": response.get("data", ""), + "token_count": len(response.get("data", "").split()) + } + + def exit(self): + if not self.work_id: + raise ValueError("work_id is not set. Please call setup() first.") + + exit_request = { + "request_id": self.generate_request_id(), + "work_id": self.work_id, + "action": "exit" + } + response = self.send_request_non_stream(exit_request) + return response + + def test(self, model, input_text): + logging.info("Setting up...") + setup_response = self.setup(model) + logging.info("Setup response: %s", setup_response) + + logging.info("Running inference...") + inference_result = self.inference(input_text) + logging.info("Inference result: %s", inference_result) + + logging.info("Exiting...") + exit_response = self.exit() + logging.info("Exit response: %s", exit_response) + + return { + "setup_response": setup_response, + "inference_result": inference_result, + "exit_response": exit_response + } + +if __name__ == "__main__": + host = "192.168.20.186" + port = 10001 + client = LLMClient(host, port) + model_name = "qwen2.5-0.5B-p256-ax630c" + input_text = "This is a test input for the LLM." + client.test(model_name, input_text) \ No newline at end of file diff --git a/benchmark/utils/token_calc.py b/benchmark/utils/token_calc.py new file mode 100644 index 0000000..47154bb --- /dev/null +++ b/benchmark/utils/token_calc.py @@ -0,0 +1,20 @@ +import tiktoken + +def calculate_token_length(input_string: str) -> int: + """ + Calculate the token length of a given string using tiktoken. + + Args: + input_string (str): The input string to calculate token length for. + + Returns: + int: The length of the tokens. + """ + # Initialize the tokenizer (you can specify a model if needed, e.g., 'gpt-4') + tokenizer = tiktoken.get_encoding("cl100k_base") + + # Encode the input string to tokens + tokens = tokenizer.encode(input_string) + + # Return the length of the tokens + return len(tokens) \ No newline at end of file From 1dd2b14790f0386bcba1c35f0d4104595c86d88f Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Fri, 18 Apr 2025 15:30:36 +0800 Subject: [PATCH 28/64] [update] StackFlow bin add version id --- projects/llm_framework/main/SConstruct | 2 +- projects/llm_framework/main_asr/SConstruct | 2 +- projects/llm_framework/main_audio/SConstruct | 2 +- projects/llm_framework/main_camera/SConstruct | 2 +- .../main_depth_anything/SConstruct | 2 +- projects/llm_framework/main_kws/SConstruct | 2 +- projects/llm_framework/main_llm/SConstruct | 2 +- .../llm_framework/main_melotts/SConstruct | 2 +- .../llm_framework/main_openai_api/SConstruct | 2 +- projects/llm_framework/main_skel/SConstruct | 2 +- projects/llm_framework/main_sys/SConstruct | 2 +- .../llm_framework/main_sys/src/event_loop.cpp | 24 +++++- projects/llm_framework/main_tts/SConstruct | 2 +- projects/llm_framework/main_vad/SConstruct | 2 +- projects/llm_framework/main_vlm/SConstruct | 2 +- .../llm_framework/main_whisper/SConstruct | 2 +- projects/llm_framework/main_yolo/SConstruct | 2 +- projects/llm_framework/tools/llm_pack.py | 82 +++++++------------ 18 files changed, 67 insertions(+), 71 deletions(-) diff --git a/projects/llm_framework/main/SConstruct b/projects/llm_framework/main/SConstruct index 79a6012..7723729 100644 --- a/projects/llm_framework/main/SConstruct +++ b/projects/llm_framework/main/SConstruct @@ -26,7 +26,7 @@ STATIC_FILES += [AFile('../static_lib/sherpa/ncnn/libsherpa-ncnn-core.so'), AFile('../static_lib/sherpa/ncnn/libkaldi-native-fbank-core.so'), ] -env['COMPONENTS'].append({'target':'static_file', +env['COMPONENTS'].append({'target':'static_file-1.0', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_asr/SConstruct b/projects/llm_framework/main_asr/SConstruct index c947a9c..496838c 100644 --- a/projects/llm_framework/main_asr/SConstruct +++ b/projects/llm_framework/main_asr/SConstruct @@ -26,7 +26,7 @@ REQUIREMENTS += ['ncnn', 'sherpa-ncnn-core'] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_asr', +env['COMPONENTS'].append({'target':'llm_asr-1.5', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_audio/SConstruct b/projects/llm_framework/main_audio/SConstruct index afbad26..25b3f1a 100644 --- a/projects/llm_framework/main_audio/SConstruct +++ b/projects/llm_framework/main_audio/SConstruct @@ -29,7 +29,7 @@ REQUIREMENTS += ['tinyalsa', 'opus', 'samplerate', 'fdk-aac'] STATIC_FILES += [AFile('audio.json')] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_audio', +env['COMPONENTS'].append({'target':'llm_audio-1.5', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_camera/SConstruct b/projects/llm_framework/main_camera/SConstruct index c506734..db43b04 100644 --- a/projects/llm_framework/main_camera/SConstruct +++ b/projects/llm_framework/main_camera/SConstruct @@ -68,7 +68,7 @@ STATIC_LIB += static_file * 4 STATIC_FILES += [AFile('camera.json')] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_camera', +env['COMPONENTS'].append({'target':'llm_camera-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_depth_anything/SConstruct b/projects/llm_framework/main_depth_anything/SConstruct index ea26da6..de14d98 100644 --- a/projects/llm_framework/main_depth_anything/SConstruct +++ b/projects/llm_framework/main_depth_anything/SConstruct @@ -30,7 +30,7 @@ STATIC_LIB += static_file * 2 STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_depth_anything', +env['COMPONENTS'].append({'target':'llm_depth_anything-1.5', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_kws/SConstruct b/projects/llm_framework/main_kws/SConstruct index f82c7a5..9b4dee4 100644 --- a/projects/llm_framework/main_kws/SConstruct +++ b/projects/llm_framework/main_kws/SConstruct @@ -55,7 +55,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_kws', +env['COMPONENTS'].append({'target':'llm_kws-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_llm/SConstruct b/projects/llm_framework/main_llm/SConstruct index ad02ce8..52b0778 100644 --- a/projects/llm_framework/main_llm/SConstruct +++ b/projects/llm_framework/main_llm/SConstruct @@ -66,7 +66,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_llm', +env['COMPONENTS'].append({'target':'llm_llm-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_melotts/SConstruct b/projects/llm_framework/main_melotts/SConstruct index 358ddb2..d3c3cba 100644 --- a/projects/llm_framework/main_melotts/SConstruct +++ b/projects/llm_framework/main_melotts/SConstruct @@ -31,7 +31,7 @@ LDFLAGS += ['-l:libcargs.a', '-l:libonnxruntime.a'] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_melotts', +env['COMPONENTS'].append({'target':'llm_melotts-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct index 35cbcee..9ad7a8b 100644 --- a/projects/llm_framework/main_openai_api/SConstruct +++ b/projects/llm_framework/main_openai_api/SConstruct @@ -52,7 +52,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_openai_api', +env['COMPONENTS'].append({'target':'llm_openai_api-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_skel/SConstruct b/projects/llm_framework/main_skel/SConstruct index f40e42d..bae9b43 100644 --- a/projects/llm_framework/main_skel/SConstruct +++ b/projects/llm_framework/main_skel/SConstruct @@ -31,7 +31,7 @@ STATIC_LIB += static_file * 2 STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_skel', +env['COMPONENTS'].append({'target':'llm_skel-0.1', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_sys/SConstruct b/projects/llm_framework/main_sys/SConstruct index 1168d25..0093d21 100644 --- a/projects/llm_framework/main_sys/SConstruct +++ b/projects/llm_framework/main_sys/SConstruct @@ -37,7 +37,7 @@ LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '- STATIC_FILES += [AFile('sys_config.json')] REQUIREMENTS += ['simdjson_component'] -env['COMPONENTS'].append({'target':'llm_sys', +env['COMPONENTS'].append({'target':'llm_sys-1.5', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_sys/src/event_loop.cpp b/projects/llm_framework/main_sys/src/event_loop.cpp index 4f8d312..16aa2a0 100644 --- a/projects/llm_framework/main_sys/src/event_loop.cpp +++ b/projects/llm_framework/main_sys/src/event_loop.cpp @@ -38,7 +38,7 @@ #include "remote_action.h" #include #include "hv/ifconfig.h" - +#include #include "StackFlowUtil.h" void usr_print_error(const std::string &request_id, const std::string &work_id, const std::string &error_msg, @@ -707,7 +707,26 @@ int sys_reset(int com_id, const nlohmann::json &json_obj) int sys_version(int com_id, const nlohmann::json &json_obj) { - usr_out(json_obj["request_id"], json_obj["work_id"], std::string("v1.5"), com_id); + usr_out(json_obj["request_id"], json_obj["work_id"], std::string("v1.6"), com_id); + + int out = 0; + return out; +} + +int sys_version2(int com_id, const nlohmann::json &json_obj) +{ + nlohmann::json data_body = nlohmann::json::array(); + glob_t glob_result; + int ret = glob("/opt/m5stack/bin/llm_*-*", GLOB_TILDE, NULL, &glob_result); // 匹配所有.txt文件 + if (ret == 0) { + for (size_t i = 0; i < glob_result.gl_pathc; i++) { + const char *separator = strrchr(glob_result.gl_pathv[i], '/'); + const char *filename = (separator != NULL) ? separator + 1 : glob_result.gl_pathv[i]; + data_body.push_back(std::string(filename)); + } + } + globfree(&glob_result); + usr_out(json_obj["request_id"], json_obj["work_id"], data_body, com_id); int out = 0; return out; } @@ -739,6 +758,7 @@ void server_work() key_sql["sys.rmmode"] = sys_rmmode; key_sql["sys.unit_call"] = sys_unit_call; key_sql["sys.cmminfo"] = sys_cmminfo; + key_sql["sys.version2"] = sys_version2; } void server_stop_work() diff --git a/projects/llm_framework/main_tts/SConstruct b/projects/llm_framework/main_tts/SConstruct index c21e0aa..bba7997 100644 --- a/projects/llm_framework/main_tts/SConstruct +++ b/projects/llm_framework/main_tts/SConstruct @@ -27,7 +27,7 @@ INCLUDE += [ADir('src/runner/eigen-3.4.0'), ADir('src/runner/src/tn/header'), AD STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_tts', +env['COMPONENTS'].append({'target':'llm_tts-1.5', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_vad/SConstruct b/projects/llm_framework/main_vad/SConstruct index f7ad093..0551cae 100644 --- a/projects/llm_framework/main_vad/SConstruct +++ b/projects/llm_framework/main_vad/SConstruct @@ -28,7 +28,7 @@ LDFLAGS += ['-l:libsherpa-onnx-core.a', STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_vad', +env['COMPONENTS'].append({'target':'llm_vad-1.5', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_vlm/SConstruct b/projects/llm_framework/main_vlm/SConstruct index 4d9e16e..a42d7fb 100644 --- a/projects/llm_framework/main_vlm/SConstruct +++ b/projects/llm_framework/main_vlm/SConstruct @@ -73,7 +73,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_vlm', +env['COMPONENTS'].append({'target':'llm_vlm-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_whisper/SConstruct b/projects/llm_framework/main_whisper/SConstruct index c14cf6b..40ab63d 100644 --- a/projects/llm_framework/main_whisper/SConstruct +++ b/projects/llm_framework/main_whisper/SConstruct @@ -33,7 +33,7 @@ LDFLAGS += ['-l:libopencc.a', '-l:libmarisa.a'] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_whisper', +env['COMPONENTS'].append({'target':'llm_whisper-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_yolo/SConstruct b/projects/llm_framework/main_yolo/SConstruct index 8400aa9..e432321 100644 --- a/projects/llm_framework/main_yolo/SConstruct +++ b/projects/llm_framework/main_yolo/SConstruct @@ -39,7 +39,7 @@ STATIC_FILES += Glob('mode_*.json') # AFile('../static_lib/libbz2.so.1.0')] # DEFINITIONS += ["-DENABLE_BACKWARD"] -env['COMPONENTS'].append({'target':'llm_yolo', +env['COMPONENTS'].append({'target':'llm_yolo-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 05dea8d..d740f43 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -9,6 +9,7 @@ import concurrent.futures import json import glob +from datetime import datetime ''' {package_name}_{version}-{revision}_{architecture}.deb lib-llm_1.0-m5stack1_arm64.deb @@ -96,6 +97,7 @@ def create_lib_deb(package_name, version, src_folder, revision = 'm5stack1'): f.write(f'Section: llm-module\n') f.write(f'Priority: optional\n') f.write(f'Homepage: https://www.m5stack.com\n') + f.write(f'Packaged-Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n') f.write(f'Description: llm-module\n') f.write(f' bsp.\n') with open(os.path.join(deb_folder, 'DEBIAN/postinst'),'w') as f: @@ -148,7 +150,7 @@ def create_lib_deb(package_name, version, src_folder, revision = 'm5stack1'): shutil.rmtree(deb_folder) return package_name + " creat success!" -def create_data_deb(package_name, version, src_folder, revision = 'm5stack1'): +def create_data_deb(package_name, version, src_folder, revision = 'm5stack1', depends = 'lib-llm (>= 1.6)'): deb_file = f"{package_name}_{version}-{revision}_arm64.deb" deb_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'debian-{}'.format(package_name)) if os.path.exists(deb_folder): @@ -201,12 +203,13 @@ def create_data_deb(package_name, version, src_folder, revision = 'm5stack1'): f.write(f'Original-Maintainer: m5stack \n') f.write(f'Section: llm-module\n') f.write(f'Priority: optional\n') - f.write(f'Depends: lib-llm (>= 1.6)\n') + f.write(f'Depends: {depends}\n') f.write(f'Homepage: https://www.m5stack.com\n') if deb_file.startswith('llm-model-'): deb_name = deb_file[:deb_file.find('_')] old_deb_name = deb_name.replace('model-','').lower() - f.write(f'Conflicts: {old_deb_name}\n') + f.write(f'Conflicts: {old_deb_name}\n') + f.write(f'Packaged-Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n') f.write(f'Description: llm-module\n') f.write(f' bsp.\n') with open(os.path.join(deb_folder, 'DEBIAN/postinst'),'w') as f: @@ -222,7 +225,17 @@ def create_data_deb(package_name, version, src_folder, revision = 'm5stack1'): shutil.rmtree(deb_folder) return package_name + " creat success!" -def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): +def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', depends = 'lib-llm (>= 1.7)'): + bin_files = glob.glob(os.path.join(src_folder, package_name.replace("-", "_") + "-*")) + version_info = 0.0 + print(os.path.join(src_folder, package_name + "-*")) + if bin_files: + for bin_file in bin_files: + version_info = float(bin_file.split('-')[-1]) + if float(bin_file.split('-')[-1]) > version_info: + version_info = float(bin_file.split('-')[-1]) + version = str(version_info) + deb_file = f"{package_name}_{version}-{revision}_arm64.deb" deb_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'debian-{}'.format(package_name)) # os.makedirs(deb_folder, exist_ok=True) @@ -250,7 +263,11 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): vlm_dir = os.path.join(src_folder, 'vlm') if os.path.exists(vlm_dir): shutil.copytree(vlm_dir, os.path.join(deb_folder, 'opt/m5stack/lib/vlm')) - shutil.copy2(os.path.join(src_folder, package_name.replace("-", "_")), os.path.join(deb_folder, 'opt/m5stack/bin', package_name.replace("-", "_"))) + + bin_file_name = package_name.replace("-", "_") + if version_info != 0.0: + bin_file_name = package_name.replace("-", "_") + f'-{version}' + shutil.copy2(os.path.join(src_folder, bin_file_name), os.path.join(deb_folder, 'opt/m5stack/bin', bin_file_name)) ext_scripts_files = glob.glob(os.path.join(src_folder, package_name + "_*")) if ext_scripts_files: os.makedirs(os.path.join(deb_folder, 'opt/m5stack/scripts'), exist_ok = True) @@ -266,8 +283,9 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): f.write(f'Section: llm-module\n') f.write(f'Priority: optional\n') # f.write(f'Depends: lib-llm\n') - f.write(f'Depends: lib-llm (>= 1.7)\n') + f.write(f'Depends: {depends}\n') f.write(f'Homepage: https://www.m5stack.com\n') + f.write(f'Packaged-Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n') f.write(f'Description: llm-module\n') f.write(f' bsp.\n') with open(os.path.join(deb_folder, 'DEBIAN/postinst'),'w') as f: @@ -289,7 +307,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): f.write(f'Requires=llm-sys.service\n') f.write(f'\n') f.write(f'[Service]\n') - f.write(f'ExecStart=/opt/m5stack/bin/{package_name.replace("-", "_")}\n') + f.write(f'ExecStart=/opt/m5stack/bin/{bin_file_name}\n') f.write(f'WorkingDirectory=/opt/m5stack\n') f.write(f'Restart=always\n') f.write(f'RestartSec=1\n') @@ -333,64 +351,22 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1'): else: cpu_count = cpu_count - 2 # cpu_count = 50 -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -# 添加新模型版本号从 0.1 版本号开始累加 -# 当单元和前单元不兼容时提升大版本号 -# 当模型和前模型不兼容时提升大版本号 -# 加速单元和模型单元的大版本号保持一致,以有的更新暂不改变,从2025年 04月 03日开始 -# Start adding new model version numbers from the 0.1 version number. -# Increment the major version number when units and previous units are incompatible -# Increment the major version number when models and previous models are incompatible -# Keep the major version numbers of acceleration units and model units consistent, with some updates not changing them, starting from April 3, 2025. -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -# 添加新模型版本号从 0.1 版本号开始累加 -# 当单元和前单元不兼容时提升大版本号 -# 当模型和前模型不兼容时提升大版本号 -# 加速单元和模型单元的大版本号保持一致,以有的更新暂不改变,从2025年 04月 03日开始 -# Start adding new model version numbers from the 0.1 version number. -# Increment the major version number when units and previous units are incompatible -# Increment the major version number when models and previous models are incompatible -# Keep the major version numbers of acceleration units and model units consistent, with some updates not changing them, starting from April 3, 2025. -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -# 添加新模型版本号从 0.1 版本号开始累加 -# 当单元和前单元不兼容时提升大版本号 -# 当模型和前模型不兼容时提升大版本号 -# 加速单元和模型单元的大版本号保持一致,以有的更新暂不改变,从2025年 04月 03日开始 -# Start adding new model version numbers from the 0.1 version number. -# Increment the major version number when units and previous units are incompatible -# Increment the major version number when models and previous models are incompatible -# Keep the major version numbers of acceleration units and model units consistent, with some updates not changing them, starting from April 3, 2025. -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ -#################################################注意################################################ Tasks = { - 'lib-llm':[create_lib_deb,'lib-llm', 1.7, src_folder, revision], + 'lib-llm':[create_lib_deb,'lib-llm', '1.7', src_folder, revision], 'llm-sys':[create_bin_deb,'llm-sys', version, src_folder, revision], 'llm-audio':[create_bin_deb,'llm-audio', version, src_folder, revision], 'llm-kws':[create_bin_deb,'llm-kws', '1.6', src_folder, revision], 'llm-asr':[create_bin_deb,'llm-asr', version, src_folder, revision], 'llm-llm':[create_bin_deb,'llm-llm', '1.7', src_folder, revision], 'llm-tts':[create_bin_deb,'llm-tts', version, src_folder, revision], - 'llm-melotts':[create_bin_deb,'llm-melotts', version, src_folder, revision], - 'llm-camera':[create_bin_deb,'llm-camera', '1.6', src_folder, revision], + 'llm-melotts':[create_bin_deb,'llm-melotts', '1.6', src_folder, revision], + 'llm-camera':[create_bin_deb,'llm-camera', '1.7', src_folder, revision, 'lib-llm'], 'llm-vlm':[create_bin_deb,'llm-vlm', '1.6', src_folder, revision], 'llm-yolo':[create_bin_deb,'llm-yolo', '1.6', src_folder, revision], 'llm-skel':[create_bin_deb,'llm-skel', version, src_folder, revision], 'llm-depth-anything':[create_bin_deb,'llm-depth-anything', version, src_folder, revision], 'llm-vad':[create_bin_deb,'llm-vad', version, src_folder, revision], - 'llm-whisper':[create_bin_deb,'llm-whisper', version, src_folder, revision], + 'llm-whisper':[create_bin_deb,'llm-whisper', '1.6', src_folder, revision], 'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.6', src_folder, revision], 'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision], 'llm-model-audio-zh-cn':[create_data_deb,'llm-model-audio-zh-cn', data_version, src_folder, revision], From 23cc53d9ef2fb5cd7b0c21412aad268ce02ad0f5 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Fri, 18 Apr 2025 18:48:40 +0800 Subject: [PATCH 29/64] [update] Update import package method --- benchmark/benchmodulellm.py | 2 +- benchmark/utils/__init__.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 benchmark/utils/__init__.py diff --git a/benchmark/benchmodulellm.py b/benchmark/benchmodulellm.py index 00dfcd0..7934b58 100644 --- a/benchmark/benchmodulellm.py +++ b/benchmark/benchmodulellm.py @@ -7,7 +7,7 @@ from pathlib import Path -from utils.llm import LLMClient +from utils import LLMClient FILE = Path(__file__).resolve() ROOT = FILE.parents[0] diff --git a/benchmark/utils/__init__.py b/benchmark/utils/__init__.py new file mode 100644 index 0000000..5714a53 --- /dev/null +++ b/benchmark/utils/__init__.py @@ -0,0 +1,3 @@ +from .llm import LLMClient + +__all__ = ["LLMClient"] \ No newline at end of file From 66a47708d1f650972fead3bffc18cd988ef34f36 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 21 Apr 2025 11:51:16 +0800 Subject: [PATCH 30/64] [update] Update model name --- benchmark/utils/llm.py | 10 +--- .../llm_depth_anything_en.md | 2 +- .../llm_depth_anything_zh.md | 2 +- .../llm_melotts_en.md | 8 ++-- .../llm_melotts_zh.md | 8 ++-- doc/projects_llm_framework_doc/llm_tts_en.md | 47 +++++++++++++++++-- doc/projects_llm_framework_doc/llm_tts_zh.md | 46 ++++++++++++++++-- .../mode_depth-anything-ax630c.json | 4 +- .../main_melotts/mode_melotts-zh-cn.json | 4 +- .../mode_single-speaker-english-fast.json | 2 +- .../main_tts/mode_single-speaker-fast.json | 2 +- .../main_vad/mode_silero-vad.json | 2 +- projects/llm_framework/tools/llm_pack.py | 40 ++++++++-------- 13 files changed, 123 insertions(+), 54 deletions(-) diff --git a/benchmark/utils/llm.py b/benchmark/utils/llm.py index e2d6a0b..bf77670 100644 --- a/benchmark/utils/llm.py +++ b/benchmark/utils/llm.py @@ -30,7 +30,6 @@ def send_request_stream(self, request): while True: chunk = self.sock.recv(4096) - logging.info(f"Received chunk: {chunk}") response += chunk while b'\n' in response: @@ -149,21 +148,14 @@ def exit(self): def test(self, model, input_text): logging.info("Setting up...") setup_response = self.setup(model) - logging.info("Setup response: %s", setup_response) logging.info("Running inference...") inference_result = self.inference(input_text) - logging.info("Inference result: %s", inference_result) logging.info("Exiting...") exit_response = self.exit() - logging.info("Exit response: %s", exit_response) - return { - "setup_response": setup_response, - "inference_result": inference_result, - "exit_response": exit_response - } + return {} if __name__ == "__main__": host = "192.168.20.186" diff --git a/doc/projects_llm_framework_doc/llm_depth_anything_en.md b/doc/projects_llm_framework_doc/llm_depth_anything_en.md index 301cc54..b1d0b51 100644 --- a/doc/projects_llm_framework_doc/llm_depth_anything_en.md +++ b/doc/projects_llm_framework_doc/llm_depth_anything_en.md @@ -27,7 +27,7 @@ Send JSON: - work_id: When configuring the unit, it is `depth_anything`. - action: The method called is `setup`. - object: The data type being transmitted is `depth_anything.setup`. -- model: The model used is the `depth_anything` model. +- model: The model used is the `depth-anything-ax630c` model. - response_format: The return result is `jpeg.base64.stream`. - input: The input is `camera.1001`, which refers to the input from the camera unit, as detailed in the camera unit documentation. diff --git a/doc/projects_llm_framework_doc/llm_depth_anything_zh.md b/doc/projects_llm_framework_doc/llm_depth_anything_zh.md index 846ca34..85f07f4 100644 --- a/doc/projects_llm_framework_doc/llm_depth_anything_zh.md +++ b/doc/projects_llm_framework_doc/llm_depth_anything_zh.md @@ -27,7 +27,7 @@ depth_anything 视觉单元,用于提供图片深度信息。 - work_id:配置单元时,为 `depth_anything`。 - action:调用的方法为 `setup`。 - object:传输的数据类型为 `depth_anything.setup`。 -- model:使用的模型为 `depth_anything` 模型。 +- model:使用的模型为 `depth-anything-ax630c` 模型。 - response_format:返回结果为 `jpeg.base64.stream`。 - input:输入的为 `camera.1001`,代表的是从 camera 单元内部输入,详见 camera 单位文档。 - enoutput:是否启用用户结果输出。 diff --git a/doc/projects_llm_framework_doc/llm_melotts_en.md b/doc/projects_llm_framework_doc/llm_melotts_en.md index f45eb0e..d488deb 100644 --- a/doc/projects_llm_framework_doc/llm_melotts_en.md +++ b/doc/projects_llm_framework_doc/llm_melotts_en.md @@ -16,7 +16,7 @@ Send JSON: "action": "setup", "object": "melotts.setup", "data": { - "model": "melotts_zh-cn", + "model": "melotts-zh-cn", "response_format": "sys.pcm", "input": "tts.utf-8", "enoutput": false @@ -28,7 +28,7 @@ Send JSON: - work_id: For configuration, it is `melotts`. - action: The method to be called is `setup`. - object: The data type being transmitted is `melotts.setup`. -- model: The model being used is the Chinese model `melotts_zh-cn`. +- model: The model being used is the Chinese model `melotts-zh-cn`. - response_format: The result is returned as `sys.pcm`, system audio data, which is directly sent to the llm-audio module for playback. - input: The input is `tts.utf-8`, representing user input. @@ -139,7 +139,7 @@ Example: "action": "setup", "object": "melotts.setup", "data": { - "model": "melotts_zh-cn", + "model": "melotts-zh-cn", "response_format": "sys.pcm", "input": [ "tts.utf-8", @@ -335,7 +335,7 @@ Response JSON: "inputs_": [ "tts.utf-8" ], - "model": "melotts_zh-cn", + "model": "melotts-zh-cn", "response_format": "sys.pcm" }, "error": { diff --git a/doc/projects_llm_framework_doc/llm_melotts_zh.md b/doc/projects_llm_framework_doc/llm_melotts_zh.md index 7d23b51..8c86e82 100644 --- a/doc/projects_llm_framework_doc/llm_melotts_zh.md +++ b/doc/projects_llm_framework_doc/llm_melotts_zh.md @@ -15,7 +15,7 @@ "action": "setup", "object": "melotts.setup", "data": { - "model": "melotts_zh-cn", + "model": "melotts-zh-cn", "response_format": "sys.pcm", "input": "tts.utf-8", "enoutput": false @@ -27,7 +27,7 @@ - work_id:配置单元时,为 `melotts`。 - action:调用的方法为 `setup`。 - object:传输的数据类型为 `melotts.setup`。 -- model:使用的模型为 `melotts_zh-cn` 中文模型。 +- model:使用的模型为 `melotts-zh-cn` 中文模型。 - response_format:返回结果为 `sys.pcm`, 系统音频数据,并直接发送到 llm-audio 模块进行播放。 - input:输入的为 `tts.utf-8`,代表的是从用户输入。 - enoutput:是否起用用户结果输出。 @@ -134,7 +134,7 @@ error::code 为 0 表示执行成功。 "action": "setup", "object": "melotts.setup", "data": { - "model": "melotts_zh-cn", + "model": "melotts-zh-cn", "response_format": "sys.pcm", "input": [ "tts.utf-8", @@ -328,7 +328,7 @@ error::code 为 0 表示执行成功。 "inputs_": [ "tts.utf-8" ], - "model": "melotts_zh-cn", + "model": "melotts-zh-cn", "response_format": "sys.pcm" }, "error": { diff --git a/doc/projects_llm_framework_doc/llm_tts_en.md b/doc/projects_llm_framework_doc/llm_tts_en.md index 7833828..61d0e45 100644 --- a/doc/projects_llm_framework_doc/llm_tts_en.md +++ b/doc/projects_llm_framework_doc/llm_tts_en.md @@ -16,7 +16,7 @@ Send JSON: "action": "setup", "object": "tts.setup", "data": { - "model": "single_speaker_fast", + "model": "single-speaker-english-fast", "response_format": "sys.pcm", "input": "tts.utf-8", "enoutput": false @@ -28,7 +28,7 @@ Send JSON: - work_id: For configuring the unit, it is `tts`. - action: The method to call is `setup`. - object: The type of data being transmitted is `tts.setup`. -- model: The model used is the `single_speaker_fast` Chinese model. +- model: The model used is the `single-speaker-english-fast` English model. - response_format: The returned result is `sys.pcm`, system audio data, which is directly sent to the llm-audio module for playback. - input: Input is `tts.utf-8`, representing user input. @@ -46,13 +46,50 @@ Response JSON: }, "object": "None", "request_id": "2", - "work_id": "llm.1003" + "work_id": "tts.1003" } ``` - created: Message creation time, in Unix time. - work_id: The successfully created work_id unit. +## inference + +### streaming input + +```json +{ + "request_id": "2", + "work_id": "tts.1003", + "action": "inference", + "object": "tts.utf-8.stream", + "data": { + "delta": "What's ur name?", + "index": 0, + "finish": true + } +} +``` +- object: The data type transmitted is tts.utf-8.stream, indicating a streaming input from the user's UTF-8. +- delta: Segment data of the streaming input. +- index: Index of the segment in the streaming input. +- finish: A flag indicating whether the streaming input has completed. + +### non-streaming input + +```json +{ + "request_id": "2", + "work_id": "tts.1003", + "action": "inference", + "object": "tts.utf-8", + "data": "What's ur name?" +} +``` + +- object: The data type transmitted is tts.utf-8, indicating a non-streaming input from the user's UTF-8. +- data: Data for non-streaming input. + ## link Link the output of the upper unit. @@ -102,7 +139,7 @@ Example: "action": "setup", "object": "tts.setup", "data": { - "model": "single_speaker_fast", + "model": "single-speaker-fast", "response_format": "sys.pcm", "input": [ "tts.utf-8", @@ -298,7 +335,7 @@ Response JSON: "inputs_": [ "tts.utf-8" ], - "model": "single_speaker_fast", + "model": "single-speaker-fast", "response_format": "sys.pcm" }, "error": { diff --git a/doc/projects_llm_framework_doc/llm_tts_zh.md b/doc/projects_llm_framework_doc/llm_tts_zh.md index 719d4fb..7e4ab18 100644 --- a/doc/projects_llm_framework_doc/llm_tts_zh.md +++ b/doc/projects_llm_framework_doc/llm_tts_zh.md @@ -15,7 +15,7 @@ "action": "setup", "object": "tts.setup", "data": { - "model": "single_speaker_fast", + "model": "single-speaker-fast", "response_format": "sys.pcm", "input": "tts.utf-8", "enoutput": false @@ -27,7 +27,7 @@ - work_id:配置单元时,为 `tts`。 - action:调用的方法为 `setup`。 - object:传输的数据类型为 `tts.setup`。 -- model:使用的模型为 `single_speaker_fast` 中文模型。 +- model:使用的模型为 `single-speaker-fast` 中文模型。 - response_format:返回结果为 `sys.pcm`, 系统音频数据,并直接发送到 llm-audio 模块进行播放。 - input:输入的为 `tts.utf-8`,代表的是从用户输入。 - enoutput:是否起用用户结果输出。 @@ -44,13 +44,49 @@ }, "object": "None", "request_id": "2", - "work_id": "llm.1003" + "work_id": "tts.1003" } ``` - created:消息创建时间,unix 时间。 - work_id:返回成功创建的 work_id 单元。 +## inference + +### 流式输入 + +```json +{ + "request_id": "2", + "work_id": "tts.1003", + "action": "inference", + "object": "tts.utf-8.stream", + "data": { + "delta": "今天天气真好!", + "index": 0, + "finish": true + } +} +``` +- object:传输的数据类型为 `tts.utf-8.stream` 代表的是从用户 utf-8 的流式输入 +- delta:流式输入的分段数据 +- index:流式输入的分段索引 +- finish:流式输入是否完成的标志位 + +### 非流式输入 + +```json +{ + "request_id": "2", + "work_id": "tts.1003", + "action": "inference", + "object": "tts.utf-8", + "data": "今天天气真好!" +} +``` +- object:传输的数据类型为 `tts.utf-8` 代表的是从用户 utf-8 的非流式输入 +- data:非流式输入的数据 + ## link 链接上级单元的输出。 @@ -98,7 +134,7 @@ error::code 为 0 表示执行成功。 "action": "setup", "object": "tts.setup", "data": { - "model": "single_speaker_fast", + "model": "single-speaker-fast", "response_format": "sys.pcm", "input": [ "tts.utf-8", @@ -294,7 +330,7 @@ error::code 为 0 表示执行成功。 "inputs_": [ "tts.utf-8" ], - "model": "single_speaker_fast", + "model": "single-speaker-fast", "response_format": "sys.pcm" }, "error": { diff --git a/projects/llm_framework/main_depth_anything/mode_depth-anything-ax630c.json b/projects/llm_framework/main_depth_anything/mode_depth-anything-ax630c.json index ddf0a6c..fd91eb3 100644 --- a/projects/llm_framework/main_depth_anything/mode_depth-anything-ax630c.json +++ b/projects/llm_framework/main_depth_anything/mode_depth-anything-ax630c.json @@ -1,7 +1,9 @@ { - "mode":"depth_anything", + "mode":"depth-anything-ax630c", "type":"cv", "homepage":"https://github.com/DepthAnything/Depth-Anything-V2", + "compile_flage":"pulsar2 build --input depth_anything.onnx --config config.json --output_dir output --output_name depth_anything.axmodel --target_hardware AX620E", + "pulsar_version":"2.5-2d5d0fa3", "capabilities":[ "Segmentation" ], diff --git a/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json b/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json index e7af2c5..ee9f57c 100644 --- a/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json +++ b/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json @@ -1,7 +1,9 @@ { - "mode": "melotts_zh-cn", + "mode": "melotts-zh-cn", "type": "tts", "homepage":"https://huggingface.co/myshell-ai/MeloTTS-Chinese", + "compile_flage":"pulsar2 build --input decoder.onnx --config config_decoder_u16.json --output_dir decoder --output_name decoder.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0", + "pulsar_version":"3.2-99f14d0a", "capabilities": [ "tts", "Chinese" diff --git a/projects/llm_framework/main_tts/mode_single-speaker-english-fast.json b/projects/llm_framework/main_tts/mode_single-speaker-english-fast.json index e311a4a..cbaccaa 100644 --- a/projects/llm_framework/main_tts/mode_single-speaker-english-fast.json +++ b/projects/llm_framework/main_tts/mode_single-speaker-english-fast.json @@ -1,5 +1,5 @@ { - "mode": "single_speaker_english_fast", + "mode": "single-speaker-english-fast", "type": "tts", "homepage":"https://github.com/huakunyang/SummerTTS", "capabilities": [ diff --git a/projects/llm_framework/main_tts/mode_single-speaker-fast.json b/projects/llm_framework/main_tts/mode_single-speaker-fast.json index ca1ede7..6fadf2f 100644 --- a/projects/llm_framework/main_tts/mode_single-speaker-fast.json +++ b/projects/llm_framework/main_tts/mode_single-speaker-fast.json @@ -1,5 +1,5 @@ { - "mode": "single_speaker_fast", + "mode": "single-speaker-fast", "type": "tts", "homepage":"https://github.com/huakunyang/SummerTTS", "capabilities": [ diff --git a/projects/llm_framework/main_vad/mode_silero-vad.json b/projects/llm_framework/main_vad/mode_silero-vad.json index d262af4..a4d8eae 100644 --- a/projects/llm_framework/main_vad/mode_silero-vad.json +++ b/projects/llm_framework/main_vad/mode_silero-vad.json @@ -1,5 +1,5 @@ { - "mode": "silero_vad", + "mode": "silero-vad", "type": "vad", "homepage":"https://github.com/snakers4/silero-vad", "capabilities": [ diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 38ca64e..21c3831 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -69,23 +69,23 @@ def create_lib_deb(package_name, version, src_folder, revision = 'm5stack1'): # if os.path.exists(zip_file_extrpath): # shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'opt/m5stack/scripts')) - # zip_file = 'm5stack_dist-packages.tar.gz' - # down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_dist-packages.tar.gz' - # zip_file_extrpath = 'm5stack_dist-packages' - # if not os.path.exists(zip_file_extrpath): - # # Downloading via HTTP (more common) - # if not os.path.exists(zip_file): - # response = requests.get(down_url) - # if response.status_code == 200: - # with open(zip_file, 'wb') as file: - # file.write(response.content) - # else: - # print("{} down failed".format(down_url)) - # with tarfile.open(zip_file, 'r:gz') as tar: - # tar.extractall(path=zip_file_extrpath) - # print("The {} download successful.".format(down_url)) - # if os.path.exists(zip_file_extrpath): - # shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'usr/local/lib/python3.10/dist-packages')) + zip_file = 'm5stack_dist-packages.tar.gz' + down_url = 'https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_dist-packages.tar.gz' + zip_file_extrpath = 'm5stack_dist-packages' + if not os.path.exists(zip_file_extrpath): + # Downloading via HTTP (more common) + if not os.path.exists(zip_file): + response = requests.get(down_url) + if response.status_code == 200: + with open(zip_file, 'wb') as file: + file.write(response.content) + else: + print("{} down failed".format(down_url)) + with tarfile.open(zip_file, 'r:gz') as tar: + tar.extractall(path=zip_file_extrpath) + print("The {} download successful.".format(down_url)) + if os.path.exists(zip_file_extrpath): + shutil.copytree(zip_file_extrpath, os.path.join(deb_folder, 'usr/local/lib/python3.10/dist-packages')) os.makedirs(os.path.join(deb_folder, 'DEBIAN'), exist_ok = True) with open(os.path.join(deb_folder, 'DEBIAN/control'),'w') as f: @@ -374,9 +374,9 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23', data_version, src_folder, revision], 'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01', '0.3', src_folder, revision], 'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01':[create_data_deb,'llm-model-sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01', '0.3', src_folder, revision], - 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', data_version, src_folder, revision], - 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', data_version, src_folder, revision], - 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.4', src_folder, revision], + 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', '0.3', src_folder, revision], + 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', '0.3', src_folder, revision], + 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.5', src_folder, revision], 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], From 783200e804dbbd266766524e4a2149b328809ed6 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 21 Apr 2025 15:23:17 +0800 Subject: [PATCH 31/64] [update] Update openai_api version --- projects/llm_framework/main_openai_api/SConstruct | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct index 17adecc..db38738 100644 --- a/projects/llm_framework/main_openai_api/SConstruct +++ b/projects/llm_framework/main_openai_api/SConstruct @@ -18,8 +18,8 @@ LINK_SEARCH_PATH = [] STATIC_FILES = [] -ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', '33477071ca362d20cd65cfad43f5b05480724711', True) -python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.5.tar.gz", 'm5stack_llm-openai-api-python-venv_v1.5.tar.gz') +ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', '6f783440eb2b4ff6566c171ffc2815cb64024928', True) +python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.6.tar.gz", 'm5stack_llm-openai-api-python-venv_v1.6.tar.gz') DEFINITIONS += ['-O3', '-fopenmp', '-std=c++17'] From 8f2643d27076c449171122df5f32d02c668e7580 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 21 Apr 2025 15:49:18 +0800 Subject: [PATCH 32/64] [update] update llm-asr doc --- doc/projects_llm_framework_doc/llm_asr_en.md | 6 +++--- doc/projects_llm_framework_doc/llm_asr_zh.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/projects_llm_framework_doc/llm_asr_en.md b/doc/projects_llm_framework_doc/llm_asr_en.md index 8e96f25..282bb68 100644 --- a/doc/projects_llm_framework_doc/llm_asr_en.md +++ b/doc/projects_llm_framework_doc/llm_asr_en.md @@ -119,9 +119,9 @@ Example: "endpoint_config.rule1.min_trailing_silence": 2.4, "endpoint_config.rule2.min_trailing_silence": 1.2, "endpoint_config.rule3.min_trailing_silence": 30.1, - "endpoint_config.rule1.must_contain_nonsilence": false, - "endpoint_config.rule2.must_contain_nonsilence": false, - "endpoint_config.rule3.must_contain_nonsilence": false + "endpoint_config.rule1.must_contain_nonsilence": true, + "endpoint_config.rule2.must_contain_nonsilence": true, + "endpoint_config.rule3.must_contain_nonsilence": true } } ``` diff --git a/doc/projects_llm_framework_doc/llm_asr_zh.md b/doc/projects_llm_framework_doc/llm_asr_zh.md index e139f28..97c8e90 100644 --- a/doc/projects_llm_framework_doc/llm_asr_zh.md +++ b/doc/projects_llm_framework_doc/llm_asr_zh.md @@ -117,9 +117,9 @@ error::code 为 0 表示执行成功。 "endpoint_config.rule1.min_trailing_silence": 2.4, "endpoint_config.rule2.min_trailing_silence": 1.2, "endpoint_config.rule3.min_trailing_silence": 30.1, - "endpoint_config.rule1.must_contain_nonsilence": false, - "endpoint_config.rule2.must_contain_nonsilence": false, - "endpoint_config.rule3.must_contain_nonsilence": false + "endpoint_config.rule1.must_contain_nonsilence": true, + "endpoint_config.rule2.must_contain_nonsilence": true, + "endpoint_config.rule3.must_contain_nonsilence": true } } ``` From 14d7e5b01b31b804b08d618fa98be1652cb2bb76 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Mon, 21 Apr 2025 16:55:48 +0800 Subject: [PATCH 33/64] [fix] llm_whisper --- projects/llm_framework/main_whisper/src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 776408b..73515f1 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -316,7 +316,7 @@ class llm_task { if (endpoint_flage_) return; } endpoint_flage_ = true; - buffer_resize(pcmdata, 0); + if (delay_audio_frame_ == 0) buffer_resize(pcmdata, 0); buffer_write_char(pcmdata, raw.c_str(), raw.length()); buffer_position_set(pcmdata, 0); count = 0; From 4b477eeac3ee351f3d8fb83ecc5d90c83c220230 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 22 Apr 2025 15:50:00 +0800 Subject: [PATCH 34/64] [update] Add melotts-en-us model --- .../main_melotts/mode_melotts-en-us.json | 29 +++++++++++++++++++ .../llm_framework/main_melotts/src/main.cpp | 13 ++++++--- projects/llm_framework/tools/llm_pack.py | 1 + 3 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 projects/llm_framework/main_melotts/mode_melotts-en-us.json diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-us.json b/projects/llm_framework/main_melotts/mode_melotts-en-us.json new file mode 100644 index 0000000..6a375c9 --- /dev/null +++ b/projects/llm_framework/main_melotts/mode_melotts-en-us.json @@ -0,0 +1,29 @@ +{ + "mode": "melotts-en-us", + "type": "tts", + "homepage":"https://huggingface.co/myshell-ai/MeloTTS-English", + "compile_flage":"pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder-en --output_name decoder-en.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0", + "pulsar_version":"3.4-3dfd5692", + "capabilities": [ + "tts", + "English" + ], + "input_type": [ + "tts.utf-8" + ], + "output_type": [ + "tts.wav", + "sys.play.0_1" + ], + "mode_param": { + "encoder": "encoder-en.ort", + "decoder": "decoder-en.axmodel", + "gbin": "g-en.bin", + "tokens": "tokens.txt", + "lexicon": "lexicon.txt", + "spacker_speed": 1.0, + "mode_rate": 44100, + "audio_rate": 16000, + "awake_delay": 1000 + } +} \ No newline at end of file diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index 9890fac..b5a27cb 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -37,6 +37,7 @@ static std::string base_model_path_; static std::string base_model_config_path_; typedef struct { + std::string mode; std::string encoder; std::string decoder; std::string lexicon; @@ -112,6 +113,9 @@ class llm_task { return false; } + std::unordered_map MELOTTS_LANG_IDS_MAP{ + {"melotts-ja-jp", 1}, {"melotts-en-us", 2}, {"melotts_zh-cn", 3}, {"melotts-zh-cn", 3}}; + std::vector intersperse(const std::vector &lst, int item) { std::vector result(lst.size() * 2 + 1, item); @@ -251,10 +255,11 @@ class llm_task { std::vector phones_bef, tones_bef; lexicon_->convert(msg_str, phones_bef, tones_bef); // Add blank between words - auto phones = intersperse(phones_bef, 0); - auto tones = intersperse(tones_bef, 0); - int phone_len = phones.size(); - std::vector langids(phone_len, 3); + auto phones = intersperse(phones_bef, 0); + auto tones = intersperse(tones_bef, 0); + int phone_len = phones.size(); + int MELOTTS_LANG_IDS = MELOTTS_LANG_IDS_MAP[mode_config_.mode]; + std::vector langids(phone_len, MELOTTS_LANG_IDS); auto encoder_output = encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w, mode_config_.get_length_scale(), mode_config_.sdp_ratio); diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 21c3831..808771e 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -377,6 +377,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-single-speaker-english-fast':[create_data_deb,'llm-model-single-speaker-english-fast', '0.3', src_folder, revision], 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', '0.3', src_folder, revision], 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.5', src_folder, revision], + 'llm-model-melotts-en-us':[create_data_deb,'llm-model-melotts-en-us', '0.5', src_folder, revision], 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], From aa3441d307aad9725e6293f523b4b9b545d95f82 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 22 Apr 2025 17:14:49 +0800 Subject: [PATCH 35/64] [update] Update package version --- projects/llm_framework/main_asr/SConstruct | 2 +- .../main_depth_anything/SConstruct | 2 +- projects/llm_framework/main_kws/SConstruct | 2 +- projects/llm_framework/main_melotts/SConstruct | 2 +- .../llm_framework/main_openai_api/SConstruct | 2 +- projects/llm_framework/main_vad/SConstruct | 2 +- projects/llm_framework/main_vlm/SConstruct | 2 +- projects/llm_framework/main_whisper/SConstruct | 2 +- projects/llm_framework/tools/llm_pack.py | 18 +++++++++--------- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/projects/llm_framework/main_asr/SConstruct b/projects/llm_framework/main_asr/SConstruct index 496838c..344162b 100644 --- a/projects/llm_framework/main_asr/SConstruct +++ b/projects/llm_framework/main_asr/SConstruct @@ -26,7 +26,7 @@ REQUIREMENTS += ['ncnn', 'sherpa-ncnn-core'] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_asr-1.5', +env['COMPONENTS'].append({'target':'llm_asr-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_depth_anything/SConstruct b/projects/llm_framework/main_depth_anything/SConstruct index de14d98..a90887f 100644 --- a/projects/llm_framework/main_depth_anything/SConstruct +++ b/projects/llm_framework/main_depth_anything/SConstruct @@ -30,7 +30,7 @@ STATIC_LIB += static_file * 2 STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_depth_anything-1.5', +env['COMPONENTS'].append({'target':'llm_depth_anything-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_kws/SConstruct b/projects/llm_framework/main_kws/SConstruct index 9b4dee4..c09ca41 100644 --- a/projects/llm_framework/main_kws/SConstruct +++ b/projects/llm_framework/main_kws/SConstruct @@ -55,7 +55,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_kws-1.6', +env['COMPONENTS'].append({'target':'llm_kws-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_melotts/SConstruct b/projects/llm_framework/main_melotts/SConstruct index d3c3cba..6663ca3 100644 --- a/projects/llm_framework/main_melotts/SConstruct +++ b/projects/llm_framework/main_melotts/SConstruct @@ -31,7 +31,7 @@ LDFLAGS += ['-l:libcargs.a', '-l:libonnxruntime.a'] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_melotts-1.6', +env['COMPONENTS'].append({'target':'llm_melotts-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct index db38738..490cebc 100644 --- a/projects/llm_framework/main_openai_api/SConstruct +++ b/projects/llm_framework/main_openai_api/SConstruct @@ -52,7 +52,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_openai_api-1.6', +env['COMPONENTS'].append({'target':'llm_openai_api-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_vad/SConstruct b/projects/llm_framework/main_vad/SConstruct index 0551cae..2493e15 100644 --- a/projects/llm_framework/main_vad/SConstruct +++ b/projects/llm_framework/main_vad/SConstruct @@ -28,7 +28,7 @@ LDFLAGS += ['-l:libsherpa-onnx-core.a', STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_vad-1.5', +env['COMPONENTS'].append({'target':'llm_vad-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_vlm/SConstruct b/projects/llm_framework/main_vlm/SConstruct index a42d7fb..d1046f2 100644 --- a/projects/llm_framework/main_vlm/SConstruct +++ b/projects/llm_framework/main_vlm/SConstruct @@ -73,7 +73,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_vlm-1.6', +env['COMPONENTS'].append({'target':'llm_vlm-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_whisper/SConstruct b/projects/llm_framework/main_whisper/SConstruct index 40ab63d..4c61edc 100644 --- a/projects/llm_framework/main_whisper/SConstruct +++ b/projects/llm_framework/main_whisper/SConstruct @@ -33,7 +33,7 @@ LDFLAGS += ['-l:libopencc.a', '-l:libmarisa.a'] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_whisper-1.6', +env['COMPONENTS'].append({'target':'llm_whisper-1.7', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 808771e..d32adfd 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -355,19 +355,19 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'lib-llm':[create_lib_deb,'lib-llm', '1.7', src_folder, revision], 'llm-sys':[create_bin_deb,'llm-sys', version, src_folder, revision], 'llm-audio':[create_bin_deb,'llm-audio', version, src_folder, revision], - 'llm-kws':[create_bin_deb,'llm-kws', '1.6', src_folder, revision], - 'llm-asr':[create_bin_deb,'llm-asr', version, src_folder, revision], + 'llm-kws':[create_bin_deb,'llm-kws', '1.7', src_folder, revision], + 'llm-asr':[create_bin_deb,'llm-asr', '1.6', src_folder, revision], 'llm-llm':[create_bin_deb,'llm-llm', '1.7', src_folder, revision], 'llm-tts':[create_bin_deb,'llm-tts', version, src_folder, revision], - 'llm-melotts':[create_bin_deb,'llm-melotts', '1.6', src_folder, revision], + 'llm-melotts':[create_bin_deb,'llm-melotts', '1.7', src_folder, revision], 'llm-camera':[create_bin_deb,'llm-camera', '1.7', src_folder, revision, 'lib-llm'], - 'llm-vlm':[create_bin_deb,'llm-vlm', '1.6', src_folder, revision], - 'llm-yolo':[create_bin_deb,'llm-yolo', '1.6', src_folder, revision], + 'llm-vlm':[create_bin_deb,'llm-vlm', '1.7', src_folder, revision], + 'llm-yolo':[create_bin_deb,'llm-yolo', '1.7', src_folder, revision], 'llm-skel':[create_bin_deb,'llm-skel', version, src_folder, revision], - 'llm-depth-anything':[create_bin_deb,'llm-depth-anything', version, src_folder, revision], - 'llm-vad':[create_bin_deb,'llm-vad', version, src_folder, revision], - 'llm-whisper':[create_bin_deb,'llm-whisper', '1.6', src_folder, revision], - 'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.6', src_folder, revision], + 'llm-depth-anything':[create_bin_deb,'llm-depth-anything', '1.6', src_folder, revision], + 'llm-vad':[create_bin_deb,'llm-vad', '1.6', src_folder, revision], + 'llm-whisper':[create_bin_deb,'llm-whisper', '1.7', src_folder, revision], + 'llm-openai-api':[create_bin_deb,'llm-openai-api', '1.7', src_folder, revision], 'llm-model-audio-en-us':[create_data_deb,'llm-model-audio-en-us', data_version, src_folder, revision], 'llm-model-audio-zh-cn':[create_data_deb,'llm-model-audio-zh-cn', data_version, src_folder, revision], 'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17':[create_data_deb,'llm-model-sherpa-ncnn-streaming-zipformer-20M-2023-02-17', data_version, src_folder, revision], From 8e263d0f44290033d8202e36e4fb48c5352191e5 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 22 Apr 2025 17:36:12 +0800 Subject: [PATCH 36/64] [update] Update other package version --- projects/llm_framework/main_audio/SConstruct | 2 +- projects/llm_framework/main_camera/SConstruct | 2 +- projects/llm_framework/main_llm/SConstruct | 2 +- projects/llm_framework/main_sys/SConstruct | 2 +- projects/llm_framework/main_tts/SConstruct | 2 +- projects/llm_framework/main_yolo/SConstruct | 2 +- projects/llm_framework/tools/llm_pack.py | 12 ++++++------ 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/projects/llm_framework/main_audio/SConstruct b/projects/llm_framework/main_audio/SConstruct index 25b3f1a..27a04b6 100644 --- a/projects/llm_framework/main_audio/SConstruct +++ b/projects/llm_framework/main_audio/SConstruct @@ -29,7 +29,7 @@ REQUIREMENTS += ['tinyalsa', 'opus', 'samplerate', 'fdk-aac'] STATIC_FILES += [AFile('audio.json')] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_audio-1.5', +env['COMPONENTS'].append({'target':'llm_audio-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_camera/SConstruct b/projects/llm_framework/main_camera/SConstruct index db43b04..eb5190c 100644 --- a/projects/llm_framework/main_camera/SConstruct +++ b/projects/llm_framework/main_camera/SConstruct @@ -68,7 +68,7 @@ STATIC_LIB += static_file * 4 STATIC_FILES += [AFile('camera.json')] STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_camera-1.7', +env['COMPONENTS'].append({'target':'llm_camera-1.8', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_llm/SConstruct b/projects/llm_framework/main_llm/SConstruct index 52b0778..1b12643 100644 --- a/projects/llm_framework/main_llm/SConstruct +++ b/projects/llm_framework/main_llm/SConstruct @@ -66,7 +66,7 @@ ignore['ignore'] = list(set(ignore['ignore'])) with open('../dist/fileignore', 'w') as f: json.dump(ignore, f, indent=4) -env['COMPONENTS'].append({'target':'llm_llm-1.7', +env['COMPONENTS'].append({'target':'llm_llm-1.8', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_sys/SConstruct b/projects/llm_framework/main_sys/SConstruct index 0093d21..d9a4830 100644 --- a/projects/llm_framework/main_sys/SConstruct +++ b/projects/llm_framework/main_sys/SConstruct @@ -37,7 +37,7 @@ LDFLAGS+=['-Wl,-rpath=/opt/m5stack/lib', '-Wl,-rpath=/usr/local/m5stack/lib', '- STATIC_FILES += [AFile('sys_config.json')] REQUIREMENTS += ['simdjson_component'] -env['COMPONENTS'].append({'target':'llm_sys-1.5', +env['COMPONENTS'].append({'target':'llm_sys-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_tts/SConstruct b/projects/llm_framework/main_tts/SConstruct index bba7997..a38e56f 100644 --- a/projects/llm_framework/main_tts/SConstruct +++ b/projects/llm_framework/main_tts/SConstruct @@ -27,7 +27,7 @@ INCLUDE += [ADir('src/runner/eigen-3.4.0'), ADir('src/runner/src/tn/header'), AD STATIC_FILES += Glob('mode_*.json') -env['COMPONENTS'].append({'target':'llm_tts-1.5', +env['COMPONENTS'].append({'target':'llm_tts-1.6', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/main_yolo/SConstruct b/projects/llm_framework/main_yolo/SConstruct index e432321..99f1e1e 100644 --- a/projects/llm_framework/main_yolo/SConstruct +++ b/projects/llm_framework/main_yolo/SConstruct @@ -39,7 +39,7 @@ STATIC_FILES += Glob('mode_*.json') # AFile('../static_lib/libbz2.so.1.0')] # DEFINITIONS += ["-DENABLE_BACKWARD"] -env['COMPONENTS'].append({'target':'llm_yolo-1.7', +env['COMPONENTS'].append({'target':'llm_yolo-1.8', 'SRCS':SRCS, 'INCLUDE':INCLUDE, 'PRIVATE_INCLUDE':PRIVATE_INCLUDE, diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index d32adfd..370c8b2 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -353,16 +353,16 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep # cpu_count = 50 Tasks = { 'lib-llm':[create_lib_deb,'lib-llm', '1.7', src_folder, revision], - 'llm-sys':[create_bin_deb,'llm-sys', version, src_folder, revision], - 'llm-audio':[create_bin_deb,'llm-audio', version, src_folder, revision], + 'llm-sys':[create_bin_deb,'llm-sys', '1.6', src_folder, revision], + 'llm-audio':[create_bin_deb,'llm-audio', '1.6', src_folder, revision], 'llm-kws':[create_bin_deb,'llm-kws', '1.7', src_folder, revision], 'llm-asr':[create_bin_deb,'llm-asr', '1.6', src_folder, revision], - 'llm-llm':[create_bin_deb,'llm-llm', '1.7', src_folder, revision], - 'llm-tts':[create_bin_deb,'llm-tts', version, src_folder, revision], + 'llm-llm':[create_bin_deb,'llm-llm', '1.8', src_folder, revision], + 'llm-tts':[create_bin_deb,'llm-tts', '1.6', src_folder, revision], 'llm-melotts':[create_bin_deb,'llm-melotts', '1.7', src_folder, revision], - 'llm-camera':[create_bin_deb,'llm-camera', '1.7', src_folder, revision, 'lib-llm'], + 'llm-camera':[create_bin_deb,'llm-camera', '1.8', src_folder, revision, 'lib-llm'], 'llm-vlm':[create_bin_deb,'llm-vlm', '1.7', src_folder, revision], - 'llm-yolo':[create_bin_deb,'llm-yolo', '1.7', src_folder, revision], + 'llm-yolo':[create_bin_deb,'llm-yolo', '1.8', src_folder, revision], 'llm-skel':[create_bin_deb,'llm-skel', version, src_folder, revision], 'llm-depth-anything':[create_bin_deb,'llm-depth-anything', '1.6', src_folder, revision], 'llm-vad':[create_bin_deb,'llm-vad', '1.6', src_folder, revision], From efa978bd773e01daacf41241161171f1692d5eea Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 22 Apr 2025 17:38:17 +0800 Subject: [PATCH 37/64] [update] update melotts doc --- doc/projects_llm_framework_doc/llm_melotts_en.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/projects_llm_framework_doc/llm_melotts_en.md b/doc/projects_llm_framework_doc/llm_melotts_en.md index d488deb..08222df 100644 --- a/doc/projects_llm_framework_doc/llm_melotts_en.md +++ b/doc/projects_llm_framework_doc/llm_melotts_en.md @@ -16,7 +16,7 @@ Send JSON: "action": "setup", "object": "melotts.setup", "data": { - "model": "melotts-zh-cn", + "model": "melotts-en-us", "response_format": "sys.pcm", "input": "tts.utf-8", "enoutput": false @@ -28,7 +28,7 @@ Send JSON: - work_id: For configuration, it is `melotts`. - action: The method to be called is `setup`. - object: The data type being transmitted is `melotts.setup`. -- model: The model being used is the Chinese model `melotts-zh-cn`. +- model: The model being used is the English model `melotts-en-us`. - response_format: The result is returned as `sys.pcm`, system audio data, which is directly sent to the llm-audio module for playback. - input: The input is `tts.utf-8`, representing user input. @@ -139,7 +139,7 @@ Example: "action": "setup", "object": "melotts.setup", "data": { - "model": "melotts-zh-cn", + "model": "melotts-en-us", "response_format": "sys.pcm", "input": [ "tts.utf-8", @@ -335,7 +335,7 @@ Response JSON: "inputs_": [ "tts.utf-8" ], - "model": "melotts-zh-cn", + "model": "melotts-en-us", "response_format": "sys.pcm" }, "error": { From 16dfe700359107a2ec074bdfa0630449577ef56e Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 22 Apr 2025 18:16:32 +0800 Subject: [PATCH 38/64] [update] update model version --- projects/llm_framework/tools/llm_pack.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 370c8b2..14cab95 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -382,11 +382,11 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], - 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.3', src_folder, revision], + 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.4', src_folder, revision], 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], 'llm-model-whisper-small':[create_data_deb,'llm-model-whisper-small', '0.3', src_folder, revision], - 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.3', src_folder, revision], + 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.4', src_folder, revision], 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], 'llm-model-qwen2.5-0.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-Int4-ax630c', '0.4', src_folder, revision], From 43c34385c93bfbb744e57d4344d348b5259e704f Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 23 Apr 2025 09:05:27 +0800 Subject: [PATCH 39/64] [fix] Fix non-utf-8 characters --- .../llm_framework/main_whisper/src/main.cpp | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 73515f1..cc59a97 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -192,6 +192,34 @@ class llm_task { return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; } + bool is_valid_utf8(const std::string &str) + { + int bytes = 0; + for (unsigned char c : str) { + if (bytes == 0) { + if ((c >> 5) == 0b110) + bytes = 1; + else if ((c >> 4) == 0b1110) + bytes = 2; + else if ((c >> 3) == 0b11110) + bytes = 3; + else if ((c >> 7)) + return false; + } else { + if ((c >> 6) != 0b10) return false; + bytes--; + } + } + return bytes == 0; + } + + void fix_utf8_string(std::string &s) + { + while (!s.empty() && !is_valid_utf8(s)) { + s.pop_back(); + } + } + int load_model(const nlohmann::json &config_body) { if (parse_config(config_body)) { @@ -475,7 +503,7 @@ class llm_task { (uint32)mode_config_.token_tables[i].size(), str); s += str; } - + fix_utf8_string(s); if (mode_config_.language == "en" || mode_config_.language == "ja") { if (out_callback_) out_callback_(s, true); } else { From dce7e4d034ecb6913563cbb0f8053e9bc41adabd Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 23 Apr 2025 14:51:06 +0800 Subject: [PATCH 40/64] [update] Update OpenAI-Plugin --- projects/llm_framework/main_openai_api/SConstruct | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/main_openai_api/SConstruct b/projects/llm_framework/main_openai_api/SConstruct index 490cebc..9098a67 100644 --- a/projects/llm_framework/main_openai_api/SConstruct +++ b/projects/llm_framework/main_openai_api/SConstruct @@ -18,7 +18,7 @@ LINK_SEARCH_PATH = [] STATIC_FILES = [] -ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', '6f783440eb2b4ff6566c171ffc2815cb64024928', True) +ModuleLLMOpenAIPluginPath = wget_github_commit('https://github.com/m5stack/ModuleLLM-OpenAI-Plugin.git', '9612b20800eb1708e648744e3cb333c4f743811c', True) python_venv = check_wget_down("https://m5stack.oss-cn-shenzhen.aliyuncs.com/resource/linux/llm/m5stack_llm-openai-api-python-venv_v1.6.tar.gz", 'm5stack_llm-openai-api-python-venv_v1.6.tar.gz') From 076ecace17df6cd6e21a875ed46fe185b27821b7 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 24 Apr 2025 11:30:53 +0800 Subject: [PATCH 41/64] [update] Update LLM VLM STT benchmark --- benchmark/RESULTS.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 benchmark/RESULTS.md diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md new file mode 100644 index 0000000..9c49b35 --- /dev/null +++ b/benchmark/RESULTS.md @@ -0,0 +1,33 @@ +# Results + +## ModuleLLM (AX630C) + +### LLM +| model | ttft (ms) | avg-token/s | model version | llm version | +|---------------------------------|------------|-------------|---------------|-------------| +| qwen2.5-0.5B-prefill-20e | 359.8 | 10.32 | v0.2 | v1.8 | +| qwen2.5-0.5B-p256-ax630c | 1126.19 | 10.30 | v0.4 | v1.8 | +| qwen2.5-0.5B-Int4-ax630c | 442.95 | 12.52 | v0.4 | v1.8 | +| qwen2.5-coder-0.5B-ax630c | 361.81 | 10.28 | v0.2 | v1.8 | +| qwen2.5-1.5B-ax630c | 1029.41 | 3.59 | v0.3 | v1.8 | +| qwen2.5-1.5B-p256-ax630c | 3056.54 | 3.57 | v0.4 | v1.8 | +| qwen2.5-1.5B-Int4-ax630c | 1219.54 | 4.63 | v0.4 | v1.8 | +| deepseek-r1-1.5B-ax630c | 1075.04 | 3.57 | v0.3 | v1.8 | +| deepseek-r1-1.5B-p256-ax630c | 3056.86 | 3.57 | v0.4 | v1.8 | +| llama3.2-1B-prefill-ax630c | 891.00 | 4.48 | v0.2 | v1.8 | +| llama3.2-1B-p256-ax630c | 2601.11 | 4.49 | v0.4 | v1.8 | +| openbuddy-llama3.2-1B-ax630c | 891.02 | 4.52 | v0.2 | v1.8 | + +### VLM +| model | ttft (ms) | avg-token/s | image encode (ms) | model version | vlm version | +|---------------------------------|------------|-------------|-------------------|---------------|-------------| +| internvl2.5-1B-364-ax630c | 1117.27 | 10.56 | 1164.61 | v0.4 | v1.7 | +| smolvlm-256M-ax630c | 185.75 | 30.16 | 799.11 | v0.4 | v1.7 | +| smolvlm-500M-ax630c | 365.69 | 13.14 | 838.30 | v0.4 | v1.7 | + +### STT +| model | encode (ms) | avg-decode (ms) | model version | whisper version | +|--------------------|-------------|-----------------|---------------|-----------------| +| whisper-tiny | 248.0 | 32.54 | v0.4 | v1.7 | +| whisper-base | 660.31 | 51.11 | v0.4 | v1.7 | +| whisper-small | 1606.08 | 148.92 | v0.4 | v1.7 | \ No newline at end of file From 3bdf8222d6ff5cf1f8c511ee5e59d7d7791fec0c Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 24 Apr 2025 11:42:06 +0800 Subject: [PATCH 42/64] [update] Update benchmark --- benchmark/RESULTS.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md index 9c49b35..da44e3f 100644 --- a/benchmark/RESULTS.md +++ b/benchmark/RESULTS.md @@ -18,6 +18,8 @@ | llama3.2-1B-p256-ax630c | 2601.11 | 4.49 | v0.4 | v1.8 | | openbuddy-llama3.2-1B-ax630c | 891.02 | 4.52 | v0.2 | v1.8 | +`The input text used by the llm test is "hello!“` + ### VLM | model | ttft (ms) | avg-token/s | image encode (ms) | model version | vlm version | |---------------------------------|------------|-------------|-------------------|---------------|-------------| @@ -25,9 +27,13 @@ | smolvlm-256M-ax630c | 185.75 | 30.16 | 799.11 | v0.4 | v1.7 | | smolvlm-500M-ax630c | 365.69 | 13.14 | 838.30 | v0.4 | v1.7 | +`The image encoding test uses a jpg image with a size of 810*1080` + ### STT | model | encode (ms) | avg-decode (ms) | model version | whisper version | |--------------------|-------------|-----------------|---------------|-----------------| | whisper-tiny | 248.0 | 32.54 | v0.4 | v1.7 | | whisper-base | 660.31 | 51.11 | v0.4 | v1.7 | -| whisper-small | 1606.08 | 148.92 | v0.4 | v1.7 | \ No newline at end of file +| whisper-small | 1606.08 | 148.92 | v0.4 | v1.7 | + +`The STT test uses a 30-second wav English audio` \ No newline at end of file From d636d4d7b37a236034bd2fccd20df1a445c48ba2 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Tue, 29 Apr 2025 11:21:35 +0800 Subject: [PATCH 43/64] [update] add pzmq pzmq_data class --- .../StackFlow/stackflow/StackFlow.cpp | 62 +++++++++------ .../StackFlow/stackflow/StackFlow.h | 26 ++++--- .../StackFlow/stackflow/StackFlowUtil.cpp | 2 +- ext_components/StackFlow/stackflow/pzmq.hpp | 76 ++++++++++++++----- projects/llm_framework/main_asr/src/main.cpp | 12 +-- .../llm_framework/main_audio/src/main.cpp | 35 +++++---- .../llm_framework/main_camera/src/main.cpp | 7 +- .../main_depth_anything/src/main.cpp | 8 +- projects/llm_framework/main_kws/src/main.cpp | 8 +- .../main_sys/src/remote_action.cpp | 2 +- .../main_sys/src/remote_server.cpp | 22 +++--- .../llm_framework/main_sys/src/zmq_bus.cpp | 2 +- projects/llm_framework/main_vad/src/main.cpp | 12 +-- .../llm_framework/main_whisper/src/main.cpp | 12 +-- projects/llm_framework/main_yolo/src/main.cpp | 8 +- 15 files changed, 178 insertions(+), 116 deletions(-) diff --git a/ext_components/StackFlow/stackflow/StackFlow.cpp b/ext_components/StackFlow/stackflow/StackFlow.cpp index 236cc7a..e06f478 100644 --- a/ext_components/StackFlow/stackflow/StackFlow.cpp +++ b/ext_components/StackFlow/stackflow/StackFlow.cpp @@ -14,7 +14,8 @@ std::string llm_channel_obj::uart_push_url; #define RPC_PARSE_TO_PARAM_OLD(obj) \ sample_json_str_get(obj, "zmq_com"), sample_unescapeString(sample_json_str_get(obj, "raw_data")) -#define RPC_PARSE_TO_PARAM(obj) RPC_PARSE_TO_FIRST(obj), RPC_PARSE_TO_SECOND(obj) +#define RPC_PARSE_TO_PARAM(obj) RPC_PARSE_TO_FIRST(obj), RPC_PARSE_TO_SECOND(obj) +#define PTR_RPC_PARSE_TO_PARAM(obj) PTR_RPC_PARSE_TO_FIRST(obj), PTR_RPC_PARSE_TO_SECOND(obj) llm_channel_obj::llm_channel_obj(const std::string &_publisher_url, const std::string &inference_url, const std::string &unit_name) @@ -30,23 +31,24 @@ llm_channel_obj::~llm_channel_obj() } void llm_channel_obj::subscriber_event_call(const std::function &call, - pzmq *_pzmq, const std::string &raw) + pzmq *_pzmq, const std::shared_ptr &raw) { + auto _raw = raw->string(); const char *user_inference_flage_str = "\"action\""; - std::size_t pos = raw.find(user_inference_flage_str); + std::size_t pos = _raw.find(user_inference_flage_str); while (true) { if (pos == std::string::npos) { break; - } else if ((pos > 0) && (raw[pos - 1] != '\\')) { - std::string zmq_com = sample_json_str_get(raw, "zmq_com"); + } else if ((pos > 0) && (_raw[pos - 1] != '\\')) { + std::string zmq_com = sample_json_str_get(_raw, "zmq_com"); if (!zmq_com.empty()) set_push_url(zmq_com); - request_id_ = sample_json_str_get(raw, "request_id"); - work_id_ = sample_json_str_get(raw, "work_id"); + request_id_ = sample_json_str_get(_raw, "request_id"); + work_id_ = sample_json_str_get(_raw, "work_id"); break; } - pos = raw.find(user_inference_flage_str, pos + sizeof(user_inference_flage_str)); + pos = _raw.find(user_inference_flage_str, pos + sizeof(user_inference_flage_str)); } - call(sample_json_str_get(raw, "object"), sample_json_str_get(raw, "data")); + call(sample_json_str_get(_raw, "object"), sample_json_str_get(_raw, "data")); } int llm_channel_obj::subscriber_work_id(const std::string &work_id, @@ -236,9 +238,10 @@ void StackFlow::_sys_init(const std::string &zmq_url, const std::string &data) // todo:... } -std::string StackFlow::_rpc_setup(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_setup(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_SETUP, RPC_PARSE_TO_PARAM(data)); + auto _data = data->string(); + event_queue_.enqueue(EVENT_SETUP, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } @@ -270,9 +273,10 @@ int StackFlow::setup(const std::string &work_id, const std::string &object, cons return -1; } -std::string StackFlow::_rpc_link(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_link(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_LINK, RPC_PARSE_TO_PARAM(data)); + auto _data = data->string(); + event_queue_.enqueue(EVENT_LINK, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } @@ -301,9 +305,10 @@ void StackFlow::link(const std::string &work_id, const std::string &object, cons send("None", "None", error_body, work_id); } -std::string StackFlow::_rpc_unlink(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_unlink(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_UNLINK, RPC_PARSE_TO_PARAM(data)); + auto _data = data->string(); + event_queue_.enqueue(EVENT_UNLINK, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } @@ -332,9 +337,11 @@ void StackFlow::unlink(const std::string &work_id, const std::string &object, co send("None", "None", error_body, work_id); } -std::string StackFlow::_rpc_work(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_work(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_WORK, RPC_PARSE_TO_PARAM(data)); + + auto _data = data->string(); + event_queue_.enqueue(EVENT_WORK, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } @@ -363,9 +370,12 @@ void StackFlow::work(const std::string &work_id, const std::string &object, cons send("None", "None", error_body, work_id); } -std::string StackFlow::_rpc_exit(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_exit(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_EXIT, RPC_PARSE_TO_PARAM(data)); + + + auto _data = data->string(); + event_queue_.enqueue(EVENT_EXIT, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } @@ -397,9 +407,12 @@ int StackFlow::exit(const std::string &work_id, const std::string &object, const return 0; } -std::string StackFlow::_rpc_pause(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_pause(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_PAUSE, RPC_PARSE_TO_PARAM(data)); + + + auto _data = data->string(); + event_queue_.enqueue(EVENT_PAUSE, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } @@ -428,9 +441,12 @@ void StackFlow::pause(const std::string &work_id, const std::string &object, con send("None", "None", error_body, work_id); } -std::string StackFlow::_rpc_taskinfo(pzmq *_pzmq, const std::string &data) +std::string StackFlow::_rpc_taskinfo(pzmq *_pzmq, const std::shared_ptr &data) { - event_queue_.enqueue(EVENT_TASKINFO, RPC_PARSE_TO_PARAM(data)); + + + auto _data = data->string(); + event_queue_.enqueue(EVENT_TASKINFO, RPC_PARSE_TO_PARAM(_data)); return std::string("None"); } diff --git a/ext_components/StackFlow/stackflow/StackFlow.h b/ext_components/StackFlow/stackflow/StackFlow.h index 2098458..6827753 100644 --- a/ext_components/StackFlow/stackflow/StackFlow.h +++ b/ext_components/StackFlow/stackflow/StackFlow.h @@ -98,7 +98,7 @@ class llm_channel_obj { return enstream_; } void subscriber_event_call(const std::function &call, pzmq *_pzmq, - const std::string &raw); + const std::shared_ptr &raw); int subscriber_work_id(const std::string &work_id, const std::function &call); void stop_subscriber_work_id(const std::string &work_id); @@ -185,6 +185,14 @@ class llm_channel_obj { } }; +class stackflow_data { + union { + std::string *rawobj; + std::string *object; + }; + std::string *data; +}; + class StackFlow { private: std::atomic_int work_id_num_cout_; @@ -252,7 +260,7 @@ class StackFlow { return llm_task_channel_.at(_work_id_num); } - std::string _rpc_setup(pzmq *_pzmq, const std::string &data); + std::string _rpc_setup(pzmq *_pzmq, const std::shared_ptr &data); void _setup(const std::string &zmq_url, const std::string &data) { // printf("void _setup run \n"); @@ -263,7 +271,7 @@ class StackFlow { virtual int setup(const std::string &zmq_url, const std::string &raw); virtual int setup(const std::string &work_id, const std::string &object, const std::string &data); - std::string _rpc_link(pzmq *_pzmq, const std::string &data); + std::string _rpc_link(pzmq *_pzmq, const std::shared_ptr &data); void _link(const std::string &zmq_url, const std::string &data) { // printf("void _link run \n"); @@ -274,7 +282,7 @@ class StackFlow { virtual void link(const std::string &zmq_url, const std::string &raw); virtual void link(const std::string &work_id, const std::string &object, const std::string &data); - std::string _rpc_unlink(pzmq *_pzmq, const std::string &data); + std::string _rpc_unlink(pzmq *_pzmq, const std::shared_ptr &data); void _unlink(const std::string &zmq_url, const std::string &data) { // printf("void _unlink run \n"); @@ -285,7 +293,7 @@ class StackFlow { virtual void unlink(const std::string &zmq_url, const std::string &raw); virtual void unlink(const std::string &work_id, const std::string &object, const std::string &data); - std::string _rpc_exit(pzmq *_pzmq, const std::string &data); + std::string _rpc_exit(pzmq *_pzmq, const std::shared_ptr &data); void _exit(const std::string &zmq_url, const std::string &data) { request_id_ = sample_json_str_get(data, "request_id"); @@ -295,7 +303,7 @@ class StackFlow { virtual int exit(const std::string &zmq_url, const std::string &raw); virtual int exit(const std::string &work_id, const std::string &object, const std::string &data); - std::string _rpc_work(pzmq *_pzmq, const std::string &data); + std::string _rpc_work(pzmq *_pzmq, const std::shared_ptr &data); void _work(const std::string &zmq_url, const std::string &data) { request_id_ = sample_json_str_get(data, "request_id"); @@ -305,7 +313,7 @@ class StackFlow { virtual void work(const std::string &zmq_url, const std::string &raw); virtual void work(const std::string &work_id, const std::string &object, const std::string &data); - std::string _rpc_pause(pzmq *_pzmq, const std::string &data); + std::string _rpc_pause(pzmq *_pzmq, const std::shared_ptr &data); void _pause(const std::string &zmq_url, const std::string &data) { request_id_ = sample_json_str_get(data, "request_id"); @@ -315,7 +323,7 @@ class StackFlow { virtual void pause(const std::string &zmq_url, const std::string &raw); virtual void pause(const std::string &work_id, const std::string &object, const std::string &data); - std::string _rpc_taskinfo(pzmq *_pzmq, const std::string &data); + std::string _rpc_taskinfo(pzmq *_pzmq, const std::shared_ptr &data); void _taskinfo(const std::string &zmq_url, const std::string &data) { request_id_ = sample_json_str_get(data, "request_id"); @@ -379,7 +387,7 @@ class StackFlow { return false; } pzmq _call("sys"); - _call.call_rpc_action("release_unit", _work_id, [](pzmq *_pzmq, const std::string &data) {}); + _call.call_rpc_action("release_unit", _work_id, [](pzmq *_pzmq, const std::shared_ptr &data) {}); llm_task_channel_[_work_id_num].reset(); llm_task_channel_.erase(_work_id_num); // SLOGI("release work_id %s success", _work_id.c_str()); diff --git a/ext_components/StackFlow/stackflow/StackFlowUtil.cpp b/ext_components/StackFlow/stackflow/StackFlowUtil.cpp index a5cebea..8225b8b 100644 --- a/ext_components/StackFlow/stackflow/StackFlowUtil.cpp +++ b/ext_components/StackFlow/stackflow/StackFlowUtil.cpp @@ -358,7 +358,7 @@ std::string StackFlows::unit_call(const std::string &unit_name, const std::strin { std::string value; pzmq _call(unit_name); - _call.call_rpc_action(unit_action, data, [&value](pzmq *_pzmq, const std::string &raw) { value = raw; }); + _call.call_rpc_action(unit_action, data, [&value](pzmq *_pzmq, const std::shared_ptr &raw) { value = raw->string(); }); return value; } diff --git a/ext_components/StackFlow/stackflow/pzmq.hpp b/ext_components/StackFlow/stackflow/pzmq.hpp index 1a7baf9..c70a692 100644 --- a/ext_components/StackFlow/stackflow/pzmq.hpp +++ b/ext_components/StackFlow/stackflow/pzmq.hpp @@ -18,10 +18,48 @@ #define ZMQ_RPC_CALL (ZMQ_REQ | 0x80) namespace StackFlows { + +class pzmq_data { +private: + zmq_msg_t msg; + +public: + pzmq_data(/* args */) + { + zmq_msg_init(&msg); + } + std::shared_ptr get_string() + { + auto len = zmq_msg_size(&msg); + return std::make_shared((const char *)zmq_msg_data(&msg), zmq_msg_size(&msg)); + } + std::string string() + { + auto len = zmq_msg_size(&msg); + return std::string((const char *)zmq_msg_data(&msg), zmq_msg_size(&msg)); + } + void *data() + { + return zmq_msg_data(&msg); + } + size_t size() + { + return zmq_msg_size(&msg); + } + zmq_msg_t *get() + { + return &msg; + } + ~pzmq_data() + { + zmq_msg_close(&msg); + } +}; + class pzmq { public: - typedef std::function rpc_callback_fun; - typedef std::function msg_callback_fun; + typedef std::function &)> rpc_callback_fun; + typedef std::function &)> msg_callback_fun; private: const int rpc_url_head_length = 6; @@ -85,7 +123,7 @@ class pzmq { } return zmq_url_; } - std::string _rpc_list_action(pzmq *self, const std::string &_None) + std::string _rpc_list_action(pzmq *self, const std::shared_ptr &_None) { std::string action_list; action_list.reserve(128); @@ -131,8 +169,7 @@ class pzmq { int call_rpc_action(const std::string &action, const std::string &data, const msg_callback_fun &raw_call) { int ret; - zmq_msg_t msg; - zmq_msg_init(&msg); + std::shared_ptr msg_ptr = std::make_shared(); try { if (NULL == zmq_socket_) { if (rpc_server_.empty()) return -1; @@ -150,13 +187,13 @@ class pzmq { } // action { - zmq_msg_recv(&msg, zmq_socket_, 0); + zmq_msg_recv(msg_ptr->get(), zmq_socket_, 0); } - raw_call(this, std::string((const char *)zmq_msg_data(&msg), zmq_msg_size(&msg))); + raw_call(this, msg_ptr); } catch (int e) { ret = e; } - zmq_msg_close(&msg); + msg_ptr.reset(); close_zmq(); return ret; } @@ -293,8 +330,7 @@ class pzmq { items[0].revents = 0; }; while (!flage_.load()) { - zmq_msg_t msg; - zmq_msg_init(&msg); + std::shared_ptr msg_ptr = std::make_shared(); if (mode_ == ZMQ_PULL) { ret = zmq_poll(items, 1, -1); if (ret == -1) { @@ -305,30 +341,28 @@ class pzmq { continue; } } - ret = zmq_msg_recv(&msg, zmq_socket_, 0); + ret = zmq_msg_recv(msg_ptr->get(), zmq_socket_, 0); if (ret <= 0) { - zmq_msg_close(&msg); + msg_ptr.reset(); continue; } - std::string raw_data((const char *)zmq_msg_data(&msg), zmq_msg_size(&msg)); + if (mode_ == ZMQ_RPC_FUN) { - zmq_msg_t msg1; - zmq_msg_init(&msg1); - zmq_msg_recv(&msg1, zmq_socket_, 0); - std::string _raw_data((const char *)zmq_msg_data(&msg1), zmq_msg_size(&msg1)); + std::shared_ptr msg1_ptr = std::make_shared(); + zmq_msg_recv(msg1_ptr->get(), zmq_socket_, 0); std::string retval; try { std::unique_lock lock(zmq_fun_mtx_); - retval = zmq_fun_.at(raw_data)(this, _raw_data); + retval = zmq_fun_.at(msg_ptr->string())(this, msg1_ptr); } catch (...) { retval = "NotAction"; } zmq_send(zmq_socket_, retval.c_str(), retval.length(), 0); - zmq_msg_close(&msg1); + msg1_ptr.reset(); } else { - raw_call(this, raw_data); + raw_call(this, msg_ptr); } - zmq_msg_close(&msg); + msg_ptr.reset(); } } void close_zmq() diff --git a/projects/llm_framework/main_asr/src/main.cpp b/projects/llm_framework/main_asr/src/main.cpp index f09ade0..ebe503b 100644 --- a/projects/llm_framework/main_asr/src/main.cpp +++ b/projects/llm_framework/main_asr/src/main.cpp @@ -426,8 +426,8 @@ class llm_asr : public StackFlow { llm_task_obj->kws_awake(); if ((!audio_url_.empty()) && (llm_task_obj->audio_flage_ == false)) { std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } @@ -515,8 +515,8 @@ class llm_asr : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", input); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } else if (input.find("asr") != std::string::npos) { @@ -563,8 +563,8 @@ class llm_asr : public StackFlow { if (data.find("sys") != std::string::npos) { if (audio_url_.empty()) audio_url_ = unit_call("audio", "cap", data); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; llm_task_obj->inputs_.push_back(data); diff --git a/projects/llm_framework/main_audio/src/main.cpp b/projects/llm_framework/main_audio/src/main.cpp index 32407d3..f12d3af 100644 --- a/projects/llm_framework/main_audio/src/main.cpp +++ b/projects/llm_framework/main_audio/src/main.cpp @@ -389,10 +389,11 @@ class llm_audio : public StackFlow { return LLM_NONE; } - std::string play(pzmq *_pzmq, const std::string &rawdata) + std::string play(pzmq *_pzmq, const std::shared_ptr &rawdata) { - std::string zmq_url = RPC_PARSE_TO_FIRST(rawdata); - std::string audio_json = RPC_PARSE_TO_SECOND(rawdata); + auto _rawdata = rawdata->string(); + std::string zmq_url = RPC_PARSE_TO_FIRST(_rawdata); + std::string audio_json = RPC_PARSE_TO_SECOND(_rawdata); std::string ret_val = parse_data(sample_json_str_get(audio_json, "object"), sample_json_str_get(audio_json, "data")); request_id_ = sample_json_str_get(audio_json, "request_id"); @@ -400,29 +401,31 @@ class llm_audio : public StackFlow { return ret_val; } - std::string play_raw(pzmq *_pzmq, const std::string &rawdata) + std::string play_raw(pzmq *_pzmq, const std::shared_ptr &rawdata) { - if (rawdata.empty()) return std::string("rawdata empty"); - _play(rawdata); + auto _rawdata = rawdata->string(); + if (_rawdata.empty()) return std::string("rawdata empty"); + _play(_rawdata); return LLM_NONE; } - std::string enqueue_play(pzmq *_pzmq, const std::string &rawdata) + std::string enqueue_play(pzmq *_pzmq, const std::shared_ptr &rawdata) { audio_clear_flage_ = false; - event_queue_.enqueue(EVENT_QUEUE_PLAY, rawdata, ""); + event_queue_.enqueue(EVENT_QUEUE_PLAY, rawdata->string(), ""); return LLM_NONE; } - std::string audio_status(pzmq *_pzmq, const std::string &rawdata) + std::string audio_status(pzmq *_pzmq, const std::shared_ptr &rawdata) { - if (rawdata == "play") { + std::string _rawdata = rawdata->string(); + if (_rawdata == "play") { if (ax_play_status()) { return std::string("None"); } else { return std::string("Runing"); } - } else if (rawdata == "cap") { + } else if (_rawdata == "cap") { if (ax_cap_status()) { return std::string("None"); } else { @@ -447,19 +450,19 @@ class llm_audio : public StackFlow { } } - std::string play_stop(pzmq *_pzmq, const std::string &rawdata) + std::string play_stop(pzmq *_pzmq, const std::shared_ptr &rawdata) { _play_stop(); return LLM_NONE; } - std::string queue_play_stop(pzmq *_pzmq, const std::string &rawdata) + std::string queue_play_stop(pzmq *_pzmq, const std::shared_ptr &rawdata) { audio_clear_flage_ = true; return LLM_NONE; } - std::string cap(pzmq *_pzmq, const std::string &rawdata) + std::string cap(pzmq *_pzmq, const std::shared_ptr &rawdata) { if (cap_status_ == 0) { _cap(); @@ -468,7 +471,7 @@ class llm_audio : public StackFlow { return sys_pcm_cap_channel; } - std::string cap_stop(pzmq *_pzmq, const std::string &rawdata) + std::string cap_stop(pzmq *_pzmq, const std::shared_ptr &rawdata) { if (cap_status_ > 0) { cap_status_--; @@ -479,7 +482,7 @@ class llm_audio : public StackFlow { return LLM_NONE; } - std::string cap_stop_all(pzmq *_pzmq, const std::string &rawdata) + std::string cap_stop_all(pzmq *_pzmq, const std::shared_ptr &rawdata) { cap_status_ = 0; _cap_stop(); diff --git a/projects/llm_framework/main_camera/src/main.cpp b/projects/llm_framework/main_camera/src/main.cpp index 26bbd77..c2c97f2 100644 --- a/projects/llm_framework/main_camera/src/main.cpp +++ b/projects/llm_framework/main_camera/src/main.cpp @@ -590,11 +590,12 @@ class llm_camera : public StackFlow { "list_camera", std::bind(&llm_camera::list_camera, this, std::placeholders::_1, std::placeholders::_2)); } - std::string list_camera(pzmq *_pzmq, const std::string &rawdata) + std::string list_camera(pzmq *_pzmq, const std::shared_ptr &rawdata) { + auto _rawdata = rawdata->string(); nlohmann::json req_body; - std::string zmq_url = RPC_PARSE_TO_FIRST(rawdata); - std::string param_json = RPC_PARSE_TO_SECOND(rawdata); + std::string zmq_url = RPC_PARSE_TO_FIRST(_rawdata); + std::string param_json = RPC_PARSE_TO_SECOND(_rawdata); std::vector devices; glob_t glob_result; glob("/dev/video*", GLOB_TILDE, NULL, &glob_result); diff --git a/projects/llm_framework/main_depth_anything/src/main.cpp b/projects/llm_framework/main_depth_anything/src/main.cpp index b7bb591..7685bc8 100644 --- a/projects/llm_framework/main_depth_anything/src/main.cpp +++ b/projects/llm_framework/main_depth_anything/src/main.cpp @@ -419,8 +419,8 @@ class llm_depth_anything : public StackFlow { std::weak_ptr _llm_task_obj = llm_task_obj; std::weak_ptr _llm_channel = llm_channel; llm_channel->subscriber( - input_url, [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::string &raw) { - this->task_camera_data(_llm_task_obj, _llm_channel, raw); + input_url, [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::shared_ptr &raw) { + this->task_camera_data(_llm_task_obj, _llm_channel, raw->string()); }); } } @@ -466,8 +466,8 @@ class llm_depth_anything : public StackFlow { std::weak_ptr _llm_task_obj = llm_task_obj; std::weak_ptr _llm_channel = llm_channel; llm_channel->subscriber(input_url, - [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::string &raw) { - this->task_camera_data(_llm_task_obj, _llm_channel, raw); + [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::shared_ptr &raw) { + this->task_camera_data(_llm_task_obj, _llm_channel, raw->string()); }); } llm_task_obj->inputs_.push_back(data); diff --git a/projects/llm_framework/main_kws/src/main.cpp b/projects/llm_framework/main_kws/src/main.cpp index fc4f31c..9c05ce8 100644 --- a/projects/llm_framework/main_kws/src/main.cpp +++ b/projects/llm_framework/main_kws/src/main.cpp @@ -336,8 +336,8 @@ class llm_kws : public StackFlow { } if ((!audio_url_.empty()) && (llm_task_obj->audio_flage_ == false)) { std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } @@ -456,8 +456,8 @@ class llm_kws : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", "None"); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } else if (input.find("kws") != std::string::npos) { diff --git a/projects/llm_framework/main_sys/src/remote_action.cpp b/projects/llm_framework/main_sys/src/remote_action.cpp index 2593369..6121f69 100644 --- a/projects/llm_framework/main_sys/src/remote_action.cpp +++ b/projects/llm_framework/main_sys/src/remote_action.cpp @@ -25,7 +25,7 @@ int remote_call(int com_id, const std::string &json_str) std::string com_urls(com_url); RPC_PUSH_PARAM(send_data, com_urls, json_str); pzmq clent(work_unit); - return clent.call_rpc_action(action, send_data, [](pzmq *_pzmq, const std::string &val) {}); + return clent.call_rpc_action(action, send_data, [](pzmq *_pzmq, const std::shared_ptr &val) {}); } void remote_action_work() diff --git a/projects/llm_framework/main_sys/src/remote_server.cpp b/projects/llm_framework/main_sys/src/remote_server.cpp index 5a4567f..48ea242 100644 --- a/projects/llm_framework/main_sys/src/remote_server.cpp +++ b/projects/llm_framework/main_sys/src/remote_server.cpp @@ -171,9 +171,9 @@ int c_sys_release_unit(char const *unit) return sys_release_unit(unit); } -std::string rpc_allocate_unit(pzmq *_pzmq, const std::string &raw) +std::string rpc_allocate_unit(pzmq *_pzmq, const std::shared_ptr &raw) { - unit_data *unit_info = sys_allocate_unit(raw); + unit_data *unit_info = sys_allocate_unit(raw->string()); std::string send_data; std::string send_data1; std::string str_port = std::to_string(unit_info->port_); @@ -182,29 +182,29 @@ std::string rpc_allocate_unit(pzmq *_pzmq, const std::string &raw) return send_data; } -std::string rpc_release_unit(pzmq *_pzmq, const std::string &raw) +std::string rpc_release_unit(pzmq *_pzmq, const std::shared_ptr &raw) { - sys_release_unit(raw); + sys_release_unit(raw->string()); return "Success"; } -std::string rpc_sql_select(pzmq *_pzmq, const std::string &raw) +std::string rpc_sql_select(pzmq *_pzmq, const std::shared_ptr &raw) { - return sys_sql_select(raw); + return sys_sql_select(raw->string()); } -std::string rpc_sql_set(pzmq *_pzmq, const std::string &raw) +std::string rpc_sql_set(pzmq *_pzmq, const std::shared_ptr &raw) { - std::string key = sample_json_str_get(raw, "key"); - std::string val = sample_json_str_get(raw, "val"); + std::string key = sample_json_str_get(raw->string(), "key"); + std::string val = sample_json_str_get(raw->string(), "val"); if (key.empty()) return "False"; sys_sql_set(key, val); return "Success"; } -std::string rpc_sql_unset(pzmq *_pzmq, const std::string &raw) +std::string rpc_sql_unset(pzmq *_pzmq, const std::shared_ptr &raw) { - sys_sql_unset(raw); + sys_sql_unset(raw->string()); return "Success"; } diff --git a/projects/llm_framework/main_sys/src/zmq_bus.cpp b/projects/llm_framework/main_sys/src/zmq_bus.cpp index 60693c9..b62196e 100644 --- a/projects/llm_framework/main_sys/src/zmq_bus.cpp +++ b/projects/llm_framework/main_sys/src/zmq_bus.cpp @@ -42,7 +42,7 @@ void zmq_bus_com::work(const std::string &zmq_url_format, int port) _zmq_url = std::string((char *)buff.data()); SAFE_SETTING("serial_zmq_url", _zmq_url); user_chennal_ = std::make_unique(_zmq_url, ZMQ_PULL, - [this](pzmq *_pzmq, const std::string &data) { this->send_data(data); }); + [this](pzmq *_pzmq, const std::shared_ptr &data) { this->send_data(data->string()); }); reace_data_event_thread = std::make_unique(std::bind(&zmq_bus_com::reace_data_event, this)); } diff --git a/projects/llm_framework/main_vad/src/main.cpp b/projects/llm_framework/main_vad/src/main.cpp index f0ab519..3c737bc 100644 --- a/projects/llm_framework/main_vad/src/main.cpp +++ b/projects/llm_framework/main_vad/src/main.cpp @@ -335,8 +335,8 @@ class llm_vad : public StackFlow { } if ((!audio_url_.empty()) && (llm_task_obj->audio_flage_ == false)) { std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } @@ -423,8 +423,8 @@ class llm_vad : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", "None"); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } else if (input.find("vad") != std::string::npos) { @@ -471,8 +471,8 @@ class llm_vad : public StackFlow { if (data.find("sys") != std::string::npos) { if (audio_url_.empty()) audio_url_ = unit_call("audio", "cap", data); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; llm_task_obj->inputs_.push_back(data); diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 776408b..367ee87 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -715,8 +715,8 @@ class llm_whisper : public StackFlow { llm_task_obj->kws_awake(); if ((!audio_url_.empty()) && (llm_task_obj->audio_flage_ == false)) { std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } @@ -818,8 +818,8 @@ class llm_whisper : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", input); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; } else if (input.find("whisper") != std::string::npos) { @@ -874,8 +874,8 @@ class llm_whisper : public StackFlow { if (data.find("sys") != std::string::npos) { if (audio_url_.empty()) audio_url_ = unit_call("audio", "cap", data); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::string &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw); + llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); }); llm_task_obj->audio_flage_ = true; llm_task_obj->inputs_.push_back(data); diff --git a/projects/llm_framework/main_yolo/src/main.cpp b/projects/llm_framework/main_yolo/src/main.cpp index 453ad34..208a375 100644 --- a/projects/llm_framework/main_yolo/src/main.cpp +++ b/projects/llm_framework/main_yolo/src/main.cpp @@ -504,8 +504,8 @@ class llm_yolo : public StackFlow { std::weak_ptr _llm_task_obj = llm_task_obj; std::weak_ptr _llm_channel = llm_channel; llm_channel->subscriber( - input_url, [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::string &raw) { - this->task_camera_data(_llm_task_obj, _llm_channel, raw); + input_url, [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::shared_ptr &raw) { + this->task_camera_data(_llm_task_obj, _llm_channel, raw->string()); }); } } @@ -551,8 +551,8 @@ class llm_yolo : public StackFlow { std::weak_ptr _llm_task_obj = llm_task_obj; std::weak_ptr _llm_channel = llm_channel; llm_channel->subscriber(input_url, - [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::string &raw) { - this->task_camera_data(_llm_task_obj, _llm_channel, raw); + [this, _llm_task_obj, _llm_channel](pzmq *_pzmq, const std::shared_ptr &raw) { + this->task_camera_data(_llm_task_obj, _llm_channel, raw->string()); }); } llm_task_obj->inputs_.push_back(data); From 1a90562805f883fa4860ded1fe1946b44c7b1800 Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Wed, 30 Apr 2025 10:30:23 +0800 Subject: [PATCH 44/64] =?UTF-8?q?=E4=BC=98=E5=8C=96g2p=E6=B5=81=E7=A8=8B?= =?UTF-8?q?=EF=BC=8C=E5=8F=AF=E4=BB=A5=E5=A4=84=E7=90=86=E5=A4=9A=E9=9F=B3?= =?UTF-8?q?=E5=AD=97=EF=BC=8C=E4=B8=AD=E8=8B=B1=E6=B7=B7=E5=90=88=E7=9A=84?= =?UTF-8?q?=E6=83=85=E5=86=B5=E7=AD=89=E7=AD=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_melotts/src/runner/Lexicon.hpp | 312 +++++++++---- .../tools/test_tools/test-melo.py | 242 ++++++++++ .../tools/test_tools/test-tts.py | 412 ++++++++++++++++++ .../llm_framework/tools/test_tools/test.py | 145 ++++++ 4 files changed, 1032 insertions(+), 79 deletions(-) create mode 100644 projects/llm_framework/tools/test_tools/test-melo.py create mode 100644 projects/llm_framework/tools/test_tools/test-tts.py create mode 100644 projects/llm_framework/tools/test_tools/test.py diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 8b6255d..2d55a47 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -4,14 +4,20 @@ #include #include #include -#include +#include +#include +#include +#include // 用于日志输出 -std::vector split (const std::string &s, char delim) { +// 使用引用传参优化split函数,避免不必要的拷贝 +std::vector split(const std::string &s, char delim) { std::vector result; - std::stringstream ss (s); + std::stringstream ss(s); std::string item; - while (getline (ss, item, delim)) { - result.push_back (item); + while (getline(ss, item, delim)) { + if (!item.empty()) { // 避免添加空字符串 + result.push_back(item); + } } return result; } @@ -19,134 +25,282 @@ std::vector split (const std::string &s, char delim) { class Lexicon { private: std::unordered_map, std::vector>> lexicon; + size_t max_phrase_length; // 追踪词典中最长的词组长度 + std::pair, std::vector> unknown_token; // '_'的发音作为未知词的默认值 + std::unordered_map reverse_tokens; // 用于将音素ID转回音素符号,用于日志 public: - Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) { + Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0) { std::unordered_map tokens; + + // 加载tokens std::ifstream ifs(tokens_filename); assert(ifs.is_open()); std::string line; - while ( std::getline(ifs, line) ) { + while (std::getline(ifs, line)) { auto splitted_line = split(line, ' '); - tokens.insert({splitted_line[0], std::stoi(splitted_line[1])}); + if (splitted_line.size() >= 2) { + int token_id = std::stoi(splitted_line[1]); + tokens.insert({splitted_line[0], token_id}); + reverse_tokens[token_id] = splitted_line[0]; // 建立反向映射 + } } ifs.close(); + // 加载lexicon ifs.open(lexicon_filename); assert(ifs.is_open()); - while ( std::getline(ifs, line) ) { + while (std::getline(ifs, line)) { auto splitted_line = split(line, ' '); + if (splitted_line.empty()) continue; + std::string word_or_phrase = splitted_line[0]; + + // 更新最长词组长度 + auto chars = splitEachChar(word_or_phrase); + max_phrase_length = std::max(max_phrase_length, chars.size()); + size_t phone_tone_len = splitted_line.size() - 1; size_t half_len = phone_tone_len / 2; std::vector phones, tones; + for (size_t i = 0; i < phone_tone_len; i++) { auto phone_or_tone = splitted_line[i + 1]; if (i < half_len) { - phones.push_back(tokens[phone_or_tone]); + if (tokens.find(phone_or_tone) != tokens.end()) { + phones.push_back(tokens[phone_or_tone]); + } } else { tones.push_back(std::stoi(phone_or_tone)); } } - lexicon.insert({word_or_phrase, std::make_pair(phones, tones)}); + lexicon[word_or_phrase] = std::make_pair(phones, tones); } + // 添加特殊映射 lexicon["呣"] = lexicon["母"]; lexicon["嗯"] = lexicon["恩"]; + // 添加标点符号 const std::vector punctuation{"!", "?", "…", ",", ".", "'", "-"}; - for (auto p : punctuation) { - int i = tokens[p]; - int tone = 0; - lexicon[p] = std::make_pair(std::vector{i}, std::vector{tone}); + for (const auto& p : punctuation) { + if (tokens.find(p) != tokens.end()) { + int i = tokens[p]; + lexicon[p] = std::make_pair(std::vector{i}, std::vector{0}); + } } - lexicon[" "] = std::make_pair(std::vector{tokens["_"]}, std::vector{0}); + + // 设置'_'作为未知词的发音 + assert(tokens.find("_") != tokens.end()); // 确保tokens中包含"_" + unknown_token = std::make_pair(std::vector{tokens["_"]}, std::vector{0}); + + // 空格映射到'_'的发音 + lexicon[" "] = unknown_token; + + // 中文标点转换映射 + lexicon[","] = lexicon[","]; + lexicon["。"] = lexicon["."]; + lexicon["!"] = lexicon["!"]; + lexicon["?"] = lexicon["?"]; + + // 输出词典信息 + std::cout << "词典加载完成,包含 " << lexicon.size() << " 个条目,最长词组长度: " << max_phrase_length << std::endl; } - std::vector splitEachChar(const std::string& text) - { + std::vector splitEachChar(const std::string& text) { std::vector words; - std::string input(text); - int len = input.length(); + int len = text.length(); int i = 0; while (i < len) { - int next = 1; - if ((input[i] & 0x80) == 0x00) { - // std::cout << "one character: " << input[i] << std::endl; - } else if ((input[i] & 0xE0) == 0xC0) { - next = 2; - // std::cout << "two character: " << input.substr(i, next) << std::endl; - } else if ((input[i] & 0xF0) == 0xE0) { - next = 3; - // std::cout << "three character: " << input.substr(i, next) << std::endl; - } else if ((input[i] & 0xF8) == 0xF0) { - next = 4; - // std::cout << "four character: " << input.substr(i, next) << std::endl; - } - words.push_back(input.substr(i, next)); - i += next; + int next = 1; + if ((text[i] & 0x80) == 0x00) { + // ASCII + } else if ((text[i] & 0xE0) == 0xC0) { + next = 2; // 2字节UTF-8 + } else if ((text[i] & 0xF0) == 0xE0) { + next = 3; // 3字节UTF-8 + } else if ((text[i] & 0xF8) == 0xF0) { + next = 4; // 4字节UTF-8 + } + words.push_back(text.substr(i, next)); + i += next; } return words; } - bool is_english(std::string s) { - if (s.size() == 1) - return (s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'); - else - return false; + bool is_english(const std::string& s) { + return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z')); } - std::vector merge_english(const std::vector& splitted_text) { - std::vector words; + // 根据词典中的内容,使用最长匹配算法处理输入文本 + void convert(const std::string& text, std::vector& phones, std::vector& tones) { + std::cout << "\n开始处理文本: \"" << text << "\"" << std::endl; + std::cout << "=======匹配结果=======" << std::endl; + std::cout << "单元\t|\t音素\t|\t声调" << std::endl; + std::cout << "-----------------------------" << std::endl; + + // 在开头添加'_'边界标记 + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + std::cout << "\t|\t" << phonesToString(unknown_token.first) << "\t|\t" + << tonesToString(unknown_token.second) << std::endl; + + auto chars = splitEachChar(text); int i = 0; - while (i < splitted_text.size()) { - std::string s; - if (is_english(splitted_text[i])) { - while (i < splitted_text.size()) { - if (!is_english(splitted_text[i])) { - break; - } - s += splitted_text[i]; - i++; + + while (i < chars.size()) { + // 处理英文单词 + if (is_english(chars[i])) { + std::string eng_word; + int start = i; + while (i < chars.size() && is_english(chars[i])) { + eng_word += chars[i++]; } - // to lowercase - std::transform(s.begin(), s.end(), s.begin(), + + // 英文转小写 + std::string orig_word = eng_word; // 保留原始单词用于日志 + std::transform(eng_word.begin(), eng_word.end(), eng_word.begin(), [](unsigned char c){ return std::tolower(c); }); - words.push_back(s); - if (i >= splitted_text.size()) + + // 如果词典中有这个英文单词,使用它;否则使用'_'的发音 + if (lexicon.find(eng_word) != lexicon.end()) { + auto& [eng_phones, eng_tones] = lexicon[eng_word]; + phones.insert(phones.end(), eng_phones.begin(), eng_phones.end()); + tones.insert(tones.end(), eng_tones.begin(), eng_tones.end()); + + // 打印匹配信息 + std::cout << orig_word << "\t|\t" << phonesToString(eng_phones) << "\t|\t" + << tonesToString(eng_tones) << std::endl; + } else { + // 未找到单词,使用'_'的发音 + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + + // 打印未匹配信息 + std::cout << orig_word << "\t|\t" << phonesToString(unknown_token.first) << " (未匹配)\t|\t" + << tonesToString(unknown_token.second) << std::endl; + } + continue; + } + // 处理非英文字符(如空格、标点) + std::string c = chars[i++]; + if (c == " ") continue; // 跳过空格 + // 回退一步,用于最长匹配 + i--; + + + // 最长匹配算法处理中文/日文 + bool matched = false; + // 尝试从最长的词组开始匹配 + for (size_t len = std::min(max_phrase_length, chars.size() - i); len > 0 && !matched; --len) { + std::string phrase; + for (size_t j = 0; j < len; ++j) { + phrase += chars[i + j]; + } + + if (lexicon.find(phrase) != lexicon.end()) { + auto& [phrase_phones, phrase_tones] = lexicon[phrase]; + phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end()); + tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end()); + + // 打印匹配信息 + std::cout << phrase << "\t|\t" << phonesToString(phrase_phones) << "\t|\t" + << tonesToString(phrase_tones) << std::endl; + + i += len; + matched = true; break; + } } - else { - words.push_back(splitted_text[i]); - i++; + + // 如果没有匹配到任何词组,使用'_'的发音 + if (!matched) { + std::string c = chars[i++]; + std::string s = c; + + // 中文标点符号转换 + std::string orig_char = s; // 保留原始字符用于日志 + if (s == ",") s = ","; + else if (s == "。") s = "."; + else if (s == "!") s = "!"; + else if (s == "?") s = "?"; + + // 如果词典中找不到,则使用'_'的发音 + if (lexicon.find(s) != lexicon.end()) { + auto& [char_phones, char_tones] = lexicon[s]; + phones.insert(phones.end(), char_phones.begin(), char_phones.end()); + tones.insert(tones.end(), char_tones.begin(), char_tones.end()); + + // 打印匹配信息 + std::cout << orig_char << "\t|\t" << phonesToString(char_phones) << "\t|\t" + << tonesToString(char_tones) << std::endl; + } else { + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + + // 打印未匹配信息 + std::cout << orig_char << "\t|\t" << phonesToString(unknown_token.first) << " (未匹配)\t|\t" + << tonesToString(unknown_token.second) << std::endl; + } } } - return words; + + // 在末尾添加'_'边界标记 + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + std::cout << "\t|\t" << phonesToString(unknown_token.first) << "\t|\t" + << tonesToString(unknown_token.second) << std::endl; + + // 汇总打印最终结果 + std::cout << "\n处理结果汇总:" << std::endl; + std::cout << "原文: " << text << std::endl; + std::cout << "音素: " << phonesToString(phones) << std::endl; + std::cout << "声调: " << tonesToString(tones) << std::endl; + std::cout << "====================" << std::endl; } - void convert(const std::string& text, std::vector& phones, std::vector& tones) { - auto splitted_text = splitEachChar(text); - auto zh_mix_en = merge_english(splitted_text); - for (auto c : zh_mix_en) { - std::string s{c}; - if (s == ",") - s = ","; - else if (s == "。") - s = "."; - else if (s == "!") - s = "!"; - else if (s == "?") - s = "?"; - - auto phones_and_tones = lexicon[" "]; - if (lexicon.find(s) != lexicon.end()) { - phones_and_tones = lexicon[s]; +private: + // 处理单个字符 + void processChar(const std::string& c, std::vector& phones, std::vector& tones) { + std::string s = c; + + // 中文标点符号转换 + if (s == ",") s = ","; + else if (s == "。") s = "."; + else if (s == "!") s = "!"; + else if (s == "?") s = "?"; + + // 如果词典中找不到,则使用'_'的发音 + auto& phones_and_tones = (lexicon.find(s) != lexicon.end()) ? lexicon[s] : unknown_token; + + phones.insert(phones.end(), phones_and_tones.first.begin(), phones_and_tones.first.end()); + tones.insert(tones.end(), phones_and_tones.second.begin(), phones_and_tones.second.end()); + } + + // 将音素ID数组转换为字符串用于日志输出 + std::string phonesToString(const std::vector& phones) { + std::string result; + for (auto id : phones) { + if (!result.empty()) result += " "; + if (reverse_tokens.find(id) != reverse_tokens.end()) { + result += reverse_tokens[id]; + } else { + result += "<" + std::to_string(id) + ">"; } - phones.insert(phones.end(), phones_and_tones.first.begin(), phones_and_tones.first.end()); - tones.insert(tones.end(), phones_and_tones.second.begin(), phones_and_tones.second.end()); } + return result; + } + + // 将声调数组转换为字符串用于日志输出 + std::string tonesToString(const std::vector& tones) { + std::string result; + for (auto tone : tones) { + if (!result.empty()) result += " "; + result += std::to_string(tone); + } + return result; } -}; \ No newline at end of file +}; diff --git a/projects/llm_framework/tools/test_tools/test-melo.py b/projects/llm_framework/tools/test_tools/test-melo.py new file mode 100644 index 0000000..b257f2f --- /dev/null +++ b/projects/llm_framework/tools/test_tools/test-melo.py @@ -0,0 +1,242 @@ +import socket +import json +import argparse +import uuid +import time + +def create_tcp_connection(host, port): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((host, port)) + return sock + +def send_json(sock, data): + json_data = json.dumps(data, ensure_ascii=False) + '\n' + print(f"Sending: {json_data}") + sock.sendall(json_data.encode('utf-8')) + +def receive_response(sock, timeout=None): + """接收响应,带可选的超时设置""" + old_timeout = sock.gettimeout() + try: + if timeout is not None: + sock.settimeout(timeout) + response = '' + while True: + part = sock.recv(4096).decode('utf-8') + if not part: # 连接已关闭 + return response.strip() + response += part + if '\n' in response: + break + return response.strip() + except socket.timeout: + return None + finally: + sock.settimeout(old_timeout) + +def close_connection(sock): + if sock: + sock.close() + +def create_melotts_setup_data(request_id="melotts_setup"): + return { + "request_id": request_id, + "work_id": "melotts", + "action": "setup", + "object": "melotts.setup", + "data": { + "model": "melotts_zh-cn", + "response_format": "sys.pcm", + "input": "tts.utf-8", + "enoutput": False + } + } + +def list_available_tasks(sock): + """获取可用的任务列表""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": "melotts", + "action": "taskinfo" + }) + + response = receive_response(sock) + if not response: + return {"error": "No response received"} + try: + return json.loads(response) + except: + return {"error": "Failed to parse response"} + +def parse_setup_response(response_data, sent_request_id): + error = response_data.get('error') + request_id = response_data.get('request_id') + + if request_id != sent_request_id: + print(f"Request ID mismatch: sent {sent_request_id}, received {request_id}") + return None + if error and error.get('code') != 0: + print(f"Error Code: {error['code']}, Message: {error['message']}") + return None + return response_data.get('work_id') + +def setup(sock, setup_data): + sent_request_id = setup_data['request_id'] + send_json(sock, setup_data) + response = receive_response(sock) + if not response: + print("No response received during setup") + return None + try: + response_data = json.loads(response) + return parse_setup_response(response_data, sent_request_id) + except json.JSONDecodeError: + print(f"Invalid JSON response: {response}") + return None + +def melotts_tts_inference(sock, melotts_work_id, text, use_stream=False): + request_id = str(uuid.uuid4()) + + # 根据文档,选择流式或非流式请求格式 + if use_stream: + send_json(sock, { + "request_id": request_id, + "work_id": melotts_work_id, + "action": "inference", + "object": "melotts.utf-8.stream", + "data": { + "delta": text, + "index": 0, + "finish": True + } + }) + else: + # 非流式请求 + send_json(sock, { + "request_id": request_id, + "work_id": melotts_work_id, + "action": "inference", + "object": "melotts.utf-8", + "data": text + }) + + # 关键更改:不等待响应或设置更长的超时时间 + # 由于使用sys.pcm格式,音频会直接播放,可能不会立即返回响应 + print("语音合成请求已发送,正在播放...") + + # 可选:设置一个较短的超时来检查是否有响应,但不要因为没响应就认为失败 + response = receive_response(sock, timeout=0.5) # 设置短超时,只是尝试看有没有响应 + if response: + try: + response_data = json.loads(response) + error = response_data.get('error') + if error and error.get('code') != 0: + print(f"收到错误响应: Code={error['code']}, Message={error['message']}") + return False + print("收到成功响应") + except: + print(f"收到非JSON响应: {response[:100]}...") + else: + # 不收到响应也视为成功,因为服务器可能正忙于播放音频 + print("未收到响应,但这不一定表示失败(服务器可能正忙于处理音频)") + + # 这里给TTS处理一些时间 + # 根据文本长度估计播放时间 + estimated_time = len(text) * 0.1 # 假设每个字符需要0.1秒 + estimated_time = max(1.0, min(estimated_time, 10.0)) # 至少1秒,最多10秒 + print(f"等待大约 {estimated_time:.1f} 秒让音频播放完...") + time.sleep(estimated_time) + + return True + +def exit_session(sock, melotts_work_id): + send_json(sock, { + "request_id": "melotts_exit", + "work_id": melotts_work_id, + "action": "exit" + }) + response = receive_response(sock, timeout=2.0) + if not response: + print("退出命令已发送,但未收到响应") + return True # 假设成功 + try: + response_data = json.loads(response) + print("Exit Response:", response_data) + return response_data.get('error', {}).get('code', -1) == 0 + except: + print("Failed to parse exit response") + return False + +def get_task_info(sock, work_id): + """获取任务的详细信息""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": work_id, + "action": "taskinfo" + }) + + response = receive_response(sock) + if not response: + return {"error": "No response received"} + try: + return json.loads(response) + except: + return {"error": "Failed to parse response"} + +def main(host, port): + sock = create_tcp_connection(host, port) + try: + print("Setting up MeloTTS...") + setup_data = create_melotts_setup_data() + melotts_work_id = setup(sock, setup_data) + + if not melotts_work_id: + print("Setup failed. Checking available tasks...") + task_list = list_available_tasks(sock) + print("Available tasks:", task_list) + return + + print(f"MeloTTS SETUP finished, work_id: {melotts_work_id}") + + # 获取并显示任务详细信息 + task_info = get_task_info(sock, melotts_work_id) + print("Task info:", task_info) + + # 选择流式或非流式模式 + use_stream = input("是否使用流式输入? (y/n, 默认n): ").lower() == 'y' + + while True: + text = input("请输入你要合成语音的中文文本(输入exit退出):") + if text.lower() == 'exit': + break + + print("正在合成语音...", flush=True) + success = melotts_tts_inference(sock, melotts_work_id, text, use_stream) + + if success: + print("语音合成处理完成") + else: + print("语音合成处理失败") + + # 每次请求间隔 + time.sleep(1) + + # 退出会话 + if exit_session(sock, melotts_work_id): + print("成功退出会话") + else: + print("退出会话可能有问题") + + except Exception as e: + print(f"程序异常: {e}") + finally: + close_connection(sock) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='TCP Client for MeloTTS Unit.') + parser.add_argument('--host', type=str, default='localhost', help='Server hostname (default: localhost)') + parser.add_argument('--port', type=int, default=10001, help='Server port (default: 10001)') + args = parser.parse_args() + main(args.host, args.port) diff --git a/projects/llm_framework/tools/test_tools/test-tts.py b/projects/llm_framework/tools/test_tools/test-tts.py new file mode 100644 index 0000000..7368140 --- /dev/null +++ b/projects/llm_framework/tools/test_tools/test-tts.py @@ -0,0 +1,412 @@ +import socket +import json +import argparse +import uuid +import time +import sys + +def create_tcp_connection(host, port): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((host, port)) + return sock + +def send_json(sock, data): + json_data = json.dumps(data, ensure_ascii=False) + '\n' + print(f"Sending: {json_data}") + sock.sendall(json_data.encode('utf-8')) + +def receive_response(sock, timeout=None): + """接收响应,带可选的超时设置""" + old_timeout = sock.gettimeout() + try: + if timeout is not None: + sock.settimeout(timeout) + response = '' + while True: + part = sock.recv(4096).decode('utf-8') + if not part: # 连接已关闭 + return response.strip() + response += part + if '\n' in response: + break + return response.strip() + except socket.timeout: + return None + finally: + sock.settimeout(old_timeout) + +def close_connection(sock): + if sock: + sock.close() + +def create_tts_setup_data(request_id=None, link_with=None): + if request_id is None: + request_id = str(uuid.uuid4()) + + # 基本设置 + data = { + "model": "single_speaker_fast", + "response_format": "sys.pcm", + "input": "tts.utf-8", + "enoutput": False + } + + # 如果需要链接其他单元 + if link_with: + if isinstance(link_with, list): + inputs = ["tts.utf-8"] + link_with + data["input"] = inputs + else: + inputs = ["tts.utf-8", link_with] + data["input"] = inputs + + return { + "request_id": request_id, + "work_id": "tts", + "action": "setup", + "object": "tts.setup", + "data": data + } + +def list_available_tasks(sock, work_id="tts"): + """获取可用的任务列表""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": work_id, + "action": "taskinfo" + }) + + response = receive_response(sock) + if not response: + return {"error": "No response received"} + try: + return json.loads(response) + except: + return {"error": "Failed to parse response"} + +def parse_setup_response(response_data, sent_request_id): + error = response_data.get('error') + request_id = response_data.get('request_id') + + if request_id != sent_request_id: + print(f"Request ID mismatch: sent {sent_request_id}, received {request_id}") + return None + if error and error.get('code') != 0: + print(f"Error Code: {error['code']}, Message: {error['message']}") + return None + return response_data.get('work_id') + +def setup(sock, setup_data): + sent_request_id = setup_data['request_id'] + send_json(sock, setup_data) + response = receive_response(sock) + if not response: + print("No response received during setup") + return None + try: + response_data = json.loads(response) + return parse_setup_response(response_data, sent_request_id) + except json.JSONDecodeError: + print(f"Invalid JSON response: {response}") + return None + +def link_units(sock, tts_work_id, target_work_id): + """链接TTS单元与其他单元""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": tts_work_id, + "action": "link", + "object": "work_id", + "data": target_work_id + }) + + response = receive_response(sock) + if not response: + print("No response received for link request") + return False + + try: + response_data = json.loads(response) + error = response_data.get('error', {}) + if error.get('code') == 0: + print(f"成功链接 {tts_work_id} 与 {target_work_id}") + return True + else: + print(f"链接失败: {error.get('message', '未知错误')}") + return False + except: + print(f"Failed to parse link response: {response}") + return False + +def unlink_units(sock, tts_work_id, target_work_id): + """取消TTS单元与其他单元的链接""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": tts_work_id, + "action": "unlink", + "object": "work_id", + "data": target_work_id + }) + + response = receive_response(sock) + if not response: + print("No response received for unlink request") + return False + + try: + response_data = json.loads(response) + error = response_data.get('error', {}) + if error.get('code') == 0: + print(f"成功取消链接 {tts_work_id} 与 {target_work_id}") + return True + else: + print(f"取消链接失败: {error.get('message', '未知错误')}") + return False + except: + print(f"Failed to parse unlink response: {response}") + return False + +def pause_unit(sock, tts_work_id): + """暂停TTS单元工作""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": tts_work_id, + "action": "pause" + }) + + response = receive_response(sock) + if not response: + print("No response received for pause request") + return False + + try: + response_data = json.loads(response) + error = response_data.get('error', {}) + if error.get('code') == 0: + print(f"成功暂停 {tts_work_id}") + return True + else: + print(f"暂停失败: {error.get('message', '未知错误')}") + return False + except: + print(f"Failed to parse pause response: {response}") + return False + +def resume_unit(sock, tts_work_id): + """恢复TTS单元工作""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": tts_work_id, + "action": "work" + }) + + response = receive_response(sock) + if not response: + print("No response received for resume request") + return False + + try: + response_data = json.loads(response) + error = response_data.get('error', {}) + if error.get('code') == 0: + print(f"成功恢复 {tts_work_id}") + return True + else: + print(f"恢复失败: {error.get('message', '未知错误')}") + return False + except: + print(f"Failed to parse resume response: {response}") + return False + +def tts_inference(sock, tts_work_id, text): + request_id = str(uuid.uuid4()) + + # 非流式请求 + send_json(sock, { + "request_id": request_id, + "work_id": tts_work_id, + "action": "inference", + "object": "tts.utf-8", + "data": text + }) + + print("语音合成请求已发送,正在播放...") + + # 可选:设置一个较短的超时来检查是否有响应,但不要因为没响应就认为失败 + response = receive_response(sock, timeout=0.5) # 设置短超时,只是尝试看有没有响应 + if response: + try: + response_data = json.loads(response) + error = response_data.get('error') + if error and error.get('code') != 0: + print(f"收到错误响应: Code={error['code']}, Message={error['message']}") + return False + print("收到成功响应") + except: + print(f"收到非JSON响应: {response[:100]}...") + else: + # 不收到响应也视为成功,因为服务器可能正忙于播放音频 + print("未收到响应,但这不一定表示失败(服务器可能正忙于处理音频)") + + # 这里给TTS处理一些时间 + # 根据文本长度估计播放时间 + estimated_time = len(text) * 0.1 # 假设每个字符需要0.1秒 + estimated_time = max(1.0, min(estimated_time, 10.0)) # 至少1秒,最多10秒 + print(f"等待大约 {estimated_time:.1f} 秒让音频播放完...") + time.sleep(estimated_time) + + return True + +def exit_session(sock, tts_work_id): + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": tts_work_id, + "action": "exit" + }) + response = receive_response(sock, timeout=2.0) + if not response: + print("退出命令已发送,但未收到响应") + return True # 假设成功 + try: + response_data = json.loads(response) + error = response_data.get('error', {}) + if error.get('code') == 0: + print(f"成功退出 {tts_work_id}") + return True + else: + print(f"退出失败: {error.get('message', '未知错误')}") + return False + except: + print("Failed to parse exit response") + return False + +def get_task_info(sock, work_id): + """获取任务的详细信息""" + request_id = str(uuid.uuid4()) + send_json(sock, { + "request_id": request_id, + "work_id": work_id, + "action": "taskinfo" + }) + + response = receive_response(sock) + if not response: + return {"error": "No response received"} + try: + return json.loads(response) + except: + return {"error": "Failed to parse response"} + +def print_menu(): + print("\n===== TTS控制菜单 =====") + print("1. 合成语音") + print("2. 链接到其他单元") + print("3. 取消链接") + print("4. 暂停TTS单元") + print("5. 恢复TTS单元") + print("6. 获取任务信息") + print("7. 退出TTS单元") + print("0. 退出程序") + print("======================") + +def main(host, port): + sock = create_tcp_connection(host, port) + try: + print("Setting up TTS...") + setup_data = create_tts_setup_data() + tts_work_id = setup(sock, setup_data) + + if not tts_work_id: + print("Setup failed. Checking available tasks...") + task_list = list_available_tasks(sock) + print("Available tasks:", task_list) + if task_list.get('data') and isinstance(task_list.get('data'), list) and len(task_list.get('data')) > 0: + tts_work_id = task_list.get('data')[0] + print(f"使用已存在的TTS任务: {tts_work_id}") + else: + print("找不到可用的TTS任务,程序退出") + return + + print(f"TTS SETUP finished, work_id: {tts_work_id}") + + # 获取并显示任务详细信息 + task_info = get_task_info(sock, tts_work_id) + print("Task info:", task_info) + + while True: + print_menu() + choice = input("请选择操作 (0-7): ") + + if choice == '0': + print("程序退出") + break + + elif choice == '1': + text = input("请输入要合成语音的文本: ") + if text: + print("正在合成语音...", flush=True) + success = tts_inference(sock, tts_work_id, text) + if success: + print("语音合成处理完成") + else: + print("语音合成处理失败") + else: + print("文本为空,取消合成") + + elif choice == '2': + target_id = input("请输入要链接的单元ID (例如 kws.1000): ") + if target_id: + link_units(sock, tts_work_id, target_id) + else: + print("单元ID为空,取消链接操作") + + elif choice == '3': + target_id = input("请输入要取消链接的单元ID (例如 kws.1000): ") + if target_id: + unlink_units(sock, tts_work_id, target_id) + else: + print("单元ID为空,取消操作") + + elif choice == '4': + pause_unit(sock, tts_work_id) + + elif choice == '5': + resume_unit(sock, tts_work_id) + + elif choice == '6': + task_info = get_task_info(sock, tts_work_id) + print("Task info:", json.dumps(task_info, indent=2, ensure_ascii=False)) + + elif choice == '7': + if exit_session(sock, tts_work_id): + print("TTS单元已退出") + # 重新检查可用任务 + task_list = list_available_tasks(sock) + print("Available tasks:", task_list) + else: + print("TTS单元退出失败") + + else: + print("无效的选择,请重试") + + # 每次操作间隔 + time.sleep(0.5) + + except KeyboardInterrupt: + print("\n程序被用户中断") + except Exception as e: + print(f"程序异常: {e}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='TCP Client for MeloTTS Unit.') + parser.add_argument('--host', type=str, default='localhost', help='Server hostname (default: localhost)') + parser.add_argument('--port', type=int, default=10001, help='Server port (default: 10001)') + args = parser.parse_args() + main(args.host, args.port) \ No newline at end of file diff --git a/projects/llm_framework/tools/test_tools/test.py b/projects/llm_framework/tools/test_tools/test.py new file mode 100644 index 0000000..9ea712b --- /dev/null +++ b/projects/llm_framework/tools/test_tools/test.py @@ -0,0 +1,145 @@ +import socket +import json +import argparse + + +def create_tcp_connection(host, port): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((host, port)) + return sock + + +def send_json(sock, data): + json_data = json.dumps(data, ensure_ascii=False) + '\n' + sock.sendall(json_data.encode('utf-8')) + + +def receive_response(sock): + response = '' + while True: + part = sock.recv(4096).decode('utf-8') + response += part + if '\n' in response: + break + return response.strip() + + +def close_connection(sock): + if sock: + sock.close() + + +def create_init_data(): + return { + "request_id": "llm_001", + "work_id": "llm", + "action": "setup", + "object": "llm.setup", + "data": { + "model": "qwen2.5-0.5B-prefill-20e", + "response_format": "llm.utf-8.stream", + "input": "llm.utf-8.stream", + "enoutput": True, + "max_token_len": 1023, + "prompt": "You are a knowledgeable assistant capable of answering various questions and providing information." + } + } + + +def parse_setup_response(response_data, sent_request_id): + error = response_data.get('error') + request_id = response_data.get('request_id') + + if request_id != sent_request_id: + print(f"Request ID mismatch: sent {sent_request_id}, received {request_id}") + return None + + if error and error.get('code') != 0: + print(f"Error Code: {error['code']}, Message: {error['message']}") + return None + + return response_data.get('work_id') + + +def setup(sock, init_data): + sent_request_id = init_data['request_id'] + send_json(sock, init_data) + response = receive_response(sock) + response_data = json.loads(response) + return parse_setup_response(response_data, sent_request_id) + + +def exit_session(sock, deinit_data): + send_json(sock, deinit_data) + response = receive_response(sock) + response_data = json.loads(response) + print("Exit Response:", response_data) + + +def parse_inference_response(response_data): + error = response_data.get('error') + if error and error.get('code') != 0: + print(f"Error Code: {error['code']}, Message: {error['message']}") + return None + + return response_data.get('data') + + +def main(host, port): + sock = create_tcp_connection(host, port) + + try: + print("Setup LLM...") + init_data = create_init_data() + llm_work_id = setup(sock, init_data) + print("Setup LLM finished.") + + while True: + user_input = input("Enter your message (or 'exit' to quit): ") + if user_input.lower() == 'exit': + break + + send_json(sock, { + "request_id": "llm_001", + "work_id": llm_work_id, + "action": "inference", + "object": "llm.utf-8.stream", + "data": { + "delta": user_input, + "index": 0, + "finish": True + } + }) + + while True: + response = receive_response(sock) + response_data = json.loads(response) + + data = parse_inference_response(response_data) + if data is None: + break + + delta = data.get('delta') + finish = data.get('finish') + print(delta, end='', flush=True) + + if finish: + print() + break + + exit_session(sock, { + "request_id": "llm_exit", + "work_id": llm_work_id, + "action": "exit" + }) + finally: + close_connection(sock) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='TCP Client to send JSON data.') + parser.add_argument('--host', type=str, default='localhost', help='Server hostname (default: localhost)') + parser.add_argument('--port', type=int, default=10001, help='Server port (default: 10001)') + + args = parser.parse_args() + main(args.host, args.port) From 2e40ae6f10bbed96801f5ede0861076070e6a6a3 Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Wed, 30 Apr 2025 14:38:08 +0800 Subject: [PATCH 45/64] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E4=B8=AD=E6=96=87?= =?UTF-8?q?=E6=B3=A8=E9=87=8A=EF=BC=9B=E4=BD=BF=E7=94=A8log=E8=BE=93?= =?UTF-8?q?=E5=87=BA=E6=97=A5=E5=BF=97=EF=BC=9B=E6=A0=BC=E5=BC=8F=E5=8C=96?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_melotts/src/runner/Lexicon.hpp | 230 +++++++----------- 1 file changed, 84 insertions(+), 146 deletions(-) diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 2d55a47..3b363fd 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -7,64 +7,55 @@ #include #include #include -#include // 用于日志输出 +#include +#include "../../../../../SDK/components/utilities/include/sample_log.h" -// 使用引用传参优化split函数,避免不必要的拷贝 -std::vector split(const std::string &s, char delim) { +std::vector split(const std::string& s, char delim) +{ std::vector result; std::stringstream ss(s); std::string item; while (getline(ss, item, delim)) { - if (!item.empty()) { // 避免添加空字符串 + if (!item.empty()) { result.push_back(item); } } return result; } - class Lexicon { private: std::unordered_map, std::vector>> lexicon; - size_t max_phrase_length; // 追踪词典中最长的词组长度 - std::pair, std::vector> unknown_token; // '_'的发音作为未知词的默认值 - std::unordered_map reverse_tokens; // 用于将音素ID转回音素符号,用于日志 + size_t max_phrase_length; + std::pair, std::vector> unknown_token; + std::unordered_map reverse_tokens; public: - Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0) { + Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0) + { std::unordered_map tokens; - - // 加载tokens std::ifstream ifs(tokens_filename); assert(ifs.is_open()); - std::string line; while (std::getline(ifs, line)) { auto splitted_line = split(line, ' '); if (splitted_line.size() >= 2) { int token_id = std::stoi(splitted_line[1]); tokens.insert({splitted_line[0], token_id}); - reverse_tokens[token_id] = splitted_line[0]; // 建立反向映射 + reverse_tokens[token_id] = splitted_line[0]; } } ifs.close(); - - // 加载lexicon ifs.open(lexicon_filename); assert(ifs.is_open()); while (std::getline(ifs, line)) { auto splitted_line = split(line, ' '); if (splitted_line.empty()) continue; - std::string word_or_phrase = splitted_line[0]; - - // 更新最长词组长度 - auto chars = splitEachChar(word_or_phrase); - max_phrase_length = std::max(max_phrase_length, chars.size()); - - size_t phone_tone_len = splitted_line.size() - 1; - size_t half_len = phone_tone_len / 2; + auto chars = splitEachChar(word_or_phrase); + max_phrase_length = std::max(max_phrase_length, chars.size()); + size_t phone_tone_len = splitted_line.size() - 1; + size_t half_len = phone_tone_len / 2; std::vector phones, tones; - for (size_t i = 0; i < phone_tone_len; i++) { auto phone_or_tone = splitted_line[i + 1]; if (i < half_len) { @@ -75,213 +66,161 @@ class Lexicon { tones.push_back(std::stoi(phone_or_tone)); } } - lexicon[word_or_phrase] = std::make_pair(phones, tones); } - - // 添加特殊映射 - lexicon["呣"] = lexicon["母"]; - lexicon["嗯"] = lexicon["恩"]; - - // 添加标点符号 const std::vector punctuation{"!", "?", "…", ",", ".", "'", "-"}; for (const auto& p : punctuation) { if (tokens.find(p) != tokens.end()) { - int i = tokens[p]; + int i = tokens[p]; lexicon[p] = std::make_pair(std::vector{i}, std::vector{0}); } } - - // 设置'_'作为未知词的发音 - assert(tokens.find("_") != tokens.end()); // 确保tokens中包含"_" + assert(tokens.find("_") != tokens.end()); unknown_token = std::make_pair(std::vector{tokens["_"]}, std::vector{0}); - - // 空格映射到'_'的发音 - lexicon[" "] = unknown_token; - - // 中文标点转换映射 + lexicon[" "] = unknown_token; lexicon[","] = lexicon[","]; lexicon["。"] = lexicon["."]; lexicon["!"] = lexicon["!"]; lexicon["?"] = lexicon["?"]; - - // 输出词典信息 - std::cout << "词典加载完成,包含 " << lexicon.size() << " 个条目,最长词组长度: " << max_phrase_length << std::endl; + SLOGI("词典加载完成,包含 %zu 个条目,最长词组长度: %zu", lexicon.size(), max_phrase_length); } - - std::vector splitEachChar(const std::string& text) { + std::vector splitEachChar(const std::string& text) + { std::vector words; int len = text.length(); - int i = 0; - + int i = 0; while (i < len) { int next = 1; if ((text[i] & 0x80) == 0x00) { // ASCII } else if ((text[i] & 0xE0) == 0xC0) { - next = 2; // 2字节UTF-8 + next = 2; // 2字节UTF-8 } else if ((text[i] & 0xF0) == 0xE0) { - next = 3; // 3字节UTF-8 + next = 3; // 3字节UTF-8 } else if ((text[i] & 0xF8) == 0xF0) { - next = 4; // 4字节UTF-8 + next = 4; // 4字节UTF-8 } words.push_back(text.substr(i, next)); i += next; } return words; - } - - bool is_english(const std::string& s) { + } + bool is_english(const std::string& s) + { return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z')); } - - // 根据词典中的内容,使用最长匹配算法处理输入文本 - void convert(const std::string& text, std::vector& phones, std::vector& tones) { - std::cout << "\n开始处理文本: \"" << text << "\"" << std::endl; - std::cout << "=======匹配结果=======" << std::endl; - std::cout << "单元\t|\t音素\t|\t声调" << std::endl; - std::cout << "-----------------------------" << std::endl; - - // 在开头添加'_'边界标记 + void convert(const std::string& text, std::vector& phones, std::vector& tones) + { + SLOGI("\n开始处理文本: \"%s\"", text.c_str()); + SLOGI("=======匹配结果======="); + SLOGI("单元\t|\t音素\t|\t声调"); + SLOGI("-----------------------------"); phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - std::cout << "\t|\t" << phonesToString(unknown_token.first) << "\t|\t" - << tonesToString(unknown_token.second) << std::endl; - + + SLOGI("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); auto chars = splitEachChar(text); - int i = 0; - + int i = 0; while (i < chars.size()) { - // 处理英文单词 if (is_english(chars[i])) { std::string eng_word; int start = i; while (i < chars.size() && is_english(chars[i])) { eng_word += chars[i++]; } - - // 英文转小写 - std::string orig_word = eng_word; // 保留原始单词用于日志 + std::string orig_word = eng_word; std::transform(eng_word.begin(), eng_word.end(), eng_word.begin(), - [](unsigned char c){ return std::tolower(c); }); - - // 如果词典中有这个英文单词,使用它;否则使用'_'的发音 + [](unsigned char c) { return std::tolower(c); }); if (lexicon.find(eng_word) != lexicon.end()) { auto& [eng_phones, eng_tones] = lexicon[eng_word]; phones.insert(phones.end(), eng_phones.begin(), eng_phones.end()); tones.insert(tones.end(), eng_tones.begin(), eng_tones.end()); - - // 打印匹配信息 - std::cout << orig_word << "\t|\t" << phonesToString(eng_phones) << "\t|\t" - << tonesToString(eng_tones) << std::endl; + SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(), + tonesToString(eng_tones).c_str()); } else { - // 未找到单词,使用'_'的发音 phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - - // 打印未匹配信息 - std::cout << orig_word << "\t|\t" << phonesToString(unknown_token.first) << " (未匹配)\t|\t" - << tonesToString(unknown_token.second) << std::endl; + SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_word.c_str(), phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); } continue; } - // 处理非英文字符(如空格、标点) std::string c = chars[i++]; - if (c == " ") continue; // 跳过空格 - // 回退一步,用于最长匹配 + if (c == " ") continue; i--; - - - // 最长匹配算法处理中文/日文 bool matched = false; - // 尝试从最长的词组开始匹配 for (size_t len = std::min(max_phrase_length, chars.size() - i); len > 0 && !matched; --len) { std::string phrase; for (size_t j = 0; j < len; ++j) { phrase += chars[i + j]; } - if (lexicon.find(phrase) != lexicon.end()) { auto& [phrase_phones, phrase_tones] = lexicon[phrase]; phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end()); tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end()); - - // 打印匹配信息 - std::cout << phrase << "\t|\t" << phonesToString(phrase_phones) << "\t|\t" - << tonesToString(phrase_tones) << std::endl; - + SLOGI("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(), + tonesToString(phrase_tones).c_str()); i += len; matched = true; break; } } - - // 如果没有匹配到任何词组,使用'_'的发音 if (!matched) { - std::string c = chars[i++]; - std::string s = c; - - // 中文标点符号转换 - std::string orig_char = s; // 保留原始字符用于日志 - if (s == ",") s = ","; - else if (s == "。") s = "."; - else if (s == "!") s = "!"; - else if (s == "?") s = "?"; - - // 如果词典中找不到,则使用'_'的发音 + std::string c = chars[i++]; + std::string s = c; + std::string orig_char = s; + if (s == ",") + s = ","; + else if (s == "。") + s = "."; + else if (s == "!") + s = "!"; + else if (s == "?") + s = "?"; if (lexicon.find(s) != lexicon.end()) { auto& [char_phones, char_tones] = lexicon[s]; phones.insert(phones.end(), char_phones.begin(), char_phones.end()); tones.insert(tones.end(), char_tones.begin(), char_tones.end()); - - // 打印匹配信息 - std::cout << orig_char << "\t|\t" << phonesToString(char_phones) << "\t|\t" - << tonesToString(char_tones) << std::endl; + SLOGI("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(), + tonesToString(char_tones).c_str()); } else { phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - - // 打印未匹配信息 - std::cout << orig_char << "\t|\t" << phonesToString(unknown_token.first) << " (未匹配)\t|\t" - << tonesToString(unknown_token.second) << std::endl; + SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_char.c_str(), phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); } } } - - // 在末尾添加'_'边界标记 phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - std::cout << "\t|\t" << phonesToString(unknown_token.first) << "\t|\t" - << tonesToString(unknown_token.second) << std::endl; - - // 汇总打印最终结果 - std::cout << "\n处理结果汇总:" << std::endl; - std::cout << "原文: " << text << std::endl; - std::cout << "音素: " << phonesToString(phones) << std::endl; - std::cout << "声调: " << tonesToString(tones) << std::endl; - std::cout << "====================" << std::endl; + SLOGI("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); + SLOGI("\n处理结果汇总:"); + SLOGI("原文: %s", text.c_str()); + SLOGI("音素: %s", phonesToString(phones).c_str()); + SLOGI("声调: %s", tonesToString(tones).c_str()); + SLOGI("===================="); } private: - // 处理单个字符 - void processChar(const std::string& c, std::vector& phones, std::vector& tones) { + void processChar(const std::string& c, std::vector& phones, std::vector& tones) + { std::string s = c; - - // 中文标点符号转换 - if (s == ",") s = ","; - else if (s == "。") s = "."; - else if (s == "!") s = "!"; - else if (s == "?") s = "?"; - - // 如果词典中找不到,则使用'_'的发音 + if (s == ",") + s = ","; + else if (s == "。") + s = "."; + else if (s == "!") + s = "!"; + else if (s == "?") + s = "?"; auto& phones_and_tones = (lexicon.find(s) != lexicon.end()) ? lexicon[s] : unknown_token; - phones.insert(phones.end(), phones_and_tones.first.begin(), phones_and_tones.first.end()); tones.insert(tones.end(), phones_and_tones.second.begin(), phones_and_tones.second.end()); } - - // 将音素ID数组转换为字符串用于日志输出 - std::string phonesToString(const std::vector& phones) { + std::string phonesToString(const std::vector& phones) + { std::string result; for (auto id : phones) { if (!result.empty()) result += " "; @@ -293,9 +232,8 @@ class Lexicon { } return result; } - - // 将声调数组转换为字符串用于日志输出 - std::string tonesToString(const std::vector& tones) { + std::string tonesToString(const std::vector& tones) + { std::string result; for (auto tone : tones) { if (!result.empty()) result += " "; @@ -303,4 +241,4 @@ class Lexicon { } return result; } -}; +}; \ No newline at end of file From 3ce020583bdcbc9e16ceff3cfc5d7412d6e06300 Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Wed, 30 Apr 2025 14:59:58 +0800 Subject: [PATCH 46/64] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=97=A5=E8=AF=AD?= =?UTF-8?q?=E5=92=8C=E8=8B=B1=E8=AF=AD=E6=A8=A1=E5=9E=8B=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_melotts/mode_melotts-en-default.json | 29 +++++++++++++++++++ .../main_melotts/mode_melotts-ja-jp.json | 29 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 projects/llm_framework/main_melotts/mode_melotts-en-default.json create mode 100644 projects/llm_framework/main_melotts/mode_melotts-ja-jp.json diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-default.json b/projects/llm_framework/main_melotts/mode_melotts-en-default.json new file mode 100644 index 0000000..d886e85 --- /dev/null +++ b/projects/llm_framework/main_melotts/mode_melotts-en-default.json @@ -0,0 +1,29 @@ +{ + "mode": "melotts-en-default", + "type": "tts", + "homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert", + "compile_flage": "pulsar2 build --input decoder-en.onnx --config config_decoder_u16.json --output_dir decoder --output_name decoder-en.axmodel --target_hardware AX620E", + "pulsar_version": "3.4-983bb35e", + "capabilities": [ + "tts", + "Japanese" + ], + "input_type": [ + "tts.utf-8" + ], + "output_type": [ + "tts.wav", + "sys.play.0_1" + ], + "mode_param": { + "encoder": "encoder-en-def.ort", + "decoder": "decoder-en-def.axmodel", + "gbin": "g-en-def.bin", + "tokens": "tokens-en.txt", + "lexicon": "lexicon-en.txt", + "spacker_speed": 1.0, + "mode_rate": 44100, + "audio_rate": 16000, + "awake_delay": 1000 + } +} \ No newline at end of file diff --git a/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json b/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json new file mode 100644 index 0000000..97b9f31 --- /dev/null +++ b/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json @@ -0,0 +1,29 @@ +{ + "mode": "melotts-ja-jp", + "type": "tts", + "homepage": "https://github.com/ml-inory/melotts.axera/tree/main/model_convert", + "compile_flage": "pulsar2 build --input decoder-jp.onnx --config config_decoder_u16.json --output_dir decoder --output_name decoder-jp.axmodel --target_hardware AX620E", + "pulsar_version": "3.4-983bb35e", + "capabilities": [ + "tts", + "Japanese" + ], + "input_type": [ + "tts.utf-8" + ], + "output_type": [ + "tts.wav", + "sys.play.0_1" + ], + "mode_param": { + "encoder": "encoder-jp.ort", + "decoder": "decoder-jp.axmodel", + "gbin": "g-jp.bin", + "tokens": "tokens-jp.txt", + "lexicon": "lexicon-jp.txt", + "spacker_speed": 1.0, + "mode_rate": 44100, + "audio_rate": 16000, + "awake_delay": 1000 + } +} \ No newline at end of file From 3897870782a4d54d62825dc2e02e6c0167a4ad1e Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Wed, 30 Apr 2025 15:06:38 +0800 Subject: [PATCH 47/64] =?UTF-8?q?=E7=95=A5=E5=BE=AE=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E8=AF=AD=E9=80=9F=EF=BC=8C=E5=90=AC=E6=84=9F=E6=9B=B4=E5=A5=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_melotts/mode_melotts-en-default.json | 2 +- .../llm_framework/main_melotts/mode_melotts-zh-cn.json | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-default.json b/projects/llm_framework/main_melotts/mode_melotts-en-default.json index d886e85..8b16116 100644 --- a/projects/llm_framework/main_melotts/mode_melotts-en-default.json +++ b/projects/llm_framework/main_melotts/mode_melotts-en-default.json @@ -21,7 +21,7 @@ "gbin": "g-en-def.bin", "tokens": "tokens-en.txt", "lexicon": "lexicon-en.txt", - "spacker_speed": 1.0, + "spacker_speed": 1.2, "mode_rate": 44100, "audio_rate": 16000, "awake_delay": 1000 diff --git a/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json b/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json index ee9f57c..b5edfe0 100644 --- a/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json +++ b/projects/llm_framework/main_melotts/mode_melotts-zh-cn.json @@ -1,9 +1,9 @@ { "mode": "melotts-zh-cn", "type": "tts", - "homepage":"https://huggingface.co/myshell-ai/MeloTTS-Chinese", - "compile_flage":"pulsar2 build --input decoder.onnx --config config_decoder_u16.json --output_dir decoder --output_name decoder.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0", - "pulsar_version":"3.2-99f14d0a", + "homepage": "https://huggingface.co/myshell-ai/MeloTTS-Chinese", + "compile_flage": "pulsar2 build --input decoder.onnx --config config_decoder_u16.json --output_dir decoder --output_name decoder.axmodel --target_hardware AX620E --npu_mode NPU2 --compiler.check 0", + "pulsar_version": "3.2-99f14d0a", "capabilities": [ "tts", "Chinese" @@ -21,7 +21,7 @@ "gbin": "g-zh_mix_en.bin", "tokens": "tokens.txt", "lexicon": "lexicon.txt", - "spacker_speed": 1.0, + "spacker_speed": 1.1, "mode_rate": 44100, "audio_rate": 16000, "awake_delay": 1000 From 5782f89df1f232ccc6d2f04f8872f065e77da552 Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Wed, 30 Apr 2025 16:50:12 +0800 Subject: [PATCH 48/64] =?UTF-8?q?=E5=A4=84=E7=90=86=E9=99=8C=E7=94=9F?= =?UTF-8?q?=E8=8B=B1=E8=AF=AD=E5=8D=95=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .clang-format | 3 +- .../main_melotts/mode_melotts-ja-jp.json | 2 +- .../main_melotts/src/runner/Lexicon.hpp | 89 ++++++++++++++++++- 3 files changed, 87 insertions(+), 7 deletions(-) diff --git a/.clang-format b/.clang-format index 87d3960..06eb3d4 100644 --- a/.clang-format +++ b/.clang-format @@ -163,5 +163,4 @@ StatementMacros: - QT_REQUIRE_VERSION TabWidth: 4 UseCRLF: false -UseTab: Never -... \ No newline at end of file +UseTab: Never \ No newline at end of file diff --git a/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json b/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json index 97b9f31..d2df3e1 100644 --- a/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json +++ b/projects/llm_framework/main_melotts/mode_melotts-ja-jp.json @@ -21,7 +21,7 @@ "gbin": "g-jp.bin", "tokens": "tokens-jp.txt", "lexicon": "lexicon-jp.txt", - "spacker_speed": 1.0, + "spacker_speed": 1.1, "mode_rate": 44100, "audio_rate": 16000, "awake_delay": 1000 diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 3b363fd..242fb15 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -109,6 +109,90 @@ class Lexicon { { return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z')); } + + bool is_english_token_char(const std::string& s) + { + if (s.size() != 1) return false; + char c = s[0]; + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_'; + } + + void process_unknown_english(const std::string& word, std::vector& phones, std::vector& tones) + { + SLOGI("Processing unknown term: %s", word.c_str()); + + std::string orig_word = word; + std::vector parts; + std::vector phonetic_parts; + + size_t start = 0; + while (start < word.size()) { + bool matched = false; + + for (size_t len = std::min(word.size() - start, (size_t)10); len > 0 && !matched; --len) { + std::string sub_word = word.substr(start, len); + std::string lower_sub_word = sub_word; + std::transform(lower_sub_word.begin(), lower_sub_word.end(), lower_sub_word.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (lexicon.find(lower_sub_word) != lexicon.end()) { + // Substring found in lexicon + auto& [sub_phones, sub_tones] = lexicon[lower_sub_word]; + phones.insert(phones.end(), sub_phones.begin(), sub_phones.end()); + tones.insert(tones.end(), sub_tones.begin(), sub_tones.end()); + + parts.push_back(sub_word); + phonetic_parts.push_back(phonesToString(sub_phones)); + + SLOGI(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str()); + + start += len; + matched = true; + break; + } + } + + if (!matched) { + std::string single_char = word.substr(start, 1); + std::string lower_char = single_char; + std::transform(lower_char.begin(), lower_char.end(), lower_char.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if (lexicon.find(lower_char) != lexicon.end()) { + auto& [char_phones, char_tones] = lexicon[lower_char]; + phones.insert(phones.end(), char_phones.begin(), char_phones.end()); + tones.insert(tones.end(), char_tones.begin(), char_tones.end()); + + parts.push_back(single_char); + phonetic_parts.push_back(phonesToString(char_phones)); + + SLOGI(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str()); + } else { + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + + parts.push_back(single_char); + phonetic_parts.push_back("_unknown_"); + + SLOGI(" Unknown: '%s'", single_char.c_str()); + } + + start++; + } + } + + std::string parts_str, phonetic_str; + for (size_t i = 0; i < parts.size(); i++) { + if (i > 0) { + parts_str += " "; + phonetic_str += " "; + } + parts_str += parts[i]; + phonetic_str += phonetic_parts[i]; + } + + SLOGI("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str()); + } void convert(const std::string& text, std::vector& phones, std::vector& tones) { SLOGI("\n开始处理文本: \"%s\"", text.c_str()); @@ -139,10 +223,7 @@ class Lexicon { SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(), tonesToString(eng_tones).c_str()); } else { - phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); - tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_word.c_str(), phonesToString(unknown_token.first).c_str(), - tonesToString(unknown_token.second).c_str()); + process_unknown_english(orig_word, phones, tones); } continue; } From 04961aea99177ad72239f2bf95c313f13b0f1ee7 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 30 Apr 2025 17:37:04 +0800 Subject: [PATCH 49/64] [update] update mode_melotts-en-default.json --- .../main_melotts/mode_melotts-en-default.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/llm_framework/main_melotts/mode_melotts-en-default.json b/projects/llm_framework/main_melotts/mode_melotts-en-default.json index 8b16116..1894514 100644 --- a/projects/llm_framework/main_melotts/mode_melotts-en-default.json +++ b/projects/llm_framework/main_melotts/mode_melotts-en-default.json @@ -6,7 +6,7 @@ "pulsar_version": "3.4-983bb35e", "capabilities": [ "tts", - "Japanese" + "English" ], "input_type": [ "tts.utf-8" @@ -16,9 +16,9 @@ "sys.play.0_1" ], "mode_param": { - "encoder": "encoder-en-def.ort", - "decoder": "decoder-en-def.axmodel", - "gbin": "g-en-def.bin", + "encoder": "encoder-en-default.ort", + "decoder": "decoder-en-default.axmodel", + "gbin": "g-en-default.bin", "tokens": "tokens-en.txt", "lexicon": "lexicon-en.txt", "spacker_speed": 1.2, From 1e85f45413dc72811ff1d980f9769a87b9bf2060 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 30 Apr 2025 19:40:58 +0800 Subject: [PATCH 50/64] [update] update llm-model-melotts-en-default & llm-model-melotts-ja-jp --- projects/llm_framework/tools/llm_pack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 14cab95..64f3b1f 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -378,6 +378,8 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-single-speaker-fast':[create_data_deb,'llm-model-single-speaker-fast', '0.3', src_folder, revision], 'llm-model-melotts-zh-cn':[create_data_deb,'llm-model-melotts-zh-cn', '0.5', src_folder, revision], 'llm-model-melotts-en-us':[create_data_deb,'llm-model-melotts-en-us', '0.5', src_folder, revision], + 'llm-model-melotts-en-default':[create_data_deb,'llm-model-melotts-en-default', '0.5', src_folder, revision], + 'llm-model-melotts-ja-jp':[create_data_deb,'llm-model-melotts-ja-jp', '0.5', src_folder, revision], 'llm-model-yolo11n':[create_data_deb,'llm-model-yolo11n', data_version, src_folder, revision], 'llm-model-yolo11n-pose':[create_data_deb,'llm-model-yolo11n-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], From 4e301f1cbf1d01655552b98fcfba8058c086066f Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 6 May 2025 11:11:31 +0800 Subject: [PATCH 51/64] [update] update whisper-tiny --- projects/llm_framework/tools/llm_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 64f3b1f..1321b34 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -385,7 +385,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-yolo11n-hand-pose':[create_data_deb,'llm-model-yolo11n-hand-pose', '0.3', src_folder, revision], 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.4', src_folder, revision], - 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.3', src_folder, revision], + 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.4', src_folder, revision], 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], 'llm-model-whisper-small':[create_data_deb,'llm-model-whisper-small', '0.3', src_folder, revision], 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.4', src_folder, revision], From 2a8a744777d6ac13be2ffa920ccd8f1533640095 Mon Sep 17 00:00:00 2001 From: dianjixz <18637716021@163.com> Date: Tue, 6 May 2025 15:25:54 +0800 Subject: [PATCH 52/64] [update] StackFlow add stackflow_data && pzmq add get_param set_param --- .../llm_camera_zh.md | 28 +++++ .../StackFlow/stackflow/StackFlow.cpp | 104 ++++++++---------- .../StackFlow/stackflow/StackFlow.h | 92 ++++++++++++---- .../StackFlow/stackflow/StackFlowUtil.cpp | 7 ++ .../StackFlow/stackflow/StackFlowUtil.h | 4 + ext_components/StackFlow/stackflow/pzmq.hpp | 27 +++++ projects/llm_framework/main_asr/src/main.cpp | 17 +-- .../llm_framework/main_audio/src/main.cpp | 16 +-- .../llm_framework/main_camera/src/main.cpp | 4 +- .../main_sys/src/remote_action.cpp | 8 +- .../main_sys/src/remote_server.cpp | 8 +- projects/llm_framework/main_vad/src/main.cpp | 17 +-- .../llm_framework/main_whisper/src/main.cpp | 18 +-- 13 files changed, 221 insertions(+), 129 deletions(-) diff --git a/doc/projects_llm_framework_doc/llm_camera_zh.md b/doc/projects_llm_framework_doc/llm_camera_zh.md index 97cacbe..5675df3 100644 --- a/doc/projects_llm_framework_doc/llm_camera_zh.md +++ b/doc/projects_llm_framework_doc/llm_camera_zh.md @@ -155,4 +155,32 @@ error::code 为 0 表示执行成功。 } ``` +获取本机的摄像头列表。 + +发送 json: + +```json +{ + "request_id": "2", + "work_id": "camera", + "action": "list_camera" +} +``` + +响应 json: + +```json +{ + "created":1746515639, + "data":[], + "error":{ + "code":0, + "message":"" + }, + "object":"camera.devices", + "request_id":"2", + "work_id":"camera" +} +``` + > **注意:work_id 是按照单元的初始化注册顺序增加的,并不是固定的索引值。** \ No newline at end of file diff --git a/ext_components/StackFlow/stackflow/StackFlow.cpp b/ext_components/StackFlow/stackflow/StackFlow.cpp index e06f478..2991162 100644 --- a/ext_components/StackFlow/stackflow/StackFlow.cpp +++ b/ext_components/StackFlow/stackflow/StackFlow.cpp @@ -33,7 +33,7 @@ llm_channel_obj::~llm_channel_obj() void llm_channel_obj::subscriber_event_call(const std::function &call, pzmq *_pzmq, const std::shared_ptr &raw) { - auto _raw = raw->string(); + auto _raw = raw->string(); const char *user_inference_flage_str = "\"action\""; std::size_t pos = _raw.find(user_inference_flage_str); while (true) { @@ -153,26 +153,16 @@ int llm_channel_obj::output_to_uart(const std::string &data) StackFlow::StackFlow::StackFlow(const std::string &unit_name) : work_id_num_cout_(1000), unit_name_(unit_name), rpc_ctx_(std::make_unique(unit_name)) { - event_queue_.appendListener(EVENT_NONE, - std::bind(&StackFlow::_none_event, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_PAUSE, - std::bind(&StackFlow::_pause, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_WORK, - std::bind(&StackFlow::_work, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_EXIT, - std::bind(&StackFlow::_exit, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_SETUP, - std::bind(&StackFlow::_setup, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_LINK, - std::bind(&StackFlow::_link, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_UNLINK, - std::bind(&StackFlow::_unlink, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_TASKINFO, - std::bind(&StackFlow::_taskinfo, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener(EVENT_SYS_INIT, - std::bind(&StackFlow::_sys_init, this, std::placeholders::_1, std::placeholders::_2)); - event_queue_.appendListener( - EVENT_REPEAT_EVENT, std::bind(&StackFlow::_repeat_loop, this, std::placeholders::_1, std::placeholders::_2)); + event_queue_.appendListener(EVENT_NONE, std::bind(&StackFlow::_none_event, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_PAUSE, std::bind(&StackFlow::_pause, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_WORK, std::bind(&StackFlow::_work, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_EXIT, std::bind(&StackFlow::_exit, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_SETUP, std::bind(&StackFlow::_setup, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_LINK, std::bind(&StackFlow::_link, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_UNLINK, std::bind(&StackFlow::_unlink, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_TASKINFO, std::bind(&StackFlow::_taskinfo, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_SYS_INIT, std::bind(&StackFlow::_sys_init, this, std::placeholders::_1)); + event_queue_.appendListener(EVENT_REPEAT_EVENT, std::bind(&StackFlow::_repeat_loop, this, std::placeholders::_1)); rpc_ctx_->register_rpc_action( "setup", std::bind(&StackFlow::_rpc_setup, this, std::placeholders::_1, std::placeholders::_2)); rpc_ctx_->register_rpc_action( @@ -217,7 +207,7 @@ StackFlow::~StackFlow() llm_task_channel_.erase(iteam->first); } exit_flage_.store(true); - event_queue_.enqueue(EVENT_NONE, "", ""); + event_queue_.enqueue(EVENT_NONE, nullptr); even_loop_thread_->join(); } @@ -229,19 +219,19 @@ void StackFlow::even_loop() } } -void StackFlow::_none_event(const std::string &data1, const std::string &data2) +void StackFlow::_none_event(const std::shared_ptr &arg) { + // std::shared_ptr originalPtr = std::static_pointer_cast(arg); } -void StackFlow::_sys_init(const std::string &zmq_url, const std::string &data) +void StackFlow::_sys_init(const std::shared_ptr &arg) { // todo:... } std::string StackFlow::_rpc_setup(pzmq *_pzmq, const std::shared_ptr &data) { - auto _data = data->string(); - event_queue_.enqueue(EVENT_SETUP, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_SETUP, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } @@ -275,8 +265,7 @@ int StackFlow::setup(const std::string &work_id, const std::string &object, cons std::string StackFlow::_rpc_link(pzmq *_pzmq, const std::shared_ptr &data) { - auto _data = data->string(); - event_queue_.enqueue(EVENT_LINK, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_LINK, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } @@ -307,8 +296,7 @@ void StackFlow::link(const std::string &work_id, const std::string &object, cons std::string StackFlow::_rpc_unlink(pzmq *_pzmq, const std::shared_ptr &data) { - auto _data = data->string(); - event_queue_.enqueue(EVENT_UNLINK, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_UNLINK, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } @@ -339,9 +327,7 @@ void StackFlow::unlink(const std::string &work_id, const std::string &object, co std::string StackFlow::_rpc_work(pzmq *_pzmq, const std::shared_ptr &data) { - - auto _data = data->string(); - event_queue_.enqueue(EVENT_WORK, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_WORK, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } @@ -372,10 +358,7 @@ void StackFlow::work(const std::string &work_id, const std::string &object, cons std::string StackFlow::_rpc_exit(pzmq *_pzmq, const std::shared_ptr &data) { - - - auto _data = data->string(); - event_queue_.enqueue(EVENT_EXIT, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_EXIT, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } @@ -409,10 +392,7 @@ int StackFlow::exit(const std::string &work_id, const std::string &object, const std::string StackFlow::_rpc_pause(pzmq *_pzmq, const std::shared_ptr &data) { - - - auto _data = data->string(); - event_queue_.enqueue(EVENT_PAUSE, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_PAUSE, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } @@ -443,16 +423,13 @@ void StackFlow::pause(const std::string &work_id, const std::string &object, con std::string StackFlow::_rpc_taskinfo(pzmq *_pzmq, const std::shared_ptr &data) { - - - auto _data = data->string(); - event_queue_.enqueue(EVENT_TASKINFO, RPC_PARSE_TO_PARAM(_data)); + event_queue_.enqueue(EVENT_TASKINFO, std::make_shared(data->get_param(0), data->get_param(1))); return std::string("None"); } void StackFlow::taskinfo(const std::string &zmq_url, const std::string &raw) { - SLOGI("StackFlow::taskinfo raw"); + // SLOGI("StackFlow::taskinfo raw"); std::string work_id = sample_json_str_get(raw, "work_id"); try { auto task_channel = get_channel(sample_get_work_id_num(work_id)); @@ -464,7 +441,7 @@ void StackFlow::taskinfo(const std::string &zmq_url, const std::string &raw) void StackFlow::taskinfo(const std::string &work_id, const std::string &object, const std::string &data) { - SLOGI("StackFlow::taskinfo"); + // SLOGI("StackFlow::taskinfo"); if (_taskinfo_) { _taskinfo_(work_id, object, data); return; @@ -478,13 +455,17 @@ void StackFlow::taskinfo(const std::string &work_id, const std::string &object, int StackFlow::sys_register_unit(const std::string &unit_name) { int work_id_number; - std::string component_msg = unit_call("sys", "register_unit", unit_name); - std::string str_port = RPC_PARSE_TO_FIRST(component_msg); - work_id_number = std::stoi(str_port); - std::string tmp_buf = RPC_PARSE_TO_SECOND(component_msg); - std::string out_port = RPC_PARSE_TO_FIRST(tmp_buf); - std::string inference_port = RPC_PARSE_TO_SECOND(tmp_buf); - + std::string str_port; + std::string out_port; + std::string inference_port; + + unit_call("sys", "register_unit", unit_name, [&](const std::shared_ptr &pzmg_msg) { + str_port = pzmg_msg->get_param(1); + out_port = pzmg_msg->get_param(0, str_port); + inference_port = pzmg_msg->get_param(1, str_port); + str_port = pzmg_msg->get_param(0); + }); + work_id_number = std::stoi(str_port); SLOGI("work_id_number:%d, out_port:%s, inference_port:%s ", work_id_number, out_port.c_str(), inference_port.c_str()); llm_task_channel_[work_id_number] = std::make_shared(out_port, inference_port, unit_name_); @@ -527,20 +508,23 @@ void StackFlow::sys_sql_unset(const std::string &key) unit_call("sys", "sql_unset", key); } -void StackFlow::_repeat_loop(const std::string &action, const std::string &ms) +void StackFlow::_repeat_loop(const std::shared_ptr &arg) { + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string action = originalPtr->string(0); + int ms = originalPtr->integer(0); repeat_callback_fun_mutex_.lock(); const auto call_fun = repeat_callback_fun_[action]; repeat_callback_fun_mutex_.unlock(); if (call_fun()) { - int delayms = std::stoi(ms); + int delayms = ms; if (delayms) std::thread([this, action, delayms, ms]() { std::this_thread::sleep_for(std::chrono::milliseconds(delayms)); - this->event_queue_.enqueue(EVENT_REPEAT_EVENT, action, ms); + this->event_queue_.enqueue(EVENT_REPEAT_EVENT, std::make_shared(action, ms)); }).detach(); else { - event_queue_.enqueue(EVENT_REPEAT_EVENT, action, ms); + event_queue_.enqueue(EVENT_REPEAT_EVENT, std::make_shared(action, ms)); } } else { repeat_callback_fun_mutex_.lock(); @@ -558,9 +542,9 @@ void StackFlow::repeat_event(int ms, std::function repeat_fun, bool n if (!now) std::thread([this, action, ms]() { std::this_thread::sleep_for(std::chrono::milliseconds(ms)); - this->event_queue_.enqueue(EVENT_REPEAT_EVENT, action, std::to_string(ms)); + this->event_queue_.enqueue(EVENT_REPEAT_EVENT, std::make_shared(action, ms)); }).detach(); else { - event_queue_.enqueue(EVENT_REPEAT_EVENT, action, std::to_string(ms)); + event_queue_.enqueue(EVENT_REPEAT_EVENT, std::make_shared(action, ms)); } } diff --git a/ext_components/StackFlow/stackflow/StackFlow.h b/ext_components/StackFlow/stackflow/StackFlow.h index 6827753..25ac674 100644 --- a/ext_components/StackFlow/stackflow/StackFlow.h +++ b/ext_components/StackFlow/stackflow/StackFlow.h @@ -186,11 +186,34 @@ class llm_channel_obj { }; class stackflow_data { - union { - std::string *rawobj; - std::string *object; - }; - std::string *data; +public: + stackflow_data() + { + } + stackflow_data(const std::string &_data1) + { + str_data[0] = _data1; + } + stackflow_data(const std::string &_data1, const std::string &_data2) + { + str_data[0] = _data1; + str_data[1] = _data2; + } + stackflow_data(const std::string &_data1, int _data2) + { + str_data[0] = _data1; + int_data[0] = _data2; + } + std::string string(int index = 0) + { + return str_data[index]; + } + int integer(int index = 0) + { + return int_data[index]; + } + std::string str_data[2]; + int int_data[2]; }; class StackFlow { @@ -217,7 +240,7 @@ class StackFlow { EVENT_EXPORT, } local_event_t; - eventpp::EventQueue event_queue_; + eventpp::EventQueue &)> event_queue_; std::unique_ptr even_loop_thread_; std::unique_ptr rpc_ctx_; std::atomic status_; @@ -225,7 +248,7 @@ class StackFlow { std::unordered_map> repeat_callback_fun_; std::mutex repeat_callback_fun_mutex_; - void _repeat_loop(const std::string &zmq_url, const std::string &raw); + void _repeat_loop(const std::shared_ptr &arg); public: std::string request_id_; @@ -244,7 +267,7 @@ class StackFlow { StackFlow(const std::string &unit_name); void even_loop(); - void _none_event(const std::string &data1, const std::string &data2); + void _none_event(const std::shared_ptr &arg); template std::shared_ptr get_channel(T workid) @@ -261,8 +284,11 @@ class StackFlow { } std::string _rpc_setup(pzmq *_pzmq, const std::shared_ptr &data); - void _setup(const std::string &zmq_url, const std::string &data) + void _setup(const std::shared_ptr &arg) { + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); // printf("void _setup run \n"); request_id_ = sample_json_str_get(data, "request_id"); out_zmq_url_ = zmq_url; @@ -272,8 +298,11 @@ class StackFlow { virtual int setup(const std::string &work_id, const std::string &object, const std::string &data); std::string _rpc_link(pzmq *_pzmq, const std::shared_ptr &data); - void _link(const std::string &zmq_url, const std::string &data) + void _link(const std::shared_ptr &arg) { + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); // printf("void _link run \n"); request_id_ = sample_json_str_get(data, "request_id"); out_zmq_url_ = zmq_url; @@ -283,8 +312,11 @@ class StackFlow { virtual void link(const std::string &work_id, const std::string &object, const std::string &data); std::string _rpc_unlink(pzmq *_pzmq, const std::shared_ptr &data); - void _unlink(const std::string &zmq_url, const std::string &data) + void _unlink(const std::shared_ptr &arg) { + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); // printf("void _unlink run \n"); request_id_ = sample_json_str_get(data, "request_id"); out_zmq_url_ = zmq_url; @@ -294,46 +326,58 @@ class StackFlow { virtual void unlink(const std::string &work_id, const std::string &object, const std::string &data); std::string _rpc_exit(pzmq *_pzmq, const std::shared_ptr &data); - void _exit(const std::string &zmq_url, const std::string &data) + void _exit(const std::shared_ptr &arg) { - request_id_ = sample_json_str_get(data, "request_id"); - out_zmq_url_ = zmq_url; + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); + request_id_ = sample_json_str_get(data, "request_id"); + out_zmq_url_ = zmq_url; if (status_.load()) exit(zmq_url, data); } virtual int exit(const std::string &zmq_url, const std::string &raw); virtual int exit(const std::string &work_id, const std::string &object, const std::string &data); std::string _rpc_work(pzmq *_pzmq, const std::shared_ptr &data); - void _work(const std::string &zmq_url, const std::string &data) + void _work(const std::shared_ptr &arg) { - request_id_ = sample_json_str_get(data, "request_id"); - out_zmq_url_ = zmq_url; + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); + request_id_ = sample_json_str_get(data, "request_id"); + out_zmq_url_ = zmq_url; if (status_.load()) work(zmq_url, data); } virtual void work(const std::string &zmq_url, const std::string &raw); virtual void work(const std::string &work_id, const std::string &object, const std::string &data); std::string _rpc_pause(pzmq *_pzmq, const std::shared_ptr &data); - void _pause(const std::string &zmq_url, const std::string &data) + void _pause(const std::shared_ptr &arg) { - request_id_ = sample_json_str_get(data, "request_id"); - out_zmq_url_ = zmq_url; + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); + request_id_ = sample_json_str_get(data, "request_id"); + out_zmq_url_ = zmq_url; if (status_.load()) pause(zmq_url, data); } virtual void pause(const std::string &zmq_url, const std::string &raw); virtual void pause(const std::string &work_id, const std::string &object, const std::string &data); std::string _rpc_taskinfo(pzmq *_pzmq, const std::shared_ptr &data); - void _taskinfo(const std::string &zmq_url, const std::string &data) + void _taskinfo(const std::shared_ptr &arg) { - request_id_ = sample_json_str_get(data, "request_id"); - out_zmq_url_ = zmq_url; + std::shared_ptr originalPtr = std::static_pointer_cast(arg); + std::string zmq_url = originalPtr->string(0); + std::string data = originalPtr->string(1); + request_id_ = sample_json_str_get(data, "request_id"); + out_zmq_url_ = zmq_url; if (status_.load()) taskinfo(zmq_url, data); } virtual void taskinfo(const std::string &zmq_url, const std::string &raw); virtual void taskinfo(const std::string &work_id, const std::string &object, const std::string &data); - void _sys_init(const std::string &zmq_url, const std::string &data); + void _sys_init(const std::shared_ptr &arg); void user_output(const std::string &zmq_url, const std::string &request_id, const std::string &data); template diff --git a/ext_components/StackFlow/stackflow/StackFlowUtil.cpp b/ext_components/StackFlow/stackflow/StackFlowUtil.cpp index 8225b8b..7da3354 100644 --- a/ext_components/StackFlow/stackflow/StackFlowUtil.cpp +++ b/ext_components/StackFlow/stackflow/StackFlowUtil.cpp @@ -362,6 +362,13 @@ std::string StackFlows::unit_call(const std::string &unit_name, const std::strin return value; } +void StackFlows::unit_call(const std::string &unit_name, const std::string &unit_action, const std::string &data, std::function &)> callback) +{ + std::string value; + StackFlows::pzmq _call(unit_name); + _call.call_rpc_action(unit_action, data, [callback](StackFlows::pzmq *_pzmq, const std::shared_ptr &raw) { callback(raw); }); +} + std::list StackFlows::get_config_file_paths(std::string &base_model_path, std::string &base_model_config_path, const std::string &mode_name) diff --git a/ext_components/StackFlow/stackflow/StackFlowUtil.h b/ext_components/StackFlow/stackflow/StackFlowUtil.h index 9b3df86..ed86719 100644 --- a/ext_components/StackFlow/stackflow/StackFlowUtil.h +++ b/ext_components/StackFlow/stackflow/StackFlowUtil.h @@ -9,6 +9,9 @@ #include #include #include +#include +#include "pzmq.hpp" +#include #define WORK_ID_NONE -100 #define RPC_PUSH_PARAM(_obj, _data1, _data2) \ @@ -33,6 +36,7 @@ bool decode_stream(const std::string &in, std::string &out, std::unordered_map &)> callback); std::list get_config_file_paths(std::string &base_model_path, std::string &base_model_config_path, const std::string &mode_name); std::vector glob_files(const std::vector &patterns); bool file_exists(const std::string& filePath); diff --git a/ext_components/StackFlow/stackflow/pzmq.hpp b/ext_components/StackFlow/stackflow/pzmq.hpp index c70a692..5d51ac3 100644 --- a/ext_components/StackFlow/stackflow/pzmq.hpp +++ b/ext_components/StackFlow/stackflow/pzmq.hpp @@ -50,6 +50,33 @@ class pzmq_data { { return &msg; } + + std::string get_param(int index, const std::string &idata = "") + { + const char *data = NULL; + int size = 0; + if (idata.length() > 0) { + data = idata.c_str(); + size = idata.length(); + } else { + data = (const char *)zmq_msg_data(&msg); + size = zmq_msg_size(&msg); + } + + if ((index % 2) == 0) { + return std::string((const char *)(data + 1), data[0]); + } else { + return std::string((const char *)(data + data[0] + 1), zmq_msg_size(&msg) - data[0] - 1); + } + } + + static std::string set_param(std::string param0, std::string param1) + { + std::string data = " " + param0 + param1; + data[0] = param0.length(); + return data; + } + ~pzmq_data() { zmq_msg_close(&msg); diff --git a/projects/llm_framework/main_asr/src/main.cpp b/projects/llm_framework/main_asr/src/main.cpp index ebe503b..c3bd64f 100644 --- a/projects/llm_framework/main_asr/src/main.cpp +++ b/projects/llm_framework/main_asr/src/main.cpp @@ -278,8 +278,7 @@ class llm_asr : public StackFlow { llm_asr() : StackFlow("asr") { task_count_ = 1; - event_queue_.appendListener( - EVENT_TASK_PAUSE, std::bind(&llm_asr::_task_pause, this, std::placeholders::_1, std::placeholders::_2)); + event_queue_.appendListener(EVENT_TASK_PAUSE, std::bind(&llm_asr::_task_pause, this, std::placeholders::_1)); } void task_output(const std::weak_ptr llm_task_obj_weak, @@ -396,9 +395,10 @@ class llm_asr : public StackFlow { llm_task_obj->sys_pcm_on_data((*next_data)); } - void _task_pause(const std::string &work_id, const std::string &data) + void _task_pause(const std::shared_ptr &arg) { - int work_id_num = sample_get_work_id_num(work_id); + std::shared_ptr work_id = std::static_pointer_cast(arg); + int work_id_num = sample_get_work_id_num(*work_id); if (llm_task_.find(work_id_num) == llm_task_.end()) { return; } @@ -412,7 +412,7 @@ class llm_asr : public StackFlow { void task_pause(const std::string &work_id, const std::string &data) { - event_queue_.enqueue(EVENT_TASK_PAUSE, work_id, ""); + event_queue_.enqueue(EVENT_TASK_PAUSE, std::make_shared(work_id)); } void task_work(const std::weak_ptr llm_task_obj_weak, @@ -515,9 +515,10 @@ class llm_asr : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", input); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); - }); + llm_channel->subscriber(audio_url_, + [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); + }); llm_task_obj->audio_flage_ = true; } else if (input.find("asr") != std::string::npos) { llm_channel->subscriber_work_id( diff --git a/projects/llm_framework/main_audio/src/main.cpp b/projects/llm_framework/main_audio/src/main.cpp index f12d3af..80da841 100644 --- a/projects/llm_framework/main_audio/src/main.cpp +++ b/projects/llm_framework/main_audio/src/main.cpp @@ -48,14 +48,15 @@ class llm_audio : public StackFlow { self->pub_ctx_->send_data((const char *)data, size); } - void hw_queue_play(const std::string &audio_data, const std::string &None) + void hw_queue_play(const std::shared_ptr &arg) { if (audio_clear_flage_) { return; } + std::shared_ptr originalPtr = std::static_pointer_cast(arg); std::lock_guard guard(ax_play_mtx); ax_play(play_config.card, play_config.device, play_config.volume, play_config.channel, play_config.rate, - play_config.bit, audio_data.c_str(), audio_data.length()); + play_config.bit, originalPtr->data(), originalPtr->size()); } void hw_play(const std::string &audio_data) @@ -109,8 +110,8 @@ class llm_audio : public StackFlow { public: llm_audio() : StackFlow("audio") { - event_queue_.appendListener( - EVENT_QUEUE_PLAY, std::bind(&llm_audio::hw_queue_play, this, std::placeholders::_1, std::placeholders::_2)); + event_queue_.appendListener(EVENT_QUEUE_PLAY, + std::bind(&llm_audio::hw_queue_play, this, std::placeholders::_1)); setup("", "audio.play", "{\"None\":\"None\"}"); setup("", "audio.cap", "{\"None\":\"None\"}"); self = this; @@ -391,9 +392,8 @@ class llm_audio : public StackFlow { std::string play(pzmq *_pzmq, const std::shared_ptr &rawdata) { - auto _rawdata = rawdata->string(); - std::string zmq_url = RPC_PARSE_TO_FIRST(_rawdata); - std::string audio_json = RPC_PARSE_TO_SECOND(_rawdata); + std::string zmq_url = rawdata->get_param(0); + std::string audio_json = rawdata->get_param(1); std::string ret_val = parse_data(sample_json_str_get(audio_json, "object"), sample_json_str_get(audio_json, "data")); request_id_ = sample_json_str_get(audio_json, "request_id"); @@ -412,7 +412,7 @@ class llm_audio : public StackFlow { std::string enqueue_play(pzmq *_pzmq, const std::shared_ptr &rawdata) { audio_clear_flage_ = false; - event_queue_.enqueue(EVENT_QUEUE_PLAY, rawdata->string(), ""); + event_queue_.enqueue(EVENT_QUEUE_PLAY, rawdata); return LLM_NONE; } diff --git a/projects/llm_framework/main_camera/src/main.cpp b/projects/llm_framework/main_camera/src/main.cpp index c2c97f2..dad2699 100644 --- a/projects/llm_framework/main_camera/src/main.cpp +++ b/projects/llm_framework/main_camera/src/main.cpp @@ -594,8 +594,8 @@ class llm_camera : public StackFlow { { auto _rawdata = rawdata->string(); nlohmann::json req_body; - std::string zmq_url = RPC_PARSE_TO_FIRST(_rawdata); - std::string param_json = RPC_PARSE_TO_SECOND(_rawdata); + std::string zmq_url = rawdata->get_param(0); + std::string param_json = rawdata->get_param(1); std::vector devices; glob_t glob_result; glob("/dev/video*", GLOB_TILDE, NULL, &glob_result); diff --git a/projects/llm_framework/main_sys/src/remote_action.cpp b/projects/llm_framework/main_sys/src/remote_action.cpp index 6121f69..a6818c8 100644 --- a/projects/llm_framework/main_sys/src/remote_action.cpp +++ b/projects/llm_framework/main_sys/src/remote_action.cpp @@ -20,12 +20,10 @@ int remote_call(int com_id, const std::string &json_str) std::string work_unit = work_id.substr(0, work_id.find(".")); std::string action = sample_json_str_get(json_str, "action"); char com_url[256]; - int length = snprintf(com_url, 255, zmq_c_format.c_str(), com_id); - std::string send_data; - std::string com_urls(com_url); - RPC_PUSH_PARAM(send_data, com_urls, json_str); + snprintf(com_url, 255, zmq_c_format.c_str(), com_id); pzmq clent(work_unit); - return clent.call_rpc_action(action, send_data, [](pzmq *_pzmq, const std::shared_ptr &val) {}); + return clent.call_rpc_action(action, pzmq_data::set_param(com_url, json_str), + [](pzmq *_pzmq, const std::shared_ptr &val) {}); } void remote_action_work() diff --git a/projects/llm_framework/main_sys/src/remote_server.cpp b/projects/llm_framework/main_sys/src/remote_server.cpp index 48ea242..64d65db 100644 --- a/projects/llm_framework/main_sys/src/remote_server.cpp +++ b/projects/llm_framework/main_sys/src/remote_server.cpp @@ -174,12 +174,8 @@ int c_sys_release_unit(char const *unit) std::string rpc_allocate_unit(pzmq *_pzmq, const std::shared_ptr &raw) { unit_data *unit_info = sys_allocate_unit(raw->string()); - std::string send_data; - std::string send_data1; - std::string str_port = std::to_string(unit_info->port_); - RPC_PUSH_PARAM(send_data1, unit_info->output_url, unit_info->inference_url); - RPC_PUSH_PARAM(send_data, str_port, send_data1); - return send_data; + return pzmq_data::set_param(std::to_string(unit_info->port_), + pzmq_data::set_param(unit_info->output_url, unit_info->inference_url)); } std::string rpc_release_unit(pzmq *_pzmq, const std::shared_ptr &raw) diff --git a/projects/llm_framework/main_vad/src/main.cpp b/projects/llm_framework/main_vad/src/main.cpp index 3c737bc..3bd53f3 100644 --- a/projects/llm_framework/main_vad/src/main.cpp +++ b/projects/llm_framework/main_vad/src/main.cpp @@ -244,8 +244,7 @@ class llm_vad : public StackFlow { llm_vad() : StackFlow("vad") { task_count_ = 1; - event_queue_.appendListener( - EVENT_TASK_PAUSE, std::bind(&llm_vad::_task_pause, this, std::placeholders::_1, std::placeholders::_2)); + event_queue_.appendListener(EVENT_TASK_PAUSE, std::bind(&llm_vad::_task_pause, this, std::placeholders::_1)); } void task_output(const std::weak_ptr llm_task_obj_weak, @@ -306,9 +305,10 @@ class llm_vad : public StackFlow { llm_task_obj->sys_pcm_on_data((*next_data)); } - void _task_pause(const std::string &work_id, const std::string &data) + void _task_pause(const std::shared_ptr &arg) { - int work_id_num = sample_get_work_id_num(work_id); + std::shared_ptr work_id = std::static_pointer_cast(arg); + int work_id_num = sample_get_work_id_num(*work_id); if (llm_task_.find(work_id_num) == llm_task_.end()) { return; } @@ -322,7 +322,7 @@ class llm_vad : public StackFlow { void task_pause(const std::string &work_id, const std::string &data) { - event_queue_.enqueue(EVENT_TASK_PAUSE, work_id, ""); + event_queue_.enqueue(EVENT_TASK_PAUSE, std::make_shared(work_id)); } void task_work(const std::weak_ptr llm_task_obj_weak, @@ -423,9 +423,10 @@ class llm_vad : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", "None"); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); - }); + llm_channel->subscriber(audio_url_, + [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); + }); llm_task_obj->audio_flage_ = true; } else if (input.find("vad") != std::string::npos) { llm_channel->subscriber_work_id( diff --git a/projects/llm_framework/main_whisper/src/main.cpp b/projects/llm_framework/main_whisper/src/main.cpp index 69d1799..cc3aaa9 100644 --- a/projects/llm_framework/main_whisper/src/main.cpp +++ b/projects/llm_framework/main_whisper/src/main.cpp @@ -595,8 +595,8 @@ class llm_whisper : public StackFlow { llm_whisper() : StackFlow("whisper") { task_count_ = 1; - event_queue_.appendListener( - EVENT_TASK_PAUSE, std::bind(&llm_whisper::_task_pause, this, std::placeholders::_1, std::placeholders::_2)); + event_queue_.appendListener(EVENT_TASK_PAUSE, + std::bind(&llm_whisper::_task_pause, this, std::placeholders::_1)); } void task_output(const std::weak_ptr llm_task_obj_weak, @@ -713,9 +713,10 @@ class llm_whisper : public StackFlow { llm_task_obj->sys_pcm_on_data((*next_data)); } - void _task_pause(const std::string &work_id, const std::string &data) + void _task_pause(const std::shared_ptr &arg) { - int work_id_num = sample_get_work_id_num(work_id); + std::shared_ptr work_id = std::static_pointer_cast(arg); + int work_id_num = sample_get_work_id_num(*work_id); if (llm_task_.find(work_id_num) == llm_task_.end()) { return; } @@ -729,7 +730,7 @@ class llm_whisper : public StackFlow { void task_pause(const std::string &work_id, const std::string &data) { - event_queue_.enqueue(EVENT_TASK_PAUSE, work_id, ""); + event_queue_.enqueue(EVENT_TASK_PAUSE, std::make_shared(work_id)); } void task_work(const std::weak_ptr llm_task_obj_weak, @@ -846,9 +847,10 @@ class llm_whisper : public StackFlow { if (input.find("sys") != std::string::npos) { audio_url_ = unit_call("audio", "cap", input); std::weak_ptr _llm_task_obj = llm_task_obj; - llm_channel->subscriber(audio_url_, [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { - _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); - }); + llm_channel->subscriber(audio_url_, + [_llm_task_obj](pzmq *_pzmq, const std::shared_ptr &raw) { + _llm_task_obj.lock()->sys_pcm_on_data(raw->string()); + }); llm_task_obj->audio_flage_ = true; } else if (input.find("whisper") != std::string::npos) { if (input.find("stream.base64") != std::string::npos) llm_task_obj->delay_audio_frame_ = 0; From e3c70bc1273fb9ac668650ba80e34f27a6cfc9a2 Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Tue, 6 May 2025 16:48:11 +0800 Subject: [PATCH 53/64] Implement Sola algorithm for smoother audio transitions Apply the Synchronized Overlap-Add (SOLA) algorithm to smooth the connection between audio segments output by the decoder, resulting in more natural-sounding transitions between segments. --- .../llm_framework/main_melotts/src/main.cpp | 67 +++-- .../main_melotts/src/runner/Lexicon.hpp | 1 + .../main_melotts/src/runner/SolaProcessor.h | 269 ++++++++++++++++++ 3 files changed, 315 insertions(+), 22 deletions(-) create mode 100644 projects/llm_framework/main_melotts/src/runner/SolaProcessor.h diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index b5a27cb..610362a 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -9,6 +9,7 @@ #include "Lexicon.hpp" #include #include "AudioFile.h" +#include "SolaProcessor.h" #include "Lexicon.hpp" #include @@ -263,49 +264,71 @@ class llm_task { auto encoder_output = encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w, mode_config_.get_length_scale(), mode_config_.sdp_ratio); - float *zp_data = encoder_output.at(0).GetTensorMutableData(); - int audio_len = encoder_output.at(2).GetTensorMutableData()[0]; - auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo(); - auto zp_shape = zp_info.GetShape(); - int zp_size = decoder_->GetInputSize(0) / sizeof(float); - int dec_len = zp_size / zp_shape[1]; - int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); - std::vector decoder_output(audio_slice_len); - int dec_slice_num = int(std::ceil(zp_shape[2] * 1.0 / dec_len)); + float *zp_data = encoder_output.at(0).GetTensorMutableData(); + int audio_len = encoder_output.at(2).GetTensorMutableData()[0]; + auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo(); + auto zp_shape = zp_info.GetShape(); + + // Decoder parameters setup + int zp_size = decoder_->GetInputSize(0) / sizeof(float); + int dec_len = zp_size / zp_shape[1]; + int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); + const int pad_frames = 16; + const int samples_per_frame = 512; + const int effective_frames = dec_len - 2 * pad_frames; + int dec_slice_num = + static_cast(std::ceil(static_cast(zp_shape[2]) / static_cast(effective_frames))); + SolaProcessor sola(pad_frames, samples_per_frame); std::vector pcmlist; + for (int i = 0; i < dec_slice_num; i++) { + int input_start = i * effective_frames; + if (i > 0) { + input_start -= pad_frames; + } + input_start = std::max(0, input_start); + int actual_len = std::min(dec_len, static_cast(zp_shape[2] - input_start)); std::vector zp(zp_size, 0); - int actual_size = (i + 1) * dec_len < zp_shape[2] ? dec_len : zp_shape[2] - i * dec_len; + for (int n = 0; n < zp_shape[1]; n++) { - memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + i * dec_len, - sizeof(float) * actual_size); + int copy_size = std::min(actual_len, static_cast(zp_shape[2] - input_start)); + if (copy_size > 0) { + memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + input_start, + sizeof(float) * copy_size); + } } + // Run decoder + std::vector decoder_output(audio_slice_len); decoder_->SetInput(zp.data(), 0); decoder_->SetInput(g_matrix.data(), 1); if (0 != decoder_->Run()) { - printf("Run decoder model failed!\n"); throw std::string("decoder_ RunSync error"); } decoder_->GetOutput(decoder_output.data(), 0); - actual_size = (i + 1) * audio_slice_len < audio_len ? audio_slice_len : audio_len - i * audio_slice_len; - if (decoder_output.size() > actual_size) { - pcmlist.reserve(pcmlist.size() + actual_size); - std::copy(decoder_output.begin(), decoder_output.begin() + actual_size, - std::back_inserter(pcmlist)); - } else { - pcmlist.reserve(pcmlist.size() + decoder_output.size()); - std::copy(decoder_output.begin(), decoder_output.end(), std::back_inserter(pcmlist)); - } + std::vector processed_output = sola.ProcessFrame(decoder_output, i, dec_slice_num, actual_len); + + pcmlist.insert(pcmlist.end(), processed_output.begin(), processed_output.end()); } + double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f); std::vector tmp_pcm((pcmlist.size() * src_ratio + 1)); int len; resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio); + + // Convert to 16-bit PCM + wav_pcm_data.reserve(len); std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data), [](const auto val) { return (int16_t)(val * INT16_MAX); }); + + // Call callback function with output if (out_callback_) out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish); + + } catch (const std::exception &e) { + SLOGI("TTS processing exception: %s", e.what()); + return true; } catch (...) { + SLOGI("TTS processing encountered unknown exception"); return true; } return false; diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 242fb15..242e9e0 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -32,6 +32,7 @@ class Lexicon { public: Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0) { + SLOGI("词典加载: %zu 发音表加载: %zu", tokens_filename, lexicon_filename); std::unordered_map tokens; std::ifstream ifs(tokens_filename); assert(ifs.is_open()); diff --git a/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h b/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h new file mode 100644 index 0000000..a2286bb --- /dev/null +++ b/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h @@ -0,0 +1,269 @@ +#ifndef SOLA_PROCESSOR_H +#define SOLA_PROCESSOR_H + +#include +#include +#include +#include +#include + +/** + * SolaProcessor - Synchronous Overlap-Add method for audio frame processing + * + * This class provides functionality for smoothly concatenating audio frames + * using the SOLA algorithm, which finds optimal alignment points between + * consecutive frames and applies crossfading for smooth transitions. + */ +class SolaProcessor { +public: + /** + * Constructor + * + * @param padFrames Number of padding frames at the beginning and end + * @param samplesPerFrame Number of audio samples in each frame + */ + SolaProcessor(int padFrames, int samplesPerFrame) + : pad_frames_(padFrames), samples_per_frame_(samplesPerFrame), first_frame_(true) + { + Initialize(); + } + + /** + * Reset the processor to its initial state + */ + void Reset() + { + first_frame_ = true; + std::fill(sola_buffer_.begin(), sola_buffer_.end(), 0.0f); + } + + /** + * Process a single audio frame + * + * @param decoder_output Raw audio data from decoder + * @param frameIndex Current frame index + * @param totalFrames Total number of frames + * @param actualFrameLen Actual length of the frame + * @return Processed audio samples + */ + std::vector ProcessFrame(const std::vector& decoder_output, int frameIndex, int totalFrames, + int actualFrameLen) + { + std::vector processed_output; + + if (first_frame_) { + // Special handling for the first frame + ProcessFirstFrame(decoder_output, processed_output, actualFrameLen); + first_frame_ = false; + } else { + // Process subsequent frames with SOLA algorithm + ProcessSubsequentFrame(decoder_output, processed_output, frameIndex, totalFrames, actualFrameLen); + } + + return processed_output; + } + +private: + /** + * Initialize the SOLA processor parameters and buffers + */ + void Initialize() + { + // Calculate SOLA parameters + sola_buffer_frame_ = pad_frames_ * samples_per_frame_; + sola_search_frame_ = pad_frames_ * samples_per_frame_; + effective_frames_ = 0; // Will be set during frame processing + + // Create fade-in and fade-out windows + fade_in_window_.resize(sola_buffer_frame_); + fade_out_window_.resize(sola_buffer_frame_); + + for (int i = 0; i < sola_buffer_frame_; i++) { + fade_in_window_[i] = static_cast(i) / sola_buffer_frame_; + fade_out_window_[i] = 1.0f - fade_in_window_[i]; + } + + // Initialize SOLA buffer + sola_buffer_.resize(sola_buffer_frame_, 0.0f); + } + + /** + * Process the first audio frame + * + * @param decoder_output Raw audio data from decoder + * @param processed_output Output buffer for processed audio + * @param actualFrameLen Actual length of the frame + */ + void ProcessFirstFrame(const std::vector& decoder_output, std::vector& processed_output, + int actualFrameLen) + { + int audio_start = pad_frames_ * samples_per_frame_; + int audio_len = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_; + + // Boundary check + audio_len = std::min(audio_len, static_cast(decoder_output.size() - audio_start)); + + // Add first frame data to output + processed_output.insert(processed_output.end(), decoder_output.begin() + audio_start, + decoder_output.begin() + audio_start + audio_len); + + // Save the end part to SOLA buffer for next frame alignment + int buffer_start = audio_start + audio_len; + if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) { + std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_, + sola_buffer_.begin()); + } + } + + /** + * Process subsequent audio frames using SOLA algorithm + * + * @param decoder_output Raw audio data from decoder + * @param processed_output Output buffer for processed audio + * @param frameIndex Current frame index + * @param totalFrames Total number of frames + * @param actualFrameLen Actual length of the frame + */ + void ProcessSubsequentFrame(const std::vector& decoder_output, std::vector& processed_output, + int frameIndex, int totalFrames, int actualFrameLen) + { + int audio_start = pad_frames_ * samples_per_frame_; + + // 1. Prepare search window + std::vector search_window(sola_buffer_frame_ + sola_search_frame_); + std::copy(decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + search_window.size(), + search_window.begin()); + + // 2. Find best alignment point (compute cross-correlation) + int best_offset = FindBestOffset(search_window); + + // 3. Apply alignment offset + int aligned_start = audio_start + best_offset; + + // 4. Create smooth transition + std::vector crossfade_region = CreateCrossfade(decoder_output, aligned_start); + + // 5. Add crossfade region to output + processed_output.insert(processed_output.end(), crossfade_region.begin(), crossfade_region.end()); + + // 6. Add remaining valid audio data + AddRemainingAudio(decoder_output, processed_output, aligned_start, frameIndex, totalFrames, actualFrameLen); + } + + /** + * Find the best alignment offset using normalized cross-correlation + * + * @param search_window Window of audio samples to search in + * @return Optimal offset for alignment + */ + int FindBestOffset(const std::vector& search_window) + { + int best_offset = 0; + float best_correlation = -1.0f; + + for (int offset = 0; offset <= sola_search_frame_; offset++) { + float correlation = 0.0f; + float energy = 0.0f; + + for (int j = 0; j < sola_buffer_frame_; j++) { + correlation += sola_buffer_[j] * search_window[j + offset]; + energy += search_window[j + offset] * search_window[j + offset]; + } + + // Normalize correlation + float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f; + + if (normalized_correlation > best_correlation) { + best_correlation = normalized_correlation; + best_offset = offset; + } + } + + return best_offset; + } + + /** + * Create crossfade transition region + * + * @param decoder_output Raw audio data from decoder + * @param aligned_start Starting point after alignment + * @return Crossfaded audio samples + */ + std::vector CreateCrossfade(const std::vector& decoder_output, int aligned_start) + { + std::vector crossfade_region(sola_buffer_frame_); + + for (int j = 0; j < sola_buffer_frame_; j++) { + // Apply fade-in and fade-out window functions + crossfade_region[j] = + decoder_output[aligned_start + j] * fade_in_window_[j] + sola_buffer_[j] * fade_out_window_[j]; + } + + return crossfade_region; + } + + /** + * Add remaining audio data and update buffer + * + * @param decoder_output Raw audio data from decoder + * @param processed_output Output buffer for processed audio + * @param aligned_start Starting point after alignment + * @param frameIndex Current frame index + * @param totalFrames Total number of frames + * @param actualFrameLen Actual length of the frame + */ + void AddRemainingAudio(const std::vector& decoder_output, std::vector& processed_output, + int aligned_start, int frameIndex, int totalFrames, int actualFrameLen) + { + int remaining_start = aligned_start + sola_buffer_frame_; + int remaining_len = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_ - sola_buffer_frame_; + + // Boundary check + remaining_len = std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); + + if (remaining_len > 0) { + processed_output.insert(processed_output.end(), decoder_output.begin() + remaining_start, + decoder_output.begin() + remaining_start + remaining_len); + } + + // Update SOLA buffer + UpdateSolaBuffer(decoder_output, remaining_start + remaining_len); + } + + /** + * Update SOLA buffer with new audio data + * + * @param decoder_output Raw audio data from decoder + * @param buffer_start Starting point for the new buffer data + */ + void UpdateSolaBuffer(const std::vector& decoder_output, int buffer_start) + { + // Check if there's enough data for the next buffer + if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) { + std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_, + sola_buffer_.begin()); + } else { + // Fill with zeros if not enough data + int avail = static_cast(decoder_output.size() - buffer_start); + if (avail > 0) { + std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer_.begin()); + } + std::fill(sola_buffer_.begin() + avail, sola_buffer_.end(), 0.0f); + } + } + +private: + int pad_frames_; // Number of padding frames + int samples_per_frame_; // Number of samples per frame + int effective_frames_; // Number of effective frames + int sola_buffer_frame_; // SOLA buffer length + int sola_search_frame_; // SOLA search window length + + std::vector fade_in_window_; // Fade-in window + std::vector fade_out_window_; // Fade-out window + std::vector sola_buffer_; // SOLA buffer + + bool first_frame_; // Flag for first frame processing +}; + +#endif // SOLA_PROCESSOR_H From a151affa1d2891da0e8f65ce4f176a249df97b3f Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Tue, 6 May 2025 16:50:35 +0800 Subject: [PATCH 54/64] Translate logs in Lexicon.hpp to English and add debug switch - Convert all Chinese log messages in Lexicon.hpp to English for better international compatibility\n- Add a debug flag to control whether to display g2p process logs\n- Improve code readability and debugging experience --- .../main_melotts/src/runner/Lexicon.hpp | 104 +++++++++--------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 242e9e0..d1bcbe9 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -1,5 +1,4 @@ #pragma once - #include #include #include @@ -9,7 +8,15 @@ #include #include #include "../../../../../SDK/components/utilities/include/sample_log.h" - +// Debug logging switch - set to true to enable debug logs +static bool DEBUG_LOGGING = false; +// Macro for debug logging +#define DEBUG_LOG(fmt, ...) \ + do { \ + if (DEBUG_LOGGING) { \ + SLOGI(fmt, ##__VA_ARGS__); \ + } \ + } while (0) std::vector split(const std::string& s, char delim) { std::vector result; @@ -30,9 +37,16 @@ class Lexicon { std::unordered_map reverse_tokens; public: + // Setter for debug logging + static void setDebugLogging(bool enable) + { + DEBUG_LOGGING = enable; + } Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0) { - SLOGI("词典加载: %zu 发音表加载: %zu", tokens_filename, lexicon_filename); + DEBUG_LOG("Dictionary loading: %s Pronunciation table loading: %s", tokens_filename.c_str(), + lexicon_filename.c_str()); + std::unordered_map tokens; std::ifstream ifs(tokens_filename); assert(ifs.is_open()); @@ -83,8 +97,10 @@ class Lexicon { lexicon["。"] = lexicon["."]; lexicon["!"] = lexicon["!"]; lexicon["?"] = lexicon["?"]; - SLOGI("词典加载完成,包含 %zu 个条目,最长词组长度: %zu", lexicon.size(), max_phrase_length); + DEBUG_LOG("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(), + max_phrase_length); } + std::vector splitEachChar(const std::string& text) { std::vector words; @@ -95,93 +111,77 @@ class Lexicon { if ((text[i] & 0x80) == 0x00) { // ASCII } else if ((text[i] & 0xE0) == 0xC0) { - next = 2; // 2字节UTF-8 + next = 2; // 2-byte UTF-8 } else if ((text[i] & 0xF0) == 0xE0) { - next = 3; // 3字节UTF-8 + next = 3; // 3-byte UTF-8 } else if ((text[i] & 0xF8) == 0xF0) { - next = 4; // 4字节UTF-8 + next = 4; // 4-byte UTF-8 } words.push_back(text.substr(i, next)); i += next; } return words; } + bool is_english(const std::string& s) { return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z')); } - bool is_english_token_char(const std::string& s) { if (s.size() != 1) return false; char c = s[0]; return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_'; } - void process_unknown_english(const std::string& word, std::vector& phones, std::vector& tones) { - SLOGI("Processing unknown term: %s", word.c_str()); - + DEBUG_LOG("Processing unknown term: %s", word.c_str()); std::string orig_word = word; std::vector parts; std::vector phonetic_parts; - size_t start = 0; while (start < word.size()) { bool matched = false; - for (size_t len = std::min(word.size() - start, (size_t)10); len > 0 && !matched; --len) { std::string sub_word = word.substr(start, len); std::string lower_sub_word = sub_word; std::transform(lower_sub_word.begin(), lower_sub_word.end(), lower_sub_word.begin(), [](unsigned char c) { return std::tolower(c); }); - if (lexicon.find(lower_sub_word) != lexicon.end()) { // Substring found in lexicon auto& [sub_phones, sub_tones] = lexicon[lower_sub_word]; phones.insert(phones.end(), sub_phones.begin(), sub_phones.end()); tones.insert(tones.end(), sub_tones.begin(), sub_tones.end()); - parts.push_back(sub_word); phonetic_parts.push_back(phonesToString(sub_phones)); - - SLOGI(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str()); - + DEBUG_LOG(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str()); start += len; matched = true; break; } } - if (!matched) { std::string single_char = word.substr(start, 1); std::string lower_char = single_char; std::transform(lower_char.begin(), lower_char.end(), lower_char.begin(), [](unsigned char c) { return std::tolower(c); }); - if (lexicon.find(lower_char) != lexicon.end()) { auto& [char_phones, char_tones] = lexicon[lower_char]; phones.insert(phones.end(), char_phones.begin(), char_phones.end()); tones.insert(tones.end(), char_tones.begin(), char_tones.end()); - parts.push_back(single_char); phonetic_parts.push_back(phonesToString(char_phones)); - - SLOGI(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str()); + DEBUG_LOG(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str()); } else { phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - parts.push_back(single_char); phonetic_parts.push_back("_unknown_"); - - SLOGI(" Unknown: '%s'", single_char.c_str()); + DEBUG_LOG(" Unknown: '%s'", single_char.c_str()); } - start++; } } - std::string parts_str, phonetic_str; for (size_t i = 0; i < parts.size(); i++) { if (i > 0) { @@ -191,20 +191,20 @@ class Lexicon { parts_str += parts[i]; phonetic_str += phonetic_parts[i]; } - - SLOGI("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str()); + DEBUG_LOG("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), + phonetic_str.c_str()); } + void convert(const std::string& text, std::vector& phones, std::vector& tones) { - SLOGI("\n开始处理文本: \"%s\"", text.c_str()); - SLOGI("=======匹配结果======="); - SLOGI("单元\t|\t音素\t|\t声调"); - SLOGI("-----------------------------"); + DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str()); + DEBUG_LOG("=======Matching Results======="); + DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones"); + DEBUG_LOG("-----------------------------"); phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - - SLOGI("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), - tonesToString(unknown_token.second).c_str()); + DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); auto chars = splitEachChar(text); int i = 0; while (i < chars.size()) { @@ -221,8 +221,8 @@ class Lexicon { auto& [eng_phones, eng_tones] = lexicon[eng_word]; phones.insert(phones.end(), eng_phones.begin(), eng_phones.end()); tones.insert(tones.end(), eng_tones.begin(), eng_tones.end()); - SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(), - tonesToString(eng_tones).c_str()); + DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(), + tonesToString(eng_tones).c_str()); } else { process_unknown_english(orig_word, phones, tones); } @@ -241,8 +241,8 @@ class Lexicon { auto& [phrase_phones, phrase_tones] = lexicon[phrase]; phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end()); tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end()); - SLOGI("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(), - tonesToString(phrase_tones).c_str()); + DEBUG_LOG("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(), + tonesToString(phrase_tones).c_str()); i += len; matched = true; break; @@ -264,25 +264,25 @@ class Lexicon { auto& [char_phones, char_tones] = lexicon[s]; phones.insert(phones.end(), char_phones.begin(), char_phones.end()); tones.insert(tones.end(), char_tones.begin(), char_tones.end()); - SLOGI("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(), - tonesToString(char_tones).c_str()); + DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(), + tonesToString(char_tones).c_str()); } else { phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_char.c_str(), phonesToString(unknown_token.first).c_str(), - tonesToString(unknown_token.second).c_str()); + DEBUG_LOG("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(), + phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str()); } } } phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - SLOGI("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), - tonesToString(unknown_token.second).c_str()); - SLOGI("\n处理结果汇总:"); - SLOGI("原文: %s", text.c_str()); - SLOGI("音素: %s", phonesToString(phones).c_str()); - SLOGI("声调: %s", tonesToString(tones).c_str()); - SLOGI("===================="); + DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); + DEBUG_LOG("\nProcessing Summary:"); + DEBUG_LOG("Original text: %s", text.c_str()); + DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str()); + DEBUG_LOG("Tones: %s", tonesToString(tones).c_str()); + DEBUG_LOG("===================="); } private: From 840f739c70fb42f72d11b4e3c7c2c2d5eb79dfc6 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Tue, 6 May 2025 17:18:39 +0800 Subject: [PATCH 55/64] [update] update whisper-base --- projects/llm_framework/tools/llm_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 1321b34..8c48415 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -386,7 +386,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-yolo11n-seg':[create_data_deb,'llm-model-yolo11n-seg', '0.3', src_folder, revision], 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.4', src_folder, revision], 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.4', src_folder, revision], - 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.3', src_folder, revision], + 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.4', src_folder, revision], 'llm-model-whisper-small':[create_data_deb,'llm-model-whisper-small', '0.3', src_folder, revision], 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.4', src_folder, revision], 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], From 40bfe39c85820ebfdac70cd661bfb47de0873753 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Wed, 7 May 2025 10:07:36 +0800 Subject: [PATCH 56/64] [update] update whisper-small --- projects/llm_framework/tools/llm_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 8c48415..2e1902e 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -387,7 +387,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-depth-anything-ax630c':[create_data_deb,'llm-model-depth-anything-ax630c', '0.4', src_folder, revision], 'llm-model-whisper-tiny':[create_data_deb,'llm-model-whisper-tiny', '0.4', src_folder, revision], 'llm-model-whisper-base':[create_data_deb,'llm-model-whisper-base', '0.4', src_folder, revision], - 'llm-model-whisper-small':[create_data_deb,'llm-model-whisper-small', '0.3', src_folder, revision], + 'llm-model-whisper-small':[create_data_deb,'llm-model-whisper-small', '0.4', src_folder, revision], 'llm-model-silero-vad':[create_data_deb,'llm-model-silero-vad', '0.4', src_folder, revision], 'llm-model-qwen2.5-0.5B-prefill-20e':[create_data_deb,'llm-model-qwen2.5-0.5B-prefill-20e', data_version, src_folder, revision], 'llm-model-qwen2.5-0.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-0.5B-p256-ax630c', '0.4', src_folder, revision], From 6b285ed98dbe51d369518cbe80889e23d07f64c2 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 8 May 2025 17:15:50 +0800 Subject: [PATCH 57/64] [update] update benchmark --- benchmark/benchmodulellm.py | 2 +- benchmark/utils/llm.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmodulellm.py b/benchmark/benchmodulellm.py index 7934b58..d534e7e 100644 --- a/benchmark/benchmodulellm.py +++ b/benchmark/benchmodulellm.py @@ -109,7 +109,7 @@ def main(opt): for model_name in models: logging.info(f"Testing model: {model_name}") - input_text = "This is a test input for the LLM." + input_text = "Tell me an adventure story." try: result = llm_client.test(model_name, input_text) logging.info(f"Test result for model '{model_name}': {result}") diff --git a/benchmark/utils/llm.py b/benchmark/utils/llm.py index bf77670..6c62d85 100644 --- a/benchmark/utils/llm.py +++ b/benchmark/utils/llm.py @@ -3,7 +3,7 @@ import time import logging import uuid -from .token_calc import calculate_token_length +# from .token_calc import calculate_token_length logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -24,6 +24,7 @@ def send_request_stream(self, request): response = b"" parsed_responses = [] output_text = "" + token_count = 0 start_time = time.time() first_packet_time = None @@ -42,13 +43,14 @@ def send_request_stream(self, request): if first_packet_time is None: first_packet_time = time.time() output_text += parsed_response["data"]["delta"] + token_count += 3 if "data" in parsed_response and parsed_response["data"].get("finish", False): end_time = time.time() total_time = end_time - start_time first_packet_latency = first_packet_time - start_time if first_packet_time else None - token_count = calculate_token_length(output_text) + # token_count = calculate_token_length(output_text) token_speed = token_count / total_time if total_time > 0 else 0 logging.info("Stream reception completed.") From b43e73e949f66d279857ac35dfa230588c731c1a Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Thu, 8 May 2025 17:19:33 +0800 Subject: [PATCH 58/64] [add] add benchmark.yml --- .github/workflows/benchmark.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..5396ec8 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,18 @@ +name: Benchmark Test +on: + workflow_dispatch: + push: + branches: + - dev +jobs: + build: + runs-on: [self-hosted, linux, arm64] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Start Benchmark Test + run: | + echo "This job runs on a self-hosted runner!" + echo "Running benchmark test..." + python3 benchmark/benchmodulellm.py \ No newline at end of file From b1df925af957552d8028365ea57407c32fc84625 Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Fri, 9 May 2025 17:01:55 +0800 Subject: [PATCH 59/64] Fix SOLA detail issue causing first frame problems Resolved an issue in the SOLA (Synchronized Overlap-Add) implementation where specific details were causing problems with the first frame of audio processing. --- .../llm_framework/main_melotts/src/main.cpp | 248 ++++++++++++++++-- .../main_melotts/src/runner/Lexicon.hpp | 2 +- 2 files changed, 228 insertions(+), 22 deletions(-) diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index 610362a..b9a6c6f 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -253,14 +253,19 @@ class llm_task { } return false; } + SLOGI("开始处理文本: %s", msg_str.c_str()); + + // 文本转音素处理部分保持不变 std::vector phones_bef, tones_bef; lexicon_->convert(msg_str, phones_bef, tones_bef); - // Add blank between words - auto phones = intersperse(phones_bef, 0); - auto tones = intersperse(tones_bef, 0); - int phone_len = phones.size(); - int MELOTTS_LANG_IDS = MELOTTS_LANG_IDS_MAP[mode_config_.mode]; - std::vector langids(phone_len, MELOTTS_LANG_IDS); + auto phones = intersperse(phones_bef, 0); + auto tones = intersperse(tones_bef, 0); + int phone_len = phones.size(); + std::vector langids(phone_len, 3); + + SLOGI("音素转换完成,长度: %d", phone_len); + + // 运行encoder获取latent representation auto encoder_output = encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w, mode_config_.get_length_scale(), mode_config_.sdp_ratio); @@ -269,27 +274,88 @@ class llm_task { auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo(); auto zp_shape = zp_info.GetShape(); - // Decoder parameters setup - int zp_size = decoder_->GetInputSize(0) / sizeof(float); - int dec_len = zp_size / zp_shape[1]; - int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); - const int pad_frames = 16; + SLOGI("Encoder输出完成,形状: [%ld, %ld, %ld],预期音频长度: %d", zp_shape[0], zp_shape[1], zp_shape[2], + audio_len); + + // 解码器参数设置 + int zp_size = decoder_->GetInputSize(0) / sizeof(float); + int dec_len = zp_size / zp_shape[1]; + int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); + + // 定义pad长度(每侧填充帧数) + const int pad_frames = 16; + // 每个音频帧的采样点数量 const int samples_per_frame = 512; - const int effective_frames = dec_len - 2 * pad_frames; + + SLOGI("解码器配置:帧长度=%d, 音频切片长度=%d, pad长度=%d, 每帧采样点=%d", dec_len, audio_slice_len, + pad_frames, samples_per_frame); + + // 每次有效处理的帧数量 + const int effective_frames = dec_len - 2 * pad_frames; + + // 计算需要的解码次数 - 确保所有类型一致 int dec_slice_num = static_cast(std::ceil(static_cast(zp_shape[2]) / static_cast(effective_frames))); - SolaProcessor sola(pad_frames, samples_per_frame); + + SLOGI("将进行 %d 次推理,每次有效帧数: %d", dec_slice_num, effective_frames); + + // === SOLA算法参数设置 === + const int sola_buffer_frame = pad_frames * samples_per_frame; // 重叠缓冲区长度 + const int sola_search_frame = pad_frames * samples_per_frame; // 搜索窗口长度 + const int block_frame = (dec_len - 2 * pad_frames) * samples_per_frame; // 有效块长度 + + // 创建淡入淡出窗口 + std::vector fade_in_window(sola_buffer_frame); + std::vector fade_out_window(sola_buffer_frame); + + for (int i = 0; i < sola_buffer_frame; i++) { + fade_in_window[i] = static_cast(i) / sola_buffer_frame; + fade_out_window[i] = 1.0f - fade_in_window[i]; + } + + // 初始化SOLA缓冲区 + std::vector sola_buffer(sola_buffer_frame, 0.0f); + bool first_frame = true; + std::vector pcmlist; for (int i = 0; i < dec_slice_num; i++) { + // 计算当前批次的输入起始位置 int input_start = i * effective_frames; + // 考虑前向pad,但确保不为负 if (i > 0) { input_start -= pad_frames; } - input_start = std::max(0, input_start); + input_start = std::max(0, input_start); + + // 实际输入长度 int actual_len = std::min(dec_len, static_cast(zp_shape[2] - input_start)); + + // 计算输出的有效范围(帧级别) + int output_start_frame, output_end_frame; + + if (i == 0) { + // 第一帧:跳过前面的pad部分 + output_start_frame = 0; + output_end_frame = effective_frames - 1; + } else if (i == dec_slice_num - 1) { + // 最后一帧:从当前段起始计算 + output_start_frame = i * effective_frames; + // 最后到编码器输出的最大长度 + output_end_frame = static_cast(zp_shape[2]) - 1; + } else { + // 中间帧:标准计算 + output_start_frame = i * effective_frames; + output_end_frame = (i + 1) * effective_frames - 1; + } + + SLOGI("第 %d 次推理: 输入帧范围=[%d-%d],实际长度=%d,输出帧范围=[%d-%d]", i + 1, input_start, + input_start + actual_len - 1, actual_len, output_start_frame, output_end_frame); + + // 准备decoder输入,全部初始化为0 std::vector zp(zp_size, 0); + // 复制数据到decoder输入 for (int n = 0; n < zp_shape[1]; n++) { int copy_size = std::min(actual_len, static_cast(zp_shape[2] - input_start)); if (copy_size > 0) { @@ -297,38 +363,178 @@ class llm_task { sizeof(float) * copy_size); } } - // Run decoder + + // 运行decoder std::vector decoder_output(audio_slice_len); decoder_->SetInput(zp.data(), 0); decoder_->SetInput(g_matrix.data(), 1); + + SLOGI("第 %d 次推理:开始解码...", i + 1); + if (0 != decoder_->Run()) { + SLOGI("第 %d 次推理:解码失败", i + 1); throw std::string("decoder_ RunSync error"); } + decoder_->GetOutput(decoder_output.data(), 0); - std::vector processed_output = sola.ProcessFrame(decoder_output, i, dec_slice_num, actual_len); - pcmlist.insert(pcmlist.end(), processed_output.begin(), processed_output.end()); + // === SOLA处理流程 === + if (first_frame) { + // 首帧特殊处理 - 不应跳过前面的内容 + // 首帧直接从解码器输出开始,不跳过任何内容 + int audio_start = 0; // 从头开始,不跳过pad_frames + + // 计算首帧应该添加的数据长度 + // 首帧应该保留完整解码输出,只留出末尾的sola_buffer_frame用于下一帧衔接 + int audio_len = decoder_output.size() - sola_buffer_frame; + + // 边界检查 + audio_len = std::max(0, audio_len); // 确保不为负 + + // 添加首帧数据 + if (audio_len > 0) { + pcmlist.insert(pcmlist.end(), decoder_output.begin() + audio_start, + decoder_output.begin() + audio_start + audio_len); + } + + // 保存末尾的sola_buffer_frame长度到SOLA缓冲区,用于下一帧对齐 + int buffer_start = audio_len; + + // 确保有足够数据可供复制 + if (buffer_start + sola_buffer_frame <= decoder_output.size()) { + std::copy(decoder_output.begin() + buffer_start, + decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); + } else { + // 可能的情况:首帧数据总长度不足sola_buffer_frame + int available = static_cast(decoder_output.size() - buffer_start); + if (available > 0) { + std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer.begin()); + // 填充零 + std::fill(sola_buffer.begin() + available, sola_buffer.end(), 0.0f); + } else { + // 完全没有足够数据,全部填零 + std::fill(sola_buffer.begin(), sola_buffer.end(), 0.0f); + } + } + + first_frame = false; + + SLOGI("第 %d 次推理: 首帧处理,从位置%d开始添加%d采样点到输出,保存%d样本到SOLA缓冲区", i + 1, + audio_start, audio_len, sola_buffer_frame); + } else { + // 非首帧:需要执行SOLA对齐 + int audio_start = pad_frames * samples_per_frame; + + // 1. 准备搜索窗口 - 当前帧的开头部分 + std::vector search_window(sola_buffer_frame + sola_search_frame); + std::copy(decoder_output.begin() + audio_start, + decoder_output.begin() + audio_start + search_window.size(), search_window.begin()); + + // 2. 寻找最佳对齐点(计算互相关) + int best_offset = 0; + float best_correlation = -1.0; + + for (int offset = 0; offset <= sola_search_frame; offset++) { + float correlation = 0.0; + float energy = 0.0; + + for (int j = 0; j < sola_buffer_frame; j++) { + correlation += sola_buffer[j] * search_window[j + offset]; + energy += search_window[j + offset] * search_window[j + offset]; + } + + // 归一化相关性(避免除零) + float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f; + + if (normalized_correlation > best_correlation) { + best_correlation = normalized_correlation; + best_offset = offset; + } + } + + SLOGI("第 %d 次推理: SOLA找到最佳对齐偏移量 %d,相关系数 %f", i + 1, best_offset, best_correlation); + + // 3. 应用对齐偏移 + int aligned_start = audio_start + best_offset; + + // 4. 平滑过渡处理(对齐区域的crossfade) + std::vector crossfade_region(sola_buffer_frame); + + for (int j = 0; j < sola_buffer_frame; j++) { + // 应用淡入淡出窗口函数 + crossfade_region[j] = + decoder_output[aligned_start + j] * fade_in_window[j] + sola_buffer[j] * fade_out_window[j]; + } + + // 5. 添加crossfade区域到输出 + pcmlist.insert(pcmlist.end(), crossfade_region.begin(), crossfade_region.end()); + + // 6. 添加剩余有效音频数据 + int remaining_start = aligned_start + sola_buffer_frame; + int remaining_len = (i == dec_slice_num - 1) + ? (actual_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame + : (dec_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame; + + // 边界检查 + remaining_len = std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); + + if (remaining_len > 0) { + pcmlist.insert(pcmlist.end(), decoder_output.begin() + remaining_start, + decoder_output.begin() + remaining_start + remaining_len); + } + + // 7. 更新SOLA缓冲区,为下一帧准备 + int buffer_start = remaining_start + remaining_len; + + // 检查是否还有足够的数据用于下一个缓冲区 + if (buffer_start + sola_buffer_frame <= decoder_output.size()) { + std::copy(decoder_output.begin() + buffer_start, + decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); + } else { + // 如果不足,就用零填充 + int avail = static_cast(decoder_output.size() - buffer_start); + if (avail > 0) { + std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer.begin()); + } + std::fill(sola_buffer.begin() + avail, sola_buffer.end(), 0.0f); + } + + SLOGI("第 %d 次推理: 添加 %d + %d 采样点到输出,累计长度: %zu", i + 1, sola_buffer_frame, + remaining_len, pcmlist.size()); + } } + SLOGI("所有推理完成,生成PCM长度: %zu", pcmlist.size()); + + // 后续处理:重采样和转换为int16 double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f); std::vector tmp_pcm((pcmlist.size() * src_ratio + 1)); int len; + + SLOGI("开始音频重采样,源采样率: %f,目标采样率: %f,比率: %f", mode_config_.mode_rate * 1.0f, + mode_config_.audio_rate * 1.0f, src_ratio); + resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio); - // Convert to 16-bit PCM + SLOGI("重采样完成,重采样后长度: %d", len); + + // 转换为16位PCM wav_pcm_data.reserve(len); std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data), [](const auto val) { return (int16_t)(val * INT16_MAX); }); - // Call callback function with output + SLOGI("最终生成音频长度: %zu 个采样点", wav_pcm_data.size()); + + // 调用回调函数输出结果 if (out_callback_) out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish); + SLOGI("TTS处理完成,输出回调已调用"); } catch (const std::exception &e) { - SLOGI("TTS processing exception: %s", e.what()); + SLOGI("TTS处理异常: %s", e.what()); return true; } catch (...) { - SLOGI("TTS processing encountered unknown exception"); + SLOGI("TTS处理发生未知异常"); return true; } return false; diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index d1bcbe9..29c3181 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -9,7 +9,7 @@ #include #include "../../../../../SDK/components/utilities/include/sample_log.h" // Debug logging switch - set to true to enable debug logs -static bool DEBUG_LOGGING = false; +static bool DEBUG_LOGGING = true; // Macro for debug logging #define DEBUG_LOG(fmt, ...) \ do { \ From cbb0afaf65e162fc5946b31c727cf563c8346fbe Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Fri, 9 May 2025 17:24:57 +0800 Subject: [PATCH 60/64] Optimize G2P process to skip inference for short audio clips Improved the Grapheme-to-Phoneme (G2P) process by eliminating separate inference operations for audio segments that are too short, enhancing processing efficiency. --- .../main_melotts/src/runner/Lexicon.hpp | 130 ++++++++++++++++-- 1 file changed, 121 insertions(+), 9 deletions(-) diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 29c3181..cee7a2e 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -195,16 +195,99 @@ class Lexicon { phonetic_str.c_str()); } - void convert(const std::string& text, std::vector& phones, std::vector& tones) + std::vector splitTextByPunctuation(const std::string& text) + { + std::vector segments; + auto chars = splitEachChar(text); + std::string current_segment; + + for (size_t i = 0; i < chars.size(); ++i) { + std::string c = chars[i]; + current_segment += c; + + bool is_segment_punct = false; + std::string punct_key = c; + + if (c == ",") + punct_key = ","; + else if (c == "。") + punct_key = "."; + else if (c == "!") + punct_key = "!"; + else if (c == "?") + punct_key = "?"; + + if (lexicon.find(punct_key) != lexicon.end() && + (punct_key == "." || punct_key == "!" || punct_key == "?" || punct_key == "," || punct_key == "…")) { + is_segment_punct = true; + } + + if (is_segment_punct && i < chars.size() - 1) { + segments.push_back(current_segment); + current_segment.clear(); + } + } + + if (!current_segment.empty()) { + segments.push_back(current_segment); + } + + return segments; + } + std::vector mergeShortSegments(const std::vector& segments, int min_length = 4) + { + std::vector merged_segments; + std::string current_segment; + + for (size_t i = 0; i < segments.size(); ++i) { + auto chars = splitEachChar(segments[i]); + int actual_chars = 0; + for (const auto& c : chars) { + if (c != " " && lexicon.find(c) != lexicon.end()) { + std::string punct_key = c; + if (c == ",") + punct_key = ","; + else if (c == "。") + punct_key = "."; + else if (c == "!") + punct_key = "!"; + else if (c == "?") + punct_key = "?"; + + if (punct_key != "," && punct_key != "." && punct_key != "!" && punct_key != "?" && + punct_key != "…" && punct_key != "'" && punct_key != "-") { + actual_chars++; + } + } else if (is_english(c)) { + actual_chars++; + } + } + if (actual_chars < min_length && i < segments.size() - 1) { + if (current_segment.empty()) { + current_segment = segments[i]; + } else { + current_segment += segments[i]; + } + } else { + if (!current_segment.empty()) { + current_segment += segments[i]; + merged_segments.push_back(current_segment); + current_segment.clear(); + } else { + merged_segments.push_back(segments[i]); + } + } + } + + if (!current_segment.empty()) { + merged_segments.push_back(current_segment); + } + + return merged_segments; + } + + void processSegment(const std::string& text, std::vector& phones, std::vector& tones) { - DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str()); - DEBUG_LOG("=======Matching Results======="); - DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones"); - DEBUG_LOG("-----------------------------"); - phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); - tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), - tonesToString(unknown_token.second).c_str()); auto chars = splitEachChar(text); int i = 0; while (i < chars.size()) { @@ -274,10 +357,39 @@ class Lexicon { } } } + } + + void convert(const std::string& text, std::vector& phones, std::vector& tones) + { + DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str()); + + std::vector segments = splitTextByPunctuation(text); + + std::vector merged_segments = mergeShortSegments(segments); + + DEBUG_LOG("Text divided into %zu segments after merging short segments", merged_segments.size()); + for (size_t i = 0; i < merged_segments.size(); ++i) { + DEBUG_LOG("Segment %zu: \"%s\"", i + 1, merged_segments[i].c_str()); + } + + DEBUG_LOG("=======Matching Results======="); + DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones"); + DEBUG_LOG("-----------------------------"); + + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); + + for (const auto& segment : merged_segments) { + processSegment(segment, phones, tones); + } + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str()); + DEBUG_LOG("\nProcessing Summary:"); DEBUG_LOG("Original text: %s", text.c_str()); DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str()); From 74603beede65a50fd368d9056ec80659e4354b45 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Fri, 9 May 2025 17:36:19 +0800 Subject: [PATCH 61/64] [update] update qwen3-0.6B model --- .../models/mode_qwen3-0.6B-ax630c.json | 35 +++++ .../scripts/tokenizer_qwen3-0.6B-ax630c.py | 131 ++++++++++++++++++ projects/llm_framework/tools/llm_pack.py | 1 + 3 files changed, 167 insertions(+) create mode 100644 projects/llm_framework/main_llm/models/mode_qwen3-0.6B-ax630c.json create mode 100644 projects/llm_framework/main_llm/scripts/tokenizer_qwen3-0.6B-ax630c.py diff --git a/projects/llm_framework/main_llm/models/mode_qwen3-0.6B-ax630c.json b/projects/llm_framework/main_llm/models/mode_qwen3-0.6B-ax630c.json new file mode 100644 index 0000000..6b349d3 --- /dev/null +++ b/projects/llm_framework/main_llm/models/mode_qwen3-0.6B-ax630c.json @@ -0,0 +1,35 @@ +{ + "mode":"qwen3-0.6B-ax630c", + "type":"llm", + "homepage":"https://huggingface.co/Qwen/Qwen3-0.6B", + "capabilities":[ + "text_generation", + "chat" + ], + "input_type":[ + "llm.utf-8", + "llm.utf-8.stream", + "llm.chat_completion", + "llm.chat_completion.stream" + ], + "output_type":[ + "llm.utf-8", + "llm.utf-8.stream" + ], + "mode_param":{ + "tokenizer_type":2, + "filename_tokenizer_model":"http://localhost:8080", + "filename_tokens_embed":"model.embed_tokens.weight.bfloat16.bin", + "filename_post_axmodel":"qwen3_post.axmodel", + "template_filename_axmodel":"qwen3_p128_l%d_together.axmodel", + "b_use_topk":false, + "b_bos":false, + "b_eos":false, + "axmodel_num":28, + "tokens_embed_num":151936, + "tokens_embed_size":1024, + "b_use_mmap_load_embed":true, + "b_dynamic_load_axmodel_layer":false, + "ext_scripts":["tokenizer_qwen3-0.6B-ax630c.py"] + } +} \ No newline at end of file diff --git a/projects/llm_framework/main_llm/scripts/tokenizer_qwen3-0.6B-ax630c.py b/projects/llm_framework/main_llm/scripts/tokenizer_qwen3-0.6B-ax630c.py new file mode 100644 index 0000000..652335a --- /dev/null +++ b/projects/llm_framework/main_llm/scripts/tokenizer_qwen3-0.6B-ax630c.py @@ -0,0 +1,131 @@ +from transformers import AutoTokenizer, PreTrainedTokenizerFast +from http.server import HTTPServer, BaseHTTPRequestHandler +import json +import argparse + +class Tokenizer_Http(): + + def __init__(self, model_id): + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + + def encode(self, prompt, content): + messages = [ + {"role": "system", "content": content}, + {"role": "user", "content": prompt} + ] + text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + print(text) + token_ids = self.tokenizer.encode(text) + return token_ids + + def decode(self, token_ids): + return self.tokenizer.decode(token_ids) + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def bos_token(self): + return self.tokenizer.bos_token + + @property + def eos_token(self): + return self.tokenizer.eos_token + +class Request(BaseHTTPRequestHandler): + #通过类继承,新定义类 + timeout = 5 + server_version = 'Apache' + + def do_GET(self): + print(self.path) + #在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行) + self.send_response(200) + self.send_header("type", "get") #设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == '/bos_id': + bos_id = tokenizer.bos_id + # print(bos_id) + # to json + if bos_id is None: + msg = json.dumps({'bos_id': -1}) + else: + msg = json.dumps({'bos_id': bos_id}) + elif self.path == '/eos_id': + eos_id = tokenizer.eos_id + if eos_id is None: + msg = json.dumps({'eos_id': -1}) + else: + msg = json.dumps({'eos_id': eos_id}) + else: + msg = 'error' + + print(msg) + msg = str(msg).encode() #转为str再转为byte格式 + + self.wfile.write(msg) #将byte格式的信息返回给客户端 + + def do_POST(self): + #在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行) + data = self.rfile.read(int( + self.headers['content-length'])) #获取从客户端传入的参数(byte格式) + data = data.decode() #将byte格式转为str格式 + + self.send_response(200) + self.send_header("type", "post") #设置响应头,可省略或设置多个 + self.end_headers() + + if self.path == '/encode': + req = json.loads(data) + prompt = req['text'] + + token_ids = tokenizer.encode(prompt, args.content) + if token_ids is None: + msg = json.dumps({'token_ids': -1}) + else: + msg = json.dumps({'token_ids': token_ids}) + + elif self.path == '/decode': + req = json.loads(data) + token_ids = req['token_ids'] + text = tokenizer.decode(token_ids) + if text is None: + msg = json.dumps({'text': ""}) + else: + msg = json.dumps({'text': text}) + else: + msg = 'error' + print(msg) + msg = str(msg).encode() #转为str再转为byte格式 + + self.wfile.write(msg) #将byte格式的信息返回给客户端 + + +if __name__ == "__main__": + + args = argparse.ArgumentParser() + args.add_argument('--host', type=str, default='localhost') + args.add_argument('--port', type=int, default=8080) + args.add_argument('--model_id', type=str, default='qwen3_0.6B_tokenizer') + args.add_argument('--content', type=str, default='You are Qwen, created by Alibaba Cloud. You are a helpful assistant.') + args = args.parse_args() + + tokenizer = Tokenizer_Http(args.model_id) + + # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token) + # print(tokenizer.encode("hello world", args.content)) + + host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1' + print('http://%s:%s' % host) + server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例 + server.serve_forever() #开启服务 diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py index 2e1902e..9fd6c58 100755 --- a/projects/llm_framework/tools/llm_pack.py +++ b/projects/llm_framework/tools/llm_pack.py @@ -396,6 +396,7 @@ def create_bin_deb(package_name, version, src_folder, revision = 'm5stack1', dep 'llm-model-qwen2.5-1.5B-p256-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-p256-ax630c', '0.4', src_folder, revision], 'llm-model-qwen2.5-1.5B-Int4-ax630c':[create_data_deb,'llm-model-qwen2.5-1.5B-Int4-ax630c', '0.4', src_folder, revision], 'llm-model-qwen2.5-coder-0.5B-ax630c':[create_data_deb,'llm-model-qwen2.5-coder-0.5B-ax630c', data_version, src_folder, revision], + 'llm-model-qwen3-0.6B-ax630c':[create_data_deb,'llm-model-qwen3-0.6B-ax630c', '0.4', src_folder, revision], 'llm-model-llama3.2-1B-prefill-ax630c':[create_data_deb,'llm-model-llama3.2-1B-prefill-ax630c', data_version, src_folder, revision], 'llm-model-llama3.2-1B-p256-ax630c':[create_data_deb,'llm-model-llama3.2-1B-p256-ax630c', '0.4', src_folder, revision], 'llm-model-openbuddy-llama3.2-1B-ax630c':[create_data_deb,'llm-model-openbuddy-llama3.2-1B-ax630c', data_version, src_folder, revision], From 9e7342f3b172a5b2e7dd08f98ef8225a9876be1d Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Fri, 9 May 2025 17:43:17 +0800 Subject: [PATCH 62/64] Fix code formatting and integrate SOLA algorithm into main.py Corrected code formatting issues and moved the SOLA (Synchronized Overlap-Add) algorithm implementation into main.py for better code organization. --- .../llm_framework/main_melotts/src/main.cpp | 172 +++++------ .../main_melotts/src/runner/Lexicon.hpp | 130 +-------- .../main_melotts/src/runner/SolaProcessor.h | 269 ------------------ 3 files changed, 97 insertions(+), 474 deletions(-) delete mode 100644 projects/llm_framework/main_melotts/src/runner/SolaProcessor.h diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index b9a6c6f..0ad2e71 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -9,7 +9,6 @@ #include "Lexicon.hpp" #include #include "AudioFile.h" -#include "SolaProcessor.h" #include "Lexicon.hpp" #include @@ -253,9 +252,9 @@ class llm_task { } return false; } - SLOGI("开始处理文本: %s", msg_str.c_str()); + SLOGI("Processing text: %s", msg_str.c_str()); - // 文本转音素处理部分保持不变 + // Convert text to phonemes and tones std::vector phones_bef, tones_bef; lexicon_->convert(msg_str, phones_bef, tones_bef); auto phones = intersperse(phones_bef, 0); @@ -263,9 +262,9 @@ class llm_task { int phone_len = phones.size(); std::vector langids(phone_len, 3); - SLOGI("音素转换完成,长度: %d", phone_len); + SLOGI("Phoneme conversion completed, length: %d", phone_len); - // 运行encoder获取latent representation + // Run the encoder to generate hidden representations auto encoder_output = encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w, mode_config_.get_length_scale(), mode_config_.sdp_ratio); @@ -274,37 +273,33 @@ class llm_task { auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo(); auto zp_shape = zp_info.GetShape(); - SLOGI("Encoder输出完成,形状: [%ld, %ld, %ld],预期音频长度: %d", zp_shape[0], zp_shape[1], zp_shape[2], - audio_len); + SLOGI("Encoder output completed, shape: [%ld, %ld, %ld], expected audio length: %d", zp_shape[0], + zp_shape[1], zp_shape[2], audio_len); - // 解码器参数设置 + // Calculate decoder parameters int zp_size = decoder_->GetInputSize(0) / sizeof(float); int dec_len = zp_size / zp_shape[1]; int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); - // 定义pad长度(每侧填充帧数) - const int pad_frames = 16; - // 每个音频帧的采样点数量 + const int pad_frames = 16; const int samples_per_frame = 512; - SLOGI("解码器配置:帧长度=%d, 音频切片长度=%d, pad长度=%d, 每帧采样点=%d", dec_len, audio_slice_len, - pad_frames, samples_per_frame); + SLOGI("Decoder configuration: frame length=%d, audio slice length=%d, pad length=%d, samples per frame=%d", + dec_len, audio_slice_len, pad_frames, samples_per_frame); - // 每次有效处理的帧数量 const int effective_frames = dec_len - 2 * pad_frames; - // 计算需要的解码次数 - 确保所有类型一致 int dec_slice_num = static_cast(std::ceil(static_cast(zp_shape[2]) / static_cast(effective_frames))); - SLOGI("将进行 %d 次推理,每次有效帧数: %d", dec_slice_num, effective_frames); + SLOGI("Will perform %d inferences, each with effective frames: %d", dec_slice_num, effective_frames); - // === SOLA算法参数设置 === - const int sola_buffer_frame = pad_frames * samples_per_frame; // 重叠缓冲区长度 - const int sola_search_frame = pad_frames * samples_per_frame; // 搜索窗口长度 - const int block_frame = (dec_len - 2 * pad_frames) * samples_per_frame; // 有效块长度 + // SOLA parameters setup + const int sola_buffer_frame = pad_frames * samples_per_frame; // Overlap buffer length + const int sola_search_frame = pad_frames * samples_per_frame; // Search window length + const int block_frame = (dec_len - 2 * pad_frames) * samples_per_frame; // Effective block length - // 创建淡入淡出窗口 + // Create fade-in/fade-out windows for smooth transitions std::vector fade_in_window(sola_buffer_frame); std::vector fade_out_window(sola_buffer_frame); @@ -313,49 +308,50 @@ class llm_task { fade_out_window[i] = 1.0f - fade_in_window[i]; } - // 初始化SOLA缓冲区 + // Initialize SOLA buffer std::vector sola_buffer(sola_buffer_frame, 0.0f); bool first_frame = true; std::vector pcmlist; + // Main decoding loop - process each slice for (int i = 0; i < dec_slice_num; i++) { - // 计算当前批次的输入起始位置 + // Calculate start position for current batch input int input_start = i * effective_frames; - // 考虑前向pad,但确保不为负 + // Consider forward padding, but ensure non-negative if (i > 0) { input_start -= pad_frames; } input_start = std::max(0, input_start); - // 实际输入长度 + // Actual input length int actual_len = std::min(dec_len, static_cast(zp_shape[2] - input_start)); - // 计算输出的有效范围(帧级别) + // Calculate effective output range (frame level) int output_start_frame, output_end_frame; if (i == 0) { - // 第一帧:跳过前面的pad部分 + // First frame: skip padding at beginning output_start_frame = 0; output_end_frame = effective_frames - 1; } else if (i == dec_slice_num - 1) { - // 最后一帧:从当前段起始计算 + // Last frame: calculate from current segment start output_start_frame = i * effective_frames; - // 最后到编码器输出的最大长度 + // Last frame extends to encoder's maximum output length output_end_frame = static_cast(zp_shape[2]) - 1; } else { - // 中间帧:标准计算 + // Middle frames: standard calculation output_start_frame = i * effective_frames; output_end_frame = (i + 1) * effective_frames - 1; } - SLOGI("第 %d 次推理: 输入帧范围=[%d-%d],实际长度=%d,输出帧范围=[%d-%d]", i + 1, input_start, - input_start + actual_len - 1, actual_len, output_start_frame, output_end_frame); + SLOGI("Inference #%d: input frame range=[%d-%d], actual length=%d, output frame range=[%d-%d]", i + 1, + input_start, input_start + actual_len - 1, actual_len, output_start_frame, output_end_frame); - // 准备decoder输入,全部初始化为0 + // Prepare decoder input, initialize all to zero std::vector zp(zp_size, 0); - // 复制数据到decoder输入 + // Copy data to decoder input for (int n = 0; n < zp_shape[1]; n++) { int copy_size = std::min(actual_len, static_cast(zp_shape[2] - input_start)); if (copy_size > 0) { @@ -364,73 +360,76 @@ class llm_task { } } - // 运行decoder + // Run decoder std::vector decoder_output(audio_slice_len); decoder_->SetInput(zp.data(), 0); decoder_->SetInput(g_matrix.data(), 1); - SLOGI("第 %d 次推理:开始解码...", i + 1); + SLOGI("Inference #%d: starting decoding...", i + 1); if (0 != decoder_->Run()) { - SLOGI("第 %d 次推理:解码失败", i + 1); + SLOGI("Inference #%d: decoding failed", i + 1); throw std::string("decoder_ RunSync error"); } decoder_->GetOutput(decoder_output.data(), 0); - // === SOLA处理流程 === + // === SOLA Processing Logic === if (first_frame) { - // 首帧特殊处理 - 不应跳过前面的内容 - // 首帧直接从解码器输出开始,不跳过任何内容 - int audio_start = 0; // 从头开始,不跳过pad_frames + // Special handling for first frame - should not skip initial content + // First frame starts directly from decoder output without skipping + int audio_start = 0; // Start from beginning, don't skip pad_frames - // 计算首帧应该添加的数据长度 - // 首帧应该保留完整解码输出,只留出末尾的sola_buffer_frame用于下一帧衔接 + // Calculate data length for first frame + // First frame should preserve complete decoder output, only reserving sola_buffer_frame at the end + // for next frame alignment int audio_len = decoder_output.size() - sola_buffer_frame; - // 边界检查 - audio_len = std::max(0, audio_len); // 确保不为负 + // Boundary check + audio_len = std::max(0, audio_len); // Ensure non-negative - // 添加首帧数据 + // Add first frame data if (audio_len > 0) { pcmlist.insert(pcmlist.end(), decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + audio_len); } - // 保存末尾的sola_buffer_frame长度到SOLA缓冲区,用于下一帧对齐 + // Save sola_buffer_frame length from the end to SOLA buffer for next frame alignment int buffer_start = audio_len; - // 确保有足够数据可供复制 + // Ensure sufficient data is available for copying if (buffer_start + sola_buffer_frame <= decoder_output.size()) { std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); } else { - // 可能的情况:首帧数据总长度不足sola_buffer_frame + // Possible case: first frame data is shorter than sola_buffer_frame int available = static_cast(decoder_output.size() - buffer_start); if (available > 0) { std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer.begin()); - // 填充零 + // Fill with zeros std::fill(sola_buffer.begin() + available, sola_buffer.end(), 0.0f); } else { - // 完全没有足够数据,全部填零 + // Completely insufficient data, fill all with zeros std::fill(sola_buffer.begin(), sola_buffer.end(), 0.0f); } } first_frame = false; - SLOGI("第 %d 次推理: 首帧处理,从位置%d开始添加%d采样点到输出,保存%d样本到SOLA缓冲区", i + 1, - audio_start, audio_len, sola_buffer_frame); + SLOGI( + "Inference #%d: First frame processing, added %d samples from position %d to output, saved %d " + "samples to SOLA buffer", + i + 1, audio_len, audio_start, sola_buffer_frame); } else { - // 非首帧:需要执行SOLA对齐 + // Non-first frame: SOLA alignment required int audio_start = pad_frames * samples_per_frame; - // 1. 准备搜索窗口 - 当前帧的开头部分 + // 1. Prepare search window - beginning portion of current frame std::vector search_window(sola_buffer_frame + sola_search_frame); std::copy(decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + search_window.size(), search_window.begin()); - // 2. 寻找最佳对齐点(计算互相关) + // 2. Find best alignment point (calculate cross-correlation) int best_offset = 0; float best_correlation = -1.0; @@ -443,7 +442,7 @@ class llm_task { energy += search_window[j + offset] * search_window[j + offset]; } - // 归一化相关性(避免除零) + // Normalize correlation (avoid division by zero) float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f; if (normalized_correlation > best_correlation) { @@ -452,30 +451,31 @@ class llm_task { } } - SLOGI("第 %d 次推理: SOLA找到最佳对齐偏移量 %d,相关系数 %f", i + 1, best_offset, best_correlation); + SLOGI("Inference #%d: SOLA found best alignment offset %d with correlation coefficient %f", i + 1, + best_offset, best_correlation); - // 3. 应用对齐偏移 + // 3. Apply alignment offset int aligned_start = audio_start + best_offset; - // 4. 平滑过渡处理(对齐区域的crossfade) + // 4. Smooth transition processing (crossfade in alignment region) std::vector crossfade_region(sola_buffer_frame); for (int j = 0; j < sola_buffer_frame; j++) { - // 应用淡入淡出窗口函数 + // Apply fade-in/fade-out window functions crossfade_region[j] = decoder_output[aligned_start + j] * fade_in_window[j] + sola_buffer[j] * fade_out_window[j]; } - // 5. 添加crossfade区域到输出 + // 5. Add crossfade region to output pcmlist.insert(pcmlist.end(), crossfade_region.begin(), crossfade_region.end()); - // 6. 添加剩余有效音频数据 + // 6. Add remaining valid audio data int remaining_start = aligned_start + sola_buffer_frame; int remaining_len = (i == dec_slice_num - 1) ? (actual_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame : (dec_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame; - // 边界检查 + // Boundary check remaining_len = std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); if (remaining_len > 0) { @@ -483,15 +483,15 @@ class llm_task { decoder_output.begin() + remaining_start + remaining_len); } - // 7. 更新SOLA缓冲区,为下一帧准备 + // 7. Update SOLA buffer for next frame int buffer_start = remaining_start + remaining_len; - // 检查是否还有足够的数据用于下一个缓冲区 + // Check if there's enough data for the next buffer if (buffer_start + sola_buffer_frame <= decoder_output.size()) { std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); } else { - // 如果不足,就用零填充 + // If insufficient, fill with zeros int avail = static_cast(decoder_output.size() - buffer_start); if (avail > 0) { std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer.begin()); @@ -499,42 +499,46 @@ class llm_task { std::fill(sola_buffer.begin() + avail, sola_buffer.end(), 0.0f); } - SLOGI("第 %d 次推理: 添加 %d + %d 采样点到输出,累计长度: %zu", i + 1, sola_buffer_frame, - remaining_len, pcmlist.size()); + SLOGI("Inference #%d: Added %d + %d samples to output, cumulative length: %zu", i + 1, + sola_buffer_frame, remaining_len, pcmlist.size()); } } - SLOGI("所有推理完成,生成PCM长度: %zu", pcmlist.size()); + SLOGI("All inference completed, generated PCM length: %zu", pcmlist.size()); - // 后续处理:重采样和转换为int16 - double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f); + // Post-processing: resample and convert to int16 + double src_ratio = + static_cast(mode_config_.audio_rate) / static_cast(mode_config_.mode_rate); std::vector tmp_pcm((pcmlist.size() * src_ratio + 1)); int len; - SLOGI("开始音频重采样,源采样率: %f,目标采样率: %f,比率: %f", mode_config_.mode_rate * 1.0f, - mode_config_.audio_rate * 1.0f, src_ratio); + SLOGI("Starting audio resampling, source rate: %f, target rate: %f, ratio: %f", + static_cast(mode_config_.mode_rate), static_cast(mode_config_.audio_rate), src_ratio); resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio); - SLOGI("重采样完成,重采样后长度: %d", len); + SLOGI("Resampling completed, length after resampling: %d", len); - // 转换为16位PCM + // Convert to 16-bit PCM wav_pcm_data.reserve(len); std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data), - [](const auto val) { return (int16_t)(val * INT16_MAX); }); + [](const auto val) { return static_cast(val * INT16_MAX); }); - SLOGI("最终生成音频长度: %zu 个采样点", wav_pcm_data.size()); + SLOGI("Final audio length: %zu samples", wav_pcm_data.size()); - // 调用回调函数输出结果 - if (out_callback_) - out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish); + // Call the output callback function with the result + if (out_callback_) { + out_callback_( + std::string(reinterpret_cast(wav_pcm_data.data()), wav_pcm_data.size() * sizeof(int16_t)), + finish); + } - SLOGI("TTS处理完成,输出回调已调用"); + SLOGI("TTS processing completed, output callback invoked"); } catch (const std::exception &e) { - SLOGI("TTS处理异常: %s", e.what()); + SLOGI("TTS processing exception: %s", e.what()); return true; } catch (...) { - SLOGI("TTS处理发生未知异常"); + SLOGI("TTS processing encountered an unknown exception"); return true; } return false; diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index cee7a2e..29c3181 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -195,99 +195,16 @@ class Lexicon { phonetic_str.c_str()); } - std::vector splitTextByPunctuation(const std::string& text) - { - std::vector segments; - auto chars = splitEachChar(text); - std::string current_segment; - - for (size_t i = 0; i < chars.size(); ++i) { - std::string c = chars[i]; - current_segment += c; - - bool is_segment_punct = false; - std::string punct_key = c; - - if (c == ",") - punct_key = ","; - else if (c == "。") - punct_key = "."; - else if (c == "!") - punct_key = "!"; - else if (c == "?") - punct_key = "?"; - - if (lexicon.find(punct_key) != lexicon.end() && - (punct_key == "." || punct_key == "!" || punct_key == "?" || punct_key == "," || punct_key == "…")) { - is_segment_punct = true; - } - - if (is_segment_punct && i < chars.size() - 1) { - segments.push_back(current_segment); - current_segment.clear(); - } - } - - if (!current_segment.empty()) { - segments.push_back(current_segment); - } - - return segments; - } - std::vector mergeShortSegments(const std::vector& segments, int min_length = 4) - { - std::vector merged_segments; - std::string current_segment; - - for (size_t i = 0; i < segments.size(); ++i) { - auto chars = splitEachChar(segments[i]); - int actual_chars = 0; - for (const auto& c : chars) { - if (c != " " && lexicon.find(c) != lexicon.end()) { - std::string punct_key = c; - if (c == ",") - punct_key = ","; - else if (c == "。") - punct_key = "."; - else if (c == "!") - punct_key = "!"; - else if (c == "?") - punct_key = "?"; - - if (punct_key != "," && punct_key != "." && punct_key != "!" && punct_key != "?" && - punct_key != "…" && punct_key != "'" && punct_key != "-") { - actual_chars++; - } - } else if (is_english(c)) { - actual_chars++; - } - } - if (actual_chars < min_length && i < segments.size() - 1) { - if (current_segment.empty()) { - current_segment = segments[i]; - } else { - current_segment += segments[i]; - } - } else { - if (!current_segment.empty()) { - current_segment += segments[i]; - merged_segments.push_back(current_segment); - current_segment.clear(); - } else { - merged_segments.push_back(segments[i]); - } - } - } - - if (!current_segment.empty()) { - merged_segments.push_back(current_segment); - } - - return merged_segments; - } - - void processSegment(const std::string& text, std::vector& phones, std::vector& tones) + void convert(const std::string& text, std::vector& phones, std::vector& tones) { + DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str()); + DEBUG_LOG("=======Matching Results======="); + DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones"); + DEBUG_LOG("-----------------------------"); + phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); + tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); + DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), + tonesToString(unknown_token.second).c_str()); auto chars = splitEachChar(text); int i = 0; while (i < chars.size()) { @@ -357,39 +274,10 @@ class Lexicon { } } } - } - - void convert(const std::string& text, std::vector& phones, std::vector& tones) - { - DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str()); - - std::vector segments = splitTextByPunctuation(text); - - std::vector merged_segments = mergeShortSegments(segments); - - DEBUG_LOG("Text divided into %zu segments after merging short segments", merged_segments.size()); - for (size_t i = 0; i < merged_segments.size(); ++i) { - DEBUG_LOG("Segment %zu: \"%s\"", i + 1, merged_segments[i].c_str()); - } - - DEBUG_LOG("=======Matching Results======="); - DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones"); - DEBUG_LOG("-----------------------------"); - - phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); - tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); - DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), - tonesToString(unknown_token.second).c_str()); - - for (const auto& segment : merged_segments) { - processSegment(segment, phones, tones); - } - phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end()); tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end()); DEBUG_LOG("\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str()); - DEBUG_LOG("\nProcessing Summary:"); DEBUG_LOG("Original text: %s", text.c_str()); DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str()); diff --git a/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h b/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h deleted file mode 100644 index a2286bb..0000000 --- a/projects/llm_framework/main_melotts/src/runner/SolaProcessor.h +++ /dev/null @@ -1,269 +0,0 @@ -#ifndef SOLA_PROCESSOR_H -#define SOLA_PROCESSOR_H - -#include -#include -#include -#include -#include - -/** - * SolaProcessor - Synchronous Overlap-Add method for audio frame processing - * - * This class provides functionality for smoothly concatenating audio frames - * using the SOLA algorithm, which finds optimal alignment points between - * consecutive frames and applies crossfading for smooth transitions. - */ -class SolaProcessor { -public: - /** - * Constructor - * - * @param padFrames Number of padding frames at the beginning and end - * @param samplesPerFrame Number of audio samples in each frame - */ - SolaProcessor(int padFrames, int samplesPerFrame) - : pad_frames_(padFrames), samples_per_frame_(samplesPerFrame), first_frame_(true) - { - Initialize(); - } - - /** - * Reset the processor to its initial state - */ - void Reset() - { - first_frame_ = true; - std::fill(sola_buffer_.begin(), sola_buffer_.end(), 0.0f); - } - - /** - * Process a single audio frame - * - * @param decoder_output Raw audio data from decoder - * @param frameIndex Current frame index - * @param totalFrames Total number of frames - * @param actualFrameLen Actual length of the frame - * @return Processed audio samples - */ - std::vector ProcessFrame(const std::vector& decoder_output, int frameIndex, int totalFrames, - int actualFrameLen) - { - std::vector processed_output; - - if (first_frame_) { - // Special handling for the first frame - ProcessFirstFrame(decoder_output, processed_output, actualFrameLen); - first_frame_ = false; - } else { - // Process subsequent frames with SOLA algorithm - ProcessSubsequentFrame(decoder_output, processed_output, frameIndex, totalFrames, actualFrameLen); - } - - return processed_output; - } - -private: - /** - * Initialize the SOLA processor parameters and buffers - */ - void Initialize() - { - // Calculate SOLA parameters - sola_buffer_frame_ = pad_frames_ * samples_per_frame_; - sola_search_frame_ = pad_frames_ * samples_per_frame_; - effective_frames_ = 0; // Will be set during frame processing - - // Create fade-in and fade-out windows - fade_in_window_.resize(sola_buffer_frame_); - fade_out_window_.resize(sola_buffer_frame_); - - for (int i = 0; i < sola_buffer_frame_; i++) { - fade_in_window_[i] = static_cast(i) / sola_buffer_frame_; - fade_out_window_[i] = 1.0f - fade_in_window_[i]; - } - - // Initialize SOLA buffer - sola_buffer_.resize(sola_buffer_frame_, 0.0f); - } - - /** - * Process the first audio frame - * - * @param decoder_output Raw audio data from decoder - * @param processed_output Output buffer for processed audio - * @param actualFrameLen Actual length of the frame - */ - void ProcessFirstFrame(const std::vector& decoder_output, std::vector& processed_output, - int actualFrameLen) - { - int audio_start = pad_frames_ * samples_per_frame_; - int audio_len = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_; - - // Boundary check - audio_len = std::min(audio_len, static_cast(decoder_output.size() - audio_start)); - - // Add first frame data to output - processed_output.insert(processed_output.end(), decoder_output.begin() + audio_start, - decoder_output.begin() + audio_start + audio_len); - - // Save the end part to SOLA buffer for next frame alignment - int buffer_start = audio_start + audio_len; - if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) { - std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_, - sola_buffer_.begin()); - } - } - - /** - * Process subsequent audio frames using SOLA algorithm - * - * @param decoder_output Raw audio data from decoder - * @param processed_output Output buffer for processed audio - * @param frameIndex Current frame index - * @param totalFrames Total number of frames - * @param actualFrameLen Actual length of the frame - */ - void ProcessSubsequentFrame(const std::vector& decoder_output, std::vector& processed_output, - int frameIndex, int totalFrames, int actualFrameLen) - { - int audio_start = pad_frames_ * samples_per_frame_; - - // 1. Prepare search window - std::vector search_window(sola_buffer_frame_ + sola_search_frame_); - std::copy(decoder_output.begin() + audio_start, decoder_output.begin() + audio_start + search_window.size(), - search_window.begin()); - - // 2. Find best alignment point (compute cross-correlation) - int best_offset = FindBestOffset(search_window); - - // 3. Apply alignment offset - int aligned_start = audio_start + best_offset; - - // 4. Create smooth transition - std::vector crossfade_region = CreateCrossfade(decoder_output, aligned_start); - - // 5. Add crossfade region to output - processed_output.insert(processed_output.end(), crossfade_region.begin(), crossfade_region.end()); - - // 6. Add remaining valid audio data - AddRemainingAudio(decoder_output, processed_output, aligned_start, frameIndex, totalFrames, actualFrameLen); - } - - /** - * Find the best alignment offset using normalized cross-correlation - * - * @param search_window Window of audio samples to search in - * @return Optimal offset for alignment - */ - int FindBestOffset(const std::vector& search_window) - { - int best_offset = 0; - float best_correlation = -1.0f; - - for (int offset = 0; offset <= sola_search_frame_; offset++) { - float correlation = 0.0f; - float energy = 0.0f; - - for (int j = 0; j < sola_buffer_frame_; j++) { - correlation += sola_buffer_[j] * search_window[j + offset]; - energy += search_window[j + offset] * search_window[j + offset]; - } - - // Normalize correlation - float normalized_correlation = (energy > 1e-8) ? correlation / std::sqrt(energy) : 0.0f; - - if (normalized_correlation > best_correlation) { - best_correlation = normalized_correlation; - best_offset = offset; - } - } - - return best_offset; - } - - /** - * Create crossfade transition region - * - * @param decoder_output Raw audio data from decoder - * @param aligned_start Starting point after alignment - * @return Crossfaded audio samples - */ - std::vector CreateCrossfade(const std::vector& decoder_output, int aligned_start) - { - std::vector crossfade_region(sola_buffer_frame_); - - for (int j = 0; j < sola_buffer_frame_; j++) { - // Apply fade-in and fade-out window functions - crossfade_region[j] = - decoder_output[aligned_start + j] * fade_in_window_[j] + sola_buffer_[j] * fade_out_window_[j]; - } - - return crossfade_region; - } - - /** - * Add remaining audio data and update buffer - * - * @param decoder_output Raw audio data from decoder - * @param processed_output Output buffer for processed audio - * @param aligned_start Starting point after alignment - * @param frameIndex Current frame index - * @param totalFrames Total number of frames - * @param actualFrameLen Actual length of the frame - */ - void AddRemainingAudio(const std::vector& decoder_output, std::vector& processed_output, - int aligned_start, int frameIndex, int totalFrames, int actualFrameLen) - { - int remaining_start = aligned_start + sola_buffer_frame_; - int remaining_len = (actualFrameLen - 2 * pad_frames_) * samples_per_frame_ - sola_buffer_frame_; - - // Boundary check - remaining_len = std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); - - if (remaining_len > 0) { - processed_output.insert(processed_output.end(), decoder_output.begin() + remaining_start, - decoder_output.begin() + remaining_start + remaining_len); - } - - // Update SOLA buffer - UpdateSolaBuffer(decoder_output, remaining_start + remaining_len); - } - - /** - * Update SOLA buffer with new audio data - * - * @param decoder_output Raw audio data from decoder - * @param buffer_start Starting point for the new buffer data - */ - void UpdateSolaBuffer(const std::vector& decoder_output, int buffer_start) - { - // Check if there's enough data for the next buffer - if (buffer_start + sola_buffer_frame_ <= decoder_output.size()) { - std::copy(decoder_output.begin() + buffer_start, decoder_output.begin() + buffer_start + sola_buffer_frame_, - sola_buffer_.begin()); - } else { - // Fill with zeros if not enough data - int avail = static_cast(decoder_output.size() - buffer_start); - if (avail > 0) { - std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer_.begin()); - } - std::fill(sola_buffer_.begin() + avail, sola_buffer_.end(), 0.0f); - } - } - -private: - int pad_frames_; // Number of padding frames - int samples_per_frame_; // Number of samples per frame - int effective_frames_; // Number of effective frames - int sola_buffer_frame_; // SOLA buffer length - int sola_search_frame_; // SOLA search window length - - std::vector fade_in_window_; // Fade-in window - std::vector fade_out_window_; // Fade-out window - std::vector sola_buffer_; // SOLA buffer - - bool first_frame_; // Flag for first frame processing -}; - -#endif // SOLA_PROCESSOR_H From 835daf192c7199cdb7455489ade113cb0ae5a0ff Mon Sep 17 00:00:00 2001 From: yuyun2000 <15515722313yxw@gmail.com> Date: Fri, 9 May 2025 18:25:49 +0800 Subject: [PATCH 63/64] Fix SOLA algorithm implementation Resolved issues in the Synchronized Overlap-Add (SOLA) algorithm to ensure proper audio processing and alignment. --- .../llm_framework/main_melotts/src/main.cpp | 74 ++++++++++++------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index 0ad2e71..4fab699 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -469,42 +469,66 @@ class llm_task { // 5. Add crossfade region to output pcmlist.insert(pcmlist.end(), crossfade_region.begin(), crossfade_region.end()); - // 6. Add remaining valid audio data int remaining_start = aligned_start + sola_buffer_frame; - int remaining_len = (i == dec_slice_num - 1) - ? (actual_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame - : (dec_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame; - // Boundary check - remaining_len = std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); + if (i == dec_slice_num - 1) { + int total_expected_samples = audio_len * samples_per_frame / 512; - if (remaining_len > 0) { - pcmlist.insert(pcmlist.end(), decoder_output.begin() + remaining_start, - decoder_output.begin() + remaining_start + remaining_len); - } + int processed_samples = static_cast(pcmlist.size()); - // 7. Update SOLA buffer for next frame - int buffer_start = remaining_start + remaining_len; + int remaining_needed = total_expected_samples - processed_samples; + remaining_needed = std::max(0, remaining_needed); + + int remaining_len = + std::min(remaining_needed, static_cast(decoder_output.size() - remaining_start)); + + SLOGI("Inference #%d (final): Expected total=%d, processed=%d, needed=%d, available=%d", i + 1, + total_expected_samples, processed_samples, remaining_needed, remaining_len); + + if (remaining_len > 0) { + pcmlist.insert(pcmlist.end(), decoder_output.begin() + remaining_start, + decoder_output.begin() + remaining_start + remaining_len); + } - // Check if there's enough data for the next buffer - if (buffer_start + sola_buffer_frame <= decoder_output.size()) { - std::copy(decoder_output.begin() + buffer_start, - decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); } else { - // If insufficient, fill with zeros - int avail = static_cast(decoder_output.size() - buffer_start); - if (avail > 0) { - std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), sola_buffer.begin()); + int remaining_len = (dec_len - 2 * pad_frames) * samples_per_frame - sola_buffer_frame; + + remaining_len = + std::min(remaining_len, static_cast(decoder_output.size() - remaining_start)); + + if (remaining_len > 0) { + pcmlist.insert(pcmlist.end(), decoder_output.begin() + remaining_start, + decoder_output.begin() + remaining_start + remaining_len); + } + + int buffer_start = remaining_start + remaining_len; + + if (buffer_start + sola_buffer_frame <= decoder_output.size()) { + std::copy(decoder_output.begin() + buffer_start, + decoder_output.begin() + buffer_start + sola_buffer_frame, sola_buffer.begin()); + } else { + int avail = static_cast(decoder_output.size() - buffer_start); + if (avail > 0) { + std::copy(decoder_output.begin() + buffer_start, decoder_output.end(), + sola_buffer.begin()); + } + std::fill(sola_buffer.begin() + avail, sola_buffer.end(), 0.0f); } - std::fill(sola_buffer.begin() + avail, sola_buffer.end(), 0.0f); - } - SLOGI("Inference #%d: Added %d + %d samples to output, cumulative length: %zu", i + 1, - sola_buffer_frame, remaining_len, pcmlist.size()); + SLOGI("Inference #%d: Added %d + %d samples to output, cumulative length: %zu", i + 1, + sola_buffer_frame, remaining_len, pcmlist.size()); + } } } - SLOGI("All inference completed, generated PCM length: %zu", pcmlist.size()); + SLOGI("All inference completed, raw generated PCM length: %zu", pcmlist.size()); + + if (pcmlist.size() > audio_len) { + SLOGI("Truncating output from %zu to %d samples as per encoder prediction", pcmlist.size(), audio_len); + pcmlist.resize(audio_len); + } + + SLOGI("Final PCM length after truncation: %zu", pcmlist.size()); // Post-processing: resample and convert to int16 double src_ratio = From 6e503a130c1b017a8ddeb5ef4ba180846caa04a1 Mon Sep 17 00:00:00 2001 From: LittleMouse Date: Fri, 9 May 2025 18:42:20 +0800 Subject: [PATCH 64/64] [update] delete debug log --- .../llm_framework/main_melotts/src/main.cpp | 45 +------------------ .../main_melotts/src/runner/Lexicon.hpp | 2 +- 2 files changed, 2 insertions(+), 45 deletions(-) diff --git a/projects/llm_framework/main_melotts/src/main.cpp b/projects/llm_framework/main_melotts/src/main.cpp index 4fab699..4c25df8 100644 --- a/projects/llm_framework/main_melotts/src/main.cpp +++ b/projects/llm_framework/main_melotts/src/main.cpp @@ -252,7 +252,6 @@ class llm_task { } return false; } - SLOGI("Processing text: %s", msg_str.c_str()); // Convert text to phonemes and tones std::vector phones_bef, tones_bef; @@ -262,8 +261,6 @@ class llm_task { int phone_len = phones.size(); std::vector langids(phone_len, 3); - SLOGI("Phoneme conversion completed, length: %d", phone_len); - // Run the encoder to generate hidden representations auto encoder_output = encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w, @@ -273,27 +270,19 @@ class llm_task { auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo(); auto zp_shape = zp_info.GetShape(); - SLOGI("Encoder output completed, shape: [%ld, %ld, %ld], expected audio length: %d", zp_shape[0], - zp_shape[1], zp_shape[2], audio_len); - // Calculate decoder parameters int zp_size = decoder_->GetInputSize(0) / sizeof(float); int dec_len = zp_size / zp_shape[1]; int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float); - const int pad_frames = 16; + const int pad_frames = 24; const int samples_per_frame = 512; - SLOGI("Decoder configuration: frame length=%d, audio slice length=%d, pad length=%d, samples per frame=%d", - dec_len, audio_slice_len, pad_frames, samples_per_frame); - const int effective_frames = dec_len - 2 * pad_frames; int dec_slice_num = static_cast(std::ceil(static_cast(zp_shape[2]) / static_cast(effective_frames))); - SLOGI("Will perform %d inferences, each with effective frames: %d", dec_slice_num, effective_frames); - // SOLA parameters setup const int sola_buffer_frame = pad_frames * samples_per_frame; // Overlap buffer length const int sola_search_frame = pad_frames * samples_per_frame; // Search window length @@ -344,10 +333,6 @@ class llm_task { output_start_frame = i * effective_frames; output_end_frame = (i + 1) * effective_frames - 1; } - - SLOGI("Inference #%d: input frame range=[%d-%d], actual length=%d, output frame range=[%d-%d]", i + 1, - input_start, input_start + actual_len - 1, actual_len, output_start_frame, output_end_frame); - // Prepare decoder input, initialize all to zero std::vector zp(zp_size, 0); @@ -365,8 +350,6 @@ class llm_task { decoder_->SetInput(zp.data(), 0); decoder_->SetInput(g_matrix.data(), 1); - SLOGI("Inference #%d: starting decoding...", i + 1); - if (0 != decoder_->Run()) { SLOGI("Inference #%d: decoding failed", i + 1); throw std::string("decoder_ RunSync error"); @@ -416,10 +399,6 @@ class llm_task { first_frame = false; - SLOGI( - "Inference #%d: First frame processing, added %d samples from position %d to output, saved %d " - "samples to SOLA buffer", - i + 1, audio_len, audio_start, sola_buffer_frame); } else { // Non-first frame: SOLA alignment required int audio_start = pad_frames * samples_per_frame; @@ -451,9 +430,6 @@ class llm_task { } } - SLOGI("Inference #%d: SOLA found best alignment offset %d with correlation coefficient %f", i + 1, - best_offset, best_correlation); - // 3. Apply alignment offset int aligned_start = audio_start + best_offset; @@ -482,9 +458,6 @@ class llm_task { int remaining_len = std::min(remaining_needed, static_cast(decoder_output.size() - remaining_start)); - SLOGI("Inference #%d (final): Expected total=%d, processed=%d, needed=%d, available=%d", i + 1, - total_expected_samples, processed_samples, remaining_needed, remaining_len); - if (remaining_len > 0) { pcmlist.insert(pcmlist.end(), decoder_output.begin() + remaining_start, decoder_output.begin() + remaining_start + remaining_len); @@ -514,42 +487,27 @@ class llm_task { } std::fill(sola_buffer.begin() + avail, sola_buffer.end(), 0.0f); } - - SLOGI("Inference #%d: Added %d + %d samples to output, cumulative length: %zu", i + 1, - sola_buffer_frame, remaining_len, pcmlist.size()); } } } - SLOGI("All inference completed, raw generated PCM length: %zu", pcmlist.size()); - if (pcmlist.size() > audio_len) { - SLOGI("Truncating output from %zu to %d samples as per encoder prediction", pcmlist.size(), audio_len); pcmlist.resize(audio_len); } - SLOGI("Final PCM length after truncation: %zu", pcmlist.size()); - // Post-processing: resample and convert to int16 double src_ratio = static_cast(mode_config_.audio_rate) / static_cast(mode_config_.mode_rate); std::vector tmp_pcm((pcmlist.size() * src_ratio + 1)); int len; - SLOGI("Starting audio resampling, source rate: %f, target rate: %f, ratio: %f", - static_cast(mode_config_.mode_rate), static_cast(mode_config_.audio_rate), src_ratio); - resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio); - SLOGI("Resampling completed, length after resampling: %d", len); - // Convert to 16-bit PCM wav_pcm_data.reserve(len); std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data), [](const auto val) { return static_cast(val * INT16_MAX); }); - SLOGI("Final audio length: %zu samples", wav_pcm_data.size()); - // Call the output callback function with the result if (out_callback_) { out_callback_( @@ -557,7 +515,6 @@ class llm_task { finish); } - SLOGI("TTS processing completed, output callback invoked"); } catch (const std::exception &e) { SLOGI("TTS processing exception: %s", e.what()); return true; diff --git a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp index 29c3181..d1bcbe9 100644 --- a/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp +++ b/projects/llm_framework/main_melotts/src/runner/Lexicon.hpp @@ -9,7 +9,7 @@ #include #include "../../../../../SDK/components/utilities/include/sample_log.h" // Debug logging switch - set to true to enable debug logs -static bool DEBUG_LOGGING = true; +static bool DEBUG_LOGGING = false; // Macro for debug logging #define DEBUG_LOG(fmt, ...) \ do { \