xipingyan
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image_video_tts.yaml‎
Lines changed: 266 additions & 0 deletions b/‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image_video_tts.yaml‎
Lines changed: 266 additions & 0 deletions
diff --git a/‎samples/cpp/module_genai/md_omni.cpp‎
Lines changed: 17 additions & 1 deletion b/‎samples/cpp/module_genai/md_omni.cpp‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎samples/cpp/module_genai/utils/audio_utils.cpp‎
Lines changed: 36 additions & 0 deletions b/‎samples/cpp/module_genai/utils/audio_utils.cpp‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎samples/cpp/module_genai/utils/audio_utils.hpp‎
Lines changed: 2 additions & 0 deletions b/‎samples/cpp/module_genai/utils/audio_utils.hpp‎
Lines changed: 2 additions & 0 deletions
@@ -53,3 +53,6 @@ samples/python/module_genai/*.sh
 tests/module_genai/cpp/*.sh
 tests/module_genai/cpp/*.yaml
 tests/module_genai/cpp/test_data/*.json
+modelprint_*_before_la_fusion.cpp
+output_audio_*.wav
+profile_*.json
@@ -0,0 +1,266 @@
+global_context:
+  model_type: "qwen3_omni"
+
+pipeline_modules:
+  pipeline_params:
+    type: "ParameterModule"
+    outputs:
+      - name: "videos"
+        type: "VecOVTensor"
+      - name: "images"
+        type: "VecOVTensor"
+      - name: "prompts"
+        type: "VecString"
+      - name: "audios"
+        type: "VecOVTensor"
+      - name: "use_audio_in_video"
+        type: "VecInt"
+
+  image_preprocessor:
+    type: "ImagePreprocessModule"
+    device: "CPU"
+    description: "Image or Video preprocessing."
+    inputs:
+      - name: "images"
+        type: "VecOVTensor"
+        source: "pipeline_params.images"
+    outputs:
+      - name: "pixel_values"
+        type: "VecOVTensor"
+      - name: "grid_thw"
+        type: "VecOVTensor"
+      - name: "pos_embeds"
+        type: "VecOVTensor"
+      - name: "rotary_cos"
+        type: "VecOVTensor"
+      - name: "rotary_sin"
+        type: "VecOVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  video_preprocessor:
+    type: "VideoPreprocessModule"
+    device: "CPU"
+    description: "Video preprocessing."
+    inputs:
+      - name: "videos"
+        type: "VecOVTensor"
+        source: "pipeline_params.videos"
+    outputs:
+      - name: "pixel_values"
+        type: "VecOVTensor"
+      - name: "grid_thw"
+        type: "VecOVTensor"
+      - name: "pos_embeds"
+        type: "VecOVTensor"
+      - name: "rotary_cos"
+        type: "VecOVTensor"
+      - name: "rotary_sin"
+        type: "VecOVTensor"
+      - name: video_second_per_grid
+        type: "VecInt"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  audio_preprocessor:
+    type: "AudioPreprocessModule"
+    device: "CPU"
+    description: "Audio preprocessing."
+    inputs:
+      - name: "audios"
+        type: "VecOVTensor"
+        source: "pipeline_params.audios"
+    outputs:
+      - name: "input_features"
+        type: "VecOVTensor"
+      - name: "feature_attention_mask"
+        type: "VecOVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  audio_encoder:
+    type: "AudioEncoderModule"
+    device: "GPU"
+    description: "Audio encoder for Qwen 3-Omni."
+    inputs:
+      - name: "input_features"
+        type: "VecOVTensor"
+        source: "audio_preprocessor.input_features"
+      - name: "feature_attention_mask"
+        type: "VecOVTensor"
+        source: "audio_preprocessor.feature_attention_mask"
+    outputs:
+      - name: "audio_features"
+        type: "VecOVTensor"
+      - name: "audio_feature_lengths"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_audio_encoder.xml"
+
+  prompt_encoder:
+    type: "TextEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "prompts"
+        type: "VecString"
+        source: "pipeline_params.prompts"
+      - name: "image_grid_thw"
+        type: "VecOVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "video_grid_thw"
+        type: "VecOVTensor"
+        source: "video_preprocessor.grid_thw"
+      - name: "audio_features"
+        type: "VecOVTensor"
+        source: "audio_encoder.audio_features"
+      - name: "video_second_per_grid"
+        type: "VecInt"
+        source: "video_preprocessor.video_second_per_grid"
+      - name: "use_audio_in_video"
+        type: "VecInt"
+        source: "pipeline_params.use_audio_in_video"
+    outputs:
+      - name: "input_ids"
+        type: "OVTensor"
+      - name: "mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  vision_encoder:
+    type: "VisionEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "preprocessed_image"
+        type: "VecOVTensor"
+        source: "image_preprocessor.pixel_values"
+      - name: "image_grid_thw"
+        type: "VecOVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "image_pos_embeds"
+        type: "VecOVTensor"
+        source: "image_preprocessor.pos_embeds"
+      - name: "image_rotary_cos"
+        type: "VecOVTensor"
+        source: "image_preprocessor.rotary_cos"
+      - name: "image_rotary_sin"
+        type: "VecOVTensor"
+        source: "image_preprocessor.rotary_sin"
+      - name: "preprocessed_video"
+        type: "VecOVTensor"
+        source: "video_preprocessor.pixel_values"
+      - name: "video_grid_thw"
+        type: "VecOVTensor"
+        source: "video_preprocessor.grid_thw"
+      - name: "video_pos_embeds"
+        type: "VecOVTensor"
+        source: "video_preprocessor.pos_embeds"
+      - name: "video_rotary_cos"
+        type: "VecOVTensor"
+        source: "video_preprocessor.rotary_cos"
+      - name: "video_rotary_sin"
+        type: "VecOVTensor"
+        source: "video_preprocessor.rotary_sin"
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "attention_mask"
+        type: "OVTensor"
+        source: "prompt_encoder.mask"
+      - name: "audio_features"
+        type: "VecOVTensor"
+        source: "audio_encoder.audio_features"
+      - name: "audio_feature_lengths"
+        type: "OVTensor"
+        source: "audio_encoder.audio_feature_lengths"
+    outputs:
+      - name: "image_embedding"
+        type: "OVTensor"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+      - name: "position_ids"
+        type: "OVTensor"
+      - name: "rope_delta"
+        type: "OVTensor"
+      - name: "deepstack_embeds"
+        type: "VecOVTensor"
+      - name: "audio_embedding"
+        type: "OVTensor"
+      - name: "audio_pos_mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
+      vision_start_token_id: 248053
+
+  llm:
+    type: "LLMInferenceSDPAModule"
+    device: "GPU"
+    inputs:
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "visual_embeds"
+        type: "OVTensor"
+        source: "vision_encoder.image_embedding"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+        source: "vision_encoder.visual_pos_mask"
+      - name: "grid_thw"
+        type: "VecOVTensor"
+        source: "video_preprocessor.grid_thw"
+      - name: "position_ids"
+        type: "OVTensor"
+        source: "vision_encoder.position_ids"
+      - name: "rope_delta"
+        type: "OVTensor"
+        source: "vision_encoder.rope_delta"
+      - name: "deepstack_embeds"
+        type: "VecOVTensor"
+        source: "vision_encoder.deepstack_embeds"
+      - name: "audio_embeds"
+        type: "OVTensor"
+        source: "vision_encoder.audio_embedding"
+      - name: "audio_pos_mask"
+        type: "OVTensor"
+        source: "vision_encoder.audio_pos_mask"
+    outputs:
+      - name: "generated_text"
+        type: "String"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
+      max_new_tokens: 512
+
+  text_to_speech:
+    type: "TextToSpeechModule"
+    device: "GPU"
+    inputs:
+      - name: "text"
+        type: "String"
+        source: "llm.generated_text"
+    outputs:
+      - name: "audios"
+        type: "VecOVTensor"
+      - name: "sample_rates"
+        type: "VecInt"
+    params:
+      config_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/config.json"
+      tokenizer_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual"
+      embedding_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_embedding_model.xml"
+      prefill_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_prefill_model.xml"
+      decode_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_decode_model.xml"
+      codec_embedding_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_codec_embedding_model.xml"
+      code_predictor_ar_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual"
+      code_predictor_single_codec_embed_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual"
+      code_predictor_single_codec_embedding_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_code_predictor_codec_embed_model.xml"
+      speech_decoder_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_speech_decoder_model.xml"
+
+  pipeline_result:
+    type: "ResultModule"
+    description: "Collects final results and formats the output structure."
+    inputs:
+      - name: "audios"
+        type: "VecOVTensor"
+        source: "text_to_speech.audios"
+      - name: "sample_rates"
+        type: "VecInt"
+        source: "text_to_speech.sample_rates"
@@ -72,6 +72,7 @@ int main(int argc, char* argv[]) {
                                      "  -video: [Optional] video path\n"
                                      "  -audio: [Optional] audio path\n"
                                      "  -use_audio_in_video: [Optional] set to 1 if the video contains audio and you want to use the audio, default 0\n"
+                                     "  -tts: [Optional] set to 1 to use tts, default 0\n"
                                      "  -warmup: [Optional] number of warmup runs, default 0\n"
                                      "  -perf: [Optional] set to 1 to print performance metrics, default 0\n");
         }
@@ -80,6 +81,7 @@ int main(int argc, char* argv[]) {
         std::string cache_dir = utils::get_input_arg(argc, argv, "-cache_dir", std::string{});
         int warmup = std::stoi(utils::get_input_arg(argc, argv, "-warmup", std::string("0")));
         bool perf = std::stoi(utils::get_input_arg(argc, argv, "-perf", std::string("0")));
+        bool use_tts = std::stoi(utils::get_input_arg(argc, argv, "-tts", std::string("0"))) != 0;
 
         utils::OmniInputParams input_params = utils::parse_omni_input_params(argc, argv);
         ov::AnyMap inputs = parse_inputs_for_omni(input_params);
@@ -126,7 +128,21 @@ int main(int argc, char* argv[]) {
             std::cout << "[Generation] Duration: " << diff << " ms" << std::endl;
         }
 
-        std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
+        if (!use_tts) {
+            std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
+        } else {
+            std::vector<ov::Tensor> audios = pipe.get_output("audios").as<std::vector<ov::Tensor>>();
+            std::vector<int> sample_rates = pipe.get_output("sample_rates").as<std::vector<int>>();
+            for (size_t i = 0; i < audios.size(); ++i) {
+                std::string output_path = "output_audio_" + std::to_string(i) + ".wav";
+                auto audio_data = audios[i].data<const float>();
+                const size_t sample_count = audios[i].get_size();
+                std::cout << "sample_rate: " << sample_rates[i] << ", sample_count: " << sample_count << std::endl;
+                audio_utils::write_wav(output_path, audio_data, sample_count, sample_rates[i]);
+                std::cout << "Saved generated audio to: " << output_path << std::endl;
+            }
+        }
+        
     } catch (const std::exception& ex) {
         std::cerr << "[ERROR] " << ex.what() << std::endl;
         return EXIT_FAILURE;
 
@@ -204,4 +204,40 @@ ov::Tensor load_audio(const std::filesystem::path& audio_path) {
     return tensor;
 }
 
+void write_wav(const std::string& filename, const float* samples, size_t num_samples, int sample_rate) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Failed to create WAV file: " + filename);
+    }
+
+    int32_t data_size = static_cast<int32_t>(num_samples * sizeof(int16_t));
+    int32_t file_size = 36 + data_size;
+    int16_t audio_format = 1;
+    int16_t num_channels = 1;
+    int32_t byte_rate = sample_rate * num_channels * 2;
+    int16_t block_align = num_channels * 2;
+    int16_t bits_per_sample = 16;
+
+    file.write("RIFF", 4);
+    file.write(reinterpret_cast<char*>(&file_size), 4);
+    file.write("WAVE", 4);
+    file.write("fmt ", 4);
+    int32_t fmt_size = 16;
+    file.write(reinterpret_cast<char*>(&fmt_size), 4);
+    file.write(reinterpret_cast<char*>(&audio_format), 2);
+    file.write(reinterpret_cast<char*>(&num_channels), 2);
+    file.write(reinterpret_cast<char*>(&sample_rate), 4);
+    file.write(reinterpret_cast<char*>(&byte_rate), 4);
+    file.write(reinterpret_cast<char*>(&block_align), 2);
+    file.write(reinterpret_cast<char*>(&bits_per_sample), 2);
+    file.write("data", 4);
+    file.write(reinterpret_cast<char*>(&data_size), 4);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+        float v = std::max(-1.0f, std::min(1.0f, samples[i]));
+        int16_t s = static_cast<int16_t>(v * 32767.0f);
+        file.write(reinterpret_cast<char*>(&s), 2);
+    }
+}
+
 } // namespace audio_utils
@@ -9,4 +9,6 @@ namespace audio_utils {
 
 ov::Tensor load_audio(const std::filesystem::path& audio_path);
 
+void write_wav(const std::string& filename, const float* samples, size_t num_samples, int sample_rate);
+
 }
Original file line number	Diff line number	Diff line change
`@@ -9,4 +9,6 @@ namespace audio_utils {`
`9`	`9`
`10`	`10`	`ov::Tensor load_audio(const std::filesystem::path& audio_path);`
`11`	`11`
	`12`	`+void write_wav(const std::string& filename, const float* samples, size_t num_samples, int sample_rate);`
	`13`	`+`
`12`	`14`	`}`