xipingyan
diff --git a/‎samples/cpp/module_genai/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎samples/cpp/module_genai/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image.yaml‎
Lines changed: 187 additions & 0 deletions b/‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image.yaml‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎…genai/config_yaml/Qwen3-Omni/config.yaml‎ ‎…yaml/Qwen3-Omni/config_prompt_image.yaml‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config.yaml renamed to samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_image.yaml b/‎…genai/config_yaml/Qwen3-Omni/config.yaml‎ ‎…yaml/Qwen3-Omni/config_prompt_image.yaml‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config.yaml renamed to samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_image.yaml
diff --git a/‎samples/cpp/module_genai/md_omni.cpp‎
Lines changed: 28 additions & 3 deletions b/‎samples/cpp/module_genai/md_omni.cpp‎
Lines changed: 28 additions & 3 deletions
@@ -22,6 +22,7 @@ function(add_sample_executable target_name)
         ${target_name}.cpp
         utils/utils.cpp
         utils/vision_utils.cpp
+        utils/audio_utils.cpp
     )
     target_link_libraries(${target_name} PRIVATE openvino::genai ${OpenCV_LIBS} ${YAML_CPP_TARGET})
     set_target_properties(${target_name} PROPERTIES
@@ -36,7 +37,8 @@ endfunction()
 set (SAMPLE_LIST
     md_image_generation
     md_video_generation
-    md_visual_language_chat)
+    md_visual_language_chat
+    md_omni)
 
 foreach(sample IN LISTS SAMPLE_LIST)
     add_sample_executable(${sample})
 
@@ -0,0 +1,187 @@
+global_context:
+  model_type: "qwen3_omni"
+
+pipeline_modules:
+  pipeline_params:
+    type: "ParameterModule"
+    outputs:
+      - name: "image"
+        type: "OVTensor"
+      - name: "prompt"
+        type: "String"
+      - name: "audio"
+        type: "OVTensor"
+
+  image_preprocessor:
+    type: "ImagePreprocessModule"
+    device: "CPU"
+    description: "Image or Video preprocessing."
+    inputs:
+      - name: "image"
+        type: "OVTensor"
+        source: "pipeline_params.image"
+    outputs:
+      - name: "pixel_values"
+        type: "OVTensor"
+      - name: "grid_thw"
+        type: "OVTensor"
+      - name: "pos_embeds"
+        type: "OVTensor"
+      - name: "rotary_cos"
+        type: "OVTensor"
+      - name: "rotary_sin"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  audio_preprocessor:
+    type: "AudioPreprocessModule"
+    device: "CPU"
+    description: "Audio preprocessing."
+    inputs:
+      - name: "audio"
+        type: "OVTensor"
+        source: "pipeline_params.audio"
+    outputs:
+      - name: "input_features"
+        type: "VecOVTensor"
+      - name: "feature_attention_mask"
+        type: "VecOVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  audio_encoder:
+    type: "AudioEncoderModule"
+    device: "GPU"
+    description: "Audio encoder for Qwen 3-Omni."
+    inputs:
+      - name: "input_features"
+        type: "VecOVTensor"
+        source: "audio_preprocessor.input_features"
+      - name: "feature_attention_mask"
+        type: "VecOVTensor"
+        source: "audio_preprocessor.feature_attention_mask"
+    outputs:
+      - name: "audio_features"
+        type: "OVTensor"
+      - name: "audio_feature_lengths"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_audio_encoder.xml"
+
+  prompt_encoder:
+    type: "TextEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "prompt"
+        type: "String"
+        source: "pipeline_params.prompt"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "audio_features"
+        type: "OVTensor"
+        source: "audio_encoder.audio_features"
+    outputs:
+      - name: "input_ids"
+        type: "OVTensor"
+      - name: "mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
+
+  vision_encoder:
+    type: "VisionEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "preprocessed_image"
+        type: "OVTensor"
+        source: "image_preprocessor.pixel_values"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "pos_embeds"
+        type: "OVTensor"
+        source: "image_preprocessor.pos_embeds"
+      - name: "rotary_cos"
+        type: "OVTensor"
+        source: "image_preprocessor.rotary_cos"
+      - name: "rotary_sin"
+        type: "OVTensor"
+        source: "image_preprocessor.rotary_sin"
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "attention_mask"
+        type: "OVTensor"
+        source: "prompt_encoder.mask"
+      - name: "audio_features"
+        type: "OVTensor"
+        source: "audio_encoder.audio_features"
+      - name: "audio_feature_lengths"
+        type: "OVTensor"
+        source: "audio_encoder.audio_feature_lengths"
+    outputs:
+      - name: "image_embedding"
+        type: "OVTensor"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+      - name: "position_ids"
+        type: "OVTensor"
+      - name: "rope_delta"
+        type: "OVTensor"
+      - name: "deepstack_embeds"
+        type: "VecOVTensor"
+      - name: "audio_embedding"
+        type: "OVTensor"
+      - name: "audio_pos_mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
+      vision_start_token_id: 248053
+
+  llm:
+    type: "LLMInferenceSDPAModule"
+    device: "GPU"
+    inputs:
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "visual_embeds"
+        type: "OVTensor"
+        source: "vision_encoder.image_embedding"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+        source: "vision_encoder.visual_pos_mask"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "position_ids"
+        type: "OVTensor"
+        source: "vision_encoder.position_ids"
+      - name: "rope_delta"
+        type: "OVTensor"
+        source: "vision_encoder.rope_delta"
+      - name: "deepstack_embeds"
+        type: "VecOVTensor"
+        source: "vision_encoder.deepstack_embeds"
+      - name: "audio_embeds"
+        type: "OVTensor"
+        source: "vision_encoder.audio_embedding"
+      - name: "audio_pos_mask"
+        type: "OVTensor"
+        source: "vision_encoder.audio_pos_mask"
+    outputs:
+      - name: "generated_text"
+        type: "String"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
+      max_new_tokens: 512
+
+  pipeline_result:
+    type: "ResultModule"
+    description: "Collects final results and formats the output structure."
+    inputs:
+      - name: "generated_text"
+        type: "String"
+        source: "llm.generated_text"
@@ -10,6 +10,7 @@
 #include "utils/vision_utils.hpp"
 #include "yaml-cpp/yaml.h"
 #include "utils/utils.hpp"
+#include "utils/audio_utils.hpp"
 
 inline ov::AnyMap parse_inputs_from_yaml_cfg_for_vlm(const std::filesystem::path& cfg_yaml_path,
                                                      const std::string& prompt = std::string{},
@@ -52,11 +53,11 @@ inline ov::AnyMap parse_inputs_from_yaml_cfg_for_vlm(const std::filesystem::path
             continue;
         }
 
-        if (param_type == "String" && utils::contains_key(param_name, {"audio"})) {
+        if (param_type == "OVTensor" && utils::contains_key(param_name, {"audio"})) {
             if (audio_path.empty()) {
                 throw std::runtime_error("Audio path is empty.");
             }
-            inputs[param_name] = audio_utils::read_wav(audio_path);
+            inputs[param_name] = audio_utils::load_audio(audio_path);
             continue;
         }
     }
@@ -73,7 +74,9 @@ int main(int argc, char* argv[]) {
                                      "  -prompt: input prompt\n"
                                      "  -img: [Optional] image path\n"
                                      "  -video: [Optional] video path\n"
-                                     "  -audio: [Optional] audio path\n");
+                                     "  -audio: [Optional] audio path\n"
+                                     "  -warmup: [Optional] number of warmup runs, default 0\n"
+                                     "  -perf: [Optional] set to 1 to print performance metrics, default 0\n");
         }
 
         std::filesystem::path config_path = utils::get_input_arg(argc, argv, "-cfg", std::string{});
@@ -82,6 +85,8 @@ int main(int argc, char* argv[]) {
         std::string img_path = utils::get_input_arg(argc, argv, "-img", std::string{});
         std::string video_path = utils::get_input_arg(argc, argv, "-video", std::string{});
         std::string audio_path = utils::get_input_arg(argc, argv, "-audio", std::string{});
+        int warmup = std::stoi(utils::get_input_arg(argc, argv, "-warmup", std::string("0")));
+        bool perf = std::stoi(utils::get_input_arg(argc, argv, "-perf", std::string("0")));
 
         ov::AnyMap inputs = parse_inputs_from_yaml_cfg_for_vlm(config_path, prompt, img_path, video_path, audio_path);
 
@@ -105,8 +110,28 @@ int main(int argc, char* argv[]) {
 
         ov::genai::module::ModulePipeline pipe(config_path, properties);
 
+        for (int i = 0; i < warmup; ++i) {
+            std::cout << "[Warmup] Run " << (i + 1) << "/" << warmup << std::endl;
+            auto t1 = std::chrono::high_resolution_clock::now();
+            pipe.generate(inputs);
+            auto t2 = std::chrono::high_resolution_clock::now();
+            if (perf) {
+                auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
+                std::cout << "[Warmup] Duration: " << diff << " ms" << std::endl;
+            }
+        }
+
+        std::cout << "[Generation] Running main generation..." << std::endl;
+        auto t1 = std::chrono::high_resolution_clock::now();
+
         pipe.generate(inputs);
 
+        auto t2 = std::chrono::high_resolution_clock::now();
+        if (perf) {
+            auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
+            std::cout << "[Generation] Duration: " << diff << " ms" << std::endl;
+        }
+
         std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
     } catch (const std::exception& ex) {
         std::cerr << "[ERROR] " << ex.what() << std::endl;