xipingyan
diff --git a/‎samples/cpp/module_genai/CMakeLists.txt‎
Lines changed: 27 additions & 0 deletions b/‎samples/cpp/module_genai/CMakeLists.txt‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎samples/cpp/module_genai/comfyui/CMakeLists.txt‎
Lines changed: 22 additions & 0 deletions b/‎samples/cpp/module_genai/comfyui/CMakeLists.txt‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image.yaml‎
Lines changed: 34 additions & 34 deletions b/‎samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image.yaml‎
Lines changed: 34 additions & 34 deletions
@@ -11,6 +11,25 @@ find_package(OpenCV REQUIRED)
 # yaml-cpp dependency (use centralized cmake module or fetch if standalone)
 include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/yaml-cpp.cmake)
 
+# FFmpeg: optional, needed for audio extraction from video files
+find_package(PkgConfig QUIET)
+if(PkgConfig_FOUND)
+    pkg_check_modules(FFMPEG IMPORTED_TARGET QUIET
+        libavformat libavcodec libswresample libavutil)
+    if(FFMPEG_FOUND)
+        message(STATUS "FFmpeg found — audio-in-video support enabled")
+        add_compile_definitions(HAVE_FFMPEG)
+        # libavformat pulls in libgdal/libhdf5 which both depend on libcurl;
+        # link it explicitly to satisfy the @CURL_OPENSSL_4 versioned symbols.
+        pkg_check_modules(LIBCURL IMPORTED_TARGET QUIET libcurl)
+        if(NOT LIBCURL_FOUND)
+            find_package(CURL QUIET)
+        endif()
+    else()
+        message(STATUS "FFmpeg not found — audio-in-video will throw at runtime")
+    endif()
+endif()
+
 if(OpenCV_FOUND)
     message(STATUS "OpenCV found. Version: ${OpenCV_VERSION}")
 else()
@@ -25,6 +44,14 @@ function(add_sample_executable target_name)
         utils/audio_utils.cpp
     )
     target_link_libraries(${target_name} PRIVATE openvino::genai ${OpenCV_LIBS} ${YAML_CPP_TARGET})
+    if(FFMPEG_FOUND)
+        target_link_libraries(${target_name} PRIVATE PkgConfig::FFMPEG)
+        if(LIBCURL_FOUND)
+            target_link_libraries(${target_name} PRIVATE PkgConfig::LIBCURL)
+        elseif(CURL_FOUND)
+            target_link_libraries(${target_name} PRIVATE CURL::libcurl)
+        endif()
+    endif()
     set_target_properties(${target_name} PROPERTIES
         # Ensure out-of-box LC_RPATH on macOS with SIP
         INSTALL_RPATH_USE_LINK_PATH ON)
 
@@ -8,6 +8,20 @@ find_package(OpenVINO REQUIRED)
 find_package(OpenVINOGenAI REQUIRED)
 find_package(yaml-cpp REQUIRED)
 
+# FFmpeg: optional, needed because vision_utils.cpp may use it
+find_package(PkgConfig QUIET)
+if(PkgConfig_FOUND)
+    pkg_check_modules(FFMPEG IMPORTED_TARGET QUIET
+        libavformat libavcodec libswresample libavutil)
+    if(FFMPEG_FOUND)
+        add_compile_definitions(HAVE_FFMPEG)
+        pkg_check_modules(LIBCURL IMPORTED_TARGET QUIET libcurl)
+        if(NOT LIBCURL_FOUND)
+            find_package(CURL QUIET)
+        endif()
+    endif()
+endif()
+
 add_executable(${TARGET_NAME}
   main.cpp
   ../utils/vision_utils.cpp
@@ -22,6 +36,14 @@ target_link_libraries(${TARGET_NAME} PRIVATE
   openvino::genai
   ${OpenCV_LIBS}
 )
+if(FFMPEG_FOUND)
+    target_link_libraries(${TARGET_NAME} PRIVATE PkgConfig::FFMPEG)
+    if(LIBCURL_FOUND)
+        target_link_libraries(${TARGET_NAME} PRIVATE PkgConfig::LIBCURL)
+    elseif(CURL_FOUND)
+        target_link_libraries(${TARGET_NAME} PRIVATE CURL::libcurl)
+    endif()
+endif()
 
 install(TARGETS ${TARGET_NAME}
         RUNTIME DESTINATION bin/
 
@@ -5,32 +5,32 @@ pipeline_modules:
   pipeline_params:
     type: "ParameterModule"
     outputs:
-      - name: "image"
-        type: "OVTensor"
-      - name: "prompt"
-        type: "String"
-      - name: "audio"
-        type: "OVTensor"
+      - name: "images"
+        type: "VecOVTensor"
+      - name: "prompts"
+        type: "VecString"
+      - name: "audios"
+        type: "VecOVTensor"
 
   image_preprocessor:
     type: "ImagePreprocessModule"
     device: "CPU"
     description: "Image or Video preprocessing."
     inputs:
-      - name: "image"
-        type: "OVTensor"
-        source: "pipeline_params.image"
+      - name: "images"
+        type: "VecOVTensor"
+        source: "pipeline_params.images"
     outputs:
       - name: "pixel_values"
-        type: "OVTensor"
+        type: "VecOVTensor"
       - name: "grid_thw"
-        type: "OVTensor"
+        type: "VecOVTensor"
       - name: "pos_embeds"
-        type: "OVTensor"
+        type: "VecOVTensor"
       - name: "rotary_cos"
-        type: "OVTensor"
+        type: "VecOVTensor"
       - name: "rotary_sin"
-        type: "OVTensor"
+        type: "VecOVTensor"
     params:
       model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
 
@@ -39,9 +39,9 @@ pipeline_modules:
     device: "CPU"
     description: "Audio preprocessing."
     inputs:
-      - name: "audio"
-        type: "OVTensor"
-        source: "pipeline_params.audio"
+      - name: "audios"
+        type: "VecOVTensor"
+        source: "pipeline_params.audios"
     outputs:
       - name: "input_features"
         type: "VecOVTensor"
@@ -63,7 +63,7 @@ pipeline_modules:
         source: "audio_preprocessor.feature_attention_mask"
     outputs:
       - name: "audio_features"
-        type: "OVTensor"
+        type: "VecOVTensor"
       - name: "audio_feature_lengths"
         type: "OVTensor"
     params:
@@ -73,14 +73,14 @@ pipeline_modules:
     type: "TextEncoderModule"
     device: "GPU"
     inputs:
-      - name: "prompt"
-        type: "String"
-        source: "pipeline_params.prompt"
-      - name: "grid_thw"
-        type: "OVTensor"
+      - name: "prompts"
+        type: "VecString"
+        source: "pipeline_params.prompts"
+      - name: "image_grid_thw"
+        type: "VecOVTensor"
         source: "image_preprocessor.grid_thw"
       - name: "audio_features"
-        type: "OVTensor"
+        type: "VecOVTensor"
         source: "audio_encoder.audio_features"
     outputs:
       - name: "input_ids"
@@ -95,19 +95,19 @@ pipeline_modules:
     device: "GPU"
     inputs:
       - name: "preprocessed_image"
-        type: "OVTensor"
+        type: "VecOVTensor"
         source: "image_preprocessor.pixel_values"
-      - name: "grid_thw"
-        type: "OVTensor"
+      - name: "image_grid_thw"
+        type: "VecOVTensor"
         source: "image_preprocessor.grid_thw"
-      - name: "pos_embeds"
-        type: "OVTensor"
+      - name: "image_pos_embeds"
+        type: "VecOVTensor"
         source: "image_preprocessor.pos_embeds"
-      - name: "rotary_cos"
-        type: "OVTensor"
+      - name: "image_rotary_cos"
+        type: "VecOVTensor"
         source: "image_preprocessor.rotary_cos"
-      - name: "rotary_sin"
-        type: "OVTensor"
+      - name: "image_rotary_sin"
+        type: "VecOVTensor"
         source: "image_preprocessor.rotary_sin"
       - name: "input_ids"
         type: "OVTensor"
@@ -116,7 +116,7 @@ pipeline_modules:
         type: "OVTensor"
         source: "prompt_encoder.mask"
       - name: "audio_features"
-        type: "OVTensor"
+        type: "VecOVTensor"
         source: "audio_encoder.audio_features"
       - name: "audio_feature_lengths"
         type: "OVTensor"