Skip to content

Commit 16538b3

Browse files
committed
Enable video input for Qwen3-Omni
Enable video input for Qwen3-Omni. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent 623e9e8 commit 16538b3

34 files changed

+1523
-576
lines changed

samples/cpp/module_genai/CMakeLists.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,25 @@ find_package(OpenCV REQUIRED)
1111
# yaml-cpp dependency (use centralized cmake module or fetch if standalone)
1212
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/yaml-cpp.cmake)
1313

14+
# FFmpeg: optional, needed for audio extraction from video files
15+
find_package(PkgConfig QUIET)
16+
if(PkgConfig_FOUND)
17+
pkg_check_modules(FFMPEG IMPORTED_TARGET QUIET
18+
libavformat libavcodec libswresample libavutil)
19+
if(FFMPEG_FOUND)
20+
message(STATUS "FFmpeg found — audio-in-video support enabled")
21+
add_compile_definitions(HAVE_FFMPEG)
22+
# libavformat pulls in libgdal/libhdf5 which both depend on libcurl;
23+
# link it explicitly to satisfy the @CURL_OPENSSL_4 versioned symbols.
24+
pkg_check_modules(LIBCURL IMPORTED_TARGET QUIET libcurl)
25+
if(NOT LIBCURL_FOUND)
26+
find_package(CURL QUIET)
27+
endif()
28+
else()
29+
message(STATUS "FFmpeg not found — audio-in-video will throw at runtime")
30+
endif()
31+
endif()
32+
1433
if(OpenCV_FOUND)
1534
message(STATUS "OpenCV found. Version: ${OpenCV_VERSION}")
1635
else()
@@ -25,6 +44,14 @@ function(add_sample_executable target_name)
2544
utils/audio_utils.cpp
2645
)
2746
target_link_libraries(${target_name} PRIVATE openvino::genai ${OpenCV_LIBS} ${YAML_CPP_TARGET})
47+
if(FFMPEG_FOUND)
48+
target_link_libraries(${target_name} PRIVATE PkgConfig::FFMPEG)
49+
if(LIBCURL_FOUND)
50+
target_link_libraries(${target_name} PRIVATE PkgConfig::LIBCURL)
51+
elseif(CURL_FOUND)
52+
target_link_libraries(${target_name} PRIVATE CURL::libcurl)
53+
endif()
54+
endif()
2855
set_target_properties(${target_name} PROPERTIES
2956
# Ensure out-of-box LC_RPATH on macOS with SIP
3057
INSTALL_RPATH_USE_LINK_PATH ON)

samples/cpp/module_genai/comfyui/CMakeLists.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@ find_package(OpenVINO REQUIRED)
88
find_package(OpenVINOGenAI REQUIRED)
99
find_package(yaml-cpp REQUIRED)
1010

11+
# FFmpeg: optional, needed because vision_utils.cpp may use it
12+
find_package(PkgConfig QUIET)
13+
if(PkgConfig_FOUND)
14+
pkg_check_modules(FFMPEG IMPORTED_TARGET QUIET
15+
libavformat libavcodec libswresample libavutil)
16+
if(FFMPEG_FOUND)
17+
add_compile_definitions(HAVE_FFMPEG)
18+
pkg_check_modules(LIBCURL IMPORTED_TARGET QUIET libcurl)
19+
if(NOT LIBCURL_FOUND)
20+
find_package(CURL QUIET)
21+
endif()
22+
endif()
23+
endif()
24+
1125
add_executable(${TARGET_NAME}
1226
main.cpp
1327
../utils/vision_utils.cpp
@@ -22,6 +36,14 @@ target_link_libraries(${TARGET_NAME} PRIVATE
2236
openvino::genai
2337
${OpenCV_LIBS}
2438
)
39+
if(FFMPEG_FOUND)
40+
target_link_libraries(${TARGET_NAME} PRIVATE PkgConfig::FFMPEG)
41+
if(LIBCURL_FOUND)
42+
target_link_libraries(${TARGET_NAME} PRIVATE PkgConfig::LIBCURL)
43+
elseif(CURL_FOUND)
44+
target_link_libraries(${TARGET_NAME} PRIVATE CURL::libcurl)
45+
endif()
46+
endif()
2547

2648
install(TARGETS ${TARGET_NAME}
2749
RUNTIME DESTINATION bin/

samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_audio_image.yaml

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,32 @@ pipeline_modules:
55
pipeline_params:
66
type: "ParameterModule"
77
outputs:
8-
- name: "image"
9-
type: "OVTensor"
10-
- name: "prompt"
11-
type: "String"
12-
- name: "audio"
13-
type: "OVTensor"
8+
- name: "images"
9+
type: "VecOVTensor"
10+
- name: "prompts"
11+
type: "VecString"
12+
- name: "audios"
13+
type: "VecOVTensor"
1414

1515
image_preprocessor:
1616
type: "ImagePreprocessModule"
1717
device: "CPU"
1818
description: "Image or Video preprocessing."
1919
inputs:
20-
- name: "image"
21-
type: "OVTensor"
22-
source: "pipeline_params.image"
20+
- name: "images"
21+
type: "VecOVTensor"
22+
source: "pipeline_params.images"
2323
outputs:
2424
- name: "pixel_values"
25-
type: "OVTensor"
25+
type: "VecOVTensor"
2626
- name: "grid_thw"
27-
type: "OVTensor"
27+
type: "VecOVTensor"
2828
- name: "pos_embeds"
29-
type: "OVTensor"
29+
type: "VecOVTensor"
3030
- name: "rotary_cos"
31-
type: "OVTensor"
31+
type: "VecOVTensor"
3232
- name: "rotary_sin"
33-
type: "OVTensor"
33+
type: "VecOVTensor"
3434
params:
3535
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
3636

@@ -39,9 +39,9 @@ pipeline_modules:
3939
device: "CPU"
4040
description: "Audio preprocessing."
4141
inputs:
42-
- name: "audio"
43-
type: "OVTensor"
44-
source: "pipeline_params.audio"
42+
- name: "audios"
43+
type: "VecOVTensor"
44+
source: "pipeline_params.audios"
4545
outputs:
4646
- name: "input_features"
4747
type: "VecOVTensor"
@@ -63,7 +63,7 @@ pipeline_modules:
6363
source: "audio_preprocessor.feature_attention_mask"
6464
outputs:
6565
- name: "audio_features"
66-
type: "OVTensor"
66+
type: "VecOVTensor"
6767
- name: "audio_feature_lengths"
6868
type: "OVTensor"
6969
params:
@@ -73,14 +73,14 @@ pipeline_modules:
7373
type: "TextEncoderModule"
7474
device: "GPU"
7575
inputs:
76-
- name: "prompt"
77-
type: "String"
78-
source: "pipeline_params.prompt"
79-
- name: "grid_thw"
80-
type: "OVTensor"
76+
- name: "prompts"
77+
type: "VecString"
78+
source: "pipeline_params.prompts"
79+
- name: "image_grid_thw"
80+
type: "VecOVTensor"
8181
source: "image_preprocessor.grid_thw"
8282
- name: "audio_features"
83-
type: "OVTensor"
83+
type: "VecOVTensor"
8484
source: "audio_encoder.audio_features"
8585
outputs:
8686
- name: "input_ids"
@@ -95,19 +95,19 @@ pipeline_modules:
9595
device: "GPU"
9696
inputs:
9797
- name: "preprocessed_image"
98-
type: "OVTensor"
98+
type: "VecOVTensor"
9999
source: "image_preprocessor.pixel_values"
100-
- name: "grid_thw"
101-
type: "OVTensor"
100+
- name: "image_grid_thw"
101+
type: "VecOVTensor"
102102
source: "image_preprocessor.grid_thw"
103-
- name: "pos_embeds"
104-
type: "OVTensor"
103+
- name: "image_pos_embeds"
104+
type: "VecOVTensor"
105105
source: "image_preprocessor.pos_embeds"
106-
- name: "rotary_cos"
107-
type: "OVTensor"
106+
- name: "image_rotary_cos"
107+
type: "VecOVTensor"
108108
source: "image_preprocessor.rotary_cos"
109-
- name: "rotary_sin"
110-
type: "OVTensor"
109+
- name: "image_rotary_sin"
110+
type: "VecOVTensor"
111111
source: "image_preprocessor.rotary_sin"
112112
- name: "input_ids"
113113
type: "OVTensor"
@@ -116,7 +116,7 @@ pipeline_modules:
116116
type: "OVTensor"
117117
source: "prompt_encoder.mask"
118118
- name: "audio_features"
119-
type: "OVTensor"
119+
type: "VecOVTensor"
120120
source: "audio_encoder.audio_features"
121121
- name: "audio_feature_lengths"
122122
type: "OVTensor"

0 commit comments

Comments
 (0)