Skip to content

Commit b7b5982

Browse files
authored
Enable audio input for Qwen 3-Omni (#120)
Enable audio input for Qwen 3-Omni. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent f1b9b8e commit b7b5982

23 files changed

+2521
-75
lines changed

samples/cpp/module_genai/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ function(add_sample_executable target_name)
2222
${target_name}.cpp
2323
utils/utils.cpp
2424
utils/vision_utils.cpp
25+
utils/audio_utils.cpp
2526
)
2627
target_link_libraries(${target_name} PRIVATE openvino::genai ${OpenCV_LIBS} ${YAML_CPP_TARGET})
2728
set_target_properties(${target_name} PROPERTIES
@@ -36,7 +37,8 @@ endfunction()
3637
set (SAMPLE_LIST
3738
md_image_generation
3839
md_video_generation
39-
md_visual_language_chat)
40+
md_visual_language_chat
41+
md_omni)
4042

4143
foreach(sample IN LISTS SAMPLE_LIST)
4244
add_sample_executable(${sample})
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
global_context:
2+
model_type: "qwen3_omni"
3+
4+
pipeline_modules:
5+
pipeline_params:
6+
type: "ParameterModule"
7+
outputs:
8+
- name: "image"
9+
type: "OVTensor"
10+
- name: "prompt"
11+
type: "String"
12+
- name: "audio"
13+
type: "OVTensor"
14+
15+
image_preprocessor:
16+
type: "ImagePreprocessModule"
17+
device: "CPU"
18+
description: "Image or Video preprocessing."
19+
inputs:
20+
- name: "image"
21+
type: "OVTensor"
22+
source: "pipeline_params.image"
23+
outputs:
24+
- name: "pixel_values"
25+
type: "OVTensor"
26+
- name: "grid_thw"
27+
type: "OVTensor"
28+
- name: "pos_embeds"
29+
type: "OVTensor"
30+
- name: "rotary_cos"
31+
type: "OVTensor"
32+
- name: "rotary_sin"
33+
type: "OVTensor"
34+
params:
35+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
36+
37+
audio_preprocessor:
38+
type: "AudioPreprocessModule"
39+
device: "CPU"
40+
description: "Audio preprocessing."
41+
inputs:
42+
- name: "audio"
43+
type: "OVTensor"
44+
source: "pipeline_params.audio"
45+
outputs:
46+
- name: "input_features"
47+
type: "VecOVTensor"
48+
- name: "feature_attention_mask"
49+
type: "VecOVTensor"
50+
params:
51+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
52+
53+
audio_encoder:
54+
type: "AudioEncoderModule"
55+
device: "GPU"
56+
description: "Audio encoder for Qwen 3-Omni."
57+
inputs:
58+
- name: "input_features"
59+
type: "VecOVTensor"
60+
source: "audio_preprocessor.input_features"
61+
- name: "feature_attention_mask"
62+
type: "VecOVTensor"
63+
source: "audio_preprocessor.feature_attention_mask"
64+
outputs:
65+
- name: "audio_features"
66+
type: "OVTensor"
67+
- name: "audio_feature_lengths"
68+
type: "OVTensor"
69+
params:
70+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_audio_encoder.xml"
71+
72+
prompt_encoder:
73+
type: "TextEncoderModule"
74+
device: "GPU"
75+
inputs:
76+
- name: "prompt"
77+
type: "String"
78+
source: "pipeline_params.prompt"
79+
- name: "grid_thw"
80+
type: "OVTensor"
81+
source: "image_preprocessor.grid_thw"
82+
- name: "audio_features"
83+
type: "OVTensor"
84+
source: "audio_encoder.audio_features"
85+
outputs:
86+
- name: "input_ids"
87+
type: "OVTensor"
88+
- name: "mask"
89+
type: "OVTensor"
90+
params:
91+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
92+
93+
vision_encoder:
94+
type: "VisionEncoderModule"
95+
device: "GPU"
96+
inputs:
97+
- name: "preprocessed_image"
98+
type: "OVTensor"
99+
source: "image_preprocessor.pixel_values"
100+
- name: "grid_thw"
101+
type: "OVTensor"
102+
source: "image_preprocessor.grid_thw"
103+
- name: "pos_embeds"
104+
type: "OVTensor"
105+
source: "image_preprocessor.pos_embeds"
106+
- name: "rotary_cos"
107+
type: "OVTensor"
108+
source: "image_preprocessor.rotary_cos"
109+
- name: "rotary_sin"
110+
type: "OVTensor"
111+
source: "image_preprocessor.rotary_sin"
112+
- name: "input_ids"
113+
type: "OVTensor"
114+
source: "prompt_encoder.input_ids"
115+
- name: "attention_mask"
116+
type: "OVTensor"
117+
source: "prompt_encoder.mask"
118+
- name: "audio_features"
119+
type: "OVTensor"
120+
source: "audio_encoder.audio_features"
121+
- name: "audio_feature_lengths"
122+
type: "OVTensor"
123+
source: "audio_encoder.audio_feature_lengths"
124+
outputs:
125+
- name: "image_embedding"
126+
type: "OVTensor"
127+
- name: "visual_pos_mask"
128+
type: "OVTensor"
129+
- name: "position_ids"
130+
type: "OVTensor"
131+
- name: "rope_delta"
132+
type: "OVTensor"
133+
- name: "deepstack_embeds"
134+
type: "VecOVTensor"
135+
- name: "audio_embedding"
136+
type: "OVTensor"
137+
- name: "audio_pos_mask"
138+
type: "OVTensor"
139+
params:
140+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
141+
vision_start_token_id: 248053
142+
143+
llm:
144+
type: "LLMInferenceSDPAModule"
145+
device: "GPU"
146+
inputs:
147+
- name: "input_ids"
148+
type: "OVTensor"
149+
source: "prompt_encoder.input_ids"
150+
- name: "visual_embeds"
151+
type: "OVTensor"
152+
source: "vision_encoder.image_embedding"
153+
- name: "visual_pos_mask"
154+
type: "OVTensor"
155+
source: "vision_encoder.visual_pos_mask"
156+
- name: "grid_thw"
157+
type: "OVTensor"
158+
source: "image_preprocessor.grid_thw"
159+
- name: "position_ids"
160+
type: "OVTensor"
161+
source: "vision_encoder.position_ids"
162+
- name: "rope_delta"
163+
type: "OVTensor"
164+
source: "vision_encoder.rope_delta"
165+
- name: "deepstack_embeds"
166+
type: "VecOVTensor"
167+
source: "vision_encoder.deepstack_embeds"
168+
- name: "audio_embeds"
169+
type: "OVTensor"
170+
source: "vision_encoder.audio_embedding"
171+
- name: "audio_pos_mask"
172+
type: "OVTensor"
173+
source: "vision_encoder.audio_pos_mask"
174+
outputs:
175+
- name: "generated_text"
176+
type: "String"
177+
params:
178+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
179+
max_new_tokens: 512
180+
181+
pipeline_result:
182+
type: "ResultModule"
183+
description: "Collects final results and formats the output structure."
184+
inputs:
185+
- name: "generated_text"
186+
type: "String"
187+
source: "llm.generated_text"

samples/cpp/module_genai/config_yaml/Qwen3-Omni/config.yaml renamed to samples/cpp/module_genai/config_yaml/Qwen3-Omni/config_prompt_image.yaml

File renamed without changes.

samples/cpp/module_genai/md_omni.cpp

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "utils/vision_utils.hpp"
1111
#include "yaml-cpp/yaml.h"
1212
#include "utils/utils.hpp"
13+
#include "utils/audio_utils.hpp"
1314

1415
inline ov::AnyMap parse_inputs_from_yaml_cfg_for_vlm(const std::filesystem::path& cfg_yaml_path,
1516
const std::string& prompt = std::string{},
@@ -52,11 +53,11 @@ inline ov::AnyMap parse_inputs_from_yaml_cfg_for_vlm(const std::filesystem::path
5253
continue;
5354
}
5455

55-
if (param_type == "String" && utils::contains_key(param_name, {"audio"})) {
56+
if (param_type == "OVTensor" && utils::contains_key(param_name, {"audio"})) {
5657
if (audio_path.empty()) {
5758
throw std::runtime_error("Audio path is empty.");
5859
}
59-
inputs[param_name] = audio_utils::read_wav(audio_path);
60+
inputs[param_name] = audio_utils::load_audio(audio_path);
6061
continue;
6162
}
6263
}
@@ -73,7 +74,9 @@ int main(int argc, char* argv[]) {
7374
" -prompt: input prompt\n"
7475
" -img: [Optional] image path\n"
7576
" -video: [Optional] video path\n"
76-
" -audio: [Optional] audio path\n");
77+
" -audio: [Optional] audio path\n"
78+
" -warmup: [Optional] number of warmup runs, default 0\n"
79+
" -perf: [Optional] set to 1 to print performance metrics, default 0\n");
7780
}
7881

7982
std::filesystem::path config_path = utils::get_input_arg(argc, argv, "-cfg", std::string{});
@@ -82,6 +85,8 @@ int main(int argc, char* argv[]) {
8285
std::string img_path = utils::get_input_arg(argc, argv, "-img", std::string{});
8386
std::string video_path = utils::get_input_arg(argc, argv, "-video", std::string{});
8487
std::string audio_path = utils::get_input_arg(argc, argv, "-audio", std::string{});
88+
int warmup = std::stoi(utils::get_input_arg(argc, argv, "-warmup", std::string("0")));
89+
bool perf = std::stoi(utils::get_input_arg(argc, argv, "-perf", std::string("0")));
8590

8691
ov::AnyMap inputs = parse_inputs_from_yaml_cfg_for_vlm(config_path, prompt, img_path, video_path, audio_path);
8792

@@ -105,8 +110,28 @@ int main(int argc, char* argv[]) {
105110

106111
ov::genai::module::ModulePipeline pipe(config_path, properties);
107112

113+
for (int i = 0; i < warmup; ++i) {
114+
std::cout << "[Warmup] Run " << (i + 1) << "/" << warmup << std::endl;
115+
auto t1 = std::chrono::high_resolution_clock::now();
116+
pipe.generate(inputs);
117+
auto t2 = std::chrono::high_resolution_clock::now();
118+
if (perf) {
119+
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
120+
std::cout << "[Warmup] Duration: " << diff << " ms" << std::endl;
121+
}
122+
}
123+
124+
std::cout << "[Generation] Running main generation..." << std::endl;
125+
auto t1 = std::chrono::high_resolution_clock::now();
126+
108127
pipe.generate(inputs);
109128

129+
auto t2 = std::chrono::high_resolution_clock::now();
130+
if (perf) {
131+
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
132+
std::cout << "[Generation] Duration: " << diff << " ms" << std::endl;
133+
}
134+
110135
std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
111136
} catch (const std::exception& ex) {
112137
std::cerr << "[ERROR] " << ex.what() << std::endl;

0 commit comments

Comments
 (0)