Skip to content

Commit 508fe2e

Browse files
committed
Enable TTS for Qwen3-Omni
Enable TTS for Qwen3-Omni. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent c804e02 commit 508fe2e

18 files changed

+4562
-1
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,6 @@ samples/python/module_genai/*.sh
5353
tests/module_genai/cpp/*.sh
5454
tests/module_genai/cpp/*.yaml
5555
tests/module_genai/cpp/test_data/*.json
56+
modelprint_*_before_la_fusion.cpp
57+
output_audio_*.wav
58+
profile_*.json
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
global_context:
2+
model_type: "qwen3_omni"
3+
4+
pipeline_modules:
5+
pipeline_params:
6+
type: "ParameterModule"
7+
outputs:
8+
- name: "videos"
9+
type: "VecOVTensor"
10+
- name: "images"
11+
type: "VecOVTensor"
12+
- name: "prompts"
13+
type: "VecString"
14+
- name: "audios"
15+
type: "VecOVTensor"
16+
- name: "use_audio_in_video"
17+
type: "VecInt"
18+
19+
image_preprocessor:
20+
type: "ImagePreprocessModule"
21+
device: "CPU"
22+
description: "Image or Video preprocessing."
23+
inputs:
24+
- name: "images"
25+
type: "VecOVTensor"
26+
source: "pipeline_params.images"
27+
outputs:
28+
- name: "pixel_values"
29+
type: "VecOVTensor"
30+
- name: "grid_thw"
31+
type: "VecOVTensor"
32+
- name: "pos_embeds"
33+
type: "VecOVTensor"
34+
- name: "rotary_cos"
35+
type: "VecOVTensor"
36+
- name: "rotary_sin"
37+
type: "VecOVTensor"
38+
params:
39+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
40+
41+
video_preprocessor:
42+
type: "VideoPreprocessModule"
43+
device: "CPU"
44+
description: "Video preprocessing."
45+
inputs:
46+
- name: "videos"
47+
type: "VecOVTensor"
48+
source: "pipeline_params.videos"
49+
outputs:
50+
- name: "pixel_values"
51+
type: "VecOVTensor"
52+
- name: "grid_thw"
53+
type: "VecOVTensor"
54+
- name: "pos_embeds"
55+
type: "VecOVTensor"
56+
- name: "rotary_cos"
57+
type: "VecOVTensor"
58+
- name: "rotary_sin"
59+
type: "VecOVTensor"
60+
- name: video_second_per_grid
61+
type: "VecInt"
62+
params:
63+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
64+
65+
audio_preprocessor:
66+
type: "AudioPreprocessModule"
67+
device: "CPU"
68+
description: "Audio preprocessing."
69+
inputs:
70+
- name: "audios"
71+
type: "VecOVTensor"
72+
source: "pipeline_params.audios"
73+
outputs:
74+
- name: "input_features"
75+
type: "VecOVTensor"
76+
- name: "feature_attention_mask"
77+
type: "VecOVTensor"
78+
params:
79+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
80+
81+
audio_encoder:
82+
type: "AudioEncoderModule"
83+
device: "GPU"
84+
description: "Audio encoder for Qwen 3-Omni."
85+
inputs:
86+
- name: "input_features"
87+
type: "VecOVTensor"
88+
source: "audio_preprocessor.input_features"
89+
- name: "feature_attention_mask"
90+
type: "VecOVTensor"
91+
source: "audio_preprocessor.feature_attention_mask"
92+
outputs:
93+
- name: "audio_features"
94+
type: "VecOVTensor"
95+
- name: "audio_feature_lengths"
96+
type: "OVTensor"
97+
params:
98+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_audio_encoder.xml"
99+
100+
prompt_encoder:
101+
type: "TextEncoderModule"
102+
device: "GPU"
103+
inputs:
104+
- name: "prompts"
105+
type: "VecString"
106+
source: "pipeline_params.prompts"
107+
- name: "image_grid_thw"
108+
type: "VecOVTensor"
109+
source: "image_preprocessor.grid_thw"
110+
- name: "video_grid_thw"
111+
type: "VecOVTensor"
112+
source: "video_preprocessor.grid_thw"
113+
- name: "audio_features"
114+
type: "VecOVTensor"
115+
source: "audio_encoder.audio_features"
116+
- name: "video_second_per_grid"
117+
type: "VecInt"
118+
source: "video_preprocessor.video_second_per_grid"
119+
- name: "use_audio_in_video"
120+
type: "VecInt"
121+
source: "pipeline_params.use_audio_in_video"
122+
outputs:
123+
- name: "input_ids"
124+
type: "OVTensor"
125+
- name: "mask"
126+
type: "OVTensor"
127+
params:
128+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
129+
130+
vision_encoder:
131+
type: "VisionEncoderModule"
132+
device: "GPU"
133+
inputs:
134+
- name: "preprocessed_image"
135+
type: "VecOVTensor"
136+
source: "image_preprocessor.pixel_values"
137+
- name: "image_grid_thw"
138+
type: "VecOVTensor"
139+
source: "image_preprocessor.grid_thw"
140+
- name: "image_pos_embeds"
141+
type: "VecOVTensor"
142+
source: "image_preprocessor.pos_embeds"
143+
- name: "image_rotary_cos"
144+
type: "VecOVTensor"
145+
source: "image_preprocessor.rotary_cos"
146+
- name: "image_rotary_sin"
147+
type: "VecOVTensor"
148+
source: "image_preprocessor.rotary_sin"
149+
- name: "preprocessed_video"
150+
type: "VecOVTensor"
151+
source: "video_preprocessor.pixel_values"
152+
- name: "video_grid_thw"
153+
type: "VecOVTensor"
154+
source: "video_preprocessor.grid_thw"
155+
- name: "video_pos_embeds"
156+
type: "VecOVTensor"
157+
source: "video_preprocessor.pos_embeds"
158+
- name: "video_rotary_cos"
159+
type: "VecOVTensor"
160+
source: "video_preprocessor.rotary_cos"
161+
- name: "video_rotary_sin"
162+
type: "VecOVTensor"
163+
source: "video_preprocessor.rotary_sin"
164+
- name: "input_ids"
165+
type: "OVTensor"
166+
source: "prompt_encoder.input_ids"
167+
- name: "attention_mask"
168+
type: "OVTensor"
169+
source: "prompt_encoder.mask"
170+
- name: "audio_features"
171+
type: "VecOVTensor"
172+
source: "audio_encoder.audio_features"
173+
- name: "audio_feature_lengths"
174+
type: "OVTensor"
175+
source: "audio_encoder.audio_feature_lengths"
176+
outputs:
177+
- name: "image_embedding"
178+
type: "OVTensor"
179+
- name: "visual_pos_mask"
180+
type: "OVTensor"
181+
- name: "position_ids"
182+
type: "OVTensor"
183+
- name: "rope_delta"
184+
type: "OVTensor"
185+
- name: "deepstack_embeds"
186+
type: "VecOVTensor"
187+
- name: "audio_embedding"
188+
type: "OVTensor"
189+
- name: "audio_pos_mask"
190+
type: "OVTensor"
191+
params:
192+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
193+
vision_start_token_id: 248053
194+
195+
llm:
196+
type: "LLMInferenceSDPAModule"
197+
device: "GPU"
198+
inputs:
199+
- name: "input_ids"
200+
type: "OVTensor"
201+
source: "prompt_encoder.input_ids"
202+
- name: "visual_embeds"
203+
type: "OVTensor"
204+
source: "vision_encoder.image_embedding"
205+
- name: "visual_pos_mask"
206+
type: "OVTensor"
207+
source: "vision_encoder.visual_pos_mask"
208+
- name: "grid_thw"
209+
type: "VecOVTensor"
210+
source: "video_preprocessor.grid_thw"
211+
- name: "position_ids"
212+
type: "OVTensor"
213+
source: "vision_encoder.position_ids"
214+
- name: "rope_delta"
215+
type: "OVTensor"
216+
source: "vision_encoder.rope_delta"
217+
- name: "deepstack_embeds"
218+
type: "VecOVTensor"
219+
source: "vision_encoder.deepstack_embeds"
220+
- name: "audio_embeds"
221+
type: "OVTensor"
222+
source: "vision_encoder.audio_embedding"
223+
- name: "audio_pos_mask"
224+
type: "OVTensor"
225+
source: "vision_encoder.audio_pos_mask"
226+
outputs:
227+
- name: "generated_text"
228+
type: "String"
229+
params:
230+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
231+
max_new_tokens: 512
232+
233+
text_to_speech:
234+
type: "TextToSpeechModule"
235+
device: "GPU"
236+
inputs:
237+
- name: "text"
238+
type: "String"
239+
source: "llm.generated_text"
240+
outputs:
241+
- name: "audios"
242+
type: "VecOVTensor"
243+
- name: "sample_rates"
244+
type: "VecInt"
245+
params:
246+
config_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/config.json"
247+
tokenizer_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual"
248+
embedding_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_embedding_model.xml"
249+
prefill_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_prefill_model.xml"
250+
decode_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_decode_model.xml"
251+
codec_embedding_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_talker_codec_embedding_model.xml"
252+
code_predictor_ar_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual"
253+
code_predictor_single_codec_embed_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual"
254+
code_predictor_single_codec_embedding_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_code_predictor_codec_embed_model.xml"
255+
speech_decoder_model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_speech_decoder_model.xml"
256+
257+
pipeline_result:
258+
type: "ResultModule"
259+
description: "Collects final results and formats the output structure."
260+
inputs:
261+
- name: "audios"
262+
type: "VecOVTensor"
263+
source: "text_to_speech.audios"
264+
- name: "sample_rates"
265+
type: "VecInt"
266+
source: "text_to_speech.sample_rates"

samples/cpp/module_genai/md_omni.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ int main(int argc, char* argv[]) {
7272
" -video: [Optional] video path\n"
7373
" -audio: [Optional] audio path\n"
7474
" -use_audio_in_video: [Optional] set to 1 if the video contains audio and you want to use the audio, default 0\n"
75+
" -tts: [Optional] set to 1 to use tts, default 0\n"
7576
" -warmup: [Optional] number of warmup runs, default 0\n"
7677
" -perf: [Optional] set to 1 to print performance metrics, default 0\n");
7778
}
@@ -80,6 +81,7 @@ int main(int argc, char* argv[]) {
8081
std::string cache_dir = utils::get_input_arg(argc, argv, "-cache_dir", std::string{});
8182
int warmup = std::stoi(utils::get_input_arg(argc, argv, "-warmup", std::string("0")));
8283
bool perf = std::stoi(utils::get_input_arg(argc, argv, "-perf", std::string("0")));
84+
bool use_tts = std::stoi(utils::get_input_arg(argc, argv, "-tts", std::string("0"))) != 0;
8385

8486
utils::OmniInputParams input_params = utils::parse_omni_input_params(argc, argv);
8587
ov::AnyMap inputs = parse_inputs_for_omni(input_params);
@@ -126,7 +128,21 @@ int main(int argc, char* argv[]) {
126128
std::cout << "[Generation] Duration: " << diff << " ms" << std::endl;
127129
}
128130

129-
std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
131+
if (!use_tts) {
132+
std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
133+
} else {
134+
std::vector<ov::Tensor> audios = pipe.get_output("audios").as<std::vector<ov::Tensor>>();
135+
std::vector<int> sample_rates = pipe.get_output("sample_rates").as<std::vector<int>>();
136+
for (size_t i = 0; i < audios.size(); ++i) {
137+
std::string output_path = "output_audio_" + std::to_string(i) + ".wav";
138+
auto audio_data = audios[i].data<const float>();
139+
const size_t sample_count = audios[i].get_size();
140+
std::cout << "sample_rate: " << sample_rates[i] << ", sample_count: " << sample_count << std::endl;
141+
audio_utils::write_wav(output_path, audio_data, sample_count, sample_rates[i]);
142+
std::cout << "Saved generated audio to: " << output_path << std::endl;
143+
}
144+
}
145+
130146
} catch (const std::exception& ex) {
131147
std::cerr << "[ERROR] " << ex.what() << std::endl;
132148
return EXIT_FAILURE;

samples/cpp/module_genai/utils/audio_utils.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,4 +204,40 @@ ov::Tensor load_audio(const std::filesystem::path& audio_path) {
204204
return tensor;
205205
}
206206

207+
void write_wav(const std::string& filename, const float* samples, size_t num_samples, int sample_rate) {
208+
std::ofstream file(filename, std::ios::binary);
209+
if (!file) {
210+
throw std::runtime_error("Failed to create WAV file: " + filename);
211+
}
212+
213+
int32_t data_size = static_cast<int32_t>(num_samples * sizeof(int16_t));
214+
int32_t file_size = 36 + data_size;
215+
int16_t audio_format = 1;
216+
int16_t num_channels = 1;
217+
int32_t byte_rate = sample_rate * num_channels * 2;
218+
int16_t block_align = num_channels * 2;
219+
int16_t bits_per_sample = 16;
220+
221+
file.write("RIFF", 4);
222+
file.write(reinterpret_cast<char*>(&file_size), 4);
223+
file.write("WAVE", 4);
224+
file.write("fmt ", 4);
225+
int32_t fmt_size = 16;
226+
file.write(reinterpret_cast<char*>(&fmt_size), 4);
227+
file.write(reinterpret_cast<char*>(&audio_format), 2);
228+
file.write(reinterpret_cast<char*>(&num_channels), 2);
229+
file.write(reinterpret_cast<char*>(&sample_rate), 4);
230+
file.write(reinterpret_cast<char*>(&byte_rate), 4);
231+
file.write(reinterpret_cast<char*>(&block_align), 2);
232+
file.write(reinterpret_cast<char*>(&bits_per_sample), 2);
233+
file.write("data", 4);
234+
file.write(reinterpret_cast<char*>(&data_size), 4);
235+
236+
for (size_t i = 0; i < num_samples; ++i) {
237+
float v = std::max(-1.0f, std::min(1.0f, samples[i]));
238+
int16_t s = static_cast<int16_t>(v * 32767.0f);
239+
file.write(reinterpret_cast<char*>(&s), 2);
240+
}
241+
}
242+
207243
} // namespace audio_utils

samples/cpp/module_genai/utils/audio_utils.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,6 @@ namespace audio_utils {
99

1010
ov::Tensor load_audio(const std::filesystem::path& audio_path);
1111

12+
void write_wav(const std::string& filename, const float* samples, size_t num_samples, int sample_rate);
13+
1214
}

0 commit comments

Comments
 (0)