Skip to content

Commit ec5037b

Browse files
ZiniuLinxipingyan
authored andcommitted
Fix inference issue when only provides text for Qwen3-Omni (#131)
Fix inference issue when only provides text for Qwen3-Omni. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent 4d0c70e commit ec5037b

File tree

3 files changed

+119
-22
lines changed

3 files changed

+119
-22
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
global_context:
2+
model_type: "qwen3_omni"
3+
4+
pipeline_modules:
5+
pipeline_params:
6+
type: "ParameterModule"
7+
outputs:
8+
- name: "prompts"
9+
type: "VecString"
10+
11+
prompt_encoder:
12+
type: "TextEncoderModule"
13+
device: "GPU"
14+
inputs:
15+
- name: "prompts"
16+
type: "VecString"
17+
source: "pipeline_params.prompts"
18+
outputs:
19+
- name: "input_ids"
20+
type: "OVTensor"
21+
- name: "mask"
22+
type: "OVTensor"
23+
params:
24+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
25+
26+
vision_encoder:
27+
type: "VisionEncoderModule"
28+
device: "GPU"
29+
inputs:
30+
- name: "input_ids"
31+
type: "OVTensor"
32+
source: "prompt_encoder.input_ids"
33+
- name: "attention_mask"
34+
type: "OVTensor"
35+
source: "prompt_encoder.mask"
36+
outputs:
37+
- name: "position_ids"
38+
type: "OVTensor"
39+
- name: "rope_delta"
40+
type: "OVTensor"
41+
params:
42+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
43+
vision_start_token_id: 248053
44+
45+
llm:
46+
type: "LLMInferenceSDPAModule"
47+
device: "GPU"
48+
inputs:
49+
- name: "input_ids"
50+
type: "OVTensor"
51+
source: "prompt_encoder.input_ids"
52+
- name: "position_ids"
53+
type: "OVTensor"
54+
source: "vision_encoder.position_ids"
55+
- name: "rope_delta"
56+
type: "OVTensor"
57+
source: "vision_encoder.rope_delta"
58+
outputs:
59+
- name: "generated_text"
60+
type: "String"
61+
params:
62+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
63+
max_new_tokens: 512
64+
65+
pipeline_result:
66+
type: "ResultModule"
67+
description: "Collects final results and formats the output structure."
68+
inputs:
69+
- name: "generated_text"
70+
type: "String"
71+
source: "llm.generated_text"

src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ int64_t LLMInferenceSDPAModule::argmax_last(const ov::Tensor& logits) {
188188

189189
bool LLMInferenceSDPAModule::initialize() {
190190
const auto& params = module_desc->params;
191+
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
191192

192193
// Resolve model directory
193194
std::filesystem::path models_path = get_optional_param("model_path");
@@ -224,9 +225,16 @@ bool LLMInferenceSDPAModule::initialize() {
224225

225226
// Load model config
226227
try {
227-
m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file(models_path);
228+
if (model_type == VLMModelType::QWEN3_5){
229+
m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file(models_path);
230+
} else if (model_type == VLMModelType::QWEN3_OMNI) {
231+
m_model_config = ov::genai::modeling::models::Qwen3OmniConfig::from_json_file(models_path);
232+
} else {
233+
GENAI_ERR("Unsupported model type: " + module_desc->model_type);
234+
return false;
235+
}
228236
} catch (const std::exception& e) {
229-
GENAI_ERR("Failed to load Qwen3.5 config from " + models_path.string() + ": " + e.what());
237+
GENAI_ERR("Failed to load model config from " + models_path.string() + ": " + e.what());
230238
return false;
231239
}
232240

@@ -311,8 +319,11 @@ bool LLMInferenceSDPAModule::initialize() {
311319
if (eid >= 0) m_stop_ids.insert(eid);
312320
} catch (...) {}
313321
}
314-
if (m_model_config.text.eos_token_id > 0) {
315-
m_stop_ids.insert(m_model_config.text.eos_token_id);
322+
if (model_type == VLMModelType::QWEN3_5) {
323+
auto& cfg = std::get<ov::genai::modeling::models::Qwen3_5Config>(m_model_config);
324+
if (cfg.text.eos_token_id > 0) {
325+
m_stop_ids.insert(cfg.text.eos_token_id);
326+
}
316327
}
317328
if (m_stop_ids.empty()) {
318329
GENAI_INFO("LLMInferenceSDPAModule: no stop token ids found — "
@@ -334,6 +345,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
334345
const ov::Tensor& position_ids,
335346
const ov::Tensor& rope_deltas) {
336347
using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
348+
const auto &model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
337349

338350
const size_t batch = input_ids.get_shape()[0];
339351
const int64_t prompt_len = static_cast<int64_t>(input_ids.get_shape()[1]);
@@ -352,7 +364,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
352364
// Feed zero visual inputs for text-only usage of VL IR
353365
text_req.set_tensor(TIO::kVisualEmbeds,
354366
make_zeros(ov::element::f32, {batch, static_cast<size_t>(prompt_len),
355-
static_cast<size_t>(m_model_config.text.hidden_size)}));
367+
static_cast<size_t>(model_config.text.hidden_size)}));
356368
text_req.set_tensor(TIO::kVisualPosMask,
357369
make_zeros(ov::element::boolean, {batch, static_cast<size_t>(prompt_len)}));
358370
}
@@ -373,7 +385,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
373385

374386
ov::Tensor dec_vis, dec_vis_mask;
375387
if (m_text_uses_vl_ir) {
376-
dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
388+
dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)});
377389
dec_vis_mask = make_zeros(ov::element::boolean, {batch, 1});
378390
}
379391

@@ -458,6 +470,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
458470
const ov::Tensor& visual_pos_mask,
459471
const std::optional<std::vector<ov::Tensor>>& deepstack_embeds) {
460472
using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
473+
const auto &model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
461474

462475
const size_t batch = input_ids.get_shape()[0];
463476
const int64_t prompt_len = static_cast<int64_t>(input_ids.get_shape()[1]);
@@ -480,7 +493,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
480493
std::to_string(i);
481494
text_req.set_tensor(name, deepstack_embeds.value()[i]);
482495
}
483-
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(m_model_config.text.hidden_size)});
496+
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(model_config.text.hidden_size)});
484497
std::memset(prefill_audio_features.data(), 0, prefill_audio_features.get_byte_size());
485498
text_req.set_tensor("audio_features", prefill_audio_features);
486499
ov::Tensor prefill_audio_pos_mask(ov::element::boolean, {batch, input_ids.get_shape()[1]});
@@ -502,17 +515,17 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
502515
ov::Tensor step_mask = make_zeros(ov::element::i64, {batch, 1});
503516
for (size_t b = 0; b < batch; ++b) step_mask.data<int64_t>()[b] = 1;
504517

505-
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
518+
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)});
506519
ov::Tensor dec_vis_mask = make_zeros(ov::element::boolean, {batch, 1});
507520
ov::Tensor decode_audio_features =
508-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
521+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)});
509522
ov::Tensor decode_audio_pos_mask = make_zeros(ov::element::boolean, {batch, 1});
510523
std::vector<ov::Tensor> decode_deepstack;
511524
if (deepstack_embeds.has_value()) {
512525
decode_deepstack.reserve(deepstack_embeds.value().size());
513526
for (size_t i = 0; i < deepstack_embeds.value().size(); ++i) {
514527
decode_deepstack.push_back(
515-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)}));
528+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)}));
516529
}
517530
}
518531

@@ -605,6 +618,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
605618
const std::optional<ov::Tensor>& audio_embeds,
606619
const std::optional<ov::Tensor>& audio_pos_mask) {
607620
using TIO = ov::genai::modeling::models::Qwen3OmniTextIO;
621+
const auto &model_config = std::get<modeling::models::Qwen3OmniConfig>(m_model_config);
608622

609623
const size_t batch = input_ids.get_shape()[0];
610624
const int64_t prompt_len = static_cast<int64_t>(input_ids.get_shape()[1]);
@@ -621,20 +635,33 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
621635
if (visual_embeds.has_value() && visual_pos_mask.has_value()) {
622636
text_req.set_tensor(TIO::kVisualEmbeds, visual_embeds.value());
623637
text_req.set_tensor(TIO::kVisualPosMask, visual_pos_mask.value());
638+
} else{
639+
text_req.set_tensor(TIO::kVisualEmbeds,
640+
make_zeros(ov::element::f32, {batch, static_cast<size_t>(prompt_len),
641+
static_cast<size_t>(model_config.thinker.text.hidden_size)}));
642+
text_req.set_tensor(TIO::kVisualPosMask,
643+
make_zeros(ov::element::boolean, {batch, static_cast<size_t>(prompt_len)}));
624644
}
625645
if (deepstack_embeds.has_value()) {
626646
for (size_t i = 0; i < deepstack_embeds->size(); i++) {
627647
const std::string name =
628-
std::string(ov::genai::modeling::models::Qwen3VLTextIO::kDeepstackEmbedsPrefix) + "." +
648+
std::string(ov::genai::modeling::models::Qwen3OmniVisionIO::kDeepstackEmbedsPrefix) + "." +
629649
std::to_string(i);
630650
text_req.set_tensor(name, deepstack_embeds.value()[i]);
631651
}
652+
} else {
653+
for (size_t i = 0; i < model_config.thinker.vision.deepstack_visual_indexes.size(); i++) {
654+
const std::string name =
655+
std::string(ov::genai::modeling::models::Qwen3OmniVisionIO::kDeepstackEmbedsPrefix) + "." +
656+
std::to_string(i);
657+
text_req.set_tensor(name, make_zeros(ov::element::f32, {batch, static_cast<size_t>(prompt_len), static_cast<size_t>(model_config.thinker.text.hidden_size)}));
658+
}
632659
}
633660
if (audio_embeds.has_value() && audio_pos_mask.has_value()) {
634661
text_req.set_tensor(TIO::kAudioFeatures, audio_embeds.value());
635662
text_req.set_tensor(TIO::kAudioPosMask, audio_pos_mask.value());
636663
} else {
637-
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(m_model_config.text.hidden_size)});
664+
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(model_config.thinker.text.hidden_size)});
638665
std::memset(prefill_audio_features.data(), 0, prefill_audio_features.get_byte_size());
639666
text_req.set_tensor(TIO::kAudioFeatures, prefill_audio_features);
640667
ov::Tensor prefill_audio_pos_mask(ov::element::boolean, {batch, input_ids.get_shape()[1]});
@@ -656,18 +683,16 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
656683
ov::Tensor step_mask = make_zeros(ov::element::i64, {batch, 1});
657684
for (size_t b = 0; b < batch; ++b) step_mask.data<int64_t>()[b] = 1;
658685

659-
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
686+
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)});
660687
ov::Tensor dec_vis_mask = make_zeros(ov::element::boolean, {batch, 1});
661688
ov::Tensor decode_audio_features =
662-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
689+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)});
663690
ov::Tensor decode_audio_pos_mask = make_zeros(ov::element::boolean, {batch, 1});
664691
std::vector<ov::Tensor> decode_deepstack;
665-
if (deepstack_embeds.has_value()) {
666-
decode_deepstack.reserve(deepstack_embeds.value().size());
667-
for (size_t i = 0; i < deepstack_embeds.value().size(); ++i) {
668-
decode_deepstack.push_back(
669-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)}));
670-
}
692+
decode_deepstack.reserve(model_config.thinker.vision.deepstack_visual_indexes.size());
693+
for (size_t i = 0; i < model_config.thinker.vision.deepstack_visual_indexes.size(); ++i) {
694+
decode_deepstack.push_back(
695+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)}));
671696
}
672697

673698
int64_t past_len = prompt_len;
@@ -831,7 +856,7 @@ void LLMInferenceSDPAModule::run() {
831856
this->inputs.find("position_ids") != this->inputs.end() &&
832857
this->inputs.find("rope_delta") != this->inputs.end());
833858

834-
ov::genai::modeling::models::Qwen3_5InputPlanner planner(m_model_config);
859+
ov::genai::modeling::models::Qwen3_5InputPlanner planner(std::get<modeling::models::Qwen3_5Config>(m_model_config));
835860

836861
if (is_vl) {
837862
// ---- VL mode ----

src/cpp/src/module_genai/modules/md_llm_inference_sdpa.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "modeling/models/qwen3_5/modeling_qwen3_5_text.hpp"
1414
#include "modeling/models/qwen3_5/processing_qwen3_5.hpp"
1515
#include "modeling/weights/quantization_config.hpp"
16+
#include "modeling/models/qwen3_omni/processing_qwen3_omni.hpp"
1617

1718
namespace ov {
1819
namespace genai {
@@ -82,7 +83,7 @@ class LLMInferenceSDPAModule : public IBaseModule {
8283
std::set<int64_t> m_stop_ids;
8384

8485
// Model config
85-
ov::genai::modeling::models::Qwen3_5Config m_model_config;
86+
std::variant<ov::genai::modeling::models::Qwen3_5Config, modeling::models::Qwen3OmniConfig> m_model_config;
8687

8788
// Tokenizer (for text mode and decoding)
8889
std::unique_ptr<ov::genai::Tokenizer> m_tokenizer;

0 commit comments

Comments
 (0)