Skip to content

Commit b6e489e

Browse files
committed
Fix inference issue when only provides text for Qwen3-Omni
Fix inference issue when only provides text for Qwen3-Omni. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent fcf68d5 commit b6e489e

File tree

3 files changed

+113
-20
lines changed

3 files changed

+113
-20
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
global_context:
2+
model_type: "qwen3_omni"
3+
4+
pipeline_modules:
5+
pipeline_params:
6+
type: "ParameterModule"
7+
outputs:
8+
- name: "prompts"
9+
type: "VecString"
10+
11+
prompt_encoder:
12+
type: "TextEncoderModule"
13+
device: "GPU"
14+
inputs:
15+
- name: "prompts"
16+
type: "VecString"
17+
source: "pipeline_params.prompts"
18+
outputs:
19+
- name: "input_ids"
20+
type: "OVTensor"
21+
- name: "mask"
22+
type: "OVTensor"
23+
params:
24+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
25+
26+
vision_encoder:
27+
type: "VisionEncoderModule"
28+
device: "GPU"
29+
inputs:
30+
- name: "input_ids"
31+
type: "OVTensor"
32+
source: "prompt_encoder.input_ids"
33+
- name: "attention_mask"
34+
type: "OVTensor"
35+
source: "prompt_encoder.mask"
36+
outputs:
37+
- name: "position_ids"
38+
type: "OVTensor"
39+
- name: "rope_delta"
40+
type: "OVTensor"
41+
params:
42+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
43+
vision_start_token_id: 248053
44+
45+
llm:
46+
type: "LLMInferenceSDPAModule"
47+
device: "GPU"
48+
inputs:
49+
- name: "input_ids"
50+
type: "OVTensor"
51+
source: "prompt_encoder.input_ids"
52+
- name: "position_ids"
53+
type: "OVTensor"
54+
source: "vision_encoder.position_ids"
55+
- name: "rope_delta"
56+
type: "OVTensor"
57+
source: "vision_encoder.rope_delta"
58+
outputs:
59+
- name: "generated_text"
60+
type: "String"
61+
params:
62+
model_path: "./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
63+
max_new_tokens: 512
64+
65+
pipeline_result:
66+
type: "ResultModule"
67+
description: "Collects final results and formats the output structure."
68+
inputs:
69+
- name: "generated_text"
70+
type: "String"
71+
source: "llm.generated_text"

src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ int64_t LLMInferenceSDPAModule::argmax_last(const ov::Tensor& logits) {
188188

189189
bool LLMInferenceSDPAModule::initialize() {
190190
const auto& params = module_desc->params;
191+
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
191192

192193
// Resolve model directory
193194
std::filesystem::path models_path = get_optional_param("model_path");
@@ -224,7 +225,10 @@ bool LLMInferenceSDPAModule::initialize() {
224225

225226
// Load model config
226227
try {
227-
m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file(models_path);
228+
if (model_type == VLMModelType::QWEN3_5)
229+
m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file(models_path);
230+
else if (model_type == VLMModelType::QWEN3_OMNI)
231+
m_model_config = ov::genai::modeling::models::Qwen3OmniConfig::from_json_file(models_path);
228232
} catch (const std::exception& e) {
229233
GENAI_ERR("Failed to load Qwen3.5 config from " + models_path.string() + ": " + e.what());
230234
return false;
@@ -311,8 +315,11 @@ bool LLMInferenceSDPAModule::initialize() {
311315
if (eid >= 0) m_stop_ids.insert(eid);
312316
} catch (...) {}
313317
}
314-
if (m_model_config.text.eos_token_id > 0) {
315-
m_stop_ids.insert(m_model_config.text.eos_token_id);
318+
if (model_type == VLMModelType::QWEN3_5) {
319+
auto& cfg = std::get<ov::genai::modeling::models::Qwen3_5Config>(m_model_config);
320+
if (cfg.text.eos_token_id > 0) {
321+
m_stop_ids.insert(cfg.text.eos_token_id);
322+
}
316323
}
317324
if (m_stop_ids.empty()) {
318325
GENAI_INFO("LLMInferenceSDPAModule: no stop token ids found — "
@@ -334,6 +341,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
334341
const ov::Tensor& position_ids,
335342
const ov::Tensor& rope_deltas) {
336343
using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
344+
auto model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
337345

338346
const size_t batch = input_ids.get_shape()[0];
339347
const int64_t prompt_len = static_cast<int64_t>(input_ids.get_shape()[1]);
@@ -352,7 +360,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
352360
// Feed zero visual inputs for text-only usage of VL IR
353361
text_req.set_tensor(TIO::kVisualEmbeds,
354362
make_zeros(ov::element::f32, {batch, static_cast<size_t>(prompt_len),
355-
static_cast<size_t>(m_model_config.text.hidden_size)}));
363+
static_cast<size_t>(model_config.text.hidden_size)}));
356364
text_req.set_tensor(TIO::kVisualPosMask,
357365
make_zeros(ov::element::boolean, {batch, static_cast<size_t>(prompt_len)}));
358366
}
@@ -373,7 +381,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
373381

374382
ov::Tensor dec_vis, dec_vis_mask;
375383
if (m_text_uses_vl_ir) {
376-
dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
384+
dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)});
377385
dec_vis_mask = make_zeros(ov::element::boolean, {batch, 1});
378386
}
379387

@@ -458,6 +466,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
458466
const ov::Tensor& visual_pos_mask,
459467
const std::optional<std::vector<ov::Tensor>>& deepstack_embeds) {
460468
using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
469+
auto model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
461470

462471
const size_t batch = input_ids.get_shape()[0];
463472
const int64_t prompt_len = static_cast<int64_t>(input_ids.get_shape()[1]);
@@ -480,7 +489,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
480489
std::to_string(i);
481490
text_req.set_tensor(name, deepstack_embeds.value()[i]);
482491
}
483-
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(m_model_config.text.hidden_size)});
492+
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(model_config.text.hidden_size)});
484493
std::memset(prefill_audio_features.data(), 0, prefill_audio_features.get_byte_size());
485494
text_req.set_tensor("audio_features", prefill_audio_features);
486495
ov::Tensor prefill_audio_pos_mask(ov::element::boolean, {batch, input_ids.get_shape()[1]});
@@ -502,17 +511,17 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
502511
ov::Tensor step_mask = make_zeros(ov::element::i64, {batch, 1});
503512
for (size_t b = 0; b < batch; ++b) step_mask.data<int64_t>()[b] = 1;
504513

505-
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
514+
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)});
506515
ov::Tensor dec_vis_mask = make_zeros(ov::element::boolean, {batch, 1});
507516
ov::Tensor decode_audio_features =
508-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
517+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)});
509518
ov::Tensor decode_audio_pos_mask = make_zeros(ov::element::boolean, {batch, 1});
510519
std::vector<ov::Tensor> decode_deepstack;
511520
if (deepstack_embeds.has_value()) {
512521
decode_deepstack.reserve(deepstack_embeds.value().size());
513522
for (size_t i = 0; i < deepstack_embeds.value().size(); ++i) {
514523
decode_deepstack.push_back(
515-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)}));
524+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.text.hidden_size)}));
516525
}
517526
}
518527

@@ -605,6 +614,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
605614
const std::optional<ov::Tensor>& audio_embeds,
606615
const std::optional<ov::Tensor>& audio_pos_mask) {
607616
using TIO = ov::genai::modeling::models::Qwen3OmniTextIO;
617+
auto model_config = std::get<modeling::models::Qwen3OmniConfig>(m_model_config);
608618

609619
const size_t batch = input_ids.get_shape()[0];
610620
const int64_t prompt_len = static_cast<int64_t>(input_ids.get_shape()[1]);
@@ -621,6 +631,12 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
621631
if (visual_embeds.has_value() && visual_pos_mask.has_value()) {
622632
text_req.set_tensor(TIO::kVisualEmbeds, visual_embeds.value());
623633
text_req.set_tensor(TIO::kVisualPosMask, visual_pos_mask.value());
634+
} else{
635+
text_req.set_tensor(TIO::kVisualEmbeds,
636+
make_zeros(ov::element::f32, {batch, static_cast<size_t>(prompt_len),
637+
static_cast<size_t>(model_config.thinker.text.hidden_size)}));
638+
text_req.set_tensor(TIO::kVisualPosMask,
639+
make_zeros(ov::element::boolean, {batch, static_cast<size_t>(prompt_len)}));
624640
}
625641
if (deepstack_embeds.has_value()) {
626642
for (size_t i = 0; i < deepstack_embeds->size(); i++) {
@@ -629,12 +645,19 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
629645
std::to_string(i);
630646
text_req.set_tensor(name, deepstack_embeds.value()[i]);
631647
}
648+
} else {
649+
for (size_t i = 0; i < model_config.thinker.vision.deepstack_visual_indexes.size(); i++) {
650+
const std::string name =
651+
std::string(ov::genai::modeling::models::Qwen3VLTextIO::kDeepstackEmbedsPrefix) + "." +
652+
std::to_string(i);
653+
text_req.set_tensor(name, make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)}));
654+
}
632655
}
633656
if (audio_embeds.has_value() && audio_pos_mask.has_value()) {
634657
text_req.set_tensor(TIO::kAudioFeatures, audio_embeds.value());
635658
text_req.set_tensor(TIO::kAudioPosMask, audio_pos_mask.value());
636659
} else {
637-
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(m_model_config.text.hidden_size)});
660+
ov::Tensor prefill_audio_features(ov::element::f32, {batch, input_ids.get_shape()[1], static_cast<size_t>(model_config.thinker.text.hidden_size)});
638661
std::memset(prefill_audio_features.data(), 0, prefill_audio_features.get_byte_size());
639662
text_req.set_tensor(TIO::kAudioFeatures, prefill_audio_features);
640663
ov::Tensor prefill_audio_pos_mask(ov::element::boolean, {batch, input_ids.get_shape()[1]});
@@ -656,18 +679,16 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
656679
ov::Tensor step_mask = make_zeros(ov::element::i64, {batch, 1});
657680
for (size_t b = 0; b < batch; ++b) step_mask.data<int64_t>()[b] = 1;
658681

659-
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
682+
ov::Tensor dec_vis = make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)});
660683
ov::Tensor dec_vis_mask = make_zeros(ov::element::boolean, {batch, 1});
661684
ov::Tensor decode_audio_features =
662-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)});
685+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)});
663686
ov::Tensor decode_audio_pos_mask = make_zeros(ov::element::boolean, {batch, 1});
664687
std::vector<ov::Tensor> decode_deepstack;
665-
if (deepstack_embeds.has_value()) {
666-
decode_deepstack.reserve(deepstack_embeds.value().size());
667-
for (size_t i = 0; i < deepstack_embeds.value().size(); ++i) {
668-
decode_deepstack.push_back(
669-
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(m_model_config.text.hidden_size)}));
670-
}
688+
decode_deepstack.reserve(model_config.thinker.vision.deepstack_visual_indexes.size());
689+
for (size_t i = 0; i < model_config.thinker.vision.deepstack_visual_indexes.size(); ++i) {
690+
decode_deepstack.push_back(
691+
make_zeros(ov::element::f32, {batch, 1, static_cast<size_t>(model_config.thinker.text.hidden_size)}));
671692
}
672693

673694
int64_t past_len = prompt_len;
@@ -831,7 +852,7 @@ void LLMInferenceSDPAModule::run() {
831852
this->inputs.find("position_ids") != this->inputs.end() &&
832853
this->inputs.find("rope_delta") != this->inputs.end());
833854

834-
ov::genai::modeling::models::Qwen3_5InputPlanner planner(m_model_config);
855+
ov::genai::modeling::models::Qwen3_5InputPlanner planner(std::get<modeling::models::Qwen3_5Config>(m_model_config));
835856

836857
if (is_vl) {
837858
// ---- VL mode ----

src/cpp/src/module_genai/modules/md_llm_inference_sdpa.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "modeling/models/qwen3_5/modeling_qwen3_5_text.hpp"
1414
#include "modeling/models/qwen3_5/processing_qwen3_5.hpp"
1515
#include "modeling/weights/quantization_config.hpp"
16+
#include "modeling/models/qwen3_omni/processing_qwen3_omni.hpp"
1617

1718
namespace ov {
1819
namespace genai {
@@ -82,7 +83,7 @@ class LLMInferenceSDPAModule : public IBaseModule {
8283
std::set<int64_t> m_stop_ids;
8384

8485
// Model config
85-
ov::genai::modeling::models::Qwen3_5Config m_model_config;
86+
std::variant<ov::genai::modeling::models::Qwen3_5Config, modeling::models::Qwen3OmniConfig> m_model_config;
8687

8788
// Tokenizer (for text mode and decoding)
8889
std::unique_ptr<ov::genai::Tokenizer> m_tokenizer;

0 commit comments

Comments
 (0)