@@ -188,6 +188,7 @@ int64_t LLMInferenceSDPAModule::argmax_last(const ov::Tensor& logits) {
188188
189189bool LLMInferenceSDPAModule::initialize () {
190190 const auto & params = module_desc->params ;
191+ VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
191192
192193 // Resolve model directory
193194 std::filesystem::path models_path = get_optional_param (" model_path" );
@@ -224,7 +225,10 @@ bool LLMInferenceSDPAModule::initialize() {
224225
225226 // Load model config
226227 try {
227- m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file (models_path);
228+ if (model_type == VLMModelType::QWEN3_5)
229+ m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file (models_path);
230+ else if (model_type == VLMModelType::QWEN3_OMNI)
231+ m_model_config = ov::genai::modeling::models::Qwen3OmniConfig::from_json_file (models_path);
228232 } catch (const std::exception& e) {
229233 GENAI_ERR (" Failed to load Qwen3.5 config from " + models_path.string () + " : " + e.what ());
230234 return false ;
@@ -311,8 +315,11 @@ bool LLMInferenceSDPAModule::initialize() {
311315 if (eid >= 0 ) m_stop_ids.insert (eid);
312316 } catch (...) {}
313317 }
314- if (m_model_config.text .eos_token_id > 0 ) {
315- m_stop_ids.insert (m_model_config.text .eos_token_id );
318+ if (model_type == VLMModelType::QWEN3_5) {
319+ auto & cfg = std::get<ov::genai::modeling::models::Qwen3_5Config>(m_model_config);
320+ if (cfg.text .eos_token_id > 0 ) {
321+ m_stop_ids.insert (cfg.text .eos_token_id );
322+ }
316323 }
317324 if (m_stop_ids.empty ()) {
318325 GENAI_INFO (" LLMInferenceSDPAModule: no stop token ids found — "
@@ -334,6 +341,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
334341 const ov::Tensor& position_ids,
335342 const ov::Tensor& rope_deltas) {
336343 using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
344+ auto model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
337345
338346 const size_t batch = input_ids.get_shape ()[0 ];
339347 const int64_t prompt_len = static_cast <int64_t >(input_ids.get_shape ()[1 ]);
@@ -352,7 +360,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
352360 // Feed zero visual inputs for text-only usage of VL IR
353361 text_req.set_tensor (TIO::kVisualEmbeds ,
354362 make_zeros (ov::element::f32 , {batch, static_cast <size_t >(prompt_len),
355- static_cast <size_t >(m_model_config .text .hidden_size )}));
363+ static_cast <size_t >(model_config .text .hidden_size )}));
356364 text_req.set_tensor (TIO::kVisualPosMask ,
357365 make_zeros (ov::element::boolean, {batch, static_cast <size_t >(prompt_len)}));
358366 }
@@ -373,7 +381,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
373381
374382 ov::Tensor dec_vis, dec_vis_mask;
375383 if (m_text_uses_vl_ir) {
376- dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
384+ dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )});
377385 dec_vis_mask = make_zeros (ov::element::boolean, {batch, 1 });
378386 }
379387
@@ -458,6 +466,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
458466 const ov::Tensor& visual_pos_mask,
459467 const std::optional<std::vector<ov::Tensor>>& deepstack_embeds) {
460468 using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
469+ auto model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
461470
462471 const size_t batch = input_ids.get_shape ()[0 ];
463472 const int64_t prompt_len = static_cast <int64_t >(input_ids.get_shape ()[1 ]);
@@ -480,7 +489,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
480489 std::to_string (i);
481490 text_req.set_tensor (name, deepstack_embeds.value ()[i]);
482491 }
483- ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(m_model_config .text .hidden_size )});
492+ ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(model_config .text .hidden_size )});
484493 std::memset (prefill_audio_features.data (), 0 , prefill_audio_features.get_byte_size ());
485494 text_req.set_tensor (" audio_features" , prefill_audio_features);
486495 ov::Tensor prefill_audio_pos_mask (ov::element::boolean, {batch, input_ids.get_shape ()[1 ]});
@@ -502,17 +511,17 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
502511 ov::Tensor step_mask = make_zeros (ov::element::i64 , {batch, 1 });
503512 for (size_t b = 0 ; b < batch; ++b) step_mask.data <int64_t >()[b] = 1 ;
504513
505- ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
514+ ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )});
506515 ov::Tensor dec_vis_mask = make_zeros (ov::element::boolean, {batch, 1 });
507516 ov::Tensor decode_audio_features =
508- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
517+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )});
509518 ov::Tensor decode_audio_pos_mask = make_zeros (ov::element::boolean, {batch, 1 });
510519 std::vector<ov::Tensor> decode_deepstack;
511520 if (deepstack_embeds.has_value ()) {
512521 decode_deepstack.reserve (deepstack_embeds.value ().size ());
513522 for (size_t i = 0 ; i < deepstack_embeds.value ().size (); ++i) {
514523 decode_deepstack.push_back (
515- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )}));
524+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )}));
516525 }
517526 }
518527
@@ -605,6 +614,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
605614 const std::optional<ov::Tensor>& audio_embeds,
606615 const std::optional<ov::Tensor>& audio_pos_mask) {
607616 using TIO = ov::genai::modeling::models::Qwen3OmniTextIO;
617+ auto model_config = std::get<modeling::models::Qwen3OmniConfig>(m_model_config);
608618
609619 const size_t batch = input_ids.get_shape ()[0 ];
610620 const int64_t prompt_len = static_cast <int64_t >(input_ids.get_shape ()[1 ]);
@@ -621,6 +631,12 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
621631 if (visual_embeds.has_value () && visual_pos_mask.has_value ()) {
622632 text_req.set_tensor (TIO::kVisualEmbeds , visual_embeds.value ());
623633 text_req.set_tensor (TIO::kVisualPosMask , visual_pos_mask.value ());
634+ } else {
635+ text_req.set_tensor (TIO::kVisualEmbeds ,
636+ make_zeros (ov::element::f32 , {batch, static_cast <size_t >(prompt_len),
637+ static_cast <size_t >(model_config.thinker .text .hidden_size )}));
638+ text_req.set_tensor (TIO::kVisualPosMask ,
639+ make_zeros (ov::element::boolean, {batch, static_cast <size_t >(prompt_len)}));
624640 }
625641 if (deepstack_embeds.has_value ()) {
626642 for (size_t i = 0 ; i < deepstack_embeds->size (); i++) {
@@ -629,12 +645,19 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
629645 std::to_string (i);
630646 text_req.set_tensor (name, deepstack_embeds.value ()[i]);
631647 }
648+ } else {
649+ for (size_t i = 0 ; i < model_config.thinker .vision .deepstack_visual_indexes .size (); i++) {
650+ const std::string name =
651+ std::string (ov::genai::modeling::models::Qwen3VLTextIO::kDeepstackEmbedsPrefix ) + " ." +
652+ std::to_string (i);
653+ text_req.set_tensor (name, make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config.thinker .text .hidden_size )}));
654+ }
632655 }
633656 if (audio_embeds.has_value () && audio_pos_mask.has_value ()) {
634657 text_req.set_tensor (TIO::kAudioFeatures , audio_embeds.value ());
635658 text_req.set_tensor (TIO::kAudioPosMask , audio_pos_mask.value ());
636659 } else {
637- ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(m_model_config .text .hidden_size )});
660+ ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(model_config. thinker .text .hidden_size )});
638661 std::memset (prefill_audio_features.data (), 0 , prefill_audio_features.get_byte_size ());
639662 text_req.set_tensor (TIO::kAudioFeatures , prefill_audio_features);
640663 ov::Tensor prefill_audio_pos_mask (ov::element::boolean, {batch, input_ids.get_shape ()[1 ]});
@@ -656,18 +679,16 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
656679 ov::Tensor step_mask = make_zeros (ov::element::i64 , {batch, 1 });
657680 for (size_t b = 0 ; b < batch; ++b) step_mask.data <int64_t >()[b] = 1 ;
658681
659- ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
682+ ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config. thinker .text .hidden_size )});
660683 ov::Tensor dec_vis_mask = make_zeros (ov::element::boolean, {batch, 1 });
661684 ov::Tensor decode_audio_features =
662- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
685+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config. thinker .text .hidden_size )});
663686 ov::Tensor decode_audio_pos_mask = make_zeros (ov::element::boolean, {batch, 1 });
664687 std::vector<ov::Tensor> decode_deepstack;
665- if (deepstack_embeds.has_value ()) {
666- decode_deepstack.reserve (deepstack_embeds.value ().size ());
667- for (size_t i = 0 ; i < deepstack_embeds.value ().size (); ++i) {
668- decode_deepstack.push_back (
669- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config.text .hidden_size )}));
670- }
688+ decode_deepstack.reserve (model_config.thinker .vision .deepstack_visual_indexes .size ());
689+ for (size_t i = 0 ; i < model_config.thinker .vision .deepstack_visual_indexes .size (); ++i) {
690+ decode_deepstack.push_back (
691+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config.thinker .text .hidden_size )}));
671692 }
672693
673694 int64_t past_len = prompt_len;
@@ -831,7 +852,7 @@ void LLMInferenceSDPAModule::run() {
831852 this ->inputs .find (" position_ids" ) != this ->inputs .end () &&
832853 this ->inputs .find (" rope_delta" ) != this ->inputs .end ());
833854
834- ov::genai::modeling::models::Qwen3_5InputPlanner planner (m_model_config);
855+ ov::genai::modeling::models::Qwen3_5InputPlanner planner (std::get<modeling::models::Qwen3_5Config>( m_model_config) );
835856
836857 if (is_vl) {
837858 // ---- VL mode ----
0 commit comments