@@ -188,6 +188,7 @@ int64_t LLMInferenceSDPAModule::argmax_last(const ov::Tensor& logits) {
188188
189189bool LLMInferenceSDPAModule::initialize () {
190190 const auto & params = module_desc->params ;
191+ VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
191192
192193 // Resolve model directory
193194 std::filesystem::path models_path = get_optional_param (" model_path" );
@@ -224,9 +225,16 @@ bool LLMInferenceSDPAModule::initialize() {
224225
225226 // Load model config
226227 try {
227- m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file (models_path);
228+ if (model_type == VLMModelType::QWEN3_5){
229+ m_model_config = ov::genai::modeling::models::Qwen3_5Config::from_json_file (models_path);
230+ } else if (model_type == VLMModelType::QWEN3_OMNI) {
231+ m_model_config = ov::genai::modeling::models::Qwen3OmniConfig::from_json_file (models_path);
232+ } else {
233+ GENAI_ERR (" Unsupported model type: " + module_desc->model_type );
234+ return false ;
235+ }
228236 } catch (const std::exception& e) {
229- GENAI_ERR (" Failed to load Qwen3.5 config from " + models_path.string () + " : " + e.what ());
237+ GENAI_ERR (" Failed to load model config from " + models_path.string () + " : " + e.what ());
230238 return false ;
231239 }
232240
@@ -311,8 +319,11 @@ bool LLMInferenceSDPAModule::initialize() {
311319 if (eid >= 0 ) m_stop_ids.insert (eid);
312320 } catch (...) {}
313321 }
314- if (m_model_config.text .eos_token_id > 0 ) {
315- m_stop_ids.insert (m_model_config.text .eos_token_id );
322+ if (model_type == VLMModelType::QWEN3_5) {
323+ auto & cfg = std::get<ov::genai::modeling::models::Qwen3_5Config>(m_model_config);
324+ if (cfg.text .eos_token_id > 0 ) {
325+ m_stop_ids.insert (cfg.text .eos_token_id );
326+ }
316327 }
317328 if (m_stop_ids.empty ()) {
318329 GENAI_INFO (" LLMInferenceSDPAModule: no stop token ids found — "
@@ -334,6 +345,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
334345 const ov::Tensor& position_ids,
335346 const ov::Tensor& rope_deltas) {
336347 using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
348+ const auto &model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
337349
338350 const size_t batch = input_ids.get_shape ()[0 ];
339351 const int64_t prompt_len = static_cast <int64_t >(input_ids.get_shape ()[1 ]);
@@ -352,7 +364,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
352364 // Feed zero visual inputs for text-only usage of VL IR
353365 text_req.set_tensor (TIO::kVisualEmbeds ,
354366 make_zeros (ov::element::f32 , {batch, static_cast <size_t >(prompt_len),
355- static_cast <size_t >(m_model_config .text .hidden_size )}));
367+ static_cast <size_t >(model_config .text .hidden_size )}));
356368 text_req.set_tensor (TIO::kVisualPosMask ,
357369 make_zeros (ov::element::boolean, {batch, static_cast <size_t >(prompt_len)}));
358370 }
@@ -373,7 +385,7 @@ std::string LLMInferenceSDPAModule::run_text_decode(const ov::Tensor& input_ids,
373385
374386 ov::Tensor dec_vis, dec_vis_mask;
375387 if (m_text_uses_vl_ir) {
376- dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
388+ dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )});
377389 dec_vis_mask = make_zeros (ov::element::boolean, {batch, 1 });
378390 }
379391
@@ -458,6 +470,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
458470 const ov::Tensor& visual_pos_mask,
459471 const std::optional<std::vector<ov::Tensor>>& deepstack_embeds) {
460472 using TIO = ov::genai::modeling::models::Qwen3_5TextIO;
473+ const auto &model_config = std::get<modeling::models::Qwen3_5Config>(m_model_config);
461474
462475 const size_t batch = input_ids.get_shape ()[0 ];
463476 const int64_t prompt_len = static_cast <int64_t >(input_ids.get_shape ()[1 ]);
@@ -480,7 +493,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
480493 std::to_string (i);
481494 text_req.set_tensor (name, deepstack_embeds.value ()[i]);
482495 }
483- ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(m_model_config .text .hidden_size )});
496+ ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(model_config .text .hidden_size )});
484497 std::memset (prefill_audio_features.data (), 0 , prefill_audio_features.get_byte_size ());
485498 text_req.set_tensor (" audio_features" , prefill_audio_features);
486499 ov::Tensor prefill_audio_pos_mask (ov::element::boolean, {batch, input_ids.get_shape ()[1 ]});
@@ -502,17 +515,17 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
502515 ov::Tensor step_mask = make_zeros (ov::element::i64 , {batch, 1 });
503516 for (size_t b = 0 ; b < batch; ++b) step_mask.data <int64_t >()[b] = 1 ;
504517
505- ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
518+ ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )});
506519 ov::Tensor dec_vis_mask = make_zeros (ov::element::boolean, {batch, 1 });
507520 ov::Tensor decode_audio_features =
508- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
521+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )});
509522 ov::Tensor decode_audio_pos_mask = make_zeros (ov::element::boolean, {batch, 1 });
510523 std::vector<ov::Tensor> decode_deepstack;
511524 if (deepstack_embeds.has_value ()) {
512525 decode_deepstack.reserve (deepstack_embeds.value ().size ());
513526 for (size_t i = 0 ; i < deepstack_embeds.value ().size (); ++i) {
514527 decode_deepstack.push_back (
515- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )}));
528+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config .text .hidden_size )}));
516529 }
517530 }
518531
@@ -605,6 +618,7 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
605618 const std::optional<ov::Tensor>& audio_embeds,
606619 const std::optional<ov::Tensor>& audio_pos_mask) {
607620 using TIO = ov::genai::modeling::models::Qwen3OmniTextIO;
621+ const auto &model_config = std::get<modeling::models::Qwen3OmniConfig>(m_model_config);
608622
609623 const size_t batch = input_ids.get_shape ()[0 ];
610624 const int64_t prompt_len = static_cast <int64_t >(input_ids.get_shape ()[1 ]);
@@ -621,20 +635,33 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
621635 if (visual_embeds.has_value () && visual_pos_mask.has_value ()) {
622636 text_req.set_tensor (TIO::kVisualEmbeds , visual_embeds.value ());
623637 text_req.set_tensor (TIO::kVisualPosMask , visual_pos_mask.value ());
638+ } else {
639+ text_req.set_tensor (TIO::kVisualEmbeds ,
640+ make_zeros (ov::element::f32 , {batch, static_cast <size_t >(prompt_len),
641+ static_cast <size_t >(model_config.thinker .text .hidden_size )}));
642+ text_req.set_tensor (TIO::kVisualPosMask ,
643+ make_zeros (ov::element::boolean, {batch, static_cast <size_t >(prompt_len)}));
624644 }
625645 if (deepstack_embeds.has_value ()) {
626646 for (size_t i = 0 ; i < deepstack_embeds->size (); i++) {
627647 const std::string name =
628- std::string (ov::genai::modeling::models::Qwen3VLTextIO ::kDeepstackEmbedsPrefix ) + " ." +
648+ std::string (ov::genai::modeling::models::Qwen3OmniVisionIO ::kDeepstackEmbedsPrefix ) + " ." +
629649 std::to_string (i);
630650 text_req.set_tensor (name, deepstack_embeds.value ()[i]);
631651 }
652+ } else {
653+ for (size_t i = 0 ; i < model_config.thinker .vision .deepstack_visual_indexes .size (); i++) {
654+ const std::string name =
655+ std::string (ov::genai::modeling::models::Qwen3OmniVisionIO::kDeepstackEmbedsPrefix ) + " ." +
656+ std::to_string (i);
657+ text_req.set_tensor (name, make_zeros (ov::element::f32 , {batch, static_cast <size_t >(prompt_len), static_cast <size_t >(model_config.thinker .text .hidden_size )}));
658+ }
632659 }
633660 if (audio_embeds.has_value () && audio_pos_mask.has_value ()) {
634661 text_req.set_tensor (TIO::kAudioFeatures , audio_embeds.value ());
635662 text_req.set_tensor (TIO::kAudioPosMask , audio_pos_mask.value ());
636663 } else {
637- ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(m_model_config .text .hidden_size )});
664+ ov::Tensor prefill_audio_features (ov::element::f32 , {batch, input_ids.get_shape ()[1 ], static_cast <size_t >(model_config. thinker .text .hidden_size )});
638665 std::memset (prefill_audio_features.data (), 0 , prefill_audio_features.get_byte_size ());
639666 text_req.set_tensor (TIO::kAudioFeatures , prefill_audio_features);
640667 ov::Tensor prefill_audio_pos_mask (ov::element::boolean, {batch, input_ids.get_shape ()[1 ]});
@@ -656,18 +683,16 @@ std::string LLMInferenceSDPAModule::run_vl_decode(const ov::Tensor& input_ids,
656683 ov::Tensor step_mask = make_zeros (ov::element::i64 , {batch, 1 });
657684 for (size_t b = 0 ; b < batch; ++b) step_mask.data <int64_t >()[b] = 1 ;
658685
659- ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
686+ ov::Tensor dec_vis = make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config. thinker .text .hidden_size )});
660687 ov::Tensor dec_vis_mask = make_zeros (ov::element::boolean, {batch, 1 });
661688 ov::Tensor decode_audio_features =
662- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config .text .hidden_size )});
689+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config. thinker .text .hidden_size )});
663690 ov::Tensor decode_audio_pos_mask = make_zeros (ov::element::boolean, {batch, 1 });
664691 std::vector<ov::Tensor> decode_deepstack;
665- if (deepstack_embeds.has_value ()) {
666- decode_deepstack.reserve (deepstack_embeds.value ().size ());
667- for (size_t i = 0 ; i < deepstack_embeds.value ().size (); ++i) {
668- decode_deepstack.push_back (
669- make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(m_model_config.text .hidden_size )}));
670- }
692+ decode_deepstack.reserve (model_config.thinker .vision .deepstack_visual_indexes .size ());
693+ for (size_t i = 0 ; i < model_config.thinker .vision .deepstack_visual_indexes .size (); ++i) {
694+ decode_deepstack.push_back (
695+ make_zeros (ov::element::f32 , {batch, 1 , static_cast <size_t >(model_config.thinker .text .hidden_size )}));
671696 }
672697
673698 int64_t past_len = prompt_len;
@@ -831,7 +856,7 @@ void LLMInferenceSDPAModule::run() {
831856 this ->inputs .find (" position_ids" ) != this ->inputs .end () &&
832857 this ->inputs .find (" rope_delta" ) != this ->inputs .end ());
833858
834- ov::genai::modeling::models::Qwen3_5InputPlanner planner (m_model_config);
859+ ov::genai::modeling::models::Qwen3_5InputPlanner planner (std::get<modeling::models::Qwen3_5Config>( m_model_config) );
835860
836861 if (is_vl) {
837862 // ---- VL mode ----
0 commit comments