@@ -311,6 +311,7 @@ ov::Output<ov::Node> get_max_token_ids(const ov::Output<ov::Node>& logits) {
311311// Model AR output: logits[b, token_num, vocab_size] (from where we can get the predicted token id)
312312// Model SCE input: codec_input[b, token_num] (the predicted token ids from AR)
313313// Model SCE output: codec_embed[b, token_num, feature_dim] (the embedding of predicted token)
314+ // Merged model return: codec_embed[batch, token_num, feature_dim], all_layer_token_id[batch, 15] (the predicted token id at all step)
314315std::shared_ptr<ov::Model> merge_ar_sce_model (std::shared_ptr<ov::Model>& ar_model, std::shared_ptr<ov::Model>& sce_model, const int & step) {
315316 auto inputs_embeds = ar_model->get_parameters ().at (0 );
316317 const ov::PartialShape& inputs_embeds_shape = inputs_embeds->get_partial_shape ();
@@ -355,14 +356,27 @@ std::shared_ptr<ov::Model> merge_neighbor_models(std::shared_ptr<ov::Model>& mod
355356
356357 auto model_1_inputs = model_1->get_parameters ();
357358
358- auto inputs_embeds = model_1_inputs.at (0 );
359- auto input_current_layer_tokens = model_1_inputs.at (1 );
359+ auto model_1_inputs_embeds = model_1_inputs.at (0 );
360+ auto model_1_input_current_layer_tokens = model_1_inputs.at (1 );
361+ auto model_1_output_embeddings = model_1->get_results ()[0 ]->input_value (0 );
362+ auto model_1_output_layer_tokens = model_1->get_results ()[1 ]->input_value (0 );
360363
361- model_2->inputs ()[0 ].replace (inputs_embeds);
362- model_2->inputs ()[1 ].replace (input_current_layer_tokens);
364+ // Append model_1's output Embeddings to model_1's input inputs_embeds, and then take it as model_2's inputs_embeds.
365+ auto merged_inputs_embeds = std::make_shared<ov::op::v0::Concat>(
366+ ov::OutputVector{model_1_inputs_embeds->output (0 ), model_1_output_embeddings},
367+ 1 );
363368
364- return std::make_shared<ov::Model>(ov::ResultVector{model_2->get_results ()[0 ], model_2->get_results ()[1 ]},
365- ov::ParameterVector{inputs_embeds, input_current_layer_tokens},
369+ model_2->inputs ()[0 ].replace (merged_inputs_embeds);
370+ model_2->inputs ()[1 ].replace (model_1_output_layer_tokens);
371+
372+ auto model_2_output_embeddings = model_2->get_results ()[0 ]->input_value (0 );
373+ auto merged_2_outputs_embeddings =
374+ std::make_shared<ov::op::v0::Concat>(ov::OutputVector{merged_inputs_embeds, model_2_output_embeddings}, 1 );
375+
376+ auto merged_2_outputs_embeddings_result = std::make_shared<ov::op::v0::Result>(merged_2_outputs_embeddings);
377+
378+ return std::make_shared<ov::Model>(ov::ResultVector{merged_2_outputs_embeddings_result, model_2->get_results ()[1 ]},
379+ ov::ParameterVector{model_1_inputs_embeds, model_1_input_current_layer_tokens},
366380 " merged_model" );
367381};
368382
@@ -384,6 +398,10 @@ void TextToSpeechImpl_Qwen3Omni::merge_code_predictor_ov_models(std::vector<std:
384398
385399 m_merged_infer_request = std::make_unique<ov::InferRequest>(
386400 ::ov::genai::utils::singleton_core ().compile_model(merged_model, m_device).create_infer_request());
401+ m_enable_merge_ov_models = true ;
402+ m_cp_steps = ar_models.size ();
403+ GENAI_INFO (" Finished merging code predictor AR and SCE models into one OV model with " +
404+ std::to_string (m_cp_steps) + " steps. Will use merged model for inference." );
387405}
388406
389407void TextToSpeechImpl_Qwen3Omni::run () {
@@ -435,6 +453,60 @@ void TextToSpeechImpl_Qwen3Omni::calc_tts_pad_embed() {
435453}
436454
437455std::vector<int64_t > TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers_merged_ov (
456+ int cp_steps,
457+ std::vector<float >& autoregressive_sequence,
458+ size_t batch,
459+ size_t hidden_size,
460+ size_t cp_vocab_size,
461+ std::vector<std::vector<int64_t >>& all_layer_tokens,
462+ int num_layers_total) {
463+ std::vector<int64_t > current_layer_tokens (batch * cp_steps, 0 ); // shape=[batch*cp_steps]
464+ ov::Tensor current_layer_tokens_tensor (ov::element::i64 ,
465+ {batch, static_cast <size_t >(cp_steps)},
466+ current_layer_tokens.data ());
467+
468+ const size_t current_length = autoregressive_sequence.size () / hidden_size;
469+ ov::Tensor ar_input (ov::element::f32 , {batch, current_length, hidden_size}, autoregressive_sequence.data ());
470+
471+ m_merged_infer_request->set_input_tensor (0 , ar_input);
472+ m_merged_infer_request->set_input_tensor (1 , current_layer_tokens_tensor);
473+ {
474+ PROFILE (pm, " m_merged_infer_request infer" );
475+ m_merged_infer_request->infer ();
476+ }
477+
478+ auto merged_outputs = m_merged_infer_request->get_output_tensor (0 ); // shape=[batch, cp_steps, hidden_size]
479+ auto layer_tokens_output = m_merged_infer_request->get_output_tensor (1 ); // shape=[batch, cp_steps]
480+
481+ // Return layer_tokens_output to current_layer_tokens
482+ OPENVINO_ASSERT (layer_tokens_output.get_shape ()[0 ] == batch &&
483+ layer_tokens_output.get_shape ()[1 ] == static_cast <size_t >(cp_steps),
484+ " Merged model output shape mismatch. Expected [batch, cp_steps], got " +
485+ std::to_string (layer_tokens_output.get_shape ()[0 ]) + " , " +
486+ std::to_string (layer_tokens_output.get_shape ()[1 ]));
487+ const int64_t * layer_tokens_ptr = layer_tokens_output.data <int64_t >();
488+ std::copy (layer_tokens_ptr, layer_tokens_ptr + batch * cp_steps, current_layer_tokens.begin ());
489+
490+ // Return to all_layer_tokens
491+ for (size_t b = 0 ; b < batch; ++b) {
492+ for (int step = 0 ; step < cp_steps; ++step) {
493+ int64_t layer_token = current_layer_tokens[b * cp_steps + step];
494+ if (step + 1 < num_layers_total) {
495+ all_layer_tokens[step + 1 ].push_back (layer_token);
496+ }
497+ }
498+ }
499+
500+ // Return to autoregressive_sequence
501+ const float * merged_emb_ptr = merged_outputs.data <float >();
502+ autoregressive_sequence.insert (autoregressive_sequence.end (),
503+ merged_emb_ptr,
504+ merged_emb_ptr + batch * cp_steps * hidden_size);
505+
506+ return current_layer_tokens;
507+ }
508+
509+ std::vector<int64_t > TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers (
438510 int cp_steps,
439511 std::vector<float >& autoregressive_sequence,
440512 size_t batch,
@@ -446,20 +518,30 @@ std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers_merged
446518 std::mt19937& rng,
447519 std::vector<std::vector<int64_t >>& all_layer_tokens,
448520 int num_layers_total) {
449- return code_predictor_ar_infers (cp_steps,
450- autoregressive_sequence,
451- batch,
452- hidden_size,
453- cp_vocab_size,
454- temperature,
455- top_k,
456- top_p,
457- rng,
458- all_layer_tokens,
459- num_layers_total);
521+ if (m_enable_merge_ov_models) {
522+ return code_predictor_ar_infers_merged_ov (cp_steps,
523+ autoregressive_sequence,
524+ batch,
525+ hidden_size,
526+ cp_vocab_size,
527+ all_layer_tokens,
528+ num_layers_total);
529+ }
530+ // Fallback to origianl cpp implementation if merged OV model is not available for inference.
531+ return code_predictor_ar_infers_cpp (cp_steps,
532+ autoregressive_sequence,
533+ batch,
534+ hidden_size,
535+ cp_vocab_size,
536+ temperature,
537+ top_k,
538+ top_p,
539+ rng,
540+ all_layer_tokens,
541+ num_layers_total);
460542}
461543
462- std::vector<int64_t > TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers (
544+ std::vector<int64_t > TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers_cpp (
463545 int cp_steps,
464546 std::vector<float >& autoregressive_sequence,
465547 size_t batch,
@@ -528,8 +610,6 @@ std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers(
528610}
529611
530612std::pair<ov::Tensor, int > TextToSpeechImpl_Qwen3Omni::qwen3_omni_text_to_speech (const std::string& text) {
531-
532-
533613 // --- Tokenize text ---
534614 auto tok_result = m_tokenizer->encode (text, ov::genai::add_special_tokens (false ));
535615 auto tok_ids_tensor = tok_result.input_ids ;
0 commit comments