Skip to content

Commit 9f47613

Browse files
committed
ov path, test pass. but audio is strange.
Signed-off-by: xipingya <xiping.yan@intel.com>
1 parent 408fa98 commit 9f47613

File tree

2 files changed

+113
-26
lines changed

2 files changed

+113
-26
lines changed

src/cpp/src/module_genai/modules/md_text_to_speech/models/qwen3_omni.cpp

Lines changed: 100 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,7 @@ ov::Output<ov::Node> get_max_token_ids(const ov::Output<ov::Node>& logits) {
311311
// Model AR output: logits[b, token_num, vocab_size] (from where we can get the predicted token id)
312312
// Model SCE input: codec_input[b, token_num] (the predicted token ids from AR)
313313
// Model SCE output: codec_embed[b, token_num, feature_dim] (the embedding of predicted token)
314+
// Merged model return: codec_embed[batch, token_num, feature_dim], all_layer_token_id[batch, 15] (the predicted token id at all step)
314315
std::shared_ptr<ov::Model> merge_ar_sce_model(std::shared_ptr<ov::Model>& ar_model, std::shared_ptr<ov::Model>& sce_model, const int& step) {
315316
auto inputs_embeds = ar_model->get_parameters().at(0);
316317
const ov::PartialShape& inputs_embeds_shape = inputs_embeds->get_partial_shape();
@@ -355,14 +356,27 @@ std::shared_ptr<ov::Model> merge_neighbor_models(std::shared_ptr<ov::Model>& mod
355356

356357
auto model_1_inputs = model_1->get_parameters();
357358

358-
auto inputs_embeds = model_1_inputs.at(0);
359-
auto input_current_layer_tokens = model_1_inputs.at(1);
359+
auto model_1_inputs_embeds = model_1_inputs.at(0);
360+
auto model_1_input_current_layer_tokens = model_1_inputs.at(1);
361+
auto model_1_output_embeddings = model_1->get_results()[0]->input_value(0);
362+
auto model_1_output_layer_tokens = model_1->get_results()[1]->input_value(0);
360363

361-
model_2->inputs()[0].replace(inputs_embeds);
362-
model_2->inputs()[1].replace(input_current_layer_tokens);
364+
// Append model_1's output Embeddings to model_1's input inputs_embeds, and then take it as model_2's inputs_embeds.
365+
auto merged_inputs_embeds = std::make_shared<ov::op::v0::Concat>(
366+
ov::OutputVector{model_1_inputs_embeds->output(0), model_1_output_embeddings},
367+
1);
363368

364-
return std::make_shared<ov::Model>(ov::ResultVector{model_2->get_results()[0], model_2->get_results()[1]},
365-
ov::ParameterVector{inputs_embeds, input_current_layer_tokens},
369+
model_2->inputs()[0].replace(merged_inputs_embeds);
370+
model_2->inputs()[1].replace(model_1_output_layer_tokens);
371+
372+
auto model_2_output_embeddings = model_2->get_results()[0]->input_value(0);
373+
auto merged_2_outputs_embeddings =
374+
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{merged_inputs_embeds, model_2_output_embeddings}, 1);
375+
376+
auto merged_2_outputs_embeddings_result = std::make_shared<ov::op::v0::Result>(merged_2_outputs_embeddings);
377+
378+
return std::make_shared<ov::Model>(ov::ResultVector{merged_2_outputs_embeddings_result, model_2->get_results()[1]},
379+
ov::ParameterVector{model_1_inputs_embeds, model_1_input_current_layer_tokens},
366380
"merged_model");
367381
};
368382

@@ -384,6 +398,10 @@ void TextToSpeechImpl_Qwen3Omni::merge_code_predictor_ov_models(std::vector<std:
384398

385399
m_merged_infer_request = std::make_unique<ov::InferRequest>(
386400
::ov::genai::utils::singleton_core().compile_model(merged_model, m_device).create_infer_request());
401+
m_enable_merge_ov_models = true;
402+
m_cp_steps = ar_models.size();
403+
GENAI_INFO("Finished merging code predictor AR and SCE models into one OV model with " +
404+
std::to_string(m_cp_steps) + " steps. Will use merged model for inference.");
387405
}
388406

389407
void TextToSpeechImpl_Qwen3Omni::run() {
@@ -435,6 +453,60 @@ void TextToSpeechImpl_Qwen3Omni::calc_tts_pad_embed() {
435453
}
436454

437455
std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers_merged_ov(
456+
int cp_steps,
457+
std::vector<float>& autoregressive_sequence,
458+
size_t batch,
459+
size_t hidden_size,
460+
size_t cp_vocab_size,
461+
std::vector<std::vector<int64_t>>& all_layer_tokens,
462+
int num_layers_total) {
463+
std::vector<int64_t> current_layer_tokens(batch * cp_steps, 0); // shape=[batch*cp_steps]
464+
ov::Tensor current_layer_tokens_tensor(ov::element::i64,
465+
{batch, static_cast<size_t>(cp_steps)},
466+
current_layer_tokens.data());
467+
468+
const size_t current_length = autoregressive_sequence.size() / hidden_size;
469+
ov::Tensor ar_input(ov::element::f32, {batch, current_length, hidden_size}, autoregressive_sequence.data());
470+
471+
m_merged_infer_request->set_input_tensor(0, ar_input);
472+
m_merged_infer_request->set_input_tensor(1, current_layer_tokens_tensor);
473+
{
474+
PROFILE(pm, "m_merged_infer_request infer");
475+
m_merged_infer_request->infer();
476+
}
477+
478+
auto merged_outputs = m_merged_infer_request->get_output_tensor(0); // shape=[batch, cp_steps, hidden_size]
479+
auto layer_tokens_output = m_merged_infer_request->get_output_tensor(1); // shape=[batch, cp_steps]
480+
481+
// Return layer_tokens_output to current_layer_tokens
482+
OPENVINO_ASSERT(layer_tokens_output.get_shape()[0] == batch &&
483+
layer_tokens_output.get_shape()[1] == static_cast<size_t>(cp_steps),
484+
"Merged model output shape mismatch. Expected [batch, cp_steps], got " +
485+
std::to_string(layer_tokens_output.get_shape()[0]) + ", " +
486+
std::to_string(layer_tokens_output.get_shape()[1]));
487+
const int64_t* layer_tokens_ptr = layer_tokens_output.data<int64_t>();
488+
std::copy(layer_tokens_ptr, layer_tokens_ptr + batch * cp_steps, current_layer_tokens.begin());
489+
490+
// Return to all_layer_tokens
491+
for (size_t b = 0; b < batch; ++b) {
492+
for (int step = 0; step < cp_steps; ++step) {
493+
int64_t layer_token = current_layer_tokens[b * cp_steps + step];
494+
if (step + 1 < num_layers_total) {
495+
all_layer_tokens[step + 1].push_back(layer_token);
496+
}
497+
}
498+
}
499+
500+
// Return to autoregressive_sequence
501+
const float* merged_emb_ptr = merged_outputs.data<float>();
502+
autoregressive_sequence.insert(autoregressive_sequence.end(),
503+
merged_emb_ptr,
504+
merged_emb_ptr + batch * cp_steps * hidden_size);
505+
506+
return current_layer_tokens;
507+
}
508+
509+
std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers(
438510
int cp_steps,
439511
std::vector<float>& autoregressive_sequence,
440512
size_t batch,
@@ -446,20 +518,30 @@ std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers_merged
446518
std::mt19937& rng,
447519
std::vector<std::vector<int64_t>>& all_layer_tokens,
448520
int num_layers_total) {
449-
return code_predictor_ar_infers(cp_steps,
450-
autoregressive_sequence,
451-
batch,
452-
hidden_size,
453-
cp_vocab_size,
454-
temperature,
455-
top_k,
456-
top_p,
457-
rng,
458-
all_layer_tokens,
459-
num_layers_total);
521+
if (m_enable_merge_ov_models) {
522+
return code_predictor_ar_infers_merged_ov(cp_steps,
523+
autoregressive_sequence,
524+
batch,
525+
hidden_size,
526+
cp_vocab_size,
527+
all_layer_tokens,
528+
num_layers_total);
529+
}
530+
// Fallback to origianl cpp implementation if merged OV model is not available for inference.
531+
return code_predictor_ar_infers_cpp(cp_steps,
532+
autoregressive_sequence,
533+
batch,
534+
hidden_size,
535+
cp_vocab_size,
536+
temperature,
537+
top_k,
538+
top_p,
539+
rng,
540+
all_layer_tokens,
541+
num_layers_total);
460542
}
461543

462-
std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers(
544+
std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers_cpp(
463545
int cp_steps,
464546
std::vector<float>& autoregressive_sequence,
465547
size_t batch,
@@ -528,8 +610,6 @@ std::vector<int64_t> TextToSpeechImpl_Qwen3Omni::code_predictor_ar_infers(
528610
}
529611

530612
std::pair<ov::Tensor, int> TextToSpeechImpl_Qwen3Omni::qwen3_omni_text_to_speech(const std::string& text) {
531-
532-
533613
// --- Tokenize text ---
534614
auto tok_result = m_tokenizer->encode(text, ov::genai::add_special_tokens(false));
535615
auto tok_ids_tensor = tok_result.input_ids;

src/cpp/src/module_genai/modules/md_text_to_speech/models/qwen3_omni.hpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,17 @@ class TextToSpeechImpl_Qwen3Omni : public TextToSpeechModule {
4343
std::mt19937& rng,
4444
std::vector<std::vector<int64_t>>& all_layer_tokens,
4545
int num_layers_total);
46+
std::vector<int64_t> code_predictor_ar_infers_cpp(int cp_steps,
47+
std::vector<float>& autoregressive_sequence,
48+
size_t batch,
49+
size_t hidden_size,
50+
size_t cp_vocab_size,
51+
float temperature,
52+
size_t top_k,
53+
float top_p,
54+
std::mt19937& rng,
55+
std::vector<std::vector<int64_t>>& all_layer_tokens,
56+
int num_layers_total);
4657

4758
std::pair<ov::Tensor, int> qwen3_omni_text_to_speech(const std::string& text);
4859

@@ -53,8 +64,8 @@ class TextToSpeechImpl_Qwen3Omni : public TextToSpeechModule {
5364
modeling::models::Qwen3TTSCodePredictorConfig m_cp_cfg;
5465
int m_cp_steps = 15;
5566

56-
bool m_merge_ov_models = false;
57-
std::unique_ptr<ov::InferRequest> m_merged_infer_request = nullptr; // Only used when m_merge_ov_models is true
67+
bool m_enable_merge_ov_models = false;
68+
std::unique_ptr<ov::InferRequest> m_merged_infer_request = nullptr; // Only used when m_enable_merge_ov_models is true
5869

5970
void load_code_predictor_models(const ov::AnyMap& tts_props);
6071
void merge_code_predictor_ov_models(std::vector<std::shared_ptr<ov::Model>>& ar_models,
@@ -64,10 +75,6 @@ class TextToSpeechImpl_Qwen3Omni : public TextToSpeechModule {
6475
size_t batch,
6576
size_t hidden_size,
6677
size_t cp_vocab_size,
67-
float temperature,
68-
size_t top_k,
69-
float top_p,
70-
std::mt19937& rng,
7178
std::vector<std::vector<int64_t>>& all_layer_tokens,
7279
int num_layers_total);
7380
};

0 commit comments

Comments
 (0)