Add Qwen 3.5 cpp pipeline sample

ZiniuLin · ZiniuLin · commit 863ae6fab641 · 2026-03-04T10:54:42.000+08:00
Add Qwen 3.5 cpp pipeline sample.

Signed-off-by: Ziniu Lin &lt;ziniu.lin@intel.com&gt;
diff --git a/samples/cpp/module_genai/config_yaml/Qwen3.5-0.8B/config.yaml b/samples/cpp/module_genai/config_yaml/Qwen3.5-0.8B/config.yaml
@@ -0,0 +1,126 @@
+global_context:
+  model_type: "qwen3_5"
+
+pipeline_modules:
+  pipeline_params:
+    type: "ParameterModule"
+    outputs:
+      - name: "image"
+        type: "OVTensor"
+      - name: "prompt"
+        type: "String"
+
+  image_preprocessor:
+    type: "ImagePreprocessModule"
+    device: "GPU"
+    description: "Image or Video preprocessing."
+    inputs:
+      - name: "image"
+        type: "OVTensor"
+        source: "pipeline_params.image"
+    outputs:
+      - name: "pixel_values"
+        type: "OVTensor"
+      - name: "grid_thw"
+        type: "OVTensor"
+      - name: "pos_embeds"
+        type: "OVTensor"
+      - name: "rotary_cos"
+        type: "OVTensor"
+      - name: "rotary_sin"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+
+  prompt_encoder:
+    type: "TextEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "prompt"
+        type: "String"
+        source: "pipeline_params.prompt"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+    outputs:
+      - name: "input_ids"
+        type: "OVTensor"
+      - name: "mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+
+  vision_encoder:
+    type: "VisionEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "preprocessed_image"
+        type: "OVTensor"
+        source: "image_preprocessor.pixel_values"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "pos_embeds"
+        type: "OVTensor"
+        source: "image_preprocessor.pos_embeds"
+      - name: "rotary_cos"
+        type: "OVTensor"
+        source: "image_preprocessor.rotary_cos"
+      - name: "rotary_sin"
+        type: "OVTensor"
+        source: "image_preprocessor.rotary_sin"
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "attention_mask"
+        type: "OVTensor"
+        source: "prompt_encoder.mask"
+    outputs:
+      - name: "image_embedding"
+        type: "OVTensor"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+      - name: "position_ids"
+        type: "OVTensor"
+      - name: "rope_delta"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+      vision_start_token_id: 248053
+
+  llm:
+    type: "LLMInferenceSDPAModule"
+    device: "GPU"
+    inputs:
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "visual_embeds"
+        type: "OVTensor"
+        source: "vision_encoder.image_embedding"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+        source: "vision_encoder.visual_pos_mask"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "position_ids"
+        type: "OVTensor"
+        source: "vision_encoder.position_ids"
+      - name: "rope_delta"
+        type: "OVTensor"
+        source: "vision_encoder.rope_delta"
+    outputs:
+      - name: "generated_text"
+        type: "String"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+      max_new_tokens: 512
+
+  pipeline_result:
+    type: "ResultModule"
+    description: "Collects final results and formats the output structure."
+    inputs:
+      - name: "generated_text"
+        type: "String"
+        source: "llm.generated_text"
diff --git a/samples/cpp/module_genai/config_yaml/Qwen3.5-0.8B/config_text.yaml b/samples/cpp/module_genai/config_yaml/Qwen3.5-0.8B/config_text.yaml
@@ -0,0 +1,46 @@
+global_context:
+  model_type: "qwen3_5"
+
+pipeline_modules:
+  pipeline_params:
+    type: "ParameterModule"
+    outputs:
+      - name: "prompt"
+        type: "String"
+
+  prompt_encoder:
+    type: "TextEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "prompt"
+        type: "String"
+        source: "pipeline_params.prompt"
+    outputs:
+      - name: "input_ids"
+        type: "OVTensor"
+      - name: "mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+
+  llm:
+    type: "LLMInferenceSDPAModule"
+    device: "GPU"
+    inputs:
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+    outputs:
+      - name: "generated_text"
+        type: "String"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+      max_new_tokens: 512
+
+  pipeline_result:
+    type: "ResultModule"
+    description: "Collects final results and formats the output structure."
+    inputs:
+      - name: "generated_text"
+        type: "String"
+        source: "llm.generated_text"
diff --git a/src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp b/src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp
@@ -461,7 +461,9 @@ void LLMInferenceSDPAModule::run() {
     // Determine VL mode: all three additional inputs must be present
     const bool is_vl = (this->inputs.find("visual_embeds") != this->inputs.end() &&
                         this->inputs.find("visual_pos_mask") != this->inputs.end() &&
-                        this->inputs.find("grid_thw") != this->inputs.end());
+                        this->inputs.find("grid_thw") != this->inputs.end() &&
+                        this->inputs.find("position_ids") != this->inputs.end() &&
+                        this->inputs.find("rope_delta") != this->inputs.end());
 
     ov::genai::modeling::models::Qwen3_5InputPlanner planner(m_model_config);
 
@@ -470,18 +472,13 @@ void LLMInferenceSDPAModule::run() {
         ov::Tensor visual_embeds   = inputs["visual_embeds"].data.as<ov::Tensor>();
         ov::Tensor visual_pos_mask = inputs["visual_pos_mask"].data.as<ov::Tensor>();
         ov::Tensor grid_thw        = inputs["grid_thw"].data.as<ov::Tensor>();
+        ov::Tensor position_ids    = inputs["position_ids"].data.as<ov::Tensor>();
+        ov::Tensor rope_delta      = inputs["rope_delta"].data.as<ov::Tensor>();
 
-        // Compute 3D MRoPE position_ids and rope_deltas from grid_thw
-        auto plan = planner.build_plan(input_ids, &attention_mask, &grid_thw);
-
-        // Scatter raw visual embeddings to full sequence length
-        auto visual_padded =
-            ov::genai::modeling::models::Qwen3_5InputPlanner::scatter_visual_embeds(
-                visual_embeds, plan.visual_pos_mask);
 
         std::string generated_text = run_vl_decode(input_ids, attention_mask,
-                                                    plan.position_ids, plan.rope_deltas,
-                                                    visual_padded, plan.visual_pos_mask);
+                                                    position_ids, rope_delta,
+                                                    visual_embeds, visual_pos_mask);
         GENAI_INFO("LLM output: " + generated_text);
         this->outputs["generated_text"].data = generated_text;
     } else {
diff --git a/src/cpp/src/module_genai/modules/md_vision_encoder.cpp b/src/cpp/src/module_genai/modules/md_vision_encoder.cpp
@@ -96,6 +96,7 @@ VisionEncoderModule::~VisionEncoderModule() {}
 
 bool VisionEncoderModule::initialize() {
     const auto &params = module_desc->params;
+    VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
     auto it_path = params.find("model_path");
     if (it_path == params.end()) {
         GENAI_ERR("VisionEncoderModule[" + module_desc->name + "]: 'model_path' not found in params");
@@ -121,16 +122,18 @@ bool VisionEncoderModule::initialize() {
         model = utils::singleton_core().read_model(model_path);
         model_path = model_path.parent_path();
     } else {
-        if (!std::filesystem::exists(model_path / "openvino_vision_embeddings_merger_model.xml")) {
+        auto model_file_path = model_path / "openvino_vision_embeddings_merger_model.xml";
+        if (model_type == VLMModelType::QWEN3_5) {
+            model_file_path = model_path / "qwen3_5_vision.xml";
+        }
+        if (!std::filesystem::exists(model_file_path)) {
             GENAI_ERR("VisionEncoderModule[" + module_desc->name + "]: model file not found at " + 
-                (model_path / "openvino_vision_embeddings_merger_model.xml").string());
+                model_file_path.string());
             return false;
         }
-        model = utils::singleton_core().read_model(
-            model_path / "openvino_vision_embeddings_merger_model.xml");
+        model = utils::singleton_core().read_model(model_file_path);
     }
 
-    auto model_type = to_vlm_model_type(module_desc->model_type);
     if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
         utils::request_vl_sdpa_transformations(model);
     }
diff --git a/tests/module_genai/cpp/modules/ImagePreprocesModule.cpp b/tests/module_genai/cpp/modules/ImagePreprocesModule.cpp
@@ -233,7 +233,7 @@ class Qwen3_5ImagePreprocessModuleTest : public ModuleTestBase, public ::testing
 
         image_preprocessor["outputs"] = outputs;
         YAML::Node model_path;
-        model_path["model_path"] = TEST_MODEL::Qwen3_5();
+        model_path["model_path"] = TEST_MODEL::Qwen3_5_0_8B();
         image_preprocessor["params"] = model_path;
         pipeline_modules["image_preprocessor"] = image_preprocessor;
 
diff --git a/tests/module_genai/cpp/modules/TextEncoderModule.cpp b/tests/module_genai/cpp/modules/TextEncoderModule.cpp
@@ -189,7 +189,7 @@ class Qwen3_5TextEncoderModuleTest : public ModuleTestBase, public ::testing::Te
             cur_node["outputs"].push_back(output_node("rotary_cos", to_string(DataType::OVTensor)));
             cur_node["outputs"].push_back(output_node("rotary_sin", to_string(DataType::OVTensor)));
             cur_node["params"] = YAML::Node();
-            cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5();
+            cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5_0_8B();
             pipeline_modules[image_preprocessor_name] = cur_node;
         }
 
diff --git a/tests/module_genai/cpp/modules/VisionEncoderModule.cpp b/tests/module_genai/cpp/modules/VisionEncoderModule.cpp
@@ -279,7 +279,7 @@ class Qwen3_5VisionEncoderModuleTest
             cur_node["outputs"].push_back(output_node("position_ids", to_string(DataType::OVTensor)));
             cur_node["outputs"].push_back(output_node("rope_delta", to_string(DataType::OVTensor)));
             cur_node["params"] = YAML::Node();
-            cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5() + "qwen3_5_vision.xml";
+            cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5_0_8B() + "qwen3_5_vision.xml";
             cur_node["params"]["vision_start_token_id"] = 248053;
             pipeline_modules[vision_encoder_name] = cur_node;
         }
diff --git a/tests/module_genai/cpp/utils/model_yaml.cpp b/tests/module_genai/cpp/utils/model_yaml.cpp
@@ -30,7 +30,7 @@ std::string Wan_2_1() {
 }
 
 std::string Qwen3_5() {
-    return get_model_path() + "/Qwen3.5-0.8B/";
+    return get_model_path() + "/Qwen3.5-35B-A3B-Base_VL_OV_IR/";
 }
 
 std::string Qwen3_5_0_8B() {

Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ class Qwen3_5TextEncoderModuleTest : public ModuleTestBase, public ::testing::Te`
`189`	`189`	`cur_node["outputs"].push_back(output_node("rotary_cos", to_string(DataType::OVTensor)));`
`190`	`190`	`cur_node["outputs"].push_back(output_node("rotary_sin", to_string(DataType::OVTensor)));`
`191`	`191`	`cur_node["params"] = YAML::Node();`
`192`		`- cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5();`
	`192`	`+ cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5_0_8B();`
`193`	`193`	`pipeline_modules[image_preprocessor_name] = cur_node;`
`194`	`194`	`}`
`195`	`195`
Original file line number	Diff line number	Diff line change
`@@ -279,7 +279,7 @@ class Qwen3_5VisionEncoderModuleTest`
`279`	`279`	`cur_node["outputs"].push_back(output_node("position_ids", to_string(DataType::OVTensor)));`
`280`	`280`	`cur_node["outputs"].push_back(output_node("rope_delta", to_string(DataType::OVTensor)));`
`281`	`281`	`cur_node["params"] = YAML::Node();`
`282`		`- cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5() + "qwen3_5_vision.xml";`
	`282`	`+ cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5_0_8B() + "qwen3_5_vision.xml";`
`283`	`283`	`cur_node["params"]["vision_start_token_id"] = 248053;`
`284`	`284`	`pipeline_modules[vision_encoder_name] = cur_node;`
`285`	`285`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ std::string Wan_2_1() {`
`30`	`30`	`}`
`31`	`31`
`32`	`32`	`std::string Qwen3_5() {`
`33`		`- return get_model_path() + "/Qwen3.5-0.8B/";`
	`33`	`+ return get_model_path() + "/Qwen3.5-35B-A3B-Base_VL_OV_IR/";`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`std::string Qwen3_5_0_8B() {`