enabled Qwen3.5 text + vl mode with genai modular pipeline.

xzhan34 · xzhan34 · commit c2f52f10c7d9 · 2026-03-04T10:44:08.000+08:00
Qwen3.5 vl mode can't work on dg2, only verified with arl igpu device.

Signed-off-by: Zhang, Xiaolin &lt;xiaolin.zhang@intel.com&gt;
diff --git a/samples/cpp/module_genai/CMakeLists.txt b/samples/cpp/module_genai/CMakeLists.txt
@@ -36,7 +36,8 @@ endfunction()
 set (SAMPLE_LIST
     md_image_generation
     md_video_generation
-    md_visual_language_chat)
+    md_visual_language_chat
+    md_llm_chat)
 
 foreach(sample IN LISTS SAMPLE_LIST)
     add_sample_executable(${sample})
diff --git a/samples/cpp/module_genai/config_yaml/Qwen3.5/config_llm.yaml b/samples/cpp/module_genai/config_yaml/Qwen3.5/config_llm.yaml
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------
+# GLOBAL CONFIGURATION — Text-only LLM pipeline (no vision)
+# ----------------------------------------------------------------------
+global_context:
+  model_type: "qwen3_5"
+
+# ----------------------------------------------------------------------
+# MODULES DEFINITION
+# ----------------------------------------------------------------------
+pipeline_modules:
+
+  # --- 0. Parameter Module ---
+  pipeline_params:
+    type: "ParameterModule"
+    outputs:
+      - name: "prompt"
+        type: "String"
+
+  # --- 1. Prompt Encode Module ---
+  prompt_encoder:
+    type: "TextEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "prompt"
+        type: "String"
+        source: "pipeline_params.prompt"
+    outputs:
+      - name: "input_ids"
+        type: "OVTensor"
+      - name: "mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+
+  # --- 2. LLM Inference Module ---
+  llm_inference:
+    type: "LLMInferenceSDPAModule"
+    description: "LLM module for SDPA (stateful) pipeline — text only"
+    device: "GPU"
+    inputs:
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+    outputs:
+      - name: "generated_text"
+        type: "String"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+      max_new_tokens: "16"
+
+  # --- 3. Result Module ---
+  pipeline_result:
+    type: "ResultModule"
+    description: "Collects final results and formats the output structure."
+    inputs:
+      - name: "generated_text"
+        type: "String"
+        source: "llm_inference.generated_text"
diff --git a/samples/cpp/module_genai/config_yaml/Qwen3.5/config_vl.yaml b/samples/cpp/module_genai/config_yaml/Qwen3.5/config_vl.yaml
@@ -0,0 +1,136 @@
+# ----------------------------------------------------------------------
+# GLOBAL CONFIGURATION — Visual LLM pipeline
+# ----------------------------------------------------------------------
+global_context:
+  model_type: "qwen3_5"
+
+# ----------------------------------------------------------------------
+# MODULES DEFINITION
+# ----------------------------------------------------------------------
+pipeline_modules:
+
+  # --- 0. Parameter Module ---
+  pipeline_params:
+    type: "ParameterModule"
+    outputs:
+      - name: "image"
+        type: "OVTensor"
+      - name: "prompt"
+        type: "String"
+
+  # --- 1. Image Preprocessing Module ---
+  image_preprocessor:
+    type: "ImagePreprocessModule"
+    device: "GPU"
+    description: "Image or Video preprocessing for Qwen3.5."
+    inputs:
+      - name: "image"
+        type: "OVTensor"
+        source: "pipeline_params.image"
+    outputs:
+      - name: "pixel_values"
+        type: "OVTensor"
+      - name: "grid_thw"
+        type: "OVTensor"
+      - name: "pos_embeds"
+        type: "OVTensor"
+      - name: "rotary_cos"
+        type: "OVTensor"
+      - name: "rotary_sin"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+
+  # --- 2. Prompt Encode Module ---
+  prompt_encoder:
+    type: "TextEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "prompt"
+        type: "String"
+        source: "pipeline_params.prompt"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+    outputs:
+      - name: "input_ids"
+        type: "OVTensor"
+      - name: "mask"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+
+  # --- 3. Vision Encoder Module ---
+  vision_encoder:
+    type: "VisionEncoderModule"
+    device: "GPU"
+    inputs:
+      - name: "preprocessed_image"
+        type: "OVTensor"
+        source: "image_preprocessor.pixel_values"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+      - name: "pos_embeds"
+        type: "OVTensor"
+        source: "image_preprocessor.pos_embeds"
+      - name: "rotary_cos"
+        type: "OVTensor"
+        source: "image_preprocessor.rotary_cos"
+      - name: "rotary_sin"
+        type: "OVTensor"
+        source: "image_preprocessor.rotary_sin"
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      - name: "attention_mask"
+        type: "OVTensor"
+        source: "prompt_encoder.mask"
+    outputs:
+      - name: "image_embedding"
+        type: "OVTensor"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+      - name: "position_ids"
+        type: "OVTensor"
+      - name: "rope_delta"
+        type: "OVTensor"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+      vision_start_token_id: 248053
+
+  # --- 4. LLM Inference Module ---
+  llm_inference:
+    type: "LLMInferenceSDPAModule"
+    description: "LLM module for SDPA (stateful) pipeline — text & VL"
+    device: "GPU"
+    inputs:
+      # ---- Text mode inputs (required) ----
+      - name: "input_ids"
+        type: "OVTensor"
+        source: "prompt_encoder.input_ids"
+      # ---- VL mode inputs (additional, optional) ----
+      - name: "visual_embeds"
+        type: "OVTensor"
+        source: "vision_encoder.image_embedding"
+      - name: "visual_pos_mask"
+        type: "OVTensor"
+        source: "vision_encoder.visual_pos_mask"
+      - name: "grid_thw"
+        type: "OVTensor"
+        source: "image_preprocessor.grid_thw"
+    outputs:
+      - name: "generated_text"
+        type: "String"
+    params:
+      model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
+      max_new_tokens: "16"
+  
+  # --- 5. Result Module ---
+  pipeline_result:
+    type: "ResultModule"
+    description: "Collects final results and formats the output structure."
+    inputs:
+      - name: "generated_text"
+        type: "String"
+        source: "llm_inference.generated_text"
diff --git a/samples/cpp/module_genai/md_llm_chat.cpp b/samples/cpp/module_genai/md_llm_chat.cpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <iostream>
+#include <openvino/genai/module_genai/pipeline.hpp>
+
+#include <stdexcept>
+
+#include "yaml-cpp/yaml.h"
+#include "utils/utils.hpp"
+
+inline ov::AnyMap parse_inputs_from_yaml_cfg(const std::filesystem::path& cfg_yaml_path,
+                                             const std::string& prompt) {
+    ov::AnyMap inputs;
+    YAML::Node input_params = utils::find_param_module_in_yaml(cfg_yaml_path);
+
+    for (const auto& entry : input_params) {
+        if (!entry["name"] || !entry["type"]) {
+            continue;
+        }
+
+        const std::string param_name = entry["name"].as<std::string>();
+        const std::string param_type = entry["type"].as<std::string>();
+
+        if (param_type == "String" && utils::contains_key(param_name, {"prompt"})) {
+            if (prompt.empty()) {
+                throw std::runtime_error("Prompt string is empty.");
+            }
+            inputs[param_name] = prompt;
+        }
+    }
+    return inputs;
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        if (argc <= 1) {
+            throw std::runtime_error(std::string{"Usage: "} + argv[0] +
+                                     "\n"
+                                     "  -cfg config.yaml\n"
+                                     "  -prompt: input prompt\n");
+        }
+
+        std::filesystem::path config_path = utils::get_input_arg(argc, argv, "-cfg", std::string{});
+        std::string prompt = utils::get_input_arg(argc, argv, "-prompt", std::string{});
+
+        ov::AnyMap inputs = parse_inputs_from_yaml_cfg(config_path, prompt);
+
+        for (const auto& [key, value] : inputs) {
+            std::cout << "[Input] " << key << ": " << value.as<std::string>() << std::endl;
+        }
+
+        ov::genai::module::ModulePipeline pipe(config_path);
+
+        pipe.generate(inputs);
+
+        std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
+    } catch (const std::exception& ex) {
+        std::cerr << "[ERROR] " << ex.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess.cpp b/src/cpp/src/module_genai/modules/md_img_preprocess.cpp
@@ -88,7 +88,16 @@ void ImagePreprocessModule::run() {
     prepare_inputs();
     VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
 
-    if (exists_input("images")) {
+    // When running in text-only mode, neither image nor images data is provided.
+    // Skip preprocessing entirely - downstream modules will detect the absence.
+    const bool has_images = exists_input("images") && !inputs["images"].data.empty();
+    const bool has_image  = exists_input("image")  && !inputs["image"].data.empty();
+    if (!has_images && !has_image) {
+        GENAI_INFO("ImagePreprocessModule[" + module_desc->name + "]: no image input - skipping (text-only mode)");
+        return;
+    }
+
+    if (has_images) {
         auto images_data = get_input("images").as<std::vector<ov::Tensor>>();
         if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
             std::vector<ov::Tensor> output_tensors;
@@ -113,7 +122,7 @@ void ImagePreprocessModule::run() {
             this->outputs["rotary_cos"].data = output.rotary_cos;
             this->outputs["rotary_sin"].data = output.rotary_sin;
         }
-    } else {
+    } else if (has_image) {
         auto image1_data = get_input("image").as<ov::Tensor>();
         if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
             auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(image1_data, ov::AnyMap{});
diff --git a/src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp b/src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp
@@ -458,10 +458,16 @@ void LLMInferenceSDPAModule::run() {
     ov::Tensor attention_mask(ov::element::i64, {batch, seq_len});
     std::fill_n(attention_mask.data<int64_t>(), batch * seq_len, int64_t{1});
 
-    // Determine VL mode: all three additional inputs must be present
-    const bool is_vl = (this->inputs.find("visual_embeds") != this->inputs.end() &&
-                        this->inputs.find("visual_pos_mask") != this->inputs.end() &&
-                        this->inputs.find("grid_thw") != this->inputs.end());
+    // Determine VL mode: all three additional inputs must be present AND have valid data.
+    // In text-only mode, the keys exist in the inputs map (from YAML) but data is empty
+    // because ImagePreprocessModule and VisionEncoderModule skipped execution.
+    auto has_valid_input = [this](const std::string& name) {
+        auto it = this->inputs.find(name);
+        return it != this->inputs.end() && !it->second.data.empty();
+    };
+    const bool is_vl = has_valid_input("visual_embeds") &&
+                       has_valid_input("visual_pos_mask") &&
+                       has_valid_input("grid_thw");
 
     ov::genai::modeling::models::Qwen3_5InputPlanner planner(m_model_config);
 
diff --git a/src/cpp/src/module_genai/modules/md_text_encoder.cpp b/src/cpp/src/module_genai/modules/md_text_encoder.cpp
@@ -150,7 +150,7 @@ void TextEncoderModule::run() {
         }
     } else if (model_type == VLMModelType::QWEN3_5) {
         std::optional<ov::Tensor> grid_thw = std::nullopt;
-        if (exists_input("grid_thw")) {
+        if (exists_input("grid_thw") && !inputs["grid_thw"].data.empty()) {
             grid_thw = get_input("grid_thw").as<ov::Tensor>();
         }
 
diff --git a/src/cpp/src/module_genai/modules/md_vision_encoder.cpp b/src/cpp/src/module_genai/modules/md_vision_encoder.cpp

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@ void TextEncoderModule::run() {`
`150`	`150`	`}`
`151`	`151`	`} else if (model_type == VLMModelType::QWEN3_5) {`
`152`	`152`	`std::optional<ov::Tensor> grid_thw = std::nullopt;`
`153`		`- if (exists_input("grid_thw")) {`
	`153`	`+ if (exists_input("grid_thw") && !inputs["grid_thw"].data.empty()) {`
`154`	`154`	`grid_thw = get_input("grid_thw").as<ov::Tensor>();`
`155`	`155`	`}`
`156`	`156`