Skip to content

Commit c2f52f1

Browse files
committed
enabled Qwen3.5 text + vl mode with genai modular pipeline.
Qwen3.5 vl mode can't work on dg2, only verified with arl igpu device. Signed-off-by: Zhang, Xiaolin <xiaolin.zhang@intel.com>
1 parent 58a4d0f commit c2f52f1

File tree

8 files changed

+300
-37
lines changed

8 files changed

+300
-37
lines changed

samples/cpp/module_genai/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ endfunction()
3636
set (SAMPLE_LIST
3737
md_image_generation
3838
md_video_generation
39-
md_visual_language_chat)
39+
md_visual_language_chat
40+
md_llm_chat)
4041

4142
foreach(sample IN LISTS SAMPLE_LIST)
4243
add_sample_executable(${sample})
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# ----------------------------------------------------------------------
2+
# GLOBAL CONFIGURATION — Text-only LLM pipeline (no vision)
3+
# ----------------------------------------------------------------------
4+
global_context:
5+
model_type: "qwen3_5"
6+
7+
# ----------------------------------------------------------------------
8+
# MODULES DEFINITION
9+
# ----------------------------------------------------------------------
10+
pipeline_modules:
11+
12+
# --- 0. Parameter Module ---
13+
pipeline_params:
14+
type: "ParameterModule"
15+
outputs:
16+
- name: "prompt"
17+
type: "String"
18+
19+
# --- 1. Prompt Encode Module ---
20+
prompt_encoder:
21+
type: "TextEncoderModule"
22+
device: "GPU"
23+
inputs:
24+
- name: "prompt"
25+
type: "String"
26+
source: "pipeline_params.prompt"
27+
outputs:
28+
- name: "input_ids"
29+
type: "OVTensor"
30+
- name: "mask"
31+
type: "OVTensor"
32+
params:
33+
model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
34+
35+
# --- 2. LLM Inference Module ---
36+
llm_inference:
37+
type: "LLMInferenceSDPAModule"
38+
description: "LLM module for SDPA (stateful) pipeline — text only"
39+
device: "GPU"
40+
inputs:
41+
- name: "input_ids"
42+
type: "OVTensor"
43+
source: "prompt_encoder.input_ids"
44+
outputs:
45+
- name: "generated_text"
46+
type: "String"
47+
params:
48+
model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
49+
max_new_tokens: "16"
50+
51+
# --- 3. Result Module ---
52+
pipeline_result:
53+
type: "ResultModule"
54+
description: "Collects final results and formats the output structure."
55+
inputs:
56+
- name: "generated_text"
57+
type: "String"
58+
source: "llm_inference.generated_text"
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# ----------------------------------------------------------------------
2+
# GLOBAL CONFIGURATION — Visual LLM pipeline
3+
# ----------------------------------------------------------------------
4+
global_context:
5+
model_type: "qwen3_5"
6+
7+
# ----------------------------------------------------------------------
8+
# MODULES DEFINITION
9+
# ----------------------------------------------------------------------
10+
pipeline_modules:
11+
12+
# --- 0. Parameter Module ---
13+
pipeline_params:
14+
type: "ParameterModule"
15+
outputs:
16+
- name: "image"
17+
type: "OVTensor"
18+
- name: "prompt"
19+
type: "String"
20+
21+
# --- 1. Image Preprocessing Module ---
22+
image_preprocessor:
23+
type: "ImagePreprocessModule"
24+
device: "GPU"
25+
description: "Image or Video preprocessing for Qwen3.5."
26+
inputs:
27+
- name: "image"
28+
type: "OVTensor"
29+
source: "pipeline_params.image"
30+
outputs:
31+
- name: "pixel_values"
32+
type: "OVTensor"
33+
- name: "grid_thw"
34+
type: "OVTensor"
35+
- name: "pos_embeds"
36+
type: "OVTensor"
37+
- name: "rotary_cos"
38+
type: "OVTensor"
39+
- name: "rotary_sin"
40+
type: "OVTensor"
41+
params:
42+
model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
43+
44+
# --- 2. Prompt Encode Module ---
45+
prompt_encoder:
46+
type: "TextEncoderModule"
47+
device: "GPU"
48+
inputs:
49+
- name: "prompt"
50+
type: "String"
51+
source: "pipeline_params.prompt"
52+
- name: "grid_thw"
53+
type: "OVTensor"
54+
source: "image_preprocessor.grid_thw"
55+
outputs:
56+
- name: "input_ids"
57+
type: "OVTensor"
58+
- name: "mask"
59+
type: "OVTensor"
60+
params:
61+
model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
62+
63+
# --- 3. Vision Encoder Module ---
64+
vision_encoder:
65+
type: "VisionEncoderModule"
66+
device: "GPU"
67+
inputs:
68+
- name: "preprocessed_image"
69+
type: "OVTensor"
70+
source: "image_preprocessor.pixel_values"
71+
- name: "grid_thw"
72+
type: "OVTensor"
73+
source: "image_preprocessor.grid_thw"
74+
- name: "pos_embeds"
75+
type: "OVTensor"
76+
source: "image_preprocessor.pos_embeds"
77+
- name: "rotary_cos"
78+
type: "OVTensor"
79+
source: "image_preprocessor.rotary_cos"
80+
- name: "rotary_sin"
81+
type: "OVTensor"
82+
source: "image_preprocessor.rotary_sin"
83+
- name: "input_ids"
84+
type: "OVTensor"
85+
source: "prompt_encoder.input_ids"
86+
- name: "attention_mask"
87+
type: "OVTensor"
88+
source: "prompt_encoder.mask"
89+
outputs:
90+
- name: "image_embedding"
91+
type: "OVTensor"
92+
- name: "visual_pos_mask"
93+
type: "OVTensor"
94+
- name: "position_ids"
95+
type: "OVTensor"
96+
- name: "rope_delta"
97+
type: "OVTensor"
98+
params:
99+
model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
100+
vision_start_token_id: 248053
101+
102+
# --- 4. LLM Inference Module ---
103+
llm_inference:
104+
type: "LLMInferenceSDPAModule"
105+
description: "LLM module for SDPA (stateful) pipeline — text & VL"
106+
device: "GPU"
107+
inputs:
108+
# ---- Text mode inputs (required) ----
109+
- name: "input_ids"
110+
type: "OVTensor"
111+
source: "prompt_encoder.input_ids"
112+
# ---- VL mode inputs (additional, optional) ----
113+
- name: "visual_embeds"
114+
type: "OVTensor"
115+
source: "vision_encoder.image_embedding"
116+
- name: "visual_pos_mask"
117+
type: "OVTensor"
118+
source: "vision_encoder.visual_pos_mask"
119+
- name: "grid_thw"
120+
type: "OVTensor"
121+
source: "image_preprocessor.grid_thw"
122+
outputs:
123+
- name: "generated_text"
124+
type: "String"
125+
params:
126+
model_path: "./tests/module_genai/cpp/test_models/Qwen3.5-0.8B/"
127+
max_new_tokens: "16"
128+
129+
# --- 5. Result Module ---
130+
pipeline_result:
131+
type: "ResultModule"
132+
description: "Collects final results and formats the output structure."
133+
inputs:
134+
- name: "generated_text"
135+
type: "String"
136+
source: "llm_inference.generated_text"
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
// Copyright (C) 2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include <iostream>
5+
#include <openvino/genai/module_genai/pipeline.hpp>
6+
7+
#include <stdexcept>
8+
9+
#include "yaml-cpp/yaml.h"
10+
#include "utils/utils.hpp"
11+
12+
inline ov::AnyMap parse_inputs_from_yaml_cfg(const std::filesystem::path& cfg_yaml_path,
13+
const std::string& prompt) {
14+
ov::AnyMap inputs;
15+
YAML::Node input_params = utils::find_param_module_in_yaml(cfg_yaml_path);
16+
17+
for (const auto& entry : input_params) {
18+
if (!entry["name"] || !entry["type"]) {
19+
continue;
20+
}
21+
22+
const std::string param_name = entry["name"].as<std::string>();
23+
const std::string param_type = entry["type"].as<std::string>();
24+
25+
if (param_type == "String" && utils::contains_key(param_name, {"prompt"})) {
26+
if (prompt.empty()) {
27+
throw std::runtime_error("Prompt string is empty.");
28+
}
29+
inputs[param_name] = prompt;
30+
}
31+
}
32+
return inputs;
33+
}
34+
35+
int main(int argc, char* argv[]) {
36+
try {
37+
if (argc <= 1) {
38+
throw std::runtime_error(std::string{"Usage: "} + argv[0] +
39+
"\n"
40+
" -cfg config.yaml\n"
41+
" -prompt: input prompt\n");
42+
}
43+
44+
std::filesystem::path config_path = utils::get_input_arg(argc, argv, "-cfg", std::string{});
45+
std::string prompt = utils::get_input_arg(argc, argv, "-prompt", std::string{});
46+
47+
ov::AnyMap inputs = parse_inputs_from_yaml_cfg(config_path, prompt);
48+
49+
for (const auto& [key, value] : inputs) {
50+
std::cout << "[Input] " << key << ": " << value.as<std::string>() << std::endl;
51+
}
52+
53+
ov::genai::module::ModulePipeline pipe(config_path);
54+
55+
pipe.generate(inputs);
56+
57+
std::cout << "Generation Result: " << pipe.get_output("generated_text").as<std::string>() << std::endl;
58+
} catch (const std::exception& ex) {
59+
std::cerr << "[ERROR] " << ex.what() << std::endl;
60+
return EXIT_FAILURE;
61+
}
62+
return EXIT_SUCCESS;
63+
}

src/cpp/src/module_genai/modules/md_img_preprocess.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,16 @@ void ImagePreprocessModule::run() {
8888
prepare_inputs();
8989
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
9090

91-
if (exists_input("images")) {
91+
// When running in text-only mode, neither image nor images data is provided.
92+
// Skip preprocessing entirely - downstream modules will detect the absence.
93+
const bool has_images = exists_input("images") && !inputs["images"].data.empty();
94+
const bool has_image = exists_input("image") && !inputs["image"].data.empty();
95+
if (!has_images && !has_image) {
96+
GENAI_INFO("ImagePreprocessModule[" + module_desc->name + "]: no image input - skipping (text-only mode)");
97+
return;
98+
}
99+
100+
if (has_images) {
92101
auto images_data = get_input("images").as<std::vector<ov::Tensor>>();
93102
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
94103
std::vector<ov::Tensor> output_tensors;
@@ -113,7 +122,7 @@ void ImagePreprocessModule::run() {
113122
this->outputs["rotary_cos"].data = output.rotary_cos;
114123
this->outputs["rotary_sin"].data = output.rotary_sin;
115124
}
116-
} else {
125+
} else if (has_image) {
117126
auto image1_data = get_input("image").as<ov::Tensor>();
118127
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
119128
auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(image1_data, ov::AnyMap{});

src/cpp/src/module_genai/modules/md_llm_inference_sdpa.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -458,10 +458,16 @@ void LLMInferenceSDPAModule::run() {
458458
ov::Tensor attention_mask(ov::element::i64, {batch, seq_len});
459459
std::fill_n(attention_mask.data<int64_t>(), batch * seq_len, int64_t{1});
460460

461-
// Determine VL mode: all three additional inputs must be present
462-
const bool is_vl = (this->inputs.find("visual_embeds") != this->inputs.end() &&
463-
this->inputs.find("visual_pos_mask") != this->inputs.end() &&
464-
this->inputs.find("grid_thw") != this->inputs.end());
461+
// Determine VL mode: all three additional inputs must be present AND have valid data.
462+
// In text-only mode, the keys exist in the inputs map (from YAML) but data is empty
463+
// because ImagePreprocessModule and VisionEncoderModule skipped execution.
464+
auto has_valid_input = [this](const std::string& name) {
465+
auto it = this->inputs.find(name);
466+
return it != this->inputs.end() && !it->second.data.empty();
467+
};
468+
const bool is_vl = has_valid_input("visual_embeds") &&
469+
has_valid_input("visual_pos_mask") &&
470+
has_valid_input("grid_thw");
465471

466472
ov::genai::modeling::models::Qwen3_5InputPlanner planner(m_model_config);
467473

src/cpp/src/module_genai/modules/md_text_encoder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ void TextEncoderModule::run() {
150150
}
151151
} else if (model_type == VLMModelType::QWEN3_5) {
152152
std::optional<ov::Tensor> grid_thw = std::nullopt;
153-
if (exists_input("grid_thw")) {
153+
if (exists_input("grid_thw") && !inputs["grid_thw"].data.empty()) {
154154
grid_thw = get_input("grid_thw").as<ov::Tensor>();
155155
}
156156

0 commit comments

Comments
 (0)