1+ global_context :
2+ model_type : " qwen3_omni"
3+
4+ pipeline_modules :
5+ pipeline_params :
6+ type : " ParameterModule"
7+ outputs :
8+ - name : " image"
9+ type : " OVTensor"
10+ - name : " prompt"
11+ type : " String"
12+ - name : " audio"
13+ type : " OVTensor"
14+
15+ image_preprocessor :
16+ type : " ImagePreprocessModule"
17+ device : " CPU"
18+ description : " Image or Video preprocessing."
19+ inputs :
20+ - name : " image"
21+ type : " OVTensor"
22+ source : " pipeline_params.image"
23+ outputs :
24+ - name : " pixel_values"
25+ type : " OVTensor"
26+ - name : " grid_thw"
27+ type : " OVTensor"
28+ - name : " pos_embeds"
29+ type : " OVTensor"
30+ - name : " rotary_cos"
31+ type : " OVTensor"
32+ - name : " rotary_sin"
33+ type : " OVTensor"
34+ params :
35+ model_path : " ./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
36+
37+ audio_preprocessor :
38+ type : " AudioPreprocessModule"
39+ device : " CPU"
40+ description : " Audio preprocessing."
41+ inputs :
42+ - name : " audio"
43+ type : " OVTensor"
44+ source : " pipeline_params.audio"
45+ outputs :
46+ - name : " input_features"
47+ type : " VecOVTensor"
48+ - name : " feature_attention_mask"
49+ type : " VecOVTensor"
50+ params :
51+ model_path : " ./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
52+
53+ audio_encoder :
54+ type : " AudioEncoderModule"
55+ device : " GPU"
56+ description : " Audio encoder for Qwen 3-Omni."
57+ inputs :
58+ - name : " input_features"
59+ type : " VecOVTensor"
60+ source : " audio_preprocessor.input_features"
61+ - name : " feature_attention_mask"
62+ type : " VecOVTensor"
63+ source : " audio_preprocessor.feature_attention_mask"
64+ outputs :
65+ - name : " audio_features"
66+ type : " OVTensor"
67+ - name : " audio_feature_lengths"
68+ type : " OVTensor"
69+ params :
70+ model_path : " ./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_audio_encoder.xml"
71+
72+ prompt_encoder :
73+ type : " TextEncoderModule"
74+ device : " GPU"
75+ inputs :
76+ - name : " prompt"
77+ type : " String"
78+ source : " pipeline_params.prompt"
79+ - name : " grid_thw"
80+ type : " OVTensor"
81+ source : " image_preprocessor.grid_thw"
82+ - name : " audio_features"
83+ type : " OVTensor"
84+ source : " audio_encoder.audio_features"
85+ outputs :
86+ - name : " input_ids"
87+ type : " OVTensor"
88+ - name : " mask"
89+ type : " OVTensor"
90+ params :
91+ model_path : " ./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/"
92+
93+ vision_encoder :
94+ type : " VisionEncoderModule"
95+ device : " GPU"
96+ inputs :
97+ - name : " preprocessed_image"
98+ type : " OVTensor"
99+ source : " image_preprocessor.pixel_values"
100+ - name : " grid_thw"
101+ type : " OVTensor"
102+ source : " image_preprocessor.grid_thw"
103+ - name : " pos_embeds"
104+ type : " OVTensor"
105+ source : " image_preprocessor.pos_embeds"
106+ - name : " rotary_cos"
107+ type : " OVTensor"
108+ source : " image_preprocessor.rotary_cos"
109+ - name : " rotary_sin"
110+ type : " OVTensor"
111+ source : " image_preprocessor.rotary_sin"
112+ - name : " input_ids"
113+ type : " OVTensor"
114+ source : " prompt_encoder.input_ids"
115+ - name : " attention_mask"
116+ type : " OVTensor"
117+ source : " prompt_encoder.mask"
118+ - name : " audio_features"
119+ type : " OVTensor"
120+ source : " audio_encoder.audio_features"
121+ - name : " audio_feature_lengths"
122+ type : " OVTensor"
123+ source : " audio_encoder.audio_feature_lengths"
124+ outputs :
125+ - name : " image_embedding"
126+ type : " OVTensor"
127+ - name : " visual_pos_mask"
128+ type : " OVTensor"
129+ - name : " position_ids"
130+ type : " OVTensor"
131+ - name : " rope_delta"
132+ type : " OVTensor"
133+ - name : " deepstack_embeds"
134+ type : " VecOVTensor"
135+ - name : " audio_embedding"
136+ type : " OVTensor"
137+ - name : " audio_pos_mask"
138+ type : " OVTensor"
139+ params :
140+ model_path : " ./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_vision_model.xml"
141+ vision_start_token_id : 248053
142+
143+ llm :
144+ type : " LLMInferenceSDPAModule"
145+ device : " GPU"
146+ inputs :
147+ - name : " input_ids"
148+ type : " OVTensor"
149+ source : " prompt_encoder.input_ids"
150+ - name : " visual_embeds"
151+ type : " OVTensor"
152+ source : " vision_encoder.image_embedding"
153+ - name : " visual_pos_mask"
154+ type : " OVTensor"
155+ source : " vision_encoder.visual_pos_mask"
156+ - name : " grid_thw"
157+ type : " OVTensor"
158+ source : " image_preprocessor.grid_thw"
159+ - name : " position_ids"
160+ type : " OVTensor"
161+ source : " vision_encoder.position_ids"
162+ - name : " rope_delta"
163+ type : " OVTensor"
164+ source : " vision_encoder.rope_delta"
165+ - name : " deepstack_embeds"
166+ type : " VecOVTensor"
167+ source : " vision_encoder.deepstack_embeds"
168+ - name : " audio_embeds"
169+ type : " OVTensor"
170+ source : " vision_encoder.audio_embedding"
171+ - name : " audio_pos_mask"
172+ type : " OVTensor"
173+ source : " vision_encoder.audio_pos_mask"
174+ outputs :
175+ - name : " generated_text"
176+ type : " String"
177+ params :
178+ model_path : " ./tests/module_genai/cpp/test_models/Qwen3-Omni-4B-Instruct-multilingual/qwen3_omni_text_model.xml"
179+ max_new_tokens : 512
180+
181+ pipeline_result :
182+ type : " ResultModule"
183+ description : " Collects final results and formats the output structure."
184+ inputs :
185+ - name : " generated_text"
186+ type : " String"
187+ source : " llm.generated_text"
0 commit comments