openvinotoolkit · Wovchena · Mar 12, 2026 · Jan 27, 2026 · Jan 30, 2026 · Jan 30, 2026
@@ -630,6 +630,12 @@ jobs:
               python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "MiniCPM-o-2_6"
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
             timeout: 60
+          - name: 'VLM (qwen3-vl)'
+            cmd: |
+              python -m pip install transformers==4.57.0 git+https://github.com/huggingface/optimum-intel.git@0566b76f094d4c3084e06d29a248b39a1bff3fa4
+              python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "qwen3-vl"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
+            timeout: 60
     defaults:
       run:
         shell: bash

@@ -553,6 +553,12 @@ jobs:
               python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "MiniCPM-o-2_6"
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
             timeout: 60
+          - name: 'VLM (qwen3-vl)'
+            cmd: |
+              python -m pip install transformers==4.57.0 git+https://github.com/huggingface/optimum-intel.git@0566b76f094d4c3084e06d29a248b39a1bff3fa4
+              python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "qwen3-vl"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
+            timeout: 60
     defaults:
       run:
         shell: bash

@@ -718,6 +718,12 @@ jobs:
               python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "MiniCPM-o-2_6"
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
             timeout: 60
+          - name: 'VLM (qwen3-vl)'
+            cmd: |
+              python -m pip install transformers==4.57.0 git+https://github.com/huggingface/optimum-intel.git@0566b76f094d4c3084e06d29a248b39a1bff3fa4
+              python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "qwen3-vl"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
+            timeout: 60
     defaults:
       run:
         shell: pwsh

@@ -47,6 +47,7 @@ export const VLM_MODELS: VLMModelType[] = [
       {
         name: 'nanoLLaVA',
         links: ['https://huggingface.co/qnguyen3/nanoLLaVA'],
+        notesLink: '#nanollava-notes',
       },
       {
         name: 'nanoLLaVA-1.5',
@@ -148,6 +149,25 @@ export const VLM_MODELS: VLMModelType[] = [
       },
     ],
   },
+  {
+    architecture: 'Qwen3-VL',
+    models: [
+      {
+        name: 'Qwen3-VL',
+        links: [
+          'https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct',
+          'https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking',
+          'https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct',
+          'https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking',
+          'https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct',
+          'https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking',
+          'https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct',
+          'https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking',
+        ],
+        notesLink: '#qwen3_vl-notes',
+      },
+    ],
+  },
   {
     architecture: 'Gemma3ForConditionalGeneration',
     models: [

@@ -79,6 +79,14 @@ generation_config.set_eos_token_id(pipe.get_tokenizer().get_eos_token_id())
 #### phi4mm {#phi4mm-notes}
 
 Apply https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/78/files to fix the model export for `transformers>=4.50`
+
+#### Qwen3-VL {#qwen3_vl-notes}
+
+The model requires `transformers>=4.57` for the export with `optimum-cli`.
+
+#### nanoLLaVA {#nanollava-notes}
+
+The model requires `transformers>=4.48` for the export with `optimum-cli`.
 :::
 
 ## Speech Recognition Models (Whisper-based)

@@ -15,7 +15,7 @@ The prompt can contain `<ov_genai_image_i>` with `i` replaced with an actual zer
 1. InternVL2: `<image>\n`
 2. llava-1.5-7b-hf: `<image>`
 3. LLaVA-NeXT: `<image>`
-4. LLaVa-NeXT-Video: `<image>`
+4. LLaVA-NeXT-Video: `<image>`
 5. nanoLLaVA: `<image>\n`
 6. nanoLLaVA-1.5: `<image>\n`
 7. MiniCPM-o-2_6: `<image>./</image>\n`
@@ -24,12 +24,14 @@ The prompt can contain `<ov_genai_image_i>` with `i` replaced with an actual zer
 10. Phi-4-multimodal-instruct: `<|image_i|>\n` - the index starts with one
 11. Qwen2-VL: `<|vision_start|><|image_pad|><|vision_end|>`
 12. Qwen2.5-VL: `<|vision_start|><|image_pad|><|vision_end|>`
-13. gemma-3-4b-it: `<start_of_image>`
+13. Qwen3-VL: `<|vision_start|><|image_pad|><|vision_end|>`
+14. gemma-3-4b-it: `<start_of_image>`
 
 Model's native video tag can be used to refer to a video. These tags are:
-1. LLaVa-NeXT-Video: `<video>`
+1. LLaVA-NeXT-Video: `<video>`
 2. Qwen2-VL: `<|vision_start|><|video_pad|><|vision_end|>`
-2. Qwen2.5-VL: `<|vision_start|><|video_pad|><|vision_end|>`
+3. Qwen2.5-VL: `<|vision_start|><|video_pad|><|vision_end|>`
+4. Qwen3-VL: `<|vision_start|><|video_pad|><|vision_end|>`
 
 If the prompt doesn't contain image or video tags, but images or videos are provided, the tags are prepended to the prompt.