sbalandi
diff --git a/‎.github/workflows/linux.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/linux.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/manylinux_2_28.yml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/manylinux_2_28.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.github/workflows/windows.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/windows.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎site/docs/supported-models/_components/vlm-models-table/models.ts‎
Lines changed: 12 additions & 0 deletions b/‎site/docs/supported-models/_components/vlm-models-table/models.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎site/docs/supported-models/index.mdx‎
Lines changed: 6 additions & 0 deletions b/‎site/docs/supported-models/index.mdx‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/cpp/src/lm_encoding.cpp‎
Lines changed: 4 additions & 1 deletion b/‎src/cpp/src/lm_encoding.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/cpp/src/lm_encoding.hpp‎
Lines changed: 7 additions & 7 deletions b/‎src/cpp/src/lm_encoding.hpp‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/cpp/src/utils.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/cpp/src/utils.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cpp/src/visual_language/gemma3/classes.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/cpp/src/visual_language/gemma3/classes.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -652,6 +652,14 @@ jobs:
               python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "qwen3-vl"
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
             timeout: 60
+          - name: 'VLM (gemma4)'
+            cmd: |
+              python -m pip install --no-deps git+https://github.com/huggingface/optimum-intel.git@ff99d6e13774841bdd17ac0d4c8bd2d181cf7c27 # PR 1688
+              python -m pip install transformers==5.5.0
+              pip show transformers optimum-intel openvino_tokenizers openvino_genai
+              python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "gemma4"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
+            timeout: 60
     defaults:
       run:
         shell: bash
 
@@ -575,6 +575,15 @@ jobs:
               python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "qwen3-vl"
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
             timeout: 60
+          - name: 'VLM (gemma4)'
+            cmd: |
+              python -m pip install --no-deps git+https://github.com/huggingface/optimum-intel.git@ff99d6e13774841bdd17ac0d4c8bd2d181cf7c27 # PR 1688
+              python -m pip install transformers==5.5.0
+              pip show transformers optimum-intel openvino_tokenizers openvino_genai
+              python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "gemma4"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
+            timeout: 60
+
     defaults:
       run:
         shell: bash
 
@@ -740,6 +740,14 @@ jobs:
               python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "qwen3-vl"
             run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
             timeout: 60
+          - name: 'VLM (gemma4)'
+            cmd: |
+              python -m pip install --no-deps git+https://github.com/huggingface/optimum-intel.git@ff99d6e13774841bdd17ac0d4c8bd2d181cf7c27 # PR 1688
+              python -m pip install transformers==5.5.0
+              pip show transformers optimum-intel openvino_tokenizers openvino_genai
+              python -m pytest -s -v tests/python_tests/test_vlm_pipeline.py --override-ini cache_dir=/mount/caches/pytest/ -k "gemma4"
+            run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).visual_language.test }}
+            timeout: 60
     defaults:
       run:
         shell: pwsh
 
@@ -193,4 +193,16 @@ export const VLM_MODELS: VLMModelType[] = [
       },
     ],
   },
+  {
+    architecture: 'Gemma4ForConditionalGeneration',
+    models: [
+      {
+        name: 'gemma4',
+        links: [
+          'https://huggingface.co/google/gemma-4-E2B-it',
+          'https://huggingface.co/google/gemma-4-E4B-it',
+        ],
+      },
+    ],
+  },
 ];
@@ -87,6 +87,12 @@ Apply https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/78/
 2. Visual history is not preserved across rounds, so multi-turn interactions have limited visual context.
 3. If the number of input frames is not divisible by `mm_local_num_frames` (as defined in `config.json`), additional frames will be automatically padded by duplicating the last frame. For example, if there are 10 frames and `mm_local_num_frames = 4`, it will be padded to 12 frames.
 
+#### Gemma4 {#gemma4-notes}
+
+Gemma4 implementation supports text and image inputs only. Video input is not supported at the moment.
+
+The model requires `transformers==5.5.0` for the export with `optimum-cli`.
+
 #### Qwen3-VL {#qwen3_vl-notes}
 
 The model requires `transformers>=4.57` for the export with `optimum-cli`.
 
@@ -86,7 +86,8 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
     std::optional<int64_t> rope_delta,
     const size_t max_kv_cache_size,
     const bool use_intermediate_remote_tensor,
-    const std::unordered_map<std::string, ov::Tensor>& lm_extra_inputs
+    const std::unordered_map<std::string, ov::Tensor>& lm_extra_inputs,
+    std::function<ov::Tensor(const ov::Tensor& new_input_ids)> per_layer_embeddings_callback
 ) {
     std::vector<GenerationHandle> generations;
     for (SequenceGroup::Ptr sequence_group : sequence_groups) {
@@ -261,6 +262,8 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
                     ov::Tensor new_visual_pos_masks{tensor.get_element_type(), {batch_size, 1}};
                     std::fill_n(new_visual_pos_masks.data<bool>(), new_visual_pos_masks.get_size(), false);
                     m_llm.set_tensor(name, new_visual_pos_masks);
+                } else if (name == "per_layer_inputs" && per_layer_embeddings_callback) {
+                    m_llm.set_tensor(name, per_layer_embeddings_callback(new_input_ids));
                 }
             }
         } else {
 
@@ -3,15 +3,16 @@
 
 #pragma once
 
+#include <functional>
 #include <optional>
+
 #include "openvino/genai/llm_pipeline.hpp"
-#include "visual_language/embedding_model.hpp"
 #include "sampling/sampler.hpp"
+#include "visual_language/embedding_model.hpp"
 
 namespace ov {
 namespace genai {
 
-
 ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
     ov::InferRequest& m_llm,
     const ov::Tensor& input_ids,
@@ -26,13 +27,12 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
     std::optional<int64_t> rope_delta = std::nullopt,
     const size_t max_kv_cache_size = std::numeric_limits<size_t>::max(),
     const bool use_intermediate_remote_tensor = true,
-    const std::unordered_map<std::string, ov::Tensor>& lm_extra_inputs = {});
-
+    const std::unordered_map<std::string, ov::Tensor>& lm_extra_inputs = {},
+    std::function<ov::Tensor(const ov::Tensor& new_input_ids)> per_layer_embeddings_callback = nullptr);
 
 void align_cache_and_history(const ov::Tensor& new_chat_tokens, utils::CacheState& cache_state);
 
-
 TokenizedInputs get_chat_encoded_input(const ov::Tensor& new_chat_tokens, utils::CacheState& cache_state);
 
-}
-}
+}  // namespace genai
+}  // namespace ov
@@ -239,6 +239,8 @@ ProcessorConfig from_any_map(
     read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
     read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
     read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
+    read_anymap_param(config_map, "pooling_kernel_size", extracted_config.pooling_kernel_size);
+    read_anymap_param(config_map, "max_soft_tokens", extracted_config.max_soft_tokens);
     return extracted_config;
 }
 
 
@@ -106,6 +106,8 @@ NormalizedPrompt InputsEmbedderGemma3::normalize_prompt(const std::string& promp
         }
         expanded_tag += end_of_image + "\n\n";
 
+        // fixme: there seems to be an issue with how image_token is replaced. unified_prompt.find needs search_offset.
+        // refer to gemma4 implementation.
         unified_prompt.replace(unified_prompt.find(start_of_image), start_of_image.length(), expanded_tag);
     }
     return {std::move(unified_prompt), std::move(images_sequence), {}};
Original file line number	Diff line number	Diff line change
`@@ -239,6 +239,8 @@ ProcessorConfig from_any_map(`
`239`	`239`	`read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);`
`240`	`240`	`read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);`
`241`	`241`	`read_anymap_param(config_map, "norm_std", extracted_config.norm_std);`
	`242`	`+ read_anymap_param(config_map, "pooling_kernel_size", extracted_config.pooling_kernel_size);`
	`243`	`+ read_anymap_param(config_map, "max_soft_tokens", extracted_config.max_soft_tokens);`
`242`	`244`	`return extracted_config;`
`243`	`245`	`}`
`244`	`246`
Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,8 @@ NormalizedPrompt InputsEmbedderGemma3::normalize_prompt(const std::string& promp`
`106`	`106`	`}`
`107`	`107`	`expanded_tag += end_of_image + "\n\n";`
`108`	`108`
	`109`	`+ // fixme: there seems to be an issue with how image_token is replaced. unified_prompt.find needs search_offset.`
	`110`	`+ // refer to gemma4 implementation.`
`109`	`111`	`unified_prompt.replace(unified_prompt.find(start_of_image), start_of_image.length(), expanded_tag);`
`110`	`112`	`}`
`111`	`113`	`return {std::move(unified_prompt), std::move(images_sequence), {}};`