huggingface
diff --git a/‎tests/decoder/test_decoder_generation.py‎
Lines changed: 36 additions & 1 deletion b/‎tests/decoder/test_decoder_generation.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎tests/decoder/tower_of_pisa.jpg‎
39.7 KB b/‎tests/decoder/tower_of_pisa.jpg‎
39.7 KB
@@ -14,13 +14,15 @@
 # limitations under the License.
 
 import copy
+import os
 import re
 from typing import Any
 
 import pytest
 import torch
+from PIL import Image
 from prompts import get_long_prompt
-from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
 from transformers.generation import StoppingCriteria
 
 from optimum.neuron import NeuronModelForCausalLM, NeuronModelForImageTextToText
@@ -328,6 +330,39 @@ def test_vlm_generation_greedy_expectations(any_vlm_generate_model: dict[str, An
     assert neuron_outputs.shape[1] > inputs["input_ids"].shape[1]
 
 
+@is_inferentia_test
+@requires_neuronx
+def test_vlm_generation_with_image(any_vlm_generate_model: dict[str, Any]):
+    """Test VLM greedy generation with a real image matches the HF CPU reference."""
+    image_path = os.path.join(os.path.dirname(__file__), "tower_of_pisa.jpg")
+    image = Image.open(image_path).convert("RGB")
+    prompt = "Can you describe this image?"
+
+    model_id = any_vlm_generate_model["model_id"]
+    neuron_model_path = any_vlm_generate_model["neuron_model_path"]
+
+    processor = AutoProcessor.from_pretrained(neuron_model_path)
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
+    max_new_tokens = 20
+
+    # CPU reference
+    cpu_model = AutoModelForVision2Seq.from_pretrained(model_id)
+    cpu_outputs = cpu_model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
+    cpu_text = processor.decode(cpu_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+    # Neuron model
+    neuron_model = NeuronModelForImageTextToText.from_pretrained(neuron_model_path)
+    neuron_outputs = neuron_model.generate(
+        inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        do_sample=False,
+        max_new_tokens=max_new_tokens,
+    )
+    neuron_text = processor.decode(neuron_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+    assert neuron_text == cpu_text, f"Neuron: {neuron_text!r} | CPU: {cpu_text!r}"
+
+
 @is_inferentia_test
 @requires_neuronx
 @pytest.mark.parametrize("neuron_llm_config", ["llama-1x8192", "gemma3-1x8192"], indirect=True)