Skip to content

Commit 585a4dd

Browse files
tengomuchoclaude
andcommitted
test(smolvlm2): add image-grounded generation test with CPU comparison
Add test_vlm_generation_with_image that loads tower_of_pisa.jpg, processes it with AutoProcessor, generates with the Neuron VLM model and the HF CPU reference (AutoModelForVision2Seq), and asserts the decoded outputs match. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 28dde97 commit 585a4dd

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

tests/decoder/test_decoder_generation.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@
1414
# limitations under the License.
1515

1616
import copy
17+
import os
1718
import re
1819
from typing import Any
1920

2021
import pytest
2122
import torch
23+
from PIL import Image
2224
from prompts import get_long_prompt
23-
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
25+
from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
2426
from transformers.generation import StoppingCriteria
2527

2628
from optimum.neuron import NeuronModelForCausalLM, NeuronModelForImageTextToText
@@ -328,6 +330,39 @@ def test_vlm_generation_greedy_expectations(any_vlm_generate_model: dict[str, An
328330
assert neuron_outputs.shape[1] > inputs["input_ids"].shape[1]
329331

330332

333+
@is_inferentia_test
334+
@requires_neuronx
335+
def test_vlm_generation_with_image(any_vlm_generate_model: dict[str, Any]):
336+
"""Test VLM greedy generation with a real image matches the HF CPU reference."""
337+
image_path = os.path.join(os.path.dirname(__file__), "tower_of_pisa.jpg")
338+
image = Image.open(image_path).convert("RGB")
339+
prompt = "Can you describe this image?"
340+
341+
model_id = any_vlm_generate_model["model_id"]
342+
neuron_model_path = any_vlm_generate_model["neuron_model_path"]
343+
344+
processor = AutoProcessor.from_pretrained(neuron_model_path)
345+
inputs = processor(text=prompt, images=image, return_tensors="pt")
346+
max_new_tokens = 20
347+
348+
# CPU reference
349+
cpu_model = AutoModelForVision2Seq.from_pretrained(model_id)
350+
cpu_outputs = cpu_model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
351+
cpu_text = processor.decode(cpu_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
352+
353+
# Neuron model
354+
neuron_model = NeuronModelForImageTextToText.from_pretrained(neuron_model_path)
355+
neuron_outputs = neuron_model.generate(
356+
inputs["input_ids"],
357+
pixel_values=inputs["pixel_values"],
358+
do_sample=False,
359+
max_new_tokens=max_new_tokens,
360+
)
361+
neuron_text = processor.decode(neuron_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
362+
363+
assert neuron_text == cpu_text, f"Neuron: {neuron_text!r} | CPU: {cpu_text!r}"
364+
365+
331366
@is_inferentia_test
332367
@requires_neuronx
333368
@pytest.mark.parametrize("neuron_llm_config", ["llama-1x8192", "gemma3-1x8192"], indirect=True)

tests/decoder/tower_of_pisa.jpg

39.7 KB
Loading

0 commit comments

Comments
 (0)