|
14 | 14 | # limitations under the License. |
15 | 15 |
|
16 | 16 | import copy |
| 17 | +import os |
17 | 18 | import re |
18 | 19 | from typing import Any |
19 | 20 |
|
20 | 21 | import pytest |
21 | 22 | import torch |
| 23 | +from PIL import Image |
22 | 24 | from prompts import get_long_prompt |
23 | | -from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer |
| 25 | +from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer |
24 | 26 | from transformers.generation import StoppingCriteria |
25 | 27 |
|
26 | 28 | from optimum.neuron import NeuronModelForCausalLM, NeuronModelForImageTextToText |
@@ -328,6 +330,39 @@ def test_vlm_generation_greedy_expectations(any_vlm_generate_model: dict[str, An |
328 | 330 | assert neuron_outputs.shape[1] > inputs["input_ids"].shape[1] |
329 | 331 |
|
330 | 332 |
|
| 333 | +@is_inferentia_test |
| 334 | +@requires_neuronx |
| 335 | +def test_vlm_generation_with_image(any_vlm_generate_model: dict[str, Any]): |
| 336 | + """Test VLM greedy generation with a real image matches the HF CPU reference.""" |
| 337 | + image_path = os.path.join(os.path.dirname(__file__), "tower_of_pisa.jpg") |
| 338 | + image = Image.open(image_path).convert("RGB") |
| 339 | + prompt = "Can you describe this image?" |
| 340 | + |
| 341 | + model_id = any_vlm_generate_model["model_id"] |
| 342 | + neuron_model_path = any_vlm_generate_model["neuron_model_path"] |
| 343 | + |
| 344 | + processor = AutoProcessor.from_pretrained(neuron_model_path) |
| 345 | + inputs = processor(text=prompt, images=image, return_tensors="pt") |
| 346 | + max_new_tokens = 20 |
| 347 | + |
| 348 | + # CPU reference |
| 349 | + cpu_model = AutoModelForVision2Seq.from_pretrained(model_id) |
| 350 | + cpu_outputs = cpu_model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens) |
| 351 | + cpu_text = processor.decode(cpu_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True) |
| 352 | + |
| 353 | + # Neuron model |
| 354 | + neuron_model = NeuronModelForImageTextToText.from_pretrained(neuron_model_path) |
| 355 | + neuron_outputs = neuron_model.generate( |
| 356 | + inputs["input_ids"], |
| 357 | + pixel_values=inputs["pixel_values"], |
| 358 | + do_sample=False, |
| 359 | + max_new_tokens=max_new_tokens, |
| 360 | + ) |
| 361 | + neuron_text = processor.decode(neuron_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True) |
| 362 | + |
| 363 | + assert neuron_text == cpu_text, f"Neuron: {neuron_text!r} | CPU: {cpu_text!r}" |
| 364 | + |
| 365 | + |
331 | 366 | @is_inferentia_test |
332 | 367 | @requires_neuronx |
333 | 368 | @pytest.mark.parametrize("neuron_llm_config", ["llama-1x8192", "gemma3-1x8192"], indirect=True) |
|
0 commit comments