google
diff --git a/‎tests/generate/sampler_test.py‎
Lines changed: 60 additions & 0 deletions b/‎tests/generate/sampler_test.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎tests/models/gemma3/utils_test.py‎
Lines changed: 8 additions & 4 deletions b/‎tests/models/gemma3/utils_test.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎tests/processors/image_processor_test.py‎
Lines changed: 122 additions & 0 deletions b/‎tests/processors/image_processor_test.py‎
Lines changed: 122 additions & 0 deletions
@@ -111,6 +111,66 @@ def test_samples_padding_output(self, max_prompt_length, echo, return_logits):
               result_padded.tokens[i].shape[0], max_generation_steps
           )
 
+  def test_multimodal_samples(self):
+    vocab = tc.MockVocab(is_multimodal=True)
+    transformer = tc.ToyTransformer(
+        config=tc.ModelConfig(
+            vocab_size=vocab.GetPieceSize(), vision_config=tc.VisionConfig()
+        ),
+        rngs=nnx.Rngs(42),
+    )
+
+    class DummyImageProcessor:
+
+      def __call__(self, images):
+        # returns dummy processed images
+        return np.ones((len(images), 1, 32, 32, 3), dtype=np.float32)
+
+    image_processor = DummyImageProcessor()
+
+    sampler = sampler_lib.Sampler(
+        transformer=transformer,
+        tokenizer=vocab,
+        cache_config=sampler_lib.CacheConfig(
+            cache_size=64,
+            num_layers=4,
+            num_kv_heads=4,
+            head_dim=16,
+        ),
+        image_processor=image_processor,
+    )
+
+    max_generation_steps = 8
+
+    # We pass in 2 strings and 2 corresponding dummy images
+    images = [
+        np.zeros((32, 32, 3)),
+        np.zeros((32, 32, 3)),
+    ]
+
+    result = sampler(
+        [
+            'quantization <soi> <img> <img> Tunix',
+            '<soi> <img> <img> Parallax distributed',
+        ],
+        max_generation_steps=max_generation_steps,
+        return_logits=True,
+        max_prompt_length=8,
+        echo=True,
+        images=images,
+    )
+
+    self.assertIsNotNone(result)
+    self.assertReasonableTensor(result.tokens)
+    self.assertReasonableTensor(result.logits)
+    np.testing.assert_allclose(
+        result.tokens,
+        np.array([
+            [1, 21, 23, 22, 22, 14, 8, 25, 8, 25, 8, 25, 8, 25],
+            [1, 23, 22, 22, 15, 18, 8, 25, 8, 25, 8, 25, 8, 25],
+        ]),
+    )
+
   @parameterized.named_parameters(
       dict(
           testcase_name='case1',
 
@@ -17,7 +17,9 @@
 import jax.numpy as jnp
 import numpy as np
 from tunix.models.gemma3 import utils
-from tunix.models.gemma3 import vision
+
+
+_TOKEN_PLACEHOLDER = 219
 
 
 class UtilsTest(parameterized.TestCase):
@@ -46,12 +48,14 @@ def test_get_positions_and_attention_mask_multimodal(self):
     tokens = jnp.array([[
         1,
         2,
-        vision.TOKEN_PLACEHOLDER,
-        vision.TOKEN_PLACEHOLDER,
+        _TOKEN_PLACEHOLDER,
+        _TOKEN_PLACEHOLDER,
         3,
         utils._PADDING_ID,
     ]])
-    result = utils.get_positions_and_attention_mask(tokens)
+    result = utils.get_positions_and_attention_mask(
+        tokens, token_placeholder_id=_TOKEN_PLACEHOLDER
+    )
     positions = result['positions']
     attention_mask = result['attention_mask']
 
 
@@ -0,0 +1,122 @@
+import dataclasses
+import os
+import tempfile
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+from tunix.processors import image_processor
+
+
+@dataclasses.dataclass(slots=True, kw_only=True)
+class DummyConfig:
+
+  image_height: int = 32
+  image_width: int = 32
+  image_channels: int = 3
+  image_mean: tuple[float, ...] = (127.5, 127.5, 127.5)
+  image_std: tuple[float, ...] = (127.5, 127.5, 127.5)
+
+
+class ImageProcessorTest(parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.height = 32
+    self.width = 32
+    self.channels = 3
+    config = DummyConfig(
+        image_height=self.height,
+        image_width=self.width,
+        image_channels=self.channels,
+    )
+    self.processor = image_processor.ImageProcessor(config)
+
+  def _create_dummy_image_file(self, filename='test_image.png'):
+    img_array = np.zeros((100, 100, 3), dtype=np.uint8)
+    img = Image.fromarray(img_array)
+    try:
+      temp_path = self.create_tempdir().full_path
+    except Exception:
+      temp_path = tempfile.TemporaryDirectory().name
+    temp_file = os.path.join(temp_path, filename)
+    img.save(temp_file)
+    return temp_file
+
+  def test_process_none_image(self):
+    processed_image = self.processor.preprocess_image(None)
+    self.assertEqual(
+        processed_image.shape, (self.height, self.width, self.channels)
+    )
+    np.testing.assert_array_equal(processed_image, np.zeros((32, 32, 3)))
+
+  def test_path_input(self):
+    img_path = self._create_dummy_image_file()
+    processed_image = self.processor.preprocess_image(img_path)
+    self.assertEqual(
+        processed_image.shape, (self.height, self.width, self.channels)
+    )
+    np.testing.assert_allclose(processed_image, -1.0 * np.ones((32, 32, 3)))
+
+  def test_array_input(self):
+    img_array = np.zeros((100, 100, 3), dtype=np.uint8)
+    processed_image = self.processor.preprocess_image(img_array)
+    self.assertEqual(
+        processed_image.shape, (self.height, self.width, self.channels)
+    )
+    np.testing.assert_allclose(processed_image, -1.0 * np.ones((32, 32, 3)))
+
+  @parameterized.named_parameters(
+      dict(testcase_name='array', input_type='array'),
+      dict(testcase_name='path', input_type='path'),
+  )
+  def test_call_one_image(self, input_type):
+    if input_type == 'array':
+      images = [np.zeros((100, 100, 3), dtype=np.uint8)]
+    elif input_type == 'path':
+      images = [self._create_dummy_image_file()]
+
+    processed_images = self.processor(images=images)  # pylint: disable=undefined-variable
+    self.assertLen(processed_images, 1)
+    self.assertLen(processed_images[0], 1)
+    self.assertEqual(
+        processed_images[0][0].shape, (self.height, self.width, self.channels)  # pytype: disable=attribute-error
+    )
+    np.testing.assert_allclose(
+        processed_images[0][0], -1.0 * np.ones((32, 32, 3))
+    )
+
+  def test_padding(self):
+    img1 = np.zeros((100, 100, 3), dtype=np.uint8)
+    img2 = np.zeros((50, 50, 3), dtype=np.uint8)
+    images = [[img1], [img1, img2]]
+    processed_images = self.processor(images=images)
+    self.assertLen(processed_images, 2)
+    self.assertLen(processed_images[0], 2)  # Padded to 2
+    self.assertLen(processed_images[1], 2)
+    np.testing.assert_allclose(
+        processed_images[0][0], -1.0 * np.ones((32, 32, 3))
+    )
+    # Padded image should be zeros
+    np.testing.assert_allclose(processed_images[0][1], np.zeros((32, 32, 3)))
+    np.testing.assert_allclose(
+        processed_images[1][0], -1.0 * np.ones((32, 32, 3))
+    )
+    np.testing.assert_allclose(
+        processed_images[1][1], -1.0 * np.ones((32, 32, 3))
+    )
+
+  def test_call_with_none_in_batch(self):
+    images = [None, [np.zeros((100, 100, 3), dtype=np.uint8)]]
+    processed_images = self.processor(images=images)
+    self.assertLen(processed_images, 2)
+    self.assertLen(processed_images[0], 1)
+    self.assertLen(processed_images[1], 1)
+    np.testing.assert_allclose(processed_images[0][0], np.zeros((32, 32, 3)))
+    np.testing.assert_allclose(
+        processed_images[1][0], -1.0 * np.ones((32, 32, 3))
+    )
+
+
+if __name__ == '__main__':
+  absltest.main()