Add image processor

abheesht17 · The tunix Authors · commit 992ba69a532b · 2026-03-03T08:43:06.000-08:00
This PR adds a basic image processor, which takes in a batch of images and processes them (resizing, normalising, etc.). Verification: https://colab.research.google.com/gist/abheesht17/3ca408a919bbda9d4400f6c30f193dcd/-tunix-vlm-image-processor-verification.ipynb PiperOrigin-RevId: 867024240
diff --git a/tests/models/gemma3/utils_test.py b/tests/models/gemma3/utils_test.py
@@ -17,18 +17,19 @@
 import jax.numpy as jnp
 import numpy as np
 from tunix.models.gemma3 import utils
-from tunix.models.gemma3 import vision
+
+
+_TOKEN_PLACEHOLDER = 219
 
 
 class UtilsTest(parameterized.TestCase):
 
-  def test_get_positions_and_attention_mask_not_multimodal(self):
+  def test_get_attention_mask_not_multimodal(self):
     tokens = jnp.array([[1, 2, 3, utils._PADDING_ID, utils._PADDING_ID]])
-    result = utils.get_positions_and_attention_mask(tokens)
-    positions = result['positions']
-    attention_mask = result['attention_mask']
+    attention_mask = utils.get_attention_mask(
+        tokens, token_placeholder_id=_TOKEN_PLACEHOLDER
+    )
 
-    expected_positions = jnp.array([[0, 1, 2, 2, 2]])
     expected_attention_mask = jnp.array(
         [[
             [1, 0, 0, 0, 0],
@@ -39,23 +40,21 @@ def test_get_positions_and_attention_mask_not_multimodal(self):
         ]],
         dtype=jnp.bool_,
     )
-    np.testing.assert_array_equal(positions, expected_positions)
     np.testing.assert_array_equal(attention_mask, expected_attention_mask)
 
-  def test_get_positions_and_attention_mask_multimodal(self):
+  def test_get_attention_mask_multimodal(self):
     tokens = jnp.array([[
         1,
         2,
-        vision.TOKEN_PLACEHOLDER,
-        vision.TOKEN_PLACEHOLDER,
+        _TOKEN_PLACEHOLDER,
+        _TOKEN_PLACEHOLDER,
         3,
         utils._PADDING_ID,
     ]])
-    result = utils.get_positions_and_attention_mask(tokens)
-    positions = result['positions']
-    attention_mask = result['attention_mask']
+    attention_mask = utils.get_attention_mask(
+        tokens, token_placeholder_id=_TOKEN_PLACEHOLDER
+    )
 
-    expected_positions = jnp.array([[0, 1, 2, 3, 4, 4]])
     expected_attention_mask = jnp.array(
         [[
             [1, 0, 0, 0, 0, 0],
@@ -67,19 +66,15 @@ def test_get_positions_and_attention_mask_multimodal(self):
         ]],
         dtype=jnp.bool_,
     )
-    np.testing.assert_array_equal(positions, expected_positions)
     np.testing.assert_array_equal(attention_mask, expected_attention_mask)
 
-  def test_get_positions_and_attention_mask_precomputed_mask(self):
+  def test_get_attention_mask_precomputed_mask(self):
     tokens = jnp.array([[1, 2, 3, utils._PADDING_ID, utils._PADDING_ID]])
     inputs_mask = jnp.array([[1, 0, 1, 0, 0]])
-    result = utils.get_positions_and_attention_mask(
-        tokens, inputs_mask=inputs_mask
+    attention_mask = utils.get_attention_mask(
+        tokens, inputs_mask=inputs_mask, token_placeholder_id=_TOKEN_PLACEHOLDER
     )
-    positions = result['positions']
-    attention_mask = result['attention_mask']
 
-    expected_positions = jnp.array([[0, 0, 1, 1, 1]])
     expected_attention_mask = jnp.array(
         [[
             [1, 0, 0, 0, 0],
@@ -90,7 +85,6 @@ def test_get_positions_and_attention_mask_precomputed_mask(self):
         ]],
         dtype=jnp.bool_,
     )
-    np.testing.assert_array_equal(positions, expected_positions)
     np.testing.assert_array_equal(attention_mask, expected_attention_mask)
 
 
diff --git a/tests/processors/image_processor_test.py b/tests/processors/image_processor_test.py
@@ -0,0 +1,123 @@
+import dataclasses
+import os
+import shutil
+import tempfile
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+from tunix.processors import image_processor
+
+
+@dataclasses.dataclass(slots=True, kw_only=True)
+class DummyConfig:
+
+  image_height: int = 32
+  image_width: int = 32
+  image_channels: int = 3
+  image_mean: tuple[float, ...] = (127.5, 127.5, 127.5)
+  image_std: tuple[float, ...] = (127.5, 127.5, 127.5)
+
+
+class ImageProcessorTest(parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.height = 32
+    self.width = 32
+    self.channels = 3
+    config = DummyConfig(
+        image_height=self.height,
+        image_width=self.width,
+        image_channels=self.channels,
+    )
+    self.processor = image_processor.ImageProcessor(config)
+
+  def _create_dummy_image_file(self, filename='test_image.png'):
+    img_array = np.zeros((100, 100, 3), dtype=np.uint8)
+    img = Image.fromarray(img_array)
+
+    temp_dir = tempfile.mkdtemp()
+    self.addCleanup(lambda: shutil.rmtree(temp_dir))
+
+    temp_file = os.path.join(temp_dir, filename)
+    img.save(temp_file)
+    return temp_file
+
+  def test_process_none_image(self):
+    processed_image = self.processor.preprocess_image(None)
+    self.assertEqual(
+        processed_image.shape, (self.height, self.width, self.channels)
+    )
+    np.testing.assert_array_equal(processed_image, np.zeros((32, 32, 3)))
+
+  def test_path_input(self):
+    img_path = self._create_dummy_image_file()
+    processed_image = self.processor.preprocess_image(img_path)
+    self.assertEqual(
+        processed_image.shape, (self.height, self.width, self.channels)
+    )
+    np.testing.assert_allclose(processed_image, -1.0 * np.ones((32, 32, 3)))
+
+  def test_array_input(self):
+    img_array = np.zeros((100, 100, 3), dtype=np.uint8)
+    processed_image = self.processor.preprocess_image(img_array)
+    self.assertEqual(
+        processed_image.shape, (self.height, self.width, self.channels)
+    )
+    np.testing.assert_allclose(processed_image, -1.0 * np.ones((32, 32, 3)))
+
+  @parameterized.named_parameters(
+      dict(testcase_name='array', input_type='array'),
+      dict(testcase_name='path', input_type='path'),
+  )
+  def test_call_one_image(self, input_type):
+    if input_type == 'array':
+      images = [np.zeros((100, 100, 3), dtype=np.uint8)]
+    elif input_type == 'path':
+      images = [self._create_dummy_image_file()]
+
+    processed_images = self.processor(images=images)  # pylint: disable=undefined-variable
+    self.assertLen(processed_images, 1)
+    self.assertLen(processed_images[0], 1)
+    self.assertEqual(
+        processed_images[0][0].shape, (self.height, self.width, self.channels)  # pytype: disable=attribute-error
+    )
+    np.testing.assert_allclose(
+        processed_images[0][0], -1.0 * np.ones((32, 32, 3))
+    )
+
+  def test_padding(self):
+    img1 = np.zeros((100, 100, 3), dtype=np.uint8)
+    img2 = np.zeros((50, 50, 3), dtype=np.uint8)
+    images = [[img1], [img1, img2]]
+    processed_images = self.processor(images=images)
+    self.assertLen(processed_images, 2)
+    self.assertLen(processed_images[0], 2)  # Padded to 2
+    self.assertLen(processed_images[1], 2)
+    np.testing.assert_allclose(
+        processed_images[0][0], -1.0 * np.ones((32, 32, 3))
+    )
+    # Padded image should be zeros
+    np.testing.assert_allclose(processed_images[0][1], np.zeros((32, 32, 3)))
+    np.testing.assert_allclose(
+        processed_images[1][0], -1.0 * np.ones((32, 32, 3))
+    )
+    np.testing.assert_allclose(
+        processed_images[1][1], -1.0 * np.ones((32, 32, 3))
+    )
+
+  def test_call_with_none_in_batch(self):
+    images = [None, [np.zeros((100, 100, 3), dtype=np.uint8)]]
+    processed_images = self.processor(images=images)
+    self.assertLen(processed_images, 2)
+    self.assertLen(processed_images[0], 1)
+    self.assertLen(processed_images[1], 1)
+    np.testing.assert_allclose(processed_images[0][0], np.zeros((32, 32, 3)))
+    np.testing.assert_allclose(
+        processed_images[1][0], -1.0 * np.ones((32, 32, 3))
+    )
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/tunix/models/gemma3/model.py b/tunix/models/gemma3/model.py
@@ -115,7 +115,7 @@ class ModelConfig:
       QueryPreAttentionNormalisation.BY_ONE_OVER_SQRT_HEAD_DIM
   )
 
-  siglip_config: vision.SigLIPConfig | None = None
+  vision_config: vision.SigLIPConfig | None = None
 
   shd_config: ShardingConfig = ShardingConfig.get_default_sharding()
   remat_config: RematConfig = RematConfig.NONE
@@ -203,7 +203,7 @@ def _gemma3_4b(
         local_base_frequency=10_000,
         global_base_frequency=1_000_000,
         global_scale_factor=8.0,
-        siglip_config=None if text_only else vision.SigLIPConfig(),
+        vision_config=None if text_only else vision.SigLIPConfig(),
         shd_config=sharding_config,
     )
 
@@ -245,7 +245,7 @@ def _gemma3_12b(
         local_base_frequency=10_000,
         global_base_frequency=1_000_000,
         global_scale_factor=8.0,
-        siglip_config=None if text_only else vision.SigLIPConfig(),
+        vision_config=None if text_only else vision.SigLIPConfig(),
         shd_config=sharding_config,
     )
 
@@ -287,7 +287,7 @@ def _gemma3_27b(
         local_base_frequency=10_000,
         global_base_frequency=1_000_000,
         global_scale_factor=8.0,
-        siglip_config=None if text_only else vision.SigLIPConfig(),
+        vision_config=None if text_only else vision.SigLIPConfig(),
         shd_config=sharding_config,
     )
 
@@ -911,9 +911,9 @@ class Gemma3(nnx.Module):
   def __init__(self, config: ModelConfig, *, rngs: nnx.Rngs):
     self.config = config
 
-    if config.siglip_config is not None:
+    if config.vision_config is not None:
       self.vision_encoder = vision.SigLiP(
-          config=config.siglip_config,
+          config=config.vision_config,
           shd_config=config.shd_config.siglip,
           rngs=rngs,
       )
@@ -1009,7 +1009,7 @@ def _encode_and_get_inputs(
       images: jaxtyping.Array | None = None,  # (B, H, W, C) or (B, N, H, W, C)
   ) -> jaxtyping.Array:
     """Encode the text tokens, eventually including the vision embeddings."""
-    if images is not None:
+    if self.config.vision_config is not None and images is not None:
       self._assert_support_mm()
       if len(images.shape) == 4:  # If num_images is 1, add an axis.
         images = einops.rearrange(images, 'b h w c -> b 1 h w c')
@@ -1048,7 +1048,7 @@ def _merge_mm_embeddings(
     merged_embeddings = merge_embeddings_lib.merge_embeddings(
         text_embeddings=embeddings,
         vision_embeddings=soft_embeddings,
-        mask=tokens == vision.TOKEN_PLACEHOLDER,
+        mask=tokens == self.config.vision_config.soft_token_placeholder_id,  # pytype: disable=attribute-error
     )
 
     return merged_embeddings
diff --git a/tunix/models/gemma3/params.py b/tunix/models/gemma3/params.py
@@ -91,7 +91,7 @@ def create_model_from_checkpoint(
   )
   params = ocp.StandardCheckpointer().restore(checkpoint_path)
   params = map_from_upstream_checkpoint(
-      params, text_only=model_config.siglip_config is None
+      params, text_only=model_config.vision_config is None
   )
 
   if mesh is not None:
diff --git a/tunix/models/gemma3/params_safetensors.py b/tunix/models/gemma3/params_safetensors.py
@@ -113,7 +113,7 @@ def _get_key_and_transform_mapping(cfg: model_lib.ModelConfig):
   }
 
   # Vision Tower (SigLIP).
-  if cfg.siglip_config is not None:
+  if cfg.vision_config is not None:
     mapping.update({
         r"vision_tower\.vision_model\.embeddings\.patch_embedding\.weight": (
             "vision_encoder.siglip_encoder.embedding.kernel",
diff --git a/tunix/models/gemma3/utils.py b/tunix/models/gemma3/utils.py
@@ -18,33 +18,29 @@
 
 import jax.numpy as jnp
 import jaxtyping
-from tunix.models.gemma3 import vision
 
 _PADDING_ID = 0
 
 
-def get_positions_and_attention_mask(
+def get_attention_mask(
     tokens: jaxtyping.ArrayLike,  # (B, L)
     *,
     inputs_mask: jaxtyping.ArrayLike | None = None,  # (B, L, L')
+    token_placeholder_id: int = 219,
 ):
-  """Returns the positions and attention mask for the transformer."""
+  """Returns the attention mask for the transformer."""
   # Compute the mask
   if inputs_mask is None:
     inputs_mask = tokens != _PADDING_ID
-  positions = _build_positions_from_mask(inputs_mask)
 
   # The image tokens have bidirectional attention within themselves.
-  bidirectional_mask = tokens == vision.TOKEN_PLACEHOLDER
+  bidirectional_mask = tokens == token_placeholder_id
   attention_mask = make_causal_bidirectional_attention_mask(
       inputs_mask,
       bidirectional_mask=bidirectional_mask,
   )
 
-  return {
-      'positions': positions,
-      'attention_mask': attention_mask,
-  }
+  return attention_mask
 
 
 def make_causal_bidirectional_attention_mask(
@@ -153,21 +149,3 @@ def _add_bidirectional_mask(
       & (q_block_indices[..., None] > 0)
   )
   return attn_mask
-
-
-def _build_positions_from_mask(
-    input_mask: jaxtyping.ArrayLike,
-) -> jaxtyping.ArrayLike:
-  """Computes the `positions` from the `input_mask`.
-
-  Args:
-    input_mask: The tokens `input_mask`, True for non-padded tokens only.
-
-  Returns:
-    The indices to use for RoPE and absolute position encodings for the given
-    input mask.
-  """
-  positions = jnp.cumsum(input_mask, axis=-1)
-  # Subtract one for all positions from the first valid one as they are
-  # 0-indexed
-  return positions - (positions >= 1)
diff --git a/tunix/models/gemma3/vision.py b/tunix/models/gemma3/vision.py
@@ -29,8 +29,6 @@
 from tunix.utils import compat
 from tunix.utils import sharding_utils
 
-TOKEN_PLACEHOLDER = 262144
-
 
 @dataclasses.dataclass(slots=True, frozen=True)
 class SigLIPShardingConfig:
@@ -84,9 +82,14 @@ class SigLIPConfig:
 
   num_mm_tokens_per_image_prepool: int = 4096
   num_mm_tokens_per_image: int = 256
+
+  # Processor args
   image_height: int = 896
   image_width: int = 896
   image_channels: int = 3
+  image_mean: tuple[float, ...] = (127.5, 127.5, 127.5)
+  image_std: tuple[float, ...] = (127.5, 127.5, 127.5)
+  soft_token_placeholder: int = 219
 
   patch_size: tuple[int, int] = (14, 14)
   width: int = 1152
diff --git a/tunix/processors/image_processor.py b/tunix/processors/image_processor.py

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ def create_model_from_checkpoint(`
`91`	`91`	`)`
`92`	`92`	`params = ocp.StandardCheckpointer().restore(checkpoint_path)`
`93`	`93`	`params = map_from_upstream_checkpoint(`
`94`		`- params, text_only=model_config.siglip_config is None`
	`94`	`+ params, text_only=model_config.vision_config is None`
`95`	`95`	`)`
`96`	`96`
`97`	`97`	`if mesh is not None:`
Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ def _get_key_and_transform_mapping(cfg: model_lib.ModelConfig):`
`113`	`113`	`}`
`114`	`114`
`115`	`115`	`# Vision Tower (SigLIP).`
`116`		`- if cfg.siglip_config is not None:`
	`116`	`+ if cfg.vision_config is not None:`
`117`	`117`	`mapping.update({`
`118`	`118`	`r"vision_tower\.vision_model\.embeddings\.patch_embedding\.weight": (`
`119`	`119`	`"vision_encoder.siglip_encoder.embedding.kernel",`