AI-Hypercomputer
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 7 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 19 additions & 1 deletion b/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/MaxText/layers/attentions.py‎
Lines changed: 28 additions & 8 deletions b/‎src/MaxText/layers/attentions.py‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 86 additions & 16 deletions b/‎src/MaxText/layers/decoders.py‎
Lines changed: 86 additions & 16 deletions
@@ -884,6 +884,13 @@ vision_output_dim_for_vit: 4096
 pixel_shuffle_ratio_for_vit: 0.5
 projector_dropout_for_vit: 0.0
 
+# Qwen3-OmniMoe vision encoder
+spatial_merge_size_for_vit: 2
+out_hidden_size_for_vit: 512
+temporal_patch_size_for_vit: 2
+num_position_embeddings_for_vit: 1024
+deepstack_visual_indexes_for_vit: []
+
 # Subslice shape in the form of "x,y,z" when using pathways (single controller). 
 # Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
 subslice_shape: ""
 
@@ -34,7 +34,25 @@ base_moe_mlp_dim: 768
 norm_topk_prob: true
 
 # RoPE Settings
-rope_max_timescale: 10_000_000
+rope_max_timescale: 1_000_000
+max_position_embeddings: 65536
 
 # General Model Settings
 enable_dropout: False
+
+# Vision Encoder Configuration
+# Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
+image_size_for_vit: 768
+hidden_size_for_vit: 1152
+intermediate_size_for_vit: 4304
+num_attention_heads_for_vit: 16
+num_hidden_layers_for_vit: 27
+num_channels_for_vit: 3
+patch_size_for_vit: 16
+temporal_patch_size_for_vit: 2
+spatial_merge_size_for_vit: 2
+out_hidden_size_for_vit: 2048
+num_position_embeddings_for_vit: 2304
+deepstack_visual_indexes_for_vit: [7, 16, 24]
+
+use_multimodal: true
@@ -63,6 +63,7 @@
 from MaxText.layers.embeddings import (
     LLaMARotaryEmbedding,
     LlamaVisionRotaryEmbedding,
+    Qwen3OmniMoeVisionRotaryEmbedding,
     RotaryEmbedding,
     YarnRotaryEmbedding,
     Qwen3NextRotaryEmbedding,
@@ -705,6 +706,14 @@ def convert_dense_general_inputs_shape(
     axis = canonicalize_tuple(axis)
     return tuple(inputs_shape[ax] for ax in normalize_axes(axis, len(inputs_shape)))
 
+  def get_vision_rotary_embedding_class(self):
+    """Gets the rotary embedding class based on the model type."""
+    if self.config.model_name.startswith("qwen3-omni"):
+      return Qwen3OmniMoeVisionRotaryEmbedding
+    elif self.config.model_name.startswith("llama4"):
+      return LlamaVisionRotaryEmbedding
+    raise ValueError(f"Unsupported model type for vision rotary embedding: {self.config.model_name}")
+
   def init_rotary_embedding(self):
     """Initializes the rotary embeddings, handling different model types.
 
@@ -720,15 +729,16 @@ def init_rotary_embedding(self):
     rope_type = self.config.rope_type.lower()
     rope_use_scale = self.config.rope_use_scale
     if self.is_vision:
-      rotary_embedding = LlamaVisionRotaryEmbedding(
-          image_size=self.config.image_size_for_vit,
-          patch_size=self.config.patch_size_for_vit,
+      rotary_embbeding_class = self.get_vision_rotary_embedding_class()
+      rotary_embedding = rotary_embbeding_class(
           hidden_size=self.config.hidden_size_for_vit,
           num_attention_heads=self.config.num_attention_heads_for_vit,
+          spatial_merge_size=self.config.spatial_merge_size_for_vit,
           rope_theta=self.config.rope_theta_for_vit,
           fprop_dtype=self.dtype,
           rngs=self.rngs,
       )
+
     elif self.config.model_name.startswith("llama3.1") or rope_type.startswith("llama3.1"):
       rotary_embedding = LLaMARotaryEmbedding(
           min_timescale=self.config.rope_min_timescale,
@@ -784,18 +794,27 @@ def init_rotary_embedding(self):
       )
     return rotary_embedding
 
-  def apply_rotary_embedding(self, inputs: Array, inputs_positions: Optional[Array | None] = None):
+  def apply_rotary_embedding(
+      self, inputs: Array, inputs_positions: Optional[Array | None] = None, rope_kwargs: dict = None
+  ):
     """Applies rotary embeddings, handling different model types.
 
     Args:
       inputs: The input tensor to apply rotary embeddings to.
       inputs_positions: The positions of the inputs.
-      name: A name for the embedding layer.
+      rope_kwargs: A dictionary of keyword arguments for the rotary embedding.
 
     Returns:
       The input tensor with rotary embeddings applied.
     """
-    return self.rotary_embedding(inputs, inputs_positions)
+    if self.is_vision and self.config.model_name.startswith("qwen3-omni"):
+      # For Qwen3OmniMoe vision, pass static dimensions from kwargs
+      num_frames = rope_kwargs.get("num_frames")
+      height = rope_kwargs.get("height")
+      width = rope_kwargs.get("width")
+      return self.rotary_embedding(inputs, num_frames, height, width)
+    else:
+      return self.rotary_embedding(inputs, inputs_positions)
 
   def init_kv_caches(self, inputs_kv_shape: Tuple):
     """Initializes KVCache.
@@ -878,6 +897,7 @@ def __call__(
       slot: Optional[int] = None,
       page_state: Optional[page_manager.PageState] = None,
       bidirectional_mask: Any = None,
+      rope_kwargs: dict = None,
   ):
     """Applies Attention on the input data.
 
@@ -952,8 +972,8 @@ def __call__(
     use_qk_norm = self.use_qk_norm and use_rope
 
     if use_rope:
-      query = self.apply_rotary_embedding(query, inputs_positions=inputs_positions)
-      key = self.apply_rotary_embedding(key, inputs_positions=inputs_positions)
+      query = self.apply_rotary_embedding(query, inputs_positions=inputs_positions, rope_kwargs=rope_kwargs)
+      key = self.apply_rotary_embedding(key, inputs_positions=inputs_positions, rope_kwargs=rope_kwargs)
 
     if use_qk_norm and is_llama4_decoder_block:
       l2_norm = L2Norm(eps=self.config.normalization_layer_epsilon)
 
@@ -97,11 +97,23 @@ def __call__(
     )
 
     if self.model_mode == MODEL_MODE_PREFILL:
-      logical_axis_names = ("activation_batch", "prefill_activation_length", "activation_embed")
+      logical_axis_names = (
+          "activation_batch",
+          "prefill_activation_length",
+          "activation_embed",
+      )
     elif self.config.expert_shard_attention_option == EP_AS_CONTEXT and self.model_mode == MODEL_MODE_TRAIN:
-      logical_axis_names = ("activation_batch_no_exp", "activation_length", "activation_embed")
+      logical_axis_names = (
+          "activation_batch_no_exp",
+          "activation_length",
+          "activation_embed",
+      )
     else:
-      logical_axis_names = ("activation_batch", "activation_length_no_exp", "activation_embed")
+      logical_axis_names = (
+          "activation_batch",
+          "activation_length_no_exp",
+          "activation_embed",
+      )
 
     if model_mode == MODEL_MODE_PREFILL:
       inputs = _maybe_shard_with_logical(inputs, logical_axis_names)
@@ -235,7 +247,11 @@ def __call__(
   ) -> jnp.ndarray:
     for lyr in range(self.num_decoder_layers):
       inputs = self.decoder_layer(
-          config=self.config, mesh=self.mesh, name=f"layers_{lyr}", quant=self.quant, model_mode=model_mode
+          config=self.config,
+          mesh=self.mesh,
+          name=f"layers_{lyr}",
+          quant=self.quant,
+          model_mode=model_mode,
       )(
           inputs,
           decoder_segment_ids,
@@ -269,7 +285,10 @@ def setup(self):
       pipeline_stage_module = self.get_pipeline_stage_module(self.decoder_layer)
       remat_policy = self.get_remat_policy()
       self.pipeline_module = pipeline.Pipeline(
-          config=self.config, mesh=self.mesh, layers=pipeline_stage_module, remat_policy=remat_policy
+          config=self.config,
+          mesh=self.mesh,
+          layers=pipeline_stage_module,
+          remat_policy=remat_policy,
       )
 
   def minimal_policy(self, with_context=False):
@@ -339,7 +358,11 @@ def get_remat_policy(self):
       elif cfg.remat_policy == "qkv_proj_offloaded":
         policy = jax.checkpoint_policies.save_and_offload_only_these_names(
             names_which_can_be_saved=[],
-            names_which_can_be_offloaded=["query_proj", "value_proj", "key_proj"],
+            names_which_can_be_offloaded=[
+                "query_proj",
+                "value_proj",
+                "key_proj",
+            ],
             offload_src="device",
             offload_dst="pinned_host",
         )
@@ -395,7 +418,10 @@ def get_decoder_layers(self):
         return [mixtral.MixtralDecoderLayerToLinen]
       case DecoderBlockType.DEEPSEEK:
         if self.config.use_batch_split_schedule:
-          return [deepseek_batchsplit.DeepSeekDenseLayer, deepseek_batchsplit.DeepSeekMoELayer]
+          return [
+              deepseek_batchsplit.DeepSeekDenseLayer,
+              deepseek_batchsplit.DeepSeekMoELayer,
+          ]
         else:
           return [deepseek.DeepSeekDenseLayer, deepseek.DeepSeekMoELayer]
       case DecoderBlockType.GEMMA:
@@ -447,7 +473,10 @@ def map_fn(path, value):
           block_layer,
           prevent_cse=maxtext_utils.should_prevent_cse_in_remat(self.config),
           policy=policy,
-          static_argnums=(4, 5),  # Deterministic and model mode are static arguments.
+          static_argnums=(
+              4,
+              5,
+          ),  # Deterministic and model mode are static arguments.
       )
       RemattedBlockLayers.append(layer)
     return RemattedBlockLayers
@@ -473,11 +502,25 @@ def get_norm_layer(self, num_features: int):
     ):
       return functools.partial(rms_norm, num_features=num_features, shard_mode=self.config.shard_mode)
     elif self.config.decoder_block == DecoderBlockType.GPT3:
-      return functools.partial(gpt3.gpt3_layer_norm, num_features=num_features, reductions_in_fp32=False, use_bias=True)
+      return functools.partial(
+          gpt3.gpt3_layer_norm,
+          num_features=num_features,
+          reductions_in_fp32=False,
+          use_bias=True,
+      )
     else:
       raise ValueError(f"Incorrect decoder_block name {self.config.decoder_block.value=}")
 
-  def scan_decoder_layers(self, cfg, decoder_layer, length, metadata_axis_name, mesh, in_axes_tuple, **kwargs):
+  def scan_decoder_layers(
+      self,
+      cfg,
+      decoder_layer,
+      length,
+      metadata_axis_name,
+      mesh,
+      in_axes_tuple,
+      **kwargs,
+  ):
     """scan decoder layers, calls `flax.linen.transforms.scan`"""
     initializing = self.is_mutable_collection("params")
     params_spec = cfg.param_scan_axis if initializing else ScanIn(cfg.param_scan_axis)
@@ -500,7 +543,11 @@ def scan_decoder_layers(self, cfg, decoder_layer, length, metadata_axis_name, me
         metadata_params={nn.PARTITION_NAME: metadata_axis_name},
     )
     return scan_fn(
-        config=cfg, mesh=mesh, name=metadata_axis_name, quant=self.quant, **kwargs  # pytype: disable=wrong-keyword-args
+        config=cfg,
+        mesh=mesh,
+        name=metadata_axis_name,
+        quant=self.quant,
+        **kwargs,  # pytype: disable=wrong-keyword-args
     )
 
   def get_pipeline_stage_module(self, decoder_blocks):
@@ -558,7 +605,13 @@ def _apply_embedding(
 
     # Merge the image embeddings with the text embeddings for multimodal models
     if image_embeddings is not None and cfg.use_multimodal:
-      if cfg.model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b", "llama4-17b-16e", "llama4-17b-128e"]:
+      if cfg.model_name in [
+          "gemma3-4b",
+          "gemma3-12b",
+          "gemma3-27b",
+          "llama4-17b-16e",
+          "llama4-17b-128e",
+      ]:
         y = multimodal_utils.merge_mm_embeddings(
             text_embeddings=y,
             vision_embeddings=image_embeddings,
@@ -751,7 +804,10 @@ def __call__(
         remaining_layers = self.config.num_decoder_layers - self.config.pipeline_parallel_layers
         if remaining_layers > 0:
           logical_axis_rules_pp_as_dp = sharding.logical_axis_rules_pp_act_as_dp(self.config.logical_axis_rules)
-          with self.mesh, nn.partitioning.axis_rules(logical_axis_rules_pp_as_dp):
+          with (
+              self.mesh,
+              nn.partitioning.axis_rules(logical_axis_rules_pp_as_dp),
+          ):
             y, _ = self.scan_decoder_layers(
                 cfg,
                 RemattedBlockLayers[0],
@@ -838,7 +894,11 @@ def __call__(
           for layer, num_layers, layer_prefix in zip(layers, num_layers_list, layer_prefixes):
             for index in range(num_layers):
               y = layer(
-                  config=cfg, mesh=mesh, name=f"{layer_prefix}_{index}", quant=self.quant, model_mode=self.model_mode
+                  config=cfg,
+                  mesh=mesh,
+                  name=f"{layer_prefix}_{index}",
+                  quant=self.quant,
+                  model_mode=self.model_mode,
               )(
                   y,
                   decoder_segment_ids,
@@ -868,7 +928,12 @@ def __call__(
             if cfg.decoder_block == DecoderBlockType.GPT_OSS:
               layer_kwargs = {"attention_type": gpt_oss.get_attention_type(layer_id=lyr)}
             layer = RemattedBlockLayer(
-                config=cfg, mesh=mesh, name=f"layers_{lyr}", quant=self.quant, model_mode=self.model_mode, **layer_kwargs
+                config=cfg,
+                mesh=mesh,
+                name=f"layers_{lyr}",
+                quant=self.quant,
+                model_mode=self.model_mode,
+                **layer_kwargs,
             )
             y = layer(
                 y,
@@ -952,7 +1017,12 @@ def _apply_gemma3_scanned_blocks(
       rem_layer_kwargs = {"num_of_layers": num_remaining_layers}
       # pytype: disable=wrong-keyword-args
       layer = RemattedGemma3Block(
-          config=cfg, mesh=mesh, quant=self.quant, model_mode=self.model_mode, name="layers_remainder", **rem_layer_kwargs
+          config=cfg,
+          mesh=mesh,
+          quant=self.quant,
+          model_mode=self.model_mode,
+          name="layers_remainder",
+          **rem_layer_kwargs,
       )
       y, _ = layer(
           y,