Migrate Decoder (Gemma3/Deepseek/Llama4) and utils to NNX

hsuan-lun-chiang · hsuan-lun-chiang · commit edbbf291cff0 · 2026-02-09T08:14:45.000Z
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -1035,7 +1035,7 @@ subslice_shape: ""
 
 # NNX
 enable_nnx: false
-pure_nnx_decoder: false
+pure_nnx_decoder: True
 
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
diff --git a/src/MaxText/layers/gemma3.py b/src/MaxText/layers/gemma3.py
@@ -91,7 +91,6 @@ def __init__(
 
     batch_size, seq_len = max_utils.get_batch_seq_len_for_mode(config, model_mode)
     dummy_inputs_shape = (batch_size, seq_len, config.emb_dim)
-
     self.pre_self_attention_norm = RMSNorm(
         num_features=config.emb_dim,
         dtype=config.dtype,
@@ -198,7 +197,6 @@ def __call__(
       inputs = inputs[0]
     inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
-
     lnx = self.pre_self_attention_norm(inputs)
     lnx = nn.with_logical_constraint(lnx, self.activation_axis_names)
 
diff --git a/src/MaxText/layers/nnx_decoders.py b/src/MaxText/layers/nnx_decoders.py
@@ -33,10 +33,10 @@
 from MaxText.common_types import DecoderBlockType, ShardMode, Config, EP_AS_CONTEXT
 from MaxText.common_types import MODEL_MODE_TRAIN, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 from MaxText.sharding import create_sharding
+from maxtext.inference import page_manager
 from MaxText.layers import linears
 from MaxText.layers import initializers
 from MaxText.layers import quantizations
-from MaxText import multimodal_utils
 from MaxText import sharding
 from MaxText.layers.attentions import Attention
 from MaxText.layers.normalizations import RMSNorm
@@ -61,6 +61,7 @@
 from maxtext.inference import page_manager
 from maxtext.utils import max_logging
 from maxtext.utils import maxtext_utils
+from maxtext.multimodal import utils as mm_utils
 
 # ------------------------------------------------------------------------------
 # The network: Decoder Definitions
@@ -284,19 +285,28 @@ def __init__(
         attention_pattern_length = len(gemma3.GEMMA3_ATTENTION_PATTERN)
         scan_length = config.num_decoder_layers // attention_pattern_length
         num_remaining_layers = config.num_decoder_layers % attention_pattern_length
+        layer_kwargs = {"num_of_layers": attention_pattern_length}
+
         rem_layer_kwargs = {"num_of_layers": num_remaining_layers}
 
         RemattedGemma3Block = gemma3.Gemma3ScannableBlock
 
         if scan_length > 0:
-          self.layers = self._create_scanned_layers(RemattedGemma3Block, length=scan_length, rngs=rngs)
+          self.layers = self._create_scanned_layers(RemattedGemma3Block, length=scan_length, rngs=rngs, **layer_kwargs)
         self.layers_remainder = RemattedGemma3Block(
             config=self.config, mesh=mesh, quant=self.quant, model_mode=self.model_mode, **rem_layer_kwargs, rngs=rngs
         )  # pytype: disable=wrong-keyword-args
       else:
         layer_cls = decoder_block_classes[0]
-        num_layers = config.num_decoder_layers
-        self.layers = self._create_scanned_layers(layer_cls, length=num_layers, rngs=rngs)
+        num_layers = int(config.num_decoder_layers / config.inhomogeneous_layer_cycle_interval)
+        layer_kwargs = {}
+        if config.decoder_block == DecoderBlockType.LLAMA4:
+          layer_kwargs = {
+              "nope_layer_interval": self.config.nope_layer_interval,
+              "interleave_moe_layer_step": self.config.interleave_moe_layer_step,
+          }
+
+        self.layers = self._create_scanned_layers(layer_cls, length=num_layers, rngs=rngs, **layer_kwargs)
     else:
       self.layers = nnx.List([])
       if self.is_deepseek:
@@ -366,7 +376,6 @@ def _apply_layers_sequentially(self, layers, x_in, *args, length: int, **kwargs)
 
     layer_cls = layers.__class__  # Access the underlying class
     sig = inspect.signature(layer_cls.__call__)
-
     # Filter kwargs to only include keys that exist in the layer's signature
     valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters or "kwargs" in sig.parameters}
 
@@ -584,7 +593,7 @@ def _apply_embedding(
           "llama4-17b-128e",
           "qwen3-omni-30b-a3b",
       ]:
-        y = multimodal_utils.merge_mm_embeddings(
+        y = mm_utils.merge_mm_embeddings(
             text_embeddings=y,
             multimodal_embeddings=image_embeddings,
             mask=bidirectional_mask,
@@ -596,7 +605,7 @@ def _apply_embedding(
 
     if audio_embeddings is not None and cfg.use_audio:
       if cfg.model_name in ["qwen3-omni-30b-a3b"]:
-        y = multimodal_utils.merge_mm_embeddings(
+        y = mm_utils.merge_mm_embeddings(
             text_embeddings=y,
             multimodal_embeddings=audio_embeddings,
             mask=audio_masks,
@@ -698,7 +707,6 @@ def __call__(
         "previous_chunk": previous_chunk,
         "page_state": page_state,
         "slot": slot,
-        "attention_metadata": attention_metadata,
     }
 
     if cfg.decoder_block == DecoderBlockType.GEMMA3:
@@ -775,17 +783,12 @@ def _apply_gemma3_scanned_blocks(
     attention_pattern_length = len(gemma3.GEMMA3_ATTENTION_PATTERN)
     scan_length = cfg.num_decoder_layers // attention_pattern_length
 
-    layer_call_kwargs = {"bidirectional_mask": bidirectional_mask}
+    layer_args = (decoder_segment_ids, decoder_positions, deterministic, model_mode)
+    layer_kwargs = {"bidirectional_mask": bidirectional_mask}
 
     # Apply the main scan over the full blocks
     if scan_length > 0:
-      broadcast_args = (
-          decoder_segment_ids,
-          decoder_positions,
-          deterministic,
-          model_mode,
-      )
-      y, _ = self.layers(y, *broadcast_args, **layer_call_kwargs)
+      y, _ = self._apply_layers_sequentially(self.layers, y, *layer_args, length=scan_length, **layer_kwargs)
 
     # Apply any remaining layers that did not fit into a full scanned block
     num_remaining_layers = cfg.num_decoder_layers % attention_pattern_length
@@ -800,8 +803,9 @@ def _apply_gemma3_scanned_blocks(
           previous_chunk=previous_chunk,
           page_state=page_state,
           slot=slot,
-          **layer_call_kwargs,
+          **layer_kwargs,
       )
+
     return y
 
 
diff --git a/src/MaxText/utils/ckpt_conversion/to_maxtext.py b/src/MaxText/utils/ckpt_conversion/to_maxtext.py
@@ -385,14 +385,22 @@ def _build_single_axis_stacked_tensor(
       The final, assembled NumPy array for the MaxText parameter.
   """
   tensors_to_stack = []
+  # Heuristic to determine if we are stacking layers or experts.
+  # If the number of items to stack equals the number of layers, it's a standard
+  # scanned layer, and we use the configured param_scan_axis. Otherwise, it's
+  # an unscanned MoE layer, and we stack along the expert axis (0).
+  """
+  axis_to_stack = config.param_scan_axis if len(hf_source_keys) == config.base_num_decoder_layers else 0 
+  """
 
-  if config.scan_layers:
-    # If it's a standard scanned layer, we use the configured param_scan_axis.
-    axis_to_stack = config.param_scan_axis
+  # Workaround to load the HF model due to mismatched tensor ordering
+  if len(hf_source_keys) == config.base_num_decoder_layers:
+    if getattr(config, "enable_nnx", False):
+      axis_to_stack = 0
+    else:
+      axis_to_stack = config.param_scan_axis
   else:
-    # Otherwise, if an unscanned MoE layer, and we stack along the expert axis (0).
     axis_to_stack = 0
-
   # The hook function needs the shape of an individual slice, not the full stacked tensor.
   # We calculate it by removing the stacking dimension from the final target shape.
   mt_slice_shape_list = list(target_shape)
diff --git a/tests/checkpoint_compare.py b/tests/checkpoint_compare.py
@@ -0,0 +1,160 @@
+import jax
+import jax.numpy as jnp
+import orbax.checkpoint as ocp
+import numpy as np
+from typing import Any, Dict, Sequence, Tuple
+from jax.tree_util import tree_flatten_with_path, keystr, tree_structure, tree_map_with_path
+from absl import app
+from absl import flags
+
+
+_LINEN_CKPT_PATH = flags.DEFINE_string(
+    "linen_ckpt_path", None, "Path to the Linen model checkpoint items directory.", required=True
+)
+_NNX_CKPT_PATH = flags.DEFINE_string(
+    "nnx_ckpt_path", None, "Path to the NNX model checkpoint items directory.", required=True
+)
+
+
+def load_checkpoint_params(path: str) -> Dict[str, Any]:
+  """Loads parameters from an Orbax checkpoint path."""
+  print(f"Loading checkpoint from: {path}")
+  checkpointer = ocp.PyTreeCheckpointer()
+  restored_state = checkpointer.restore(path)
+  if restored_state is None:
+    raise ValueError(f"Failed to restore checkpoint from {path}")
+  if isinstance(restored_state, dict) and "params" in restored_state:
+    return restored_state["params"]
+  return restored_state
+
+
+def transform_nnx_params(nnx_params: Dict[str, Any]) -> Dict[str, Any]:
+  """Applies specific transformations with verbose logging matching original format."""
+
+  def _transform(path, leaf: jax.Array) -> jax.Array:
+    key_str = keystr(path)
+
+    if "layers" in key_str and hasattr(leaf, "ndim") and leaf.ndim >= 2:
+      print(f"TRANSPOSING: {key_str} with shape {leaf.shape}")
+      axes = (1, 0) + tuple(range(2, leaf.ndim))
+      return jnp.transpose(leaf, axes=axes)
+    else:
+      if "token_embedder" in key_str:
+        print(f"SKIPPING Transpose: {key_str} because it is token_embedder")
+      else:
+        shape = getattr(leaf, "shape", "N/A")
+        print(f"SKIPPING Transpose: {key_str} with shape {shape} (ndim < 2)")
+      return leaf
+
+  print("Applying transformations to NNX params...")
+  return tree_map_with_path(_transform, nnx_params)
+
+
+def get_tree_structure_info(tree: Dict[str, Any]):
+  """Helper only used if structures differ."""
+  flat_with_path, _ = tree_flatten_with_path(tree)
+  return {keystr(p): (getattr(l, "shape", "N/A"), str(getattr(l, "dtype", type(l).__name__))) for p, l in flat_with_path}
+
+
+def print_structure_diff(params1, params2):
+  """Prints missing/added keys if structures differ."""
+  info1 = get_tree_structure_info(params1)
+  info2 = get_tree_structure_info(params2)
+  keys1, keys2 = set(info1.keys()), set(info2.keys())
+
+  for k in sorted(keys2 - keys1):
+    print(f"  + Added in NNX: {k}")
+  for k in sorted(keys1 - keys2):
+    print(f"  - Missing in NNX: {k}")
+
+
+def compare_params(params1: Dict[str, Any], params2: Dict[str, Any]) -> bool:
+  if tree_structure(params1) != tree_structure(params2):
+    print("[] Tree structures differ.")
+    print_structure_diff(params1, params2)
+    return False
+
+  print("[] Tree structures are the same.")
+
+  all_match = True
+
+  def _compare_leaf(path, x, y):
+    nonlocal all_match
+    key_str = keystr(path)
+
+    try:
+      shape1 = getattr(x, "shape", "N/A")
+      shape2 = getattr(y, "shape", "N/A")
+
+      if shape1 != shape2:
+        print(f"[{key_str}] SHAPE MISMATCH: {shape1} vs {shape2}")
+        all_match = False
+        return
+
+      dtype1 = getattr(x, "dtype", type(x))
+      dtype2 = getattr(y, "dtype", type(y))
+
+      if dtype1 != dtype2:
+        print(f"[{key_str}] DTYPE MISMATCH: {dtype1} vs {dtype2}")
+        all_match = False
+        return
+
+      diff = x - y
+      abs_diff = jnp.abs(diff)
+      mean_diff_scalar = jnp.mean(abs_diff)
+      max_diff_scalar = jnp.max(abs_diff)
+      is_close_scalar = jnp.allclose(x, y)
+
+      mean_diff = float(mean_diff_scalar)
+      max_diff = float(max_diff_scalar)
+      is_close = bool(is_close_scalar)
+
+      print(
+          f"[{key_str}] "
+          f"Shape(Linen/NNX): {shape1} / {shape2} — "
+          f"Mean abs diff: {mean_diff:.2e}, "
+          f"Max abs diff: {max_diff:.2e}, "
+          f"AllClose: {is_close}"
+      )
+
+      if not is_close:
+        all_match = False
+
+    except Exception as e:
+      print(f"[{key_str}] Error during comparison: {e}")
+      all_match = False
+
+  tree_map_with_path(_compare_leaf, params1, params2)
+
+  return all_match
+
+
+def main(argv: Sequence[str]):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  linen_ckpt_path = _LINEN_CKPT_PATH.value
+  nnx_ckpt_path = _NNX_CKPT_PATH.value
+
+  print(f"Linen Checkpoint Path: {linen_ckpt_path}")
+  print(f"NNX Checkpoint Path: {nnx_ckpt_path}")
+
+  print("Loading Linen params...")
+  linen_params = load_checkpoint_params(linen_ckpt_path)
+  print("Loading NNX params...")
+  nnx_params = load_checkpoint_params(nnx_ckpt_path)
+
+  if linen_params is not None and nnx_params is not None:
+    nnx_params_transformed = transform_nnx_params(nnx_params)
+
+    print("\nComparing Linen params with Transformed NNX params...")
+    if compare_params(linen_params, nnx_params_transformed):
+      print("\nCheckpoints are considered the same (within np.allclose tolerance) after transformation!")
+    else:
+      print("\nCheckpoints DIFFER after transformation.")
+  else:
+    print("Failed to load params from one or both checkpoints.")
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tests/unit/multi_token_prediction_test.py b/tests/unit/multi_token_prediction_test.py
@@ -55,14 +55,23 @@ def setUp(self):
     devices_array = maxtext_utils.create_device_mesh(self.cfg)
     self.mesh = Mesh(devices_array, self.cfg.mesh_axes)
 
-    # Instantiate the Layer
-    self.mtp_layer = multi_token_prediction.MultiTokenPredictionLayer(
-        config=self.cfg,
-        mesh=self.mesh,
-        layer_number=TEST_LAYER_NUM,
-        transformer_layer_module=DecoderLayer,
-        rngs=self.rngs,
-    )
+    if self.cfg.pure_nnx:
+      # Instantiate the Layer
+      self.mtp_layer = multi_token_prediction.MultiTokenPredictionLayer(
+          config=self.cfg,
+          mesh=self.mesh,
+          layer_number=TEST_LAYER_NUM,
+          transformer_layer_module=DecoderLayer,
+          rngs=self.rngs,
+      )
+    else:
+      # Instantiate the Layer
+      self.mtp_layer = multi_token_prediction.MultiTokenPredictionLayerLinen(
+          config=self.cfg,
+          mesh=self.mesh,
+          layer_number=TEST_LAYER_NUM,
+          transformer_layer_module=DecoderLayer,
+      )
 
     # Dimensions directly from the config object
     self.batch_size = int(self.cfg.per_device_batch_size)