[Pallas][Mosaic GPU] Use NDLoopInfo as the argument to the loop body of plgpu.nd_loop

justinjfu · Google-ML-Automation · commit b6996cbda6df · 2025-09-25T10:48:13.000-07:00
PiperOrigin-RevId: 811413884
diff --git a/jax/_src/pallas/mosaic_gpu/helpers.py b/jax/_src/pallas/mosaic_gpu/helpers.py
@@ -15,6 +15,7 @@
 """Helpers for Pallas Mosaic GPU kernels."""
 
 from collections.abc import Callable, Hashable, Sequence
+import dataclasses
 import functools
 import math
 from typing import TypeVar, overload
@@ -27,15 +28,18 @@
 _T = TypeVar("_T")
 
 
-@overload
-def nd_loop(
-    grid: Sequence[int],
-    *,
-    collective_axes: Sequence[Hashable] | Hashable,
-    tiling: Sequence[int] | None = None,
-    init_carry: None = None
-) -> Callable[[Callable[[Sequence[jax.Array]], None]], None]:
-  ...
+@dataclasses.dataclass(frozen=True, eq=False)
+class NDLoopInfo:
+  """Container dataclass for loop iteration information.
+
+  Attributes:
+    index: The grid indices corresponding to the current loop iteration.
+    local_index: The local iteration index.
+    num_local_steps: The total number of local iterations to run.
+  """
+  index: tuple[jax.Array, ...]
+  local_index: jax.Array | int
+  num_local_steps: jax.Array | int
 
 
 @overload
@@ -44,29 +48,30 @@ def nd_loop(
     *,
     collective_axes: Sequence[Hashable] | Hashable,
     tiling: Sequence[int] | None = None,
-    init_carry: _T
-) -> Callable[[Callable[[Sequence[jax.Array], _T], _T]], _T]:
+    init_carry: None = None
+) -> Callable[[Callable[[NDLoopInfo], None]], None]:
   ...
 
 
-# TODO(justinfu): Fix the type signature to include both carry and wave_step.
 @overload
 def nd_loop(
     grid: Sequence[int],
     *,
     collective_axes: Sequence[Hashable] | Hashable,
     tiling: Sequence[int] | None = None,
-    include_wave_step: bool
-) -> Callable[[Callable[[Sequence[jax.Array], jax.Array], None]], None]:
+    init_carry: _T
+) -> Callable[[Callable[[NDLoopInfo, _T], _T]], _T]:
   ...
 
 
-def nd_loop(grid, *, collective_axes,
-            tiling=None,
-            init_carry=None,
-            include_wave_step=False):
+def nd_loop(grid, *, collective_axes, tiling=None, init_carry=None):
   """A loop over a multi-dimensional grid partitioned along the given axes.
 
+  The body of the loop a single argument `loop_info` which is an NDLoopInfo
+  object containing index and iteration information. However if a carry is
+  specified, the body will expect a second keyword argument `carry` containing
+  the loop carry.
+
   For example, if ``collective_axes`` is ``"x"`` with :func:`lax.axis_size`
   equal to 4 and the grid is (2, 3), the implementation would produce the
   following iteration order
@@ -98,10 +103,6 @@ def nd_loop(grid, *, collective_axes,
   take and return the carry. If it's ``None`` then no carry argument is
   expected.
 
-  If ``include_wave_step`` is True then the body will be called with an
-  additional ``wave_step`` keyword argument that specifies the current
-  iteration local to the thread.
-
   See also:
     - :func:`jax.experimental.pallas.loop`: A loop over a single dimension.
   """
@@ -141,12 +142,15 @@ def wrapper(wave_step, carry):
           untiled_index.append(sub_idx + tile_idx * tile_dim)
         index = untiled_index
 
-      if include_wave_step:
-        body = functools.partial(body, wave_step=wave_step)
+      loop_info = NDLoopInfo(
+          index=tuple(index),
+          local_index=wave_step,
+          num_local_steps=upper
+      )
       if init_carry is None:
-        body(tuple(index))
+        body(loop_info)
       else:
-        return body(tuple(index), carry=carry)
+        return body(loop_info, carry=carry)
 
     upper = lax.div(grid_size, axis_size) + lax.convert_element_type(
         axis_index < grid_size % axis_size, axis_index.dtype
diff --git a/jax/experimental/pallas/mosaic_gpu.py b/jax/experimental/pallas/mosaic_gpu.py
@@ -45,6 +45,7 @@
 from jax._src.pallas.mosaic_gpu.helpers import find_swizzle as find_swizzle
 from jax._src.pallas.mosaic_gpu.helpers import format_tcgen05_sparse_metadata as format_tcgen05_sparse_metadata
 from jax._src.pallas.mosaic_gpu.helpers import nd_loop as nd_loop
+from jax._src.pallas.mosaic_gpu.helpers import NDLoopInfo as NDLoopInfo
 from jax._src.pallas.mosaic_gpu.helpers import planar_snake as planar_snake
 from jax._src.pallas.mosaic_gpu.pipeline import emit_pipeline as emit_pipeline
 from jax._src.pallas.mosaic_gpu.pipeline import emit_pipeline_warp_specialized as emit_pipeline_warp_specialized
diff --git a/jax/experimental/pallas/ops/gpu/blackwell_matmul_mgpu.py b/jax/experimental/pallas/ops/gpu/blackwell_matmul_mgpu.py
@@ -106,10 +106,10 @@ def kernel(a_gmem, b_gmem, out_gmem,
     is_lead_block = cluster_idx == 0
 
     @plgpu.nd_loop((m_iters * n_iters,),
-                   collective_axes="sm",
-                   include_wave_step=True)
-    def mn_loop(idx, wave_step):  # pylint: disable=unused-variable
-      (lin_idx,) = idx
+                   collective_axes="sm")
+    def mn_loop(loop_info: plgpu.NDLoopInfo):  # pylint: disable=unused-variable
+      (lin_idx,) = loop_info.index
+      local_index = loop_info.local_index
       m_index, n_index = plgpu.planar_snake(
           lin_idx,
           (m_iters, n_iters),
@@ -121,7 +121,7 @@ def mn_loop(idx, wave_step):  # pylint: disable=unused-variable
       block_slice_m = pl.ds(block_m_index * block_tile_m, block_tile_m)
       slice_m = pl.ds(m_index * tile_m, tile_m)
       slice_n = pl.ds(n_index * tile_n, tile_n)
-      acc_slot = lax.rem(wave_step, jnp.int32(2))
+      acc_slot = lax.rem(local_index, jnp.int32(2))
 
       @pl.when(wg_idx == COMPUTE_WG)
       def _():
@@ -134,7 +134,7 @@ def _loop_body(ki, _):
               slice_k = pl.ds(ki * tile_k, tile_k)
               slot = lax.rem(ki, max_concurrent_steps)
               @pl.when(jnp.logical_or(ki >= max_concurrent_steps,
-                                      wave_step > 0))
+                                      local_index > 0))
               def _():
                 plgpu.barrier_wait(consumed_barrier.at[slot])
               plgpu.copy_gmem_to_smem(
@@ -153,7 +153,7 @@ def _():
               )
             lax.fori_loop(0, k_iters, _loop_body, None)
 
-          @pl.when(jnp.logical_and(warp_id == MMA_WARP, wave_step > 1))
+          @pl.when(jnp.logical_and(warp_id == MMA_WARP, local_index > 1))
           def _wait_store():
             plgpu.barrier_wait(store_done_barrier.at[acc_slot])
           @pl.when(jnp.logical_and(warp_id == MMA_WARP, is_lead_block))
diff --git a/jax/experimental/pallas/ops/gpu/blackwell_ragged_dot_mgpu.py b/jax/experimental/pallas/ops/gpu/blackwell_ragged_dot_mgpu.py
@@ -51,7 +51,7 @@ def do_matmul(a_gmem,
               grid_indices: Sequence[jax.Array],
               wg_axis: str,
               collective_axes: tuple[str, ...],
-              wave_step: jax.Array,
+              local_index: jax.Array,
               config: TuningConfig,
               group_info: ragged_dot_mgpu.GroupInfo,
               a_smem, b_smem, acc_tmem, acc_smem,
@@ -91,7 +91,7 @@ def do_matmul(a_gmem,
   block_slice_m = pl.ds(block_m_index * block_tile_m, block_tile_m)
   slice_m = pl.ds(m_index * tile_m, tile_m)
   slice_n = pl.ds(n_index * tile_n, tile_n)
-  acc_slot = lax.rem(wave_step, jnp.int32(2))
+  acc_slot = lax.rem(local_index, jnp.int32(2))
   regs_layout = plgpu.Layout.TCGEN05
 
   @pl.when(wg_idx == COMPUTE_WG)
@@ -106,7 +106,7 @@ def _loop_body(ki, _):
           slice_k = pl.ds(ki * tile_k, tile_k)
           slot = lax.rem(ki, max_concurrent_steps)
           @pl.when(jnp.logical_or(ki >= max_concurrent_steps,
-                                  wave_step > 0))
+                                  local_index > 0))
           def _():
             plgpu.barrier_wait(consumed_barrier.at[slot])
           plgpu.copy_gmem_to_smem(
@@ -125,7 +125,7 @@ def _():
           )
         lax.fori_loop(0, k_iters, _loop_body, None)
 
-      @pl.when(jnp.logical_and(warp_id == MMA_WARP, wave_step > 1))
+      @pl.when(jnp.logical_and(warp_id == MMA_WARP, local_index > 1))
       def _wait_store():
         plgpu.barrier_wait(store_done_barrier.at[acc_slot])
       @pl.when(jnp.logical_and(warp_id == MMA_WARP, is_lead_block))
@@ -297,10 +297,10 @@ def kernel(a_gmem, b_gmem, group_sizes_gmem, out_gmem):
     )
     def _scoped(**ref_kwargs):
       @plgpu.nd_loop(grid=(linear_grid,),
-                     collective_axes="sm",
-                     include_wave_step=True)
-      def mn_loop(idx, wave_step):  # pylint: disable=unused-variable
-        linear_idx, = idx
+                     collective_axes="sm")
+      def mn_loop(loop_info: plgpu.NDLoopInfo):  # pylint: disable=unused-variable
+        linear_idx, = loop_info.index
+        local_index = loop_info.local_index  # type: ignore
         m_index, n_index = plgpu.planar_snake(
           linear_idx,
           (m_iters + num_groups - 1, n_iters),
@@ -318,7 +318,7 @@ def mn_loop(idx, wave_step):  # pylint: disable=unused-variable
             grid_indices=(group_info.block, n_index, cluster_idx),
             wg_axis="wg",
             collective_axes=("x",) if collective else (),
-            wave_step=wave_step,
+            local_index=local_index,  # type: ignore
             config=config,
             group_info=group_info,
             **ref_kwargs
diff --git a/jax/experimental/pallas/ops/gpu/collective_matmul_mgpu.py b/jax/experimental/pallas/ops/gpu/collective_matmul_mgpu.py
@@ -143,8 +143,8 @@ def get_pipeline(pipeline_body, compute_context):
     )
     def _pipeline_scope(pipeline_allocs):
       @plgpu.nd_loop((m_iters * n_iters,), collective_axes="sm")
-      def _mn_loop(idxs):
-        (lin_idx,) = idxs
+      def _mn_loop(loop_info: plgpu.NDLoopInfo):
+        (lin_idx,) = loop_info.index
         m_idx, n_idx = plgpu.planar_snake(
             lin_idx,
             (m_iters, n_iters),
@@ -158,7 +158,7 @@ def _mn_loop(idxs):
           wg_n_slice = slice(None)
         else:
           wg_m_slice = slice(None)
-          wg_n_slice = pl.ds(wg_idx * tile_n, tile_n)
+          wg_n_slice = pl.ds(wg_idx * tile_n, tile_n)  # type: ignore
 
         def compute_context(eval_pipeline):
           @functools.partial(
diff --git a/jax/experimental/pallas/ops/gpu/hopper_matmul_mgpu.py b/jax/experimental/pallas/ops/gpu/hopper_matmul_mgpu.py
@@ -138,8 +138,8 @@ def _pipeline_scope(pipeline_allocs):
       wg_idx = lax.axis_index("wg")
       cta_idx = lax.axis_index("cluster")
       @plgpu.nd_loop((m_iters * n_iters,), collective_axes="cluster_grid")
-      def _mn_loop(idxs):
-        (lin_idx,) = idxs
+      def _mn_loop(loop_info: plgpu.NDLoopInfo):
+        (lin_idx,) = loop_info.index
         m_cluster_idx, n_cluster_idx = plgpu.planar_snake(
             lin_idx,
             (m_iters, n_iters),
@@ -159,7 +159,7 @@ def _mn_loop(idxs):
           wg_n_slice = slice(None)
         else:
           wg_m_slice = slice(None)
-          wg_n_slice = pl.ds(wg_idx * tile_n, tile_n)
+          wg_n_slice = pl.ds(wg_idx * tile_n, tile_n)  # type: ignore
 
         def compute_context(eval_pipeline):
           @functools.partial(
diff --git a/jax/experimental/pallas/ops/gpu/ragged_dot_mgpu.py b/jax/experimental/pallas/ops/gpu/ragged_dot_mgpu.py
@@ -126,8 +126,8 @@ def body(rows_per_expert_gmem, lhs_gmem, rhs_gmem, o_gmem):
     )
 
     @plgpu.nd_loop(grid, collective_axes="sm")
-    def mn_loop(idx):  # pylint: disable=unused-variable
-      block_ni, mi, remainder_ni = idx
+    def mn_loop(loop_info: plgpu.NDLoopInfo):  # pylint: disable=unused-variable
+      block_ni, mi, remainder_ni = loop_info.index
       ni = block_ni * pl.cdiv(n, block_n * grid_block_n) + remainder_ni
       group_info = GroupInfo.create(rows_per_expert_gmem, block_m, mi)
 
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -1979,7 +1979,8 @@ def test_nd_loop_with_carry(self, sm_steps):
         grid_names=("sm",),
     )
     def kernel(o_ref, steps_ref):
-      def body(idx, carry):
+      def body(loop_info, carry):
+        idx = loop_info.index
         assert len(idx) == 3
         # We need to use `mode="clip"`, because the indices are not static.
         flat_idx = jnp.ravel_multi_index(idx, (sm_steps, 4, 33), mode="clip")
@@ -2022,7 +2023,8 @@ def test_nd_loop(self, sm_steps: int, tiling: int | None):
     )
     def kernel(o_ref):
       @plgpu.nd_loop((sm_steps, 4, 33), tiling=tiling, collective_axes="sm")
-      def _(idx):
+      def _(loop_info):
+        idx = loop_info.index
         assert len(idx) == 3
         # We need to use `mode="clip"`, because the indices are not static.
         grid = (sm_steps, 4, 33)

Original file line number	Diff line number	Diff line change
`@@ -126,8 +126,8 @@ def body(rows_per_expert_gmem, lhs_gmem, rhs_gmem, o_gmem):`
`126`	`126`	`)`
`127`	`127`
`128`	`128`	`@plgpu.nd_loop(grid, collective_axes="sm")`
`129`		`- def mn_loop(idx): # pylint: disable=unused-variable`
`130`		`- block_ni, mi, remainder_ni = idx`
	`129`	`+ def mn_loop(loop_info: plgpu.NDLoopInfo): # pylint: disable=unused-variable`
	`130`	`+ block_ni, mi, remainder_ni = loop_info.index`
`131`	`131`	`ni = block_ni * pl.cdiv(n, block_n * grid_block_n) + remainder_ni`
`132`	`132`	`group_info = GroupInfo.create(rows_per_expert_gmem, block_m, mi)`
`133`	`133`