[Pallas/Mosaic GPU] Add the reduction_scratch_bytes field to CompilerParams.

bchetioui · Google-ML-Automation · commit 724eb02947f0 · 2026-01-23T08:17:18.000-08:00
This field allows configuring the number of bytes to reserve in order to
perform cross-warp reductions. The more bytes can be allocated to such a
reduction, the more registers can be reduced in parallel---yielding faster
reductions.

PiperOrigin-RevId: 860115656
diff --git a/docs/pallas/CHANGELOG.md b/docs/pallas/CHANGELOG.md
@@ -13,6 +13,14 @@ Remember to align the itemized text with the first line of an item within a list
 
 ## Unreleased
 
+* New features:
+
+  * Added a `reduction_scratch_bytes` field to
+    {class}`jax.experimental.pallas.mosaic_gpu.CompilerParams`. This gives user
+    control over how much shared memory Pallas is allowed to reserve for
+    cross-warp reductions on GPU. Increasing this value typically allows for
+    faster reductions.
+
 * Changes
 
   * The default lowering path on GPU now goes through Mosaic GPU. To keep using
diff --git a/jax/_src/pallas/mosaic_gpu/core.py b/jax/_src/pallas/mosaic_gpu/core.py
@@ -102,6 +102,11 @@ class CompilerParams(pallas_core.CompilerParams):
       thread ever calls commit_smem(), reads from the committed SMEM and then
       issues an async copy overwriting that region (this is a very artificial
       and highly unlikely scenario).
+    reduction_scratch_bytes: The number of shared memory bytes to reserve as
+      scratch space for cross-warp reductions. The higher this value, the more
+      registers can be reduced in parallel. 2 * 128 * 6 * 4 = 6144 bytes is
+      typically a good value in order to extract most of the potential gains on
+      H100 and B200.
     profile_space: The number of profiler events that can be collected in a
       single invocation. It is undefined behavior if a thread collects more
       events than this.
@@ -112,6 +117,7 @@ class CompilerParams(pallas_core.CompilerParams):
   dimension_semantics: Sequence[DimensionSemantics] | None = None
   max_concurrent_steps: int = 1
   unsafe_no_auto_barriers: bool = False
+  reduction_scratch_bytes: int = 128 * 4 * 4
   profile_space: int = 0
   profile_dir: str = ""
   lowering_semantics: mgpu.core.LoweringSemantics = mgpu.core.LoweringSemantics.Lane
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -90,6 +90,7 @@
 
 @dataclasses.dataclass(frozen=True, kw_only=True)
 class ResourceEstimatorContext:
+  reduction_scratch_bytes: int
   axis_names: _AxisNames
   lowering_semantics: mgpu.LoweringSemantics
 
@@ -361,7 +362,6 @@ def _run_scoped_resource_estimator(
           f"Unsupported memory space: {aval.memory_space}")
   return rs + _estimate_resources(ctx, jaxpr)
 
-REDUCE_SCRATCH_ELEMS = 128 * 4  # vector of 4 elements per lane in each WG
 
 @_register_resource_estimator(lax.reduce_sum_p)
 @_register_resource_estimator(lax.reduce_max_p)
@@ -370,10 +370,10 @@ def _reduce_resource_estimator(
     ctx: ResourceEstimatorContext, x_aval: jax_core.ShapedArray, *, axes,
     **kwargs
 ) -> Resources:
-  del ctx, axes  # Unused.
+  del x_aval, axes, kwargs  # Unused.
   # We don't need SMEM for some reductions, but it depends on the layout, so we
   # conservatively request the maximum scratch space we might need.
-  return Resources(smem_scratch_bytes=REDUCE_SCRATCH_ELEMS * x_aval.dtype.itemsize)
+  return Resources(smem_scratch_bytes=ctx.reduction_scratch_bytes)
 
 
 @dataclasses.dataclass(frozen=True)
@@ -420,6 +420,8 @@ class ModuleContext:
   mesh_info: pallas_utils.MeshInfo | None
   # See the documentation of unsafe_no_auto_barriers in CompilerParams.
   auto_barriers: bool
+  # See the documentation of reduction_scratch_bytes in CompilerParams.
+  reduction_scratch_bytes: int
   warp_axis_name: str | None = None
 
   @property
@@ -596,8 +598,9 @@ def replace(self, **changes: Any) -> LoweringRuleContext:
   @property
   def estimator_ctx(self) -> ResourceEstimatorContext:
     return ResourceEstimatorContext(
+        reduction_scratch_bytes=self.module_ctx.reduction_scratch_bytes,
         axis_names=self.module_ctx.axis_names,
-        lowering_semantics=self.module_ctx.lowering_semantics,
+        lowering_semantics=self.module_ctx.lowering_semantics
     )
 
 
@@ -886,6 +889,7 @@ def lower_jaxpr_to_module(
 
   rs = _estimate_resources(
       ResourceEstimatorContext(
+          reduction_scratch_bytes=params.reduction_scratch_bytes,
           axis_names=axis_names, lowering_semantics=lowering_semantics
       ),
       jaxpr,
@@ -982,6 +986,7 @@ def body(launch_ctx: mgpu.LaunchContext, *buffers: ir.Value):
         if jax_mesh is not None
         else None,
         auto_barriers=not params.unsafe_no_auto_barriers,
+        reduction_scratch_bytes=params.reduction_scratch_bytes,
     )
     del runtime_smem, grouped_barriers, runtime_barriers
     _ = lower_jaxpr_to_mosaic_gpu(
@@ -2592,12 +2597,11 @@ def _reduce_lowering_rule(op, ctx: LoweringRuleContext, x, *, axes, **kwargs):
         raise NotImplementedError("Multi-axis reductions not supported")
       reduced_dim = x.layout.tiling.tile_dimension(axes[0])
       if any(reduced_dim[d] for d in x.layout.partitioned_warp_dims):
-        size = x.layout.vector_length * 128  # a vector per lane in each WG.
-        if size > REDUCE_SCRATCH_ELEMS:
-          raise NotImplementedError(
-              f"Reduce scratch {size=} exceeds max={REDUCE_SCRATCH_ELEMS}"
-          )
-        scratch_ty = jax.ShapeDtypeStruct(shape=(size,), dtype=x_aval.dtype)
+        dtype_bitwidth = dtypes.itemsize_bits(x_aval.dtype)
+        if dtype_bitwidth % 8:
+          raise NotImplementedError("Sub-byte dtypes not supported")
+        scratch_elems = ctx.module_ctx.reduction_scratch_bytes * 8 // dtype_bitwidth
+        scratch_ty = jax.ShapeDtypeStruct(shape=(scratch_elems,), dtype=x_aval.dtype)
         ctx = ctx.module_ctx.scratch_view(scratch_ty)
       else:
         ctx = contextlib.nullcontext(None)
@@ -2645,7 +2649,9 @@ def _reduce_lowering_rule_wg(
   def i32_attr(value: int) -> ir.IntegerAttr:
     return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value)
   reduction.attributes["offset"] = i32_attr(ctx.module_ctx.smem_used_bytes)
-  reduction.attributes["scratch_size"] = i32_attr(REDUCE_SCRATCH_ELEMS)
+  # TODO(bchetioui): here, we could just donate all the remaining free SMEM that
+  # we have at this point in time.
+  reduction.attributes["scratch_size"] = i32_attr(ctx.module_ctx.reduction_scratch_bytes)
   return reduction.result
 
 
diff --git a/jax/experimental/mosaic/gpu/dialect_lowering.py b/jax/experimental/mosaic/gpu/dialect_lowering.py
@@ -758,15 +758,9 @@ def _vector_multi_dim_reduction_op_lowering_rule(
   if any(reduced_dim[d] for d in src.layout.partitioned_warp_dims):
     # cross-warp reductions require scratch space.
     dtype = op.source.type.element_type
-    size = src.layout.vector_length * 128  # a vector per lane in each WG.
-    scratch_size = ir.IntegerAttr(op.attributes["scratch_size"]).value
-    if size > scratch_size:
-      raise ValueError(
-          f"Required scratch space ({size}) is larger than the available"
-          f" scratch size ({scratch_size})"
-      )
+    allocation_size = ir.IntegerAttr(op.attributes["scratch_size"]).value * 8 // utils.bitwidth(dtype)
     scratch = _slice_smem(
-        ir.MemRefType.get([size], dtype, memory_space=utils.smem()),
+        ir.MemRefType.get([allocation_size], dtype, memory_space=utils.smem()),
         arith.constant(None, op.attributes["offset"]),
         ctx.smem_requested_bytes,
     )
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -2795,9 +2795,9 @@ def swizzle_warp_idx_fn(lane_idx: ir.Value, vec_len: int):
           num_banks % num_banks_per_output != 0
       ):
         raise NotImplementedError(
-              "Unoptimized configuration for cross-warp reduction: "
-              f"{self.mlir_dtype} with {vec_len=}"
-          )
+            "Unoptimized configuration for cross-warp reduction: "
+            f"{self.mlir_dtype} with {vec_len=}"
+        )
       # Define one row to be 128 bytes (32 banks of 4 bytes). For a given lane
       # index, we want to store the data coming from all 4 warps
       # contiguously in order to enable vectorized loads later on. If we
@@ -2981,7 +2981,12 @@ def reduce_stored(
       scratch_ty = ir.MemRefType(scratch.type)
       scratch_elems_per_register = WARPS_IN_WARPGROUP * unique_lanes * vec_len
       if scratch_ty.shape[0] < scratch_elems_per_register:
-        raise ValueError("Insufficient scratch space for cross-warp reduction")
+        available_bytes = scratch_ty.shape[0] * utils.bitwidth(scratch_ty.element_type) // 8
+        required_bytes = scratch_elems_per_register * utils.bitwidth(scratch_ty.element_type) // 8
+        raise ValueError(
+            f"Required reduction scratch size ({required_bytes} bytes) is "
+            f"larger than the available scratch size ({available_bytes} bytes)"
+        )
       if scratch_ty.get_strides_and_offset()[0] != [1]:
         raise ValueError("Expected scratch to be contiguous")
       num_concurrent_cross_warp_reductions = scratch_ty.shape[0] // scratch_elems_per_register
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -2474,6 +2474,49 @@ def kernel(x_ref, out_ref):
     self.assertAllClose(expected, jnp.sum(row))
     self.assertArraysEqual(result, jax.lax.broadcast_in_dim(expected, (128,), ()))
 
+  def test_reduction_fails_on_too_little_scratch_bytes_for_cross_warp_reduction(self):
+    @functools.partial(
+        self.pallas_call,
+        out_shape=jax.ShapeDtypeStruct((128,), jnp.float32),
+        compiler_params=plgpu.CompilerParams(reduction_scratch_bytes=0),
+    )
+    def kernel(x_ref, y_ref):
+      x_val = plgpu.load(x_ref, (), layout=plgpu.Layout.WGMMA, optimized=False)
+      y_ref[...] = jnp.sum(x_val, axis=0)
+
+    with self.assertRaisesRegex(
+        ValueError,
+        r"Required reduction scratch size \(1024 bytes\) is larger than the "
+        r"available scratch size \(0 bytes\)"
+    ):
+      kernel(jnp.zeros((128, 128), dtype=jnp.float32))
+
+  @jtu.thread_unsafe_test()  # Modifies ``os.environ``.
+  def test_reduction_with_more_scratch_uses_less_synchronization(self):
+    def run_kernel(x, scratch_bytes):
+      def kernel(x_ref, y_ref):
+        x_val = plgpu.load(x_ref, (), layout=plgpu.Layout.WGMMA, optimized=False)
+        y_ref[...] = jnp.sum(x_val, axis=0)
+      return self.pallas_call(
+          kernel,
+          out_shape=jax.ShapeDtypeStruct((128,), jnp.float32),
+          compiler_params=plgpu.CompilerParams(reduction_scratch_bytes=scratch_bytes)
+      )(x)
+
+    x = jax.random.uniform(jax.random.key(0), shape=(128, 128), dtype=jnp.float32)
+    with jtu.set_env(MOSAIC_GPU_DUMP_SASS="1"), jtu.capture_stdout() as sass0:
+      out0 = run_kernel(x, 1024).block_until_ready()
+
+    with jtu.set_env(MOSAIC_GPU_DUMP_SASS="1"), jtu.capture_stdout() as sass1:
+      out1 = run_kernel(x, 2 * 1024).block_until_ready()
+
+    self.assertAllClose(out0, jnp.sum(x, axis=0))
+    self.assertArraysEqual(out0, out1)
+
+    syncs0 = re.findall(r"BAR.SYNC", sass0())
+    syncs1 = re.findall(r"BAR.SYNC", sass1())
+    self.assertLess(len(syncs1), len(syncs0))
+
   @parameterized.product(
       layout=(
           plgpu.Layout.WGMMA,