[Kernel] add group_offset and transpose_rhs support in gmm kernel (#9251)

yaochengji · pgmoka · commit b992f6fef331 · 2025-05-29T23:35:18.000Z
diff --git a/test/test_gmm.py b/test/test_gmm.py
@@ -23,12 +23,18 @@
 
 class MegabloxTest(unittest.TestCase):
 
-  def _reference_gmm(self, lhs: torch.Tensor, rhs: torch.Tensor,
-                     group_sizes: torch.Tensor) -> torch.Tensor:
+  def _reference_gmm(self,
+                     lhs: torch.Tensor,
+                     rhs: torch.Tensor,
+                     group_sizes: torch.Tensor,
+                     transpose_rhs: bool = False) -> torch.Tensor:
     start = 0
     out = []
     for i, size in enumerate(group_sizes):
-      result = lhs[start:start + size, :] @ rhs[i, :, :]
+      rhsi = rhs[i, :, :]
+      if transpose_rhs is True:
+        rhsi = torch.transpose(rhsi, 0, 1)
+      result = lhs[start:start + size, :] @ rhsi
       out.append(result)
       start += group_sizes[i]
     return torch.cat(out)
@@ -105,27 +111,39 @@ def test_gmm(self):
     for test_cache in [False, True]:
       for gmm_func in gmm_funcs:
         for test_case in self.tests_cases:
-          num_groups = test_case['num_groups']
-          k = test_case['k']
-          m = test_case['m']
-          n = test_case['n']
-          lhs_dtype = rhs_dtype = test_case['dtype']
-
-          lhs = torch.rand(m, k, dtype=lhs_dtype)
-          rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype)
-          group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
-          ref_out = self._reference_gmm(lhs, rhs, group_sizes)
-
-          out = gmm_func(lhs.to("xla"), rhs.to("xla"), group_sizes.to("xla"))
-          # torch.compiled version of the gmm will cache the payload in dynamo layer
-          # hence won't trigger the trace_pallas cache
-          if test_cache and gmm_func != compiled_gmm:
-            old_cnt = xr.get_num_cached_compilation_graph()
-            # execute the same gmm func, expected to hit the cache
-            out = gmm_func(lhs.to("xla"), rhs.to("xla"), group_sizes.to("xla"))
-            new_cnt = xr.get_num_cached_compilation_graph()
-            self.assertEqual(old_cnt, new_cnt)
-          self.assertTrue(torch.allclose(ref_out, out.cpu()))
+          for transpose_rhs in [True, False]:
+            num_groups = test_case['num_groups']
+            k = test_case['k']
+            m = test_case['m']
+            n = test_case['n']
+            lhs_dtype = rhs_dtype = test_case['dtype']
+
+            lhs = torch.rand(m, k, dtype=lhs_dtype)
+            if transpose_rhs is False:
+              rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype)
+            else:
+              rhs = torch.rand(num_groups, n, k, dtype=rhs_dtype)
+            group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+            ref_out = self._reference_gmm(lhs, rhs, group_sizes, transpose_rhs)
+
+            out = gmm_func(
+                lhs.to("xla"),
+                rhs.to("xla"),
+                group_sizes.to("xla"),
+                transpose_rhs=transpose_rhs)
+            # torch.compiled version of the gmm will cache the payload in dynamo layer
+            # hence won't trigger the trace_pallas cache
+            if test_cache and gmm_func != compiled_gmm:
+              old_cnt = xr.get_num_cached_compilation_graph()
+              # execute the same gmm func, expected to hit the cache
+              out = gmm_func(
+                  lhs.to("xla"),
+                  rhs.to("xla"),
+                  group_sizes.to("xla"),
+                  transpose_rhs=transpose_rhs)
+              new_cnt = xr.get_num_cached_compilation_graph()
+              self.assertEqual(old_cnt, new_cnt)
+            self.assertTrue(torch.allclose(ref_out, out.cpu()))
 
     # Make sure gmm doesn't fallback.
     self.assertEqual(len(torch_xla._XLAC._get_executed_fallback_ops()), 0)
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -1289,7 +1289,9 @@ def gmm(
     lhs: torch.Tensor,
     rhs: torch.Tensor,
     group_sizes: torch.Tensor,
-    tiling: Tuple[int, int, int] = (512, 512, 512)
+    tiling: Tuple[int, int, int] = (512, 512, 512),
+    group_offset: torch.Tensor | None = None,
+    transpose_rhs: bool = False,
 ) -> torch.Tensor:
   """Compute lhs[sizes[i-1]:sizes[i], :] @ rhs for each group 'i'.
 
@@ -1298,7 +1300,9 @@ def gmm(
     rhs: A 3d, torch.Tensor with shape [num_groups, k, n].
     group_sizes: A 1d, torch.Tensor with shape [num_groups] and torch.int32 dtype.
     tiling: 3-tuple of ints. The m, k and n-dimension tile sizes.
-
+    group_offset: The group in group sizes to start computing from. This is
+      particularly useful for when rhs num_groups is sharded.
+    transpose_rhs: True if the rhs needs to be transposed.
   Returns:
     A 2d, torch.Tensor with shape [m, n].
   """
@@ -1310,15 +1314,18 @@ def gmm(
   tm, tk, tn = min(tiling[0], m), min(tiling[1], k), min(tiling[2], n)
   preferred_element_type = lhs.dtype
   return xb.call_jax(gmm, (lhs, rhs, group_sizes, preferred_element_type,
-                           (tm, tk, tn)))
+                           (tm, tk, tn), group_offset),
+                     {"transpose_rhs": transpose_rhs})
 
 
 @requires_jax
 def tgmm(
     lhs: torch.Tensor,
     rhs: torch.Tensor,
     group_sizes: torch.Tensor,
-    tiling: Tuple[int, int, int] = (512, 512, 512)
+    tiling: Tuple[int, int, int] = (512, 512, 512),
+    group_offset: torch.Tensor | None = None,
+    num_actual_groups: int | None = None,
 ) -> torch.Tensor:
   """Compute lhs[:, sizes[i-1]:sizes[i]] @ rhs[sizes[i-1]:sizes[i], :].
 
@@ -1340,7 +1347,7 @@ def tgmm(
   tm, tk, tn = min(tiling[0], m), min(tiling[1], k), min(tiling[2], n)
   preferred_element_type = lhs.dtype
   return xb.call_jax(tgmm, (lhs, rhs, group_sizes, preferred_element_type,
-                            (tm, tk, tn)))
+                            (tm, tk, tn), group_offset, num_actual_groups))
 
 
 def gmm_backward(grad, lhs, rhs, group_sizes, tiling=(512, 512, 512)):
@@ -1547,7 +1554,7 @@ def ragged_paged_attention_non_xla(
 
 
 XLA_LIB.define(
-    "gmm(Tensor lhs, Tensor rhs, Tensor group_sizes, int[]? tiling=None) -> Tensor",
+    "gmm(Tensor lhs, Tensor rhs, Tensor group_sizes, int[]? tiling=None, Tensor? group_offset=None, bool transpose_rhs=False) -> Tensor",
 )
 
 
@@ -1557,28 +1564,37 @@ def gmm_xla(
     rhs: torch.Tensor,
     group_sizes: torch.Tensor,
     # pytorch custom op does not allow tuple type, use list instead
-    tiling: Optional[List[int]] = [512, 512, 512]):
+    tiling: Optional[List[int]] = [512, 512, 512],
+    group_offset: torch.Tensor | None = None,
+    transpose_rhs: bool = False):
+  if tiling is None:
+    tiling = [512, 512, 512]
   assert len(tiling) == 3, "tiling must be a list with 3 integers"
   assert lhs.dim() == 2, "lhs must be a 2d, torch.Tensor with shape [k, m]"
   assert rhs.dim(
   ) == 3, "rhs must be a A 3d torch.Tensor with shape [num_groups, k, n]"
   tiling = tuple(tiling)
-  return gmm(lhs, rhs, group_sizes, tiling)
+  return gmm(lhs, rhs, group_sizes, tiling, group_offset, transpose_rhs)
 
 
 @impl(XLA_LIB, "gmm", "CompositeExplicitAutograd")
 def gmm_non_xla(lhs: torch.Tensor,
                 rhs: torch.Tensor,
                 group_sizes: torch.Tensor,
-                tiling: Optional[List[int]] = [512, 512, 512]):
+                tiling: Optional[List[int]] = [512, 512, 512],
+                group_offset: torch.Tensor | None = None,
+                transpose_rhs: bool = False):
   # This will be called when dynamo use fake tensor to construct the fake output.
   # We need to make sure output tensor's shape is correct.
   if lhs.device != torch.device("meta"):
     warnings.warn(f'XLA gmm should only be applied to tensors on XLA device')
+  if tiling is None:
+    tiling = [512, 512, 512]
   assert len(tiling) == 3, "tiling must be a list with 3 integers"
   assert lhs.dim() == 2, "lhs must be a 2d, torch.Tensor with shape [k, m]"
   assert rhs.dim(
-  ) == 3, "rhs must be a A 3d torch.Tensor with shape [num_groups, k, n]"
+  ) == 3, "rhs must be a A 3d torch.Tensor with shape [num_groups, k, n] or [num_groups, n, k] when transpose_rhs is True"
+  rhs_dim_size = rhs.size()[1] if transpose_rhs is True else rhs.size()[2]
 
   # we only need to return the tensor with correct shape for meta tensor.
-  return torch.empty(lhs.size()[0], rhs.size()[2], device=lhs.device)
+  return torch.empty(lhs.size()[0], rhs_dim_size, device=lhs.device)