all test cases working

danielvegamyhre · danielvegamyhre · commit c19bc88f402c · 2025-03-27T20:31:41.000-07:00
diff --git a/torchao/prototype/grouped_mm/__init__.py b/torchao/prototype/grouped_mm/__init__.py
@@ -230,10 +230,9 @@ def backward(ctx, grad_output: torch.Tensor):
         # - A_scale shape: (1,K) or (B, 1, K)
         # - torch._scaled_grouped_mm requires scales without any empty dims, so squeeze A_scale.
         # - A scale shape: (K,) or (B, K)
+        A_col_major = A.transpose(-2, -1).contiguous().transpose(-2, -1)
         A_fp8_col_major = hp_tensor_to_float8_dynamic(
-            A.transpose(-2, -1)
-            .contiguous()
-            .transpose(-2, -1),  # Convert to column-major
+            A_col_major,
             float8_config.cast_config_input.target_dtype,
             linear_mm_config=LinearMMConfig(),
             gemm_input_role=GemmInputRole.INPUT,
@@ -243,11 +242,17 @@ def backward(ctx, grad_output: torch.Tensor):
         )
         A_scale = A_fp8_col_major._scale.squeeze()
 
+        # Special case: 2D-2D grouped GEMM, the scales must be multiplied by the number of groups,
+        # which is the size of the `offs` tensor.
+        if grad_output_t_fp8_row_major.ndim == 2 and A_fp8_col_major.ndim == 2:
+            grad_output_t_scale = grad_output_t_scale.repeat(offs.numel())
+            A_scale = A_scale.repeat(offs.numel())
+
         # Compute grad_B = grad_output_t @ A.
         #
-        # Case 1: A=2D, B=3D with A=(M,K), B^T=(B,K,N) case, output=(B,M,N)
+        # Case 1: A=2D, B=3D with A=(M,K), B^T=(B,K,N) case, output=(M,N) <-- special case, B reduced?
         # grad_B = grad_output_t @ A
-        # grad_B = (B,N,M) @ (B,M,K) = (B,N,K)
+        # grad_B = (N,M) @ (M,K) = (N,K) <-- do we need to repeat along dim0 so it's (B,N,K)?
         #
         # Case 2: A=3D, B=2D with A=(B,M,K), B^T=(K,N) case, output=(B,M,N)
         # grad_B = grad_output_t @ A
diff --git a/torchao/prototype/grouped_mm/test_grouped_mm.py b/torchao/prototype/grouped_mm/test_grouped_mm.py
@@ -23,10 +23,12 @@ def test_grouped_gemm_2d_3d(use_fast_accum, strided):
     device = "cuda"
     s_int = int(strided)
     m, n, k, n_groups = 16, 32, 16, 4
-    a = torch.randn(m * n_groups, k * (1 + s_int), device=device, requires_grad=True)[:, :k]
-    b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device, requires_grad=True)[
-        :: (1 + s_int), :, :k
+    a = torch.randn(m * n_groups, k * (1 + s_int), device=device, requires_grad=True)[
+        :, :k
     ]
+    b = torch.randn(
+        n_groups * (1 + s_int), n, k * (1 + s_int), device=device, requires_grad=True
+    )[:: (1 + s_int), :, :k]
     offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
     result = _grouped_scaled_mm(
         a,
@@ -62,12 +64,12 @@ def test_grouped_gemm_3d_3d(use_fast_accum, strided):
     device = "cuda"
     s_int = int(strided)
     m, n, k, n_groups = 16, 32, 16, 4
-    a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device, requires_grad=True)[
-        :: (1 + s_int), :, :k
-    ]
-    b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device, requires_grad=True)[
-        :: (1 + s_int), :, :k
-    ]
+    a = torch.randn(
+        n_groups * (1 + s_int), m, k * (1 + s_int), device=device, requires_grad=True
+    )[:: (1 + s_int), :, :k]
+    b = torch.randn(
+        n_groups * (1 + s_int), n, k * (1 + s_int), device=device, requires_grad=True
+    )[:: (1 + s_int), :, :k]
     result = _grouped_scaled_mm(
         a,
         b.transpose(-2, -1),
@@ -99,12 +101,12 @@ def test_grouped_gemm_2d_2d(use_fast_accum, strided):
     out_dtype = torch.bfloat16
     device = "cuda"
     m, n, k, n_groups = 16, 16, 16, 4  # all sizes have to be divisible by 16
-    a = torch.randn(m, k * n_groups + k * int(strided), device=device, requires_grad=True)[
-        :, : k * n_groups
-    ]
-    b = torch.randn(n, k * n_groups + k * int(strided), device=device, requires_grad=True)[
-        :, : k * n_groups
-    ]
+    a = torch.randn(
+        m, k * n_groups + k * int(strided), device=device, requires_grad=True
+    )[:, : k * n_groups]
+    b = torch.randn(
+        n, k * n_groups + k * int(strided), device=device, requires_grad=True
+    )[:, : k * n_groups]
     offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
 
     # Compute result.