all test cases working

danielvegamyhre · danielvegamyhre · commit 7042cef6bc17 · 2025-03-27T20:29:18.000-07:00
diff --git a/torchao/prototype/grouped_mm/__init__.py b/torchao/prototype/grouped_mm/__init__.py
@@ -243,11 +243,17 @@ def backward(ctx, grad_output: torch.Tensor):
         )
         A_scale = A_fp8_col_major._scale.squeeze()
 
+        # Special case: 2D-2D grouped GEMM, the scales must be multiplied by the number of groups,
+        # which is the size of the `offs` tensor.
+        if grad_output_t_fp8_row_major.ndim == 2 and A_fp8_col_major.ndim == 2:
+            grad_output_t_scale = grad_output_t_scale.repeat(offs.numel())
+            A_scale = A_scale.repeat(offs.numel())
+
         # Compute grad_B = grad_output_t @ A.
         #
-        # Case 1: A=2D, B=3D with A=(M,K), B^T=(B,K,N) case, output=(B,M,N)
+        # Case 1: A=2D, B=3D with A=(M,K), B^T=(B,K,N) case, output=(M,N) <-- special case, B reduced?
         # grad_B = grad_output_t @ A
-        # grad_B = (B,N,M) @ (B,M,K) = (B,N,K)
+        # grad_B = (N,M) @ (M,K) = (N,K) <-- do we need to repeat along dim0 so it's (B,N,K)?
         #
         # Case 2: A=3D, B=2D with A=(B,M,K), B^T=(K,N) case, output=(B,M,N)
         # grad_B = grad_output_t @ A

Original file line number	Diff line number	Diff line change
`@@ -243,11 +243,17 @@ def backward(ctx, grad_output: torch.Tensor):`
`243`	`243`	`)`
`244`	`244`	`A_scale = A_fp8_col_major._scale.squeeze()`
`245`	`245`
	`246`	`+ # Special case: 2D-2D grouped GEMM, the scales must be multiplied by the number of groups,`
	`247`	+ # which is the size of the `offs` tensor.
	`248`	`+ if grad_output_t_fp8_row_major.ndim == 2 and A_fp8_col_major.ndim == 2:`
	`249`	`+ grad_output_t_scale = grad_output_t_scale.repeat(offs.numel())`
	`250`	`+ A_scale = A_scale.repeat(offs.numel())`
	`251`	`+`
`246`	`252`	`# Compute grad_B = grad_output_t @ A.`
`247`	`253`	`#`
`248`		`- # Case 1: A=2D, B=3D with A=(M,K), B^T=(B,K,N) case, output=(B,M,N)`
	`254`	`+ # Case 1: A=2D, B=3D with A=(M,K), B^T=(B,K,N) case, output=(M,N) <-- special case, B reduced?`
`249`	`255`	`# grad_B = grad_output_t @ A`
`250`		`- # grad_B = (B,N,M) @ (B,M,K) = (B,N,K)`
	`256`	`+ # grad_B = (N,M) @ (M,K) = (N,K) <-- do we need to repeat along dim0 so it's (B,N,K)?`
`251`	`257`	`#`
`252`	`258`	`# Case 2: A=3D, B=2D with A=(B,M,K), B^T=(K,N) case, output=(B,M,N)`
`253`	`259`	`# grad_B = grad_output_t @ A`