add detailed comments

danielvegamyhre · danielvegamyhre · commit 4f385e55682a · 2025-03-27T14:03:28.000-07:00
diff --git a/torchao/prototype/grouped_mm/__init__.py b/torchao/prototype/grouped_mm/__init__.py
@@ -88,6 +88,10 @@ def forward(
             ),
             round_scales_to_power_of_2=float8_config.round_scales_to_power_of_2,
         )
+        # A shape: (M, K)
+        # A_scale shape: (M,1)
+        # squeeze A_scale to be 1D for 2D tensor _scaled_grouped_mm
+        # A_scale shape: (M,)
 
         # Convert high precision weight tensor to float8.
         B_fp8 = hp_tensor_to_float8_dynamic(
@@ -101,6 +105,23 @@ def forward(
             ),
             round_scales_to_power_of_2=float8_config.round_scales_to_power_of_2,
         )
+        # B shape: (B,K,N) => this is compatible for matmul with A shape: (M,K) @ (B,K,N) = (B,M,N)
+        # B_scale shape: (B,K,1) => (using axiswise_dim=-1)
+        # squeeze A_scale to be 2D for 3D tensor in _scaled_grouped_mm
+        # B_scale shape: (B,K)
+
+        # This fails the check in _scaled_grouped_mm here: "scale.size(1) == mat.size(1 + dim)" where dim=1 for matrix B, because K != N
+        # check scale call: https://github.com/pytorch/pytorch/blob/d25acac357ff8663a7787e57e6bc5e69987a8f9a/aten/src/ATen/native/cuda/Blas.cpp#L1530
+        # failure: https://github.com/pytorch/pytorch/blob/d25acac357ff8663a7787e57e6bc5e69987a8f9a/aten/src/ATen/native/cuda/Blas.cpp#L1458-L1461
+
+        # To solve this, I changed axiswise_dim to 1, so scale shape becomes:
+        # B_scale shape: (B,1,N)
+        # squeeze A_scale to be 2D for 3D tensor in _scaled_grouped_mm
+        # B_scale shape: (B,N)
+        # This passes the check in _scaled_grouped_mm
+
+        # TODO: allowing axiswise_dim to be 1 breaks assumptions in torchao,
+        # so we need to either design long term support for this, or change the requirement in torch._scaled_grouped_mm
 
         # Store what we need for backward.
         ctx.save_for_backward(A, B)