[PyTorch] Debug weight matrix usages for dgrad GEMM (#1637)

timmoon10 · KshitijLakhani · commit c55e425ac575 · 2025-04-07T08:53:00.000-07:00
Make sure that weight matrix has required usages for dgrad GEMM

Signed-off-by: Tim Moon &lt;tmoon@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -327,9 +327,8 @@ def forward(
                         ln_out.update_usage(rowwise_usage=False)
 
             # Weight with column-wise usage is needed for dgrad GEMM.
-            if inp.requires_grad:
-                if isinstance(weightmat, QuantizedTensor):
-                    weightmat.update_usage(columnwise_usage=True)
+            if isinstance(weightmat, QuantizedTensor):
+                weightmat.update_usage(columnwise_usage=True)
 
             if cpu_offloading:
                 if fp8 and weightmat is not None:
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -415,7 +415,7 @@ def forward(
         )
 
         # Weight with column-wise usage is needed for dgrad GEMM.
-        if is_grad_enabled and inp.requires_grad:
+        if is_grad_enabled:
             if isinstance(fc1_weight_final, QuantizedTensor):
                 fc1_weight_final.update_usage(columnwise_usage=True)
             if isinstance(fc2_weight_final, QuantizedTensor):

Original file line number	Diff line number	Diff line change
`@@ -415,7 +415,7 @@ def forward(`
`415`	`415`	`)`
`416`	`416`
`417`	`417`	`# Weight with column-wise usage is needed for dgrad GEMM.`
`418`		`- if is_grad_enabled and inp.requires_grad:`
	`418`	`+ if is_grad_enabled:`
`419`	`419`	`if isinstance(fc1_weight_final, QuantizedTensor):`
`420`	`420`	`fc1_weight_final.update_usage(columnwise_usage=True)`
`421`	`421`	`if isinstance(fc2_weight_final, QuantizedTensor):`