More cleanup

matthewdouglas · matthewdouglas · commit da40911edb4b · 2025-03-13T17:19:49.000-04:00
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -228,7 +228,7 @@ def forward(
             subA = None
 
         # 3. Int8 Matmul + Dequant + Bias
-        output = torch.ops.bitsandbytes.int8_scaled_mm(CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype)
+        output = torch.ops.bitsandbytes.int8_scaled_mm.default(CA, state.CB, SCA, state.SCB, bias=bias, dtype=A.dtype)
 
         # 4. Mixed-precision decomposition matmul
         if subA is not None and state.subB is not None:
@@ -278,7 +278,7 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
         if req_gradB:
             Cgrad, _, _, SCgradt, _ = F.int8_double_quant(grad_output.to(torch.float16))
 
-            grad_B = torch.ops.bitsandbytes.int8_scaled_mm(
+            grad_B = torch.ops.bitsandbytes.int8_scaled_mm.default(
                 Cgrad.t().contiguous(),
                 CAt.t(),
                 SCgradt,
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -170,10 +170,11 @@ def _(
     A: torch.Tensor,
     threshold=0.0,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    # TODO: Optimize/write CUDA kernel for this?
-
     # Use CUDA kernel for rowwise and COO tensor
-    quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant(A, threshold=threshold)
+    quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+        A,
+        threshold=threshold,
+    )
 
     # PyTorch impl for colwise
     col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -873,7 +873,7 @@ def dequantize_blockwise(
         )
         return out
 
-    return torch.ops.bitsandbytes.dequantize_blockwise(
+    return torch.ops.bitsandbytes.dequantize_blockwise.default(
         A,
         absmax,
         quant_state.code.to(A.device),
@@ -2238,7 +2238,7 @@ def int8_vectorwise_dequant(A: torch.Tensor, stats: torch.Tensor):
         `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
     """
     # To dequantize we divide by 127, or multiply by the reciprocal.
-    return torch.ops.bitsandbytes.int8_vectorwise_dequant(A, stats)
+    return torch.ops.bitsandbytes.int8_vectorwise_dequant.default(A, stats)
 
 
 def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):