Bugfix sparse decomp

matthewdouglas · matthewdouglas · commit 5b2348bf2d68 · 2024-12-02T23:10:09.000-05:00
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2793,6 +2793,11 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):
             _get_tensor_stream(A),
         )
 
+    # Zero out values from outlier columns across all rows.
+    # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+    if rows > 1 and outlier_cols is not None:
+        out_row[:, outlier_cols] = 0
+
     return out_row, row_stats, outlier_cols
 
 
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -2145,7 +2145,7 @@ __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t* out, float* rowStat
 
   // For sm50/sm52 and CUDA < 12.2 we need to do the reduction in fp32.
   // Otherwise `T` is `fp16`. This can be removed when Maxwell is dropped.
-#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR >= 2) || BNB_FP16_AVAILABLE && __CUDACC__
+#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR >= 2) || BNB_FP16_AVAILABLE
   using TReduction = T;
 #else
   using TReduction = float;

Original file line number	Diff line number	Diff line change
`@@ -2793,6 +2793,11 @@ def int8_vectorwise_quant(A: torch.Tensor, threshold=0.0):`
`2793`	`2793`	`_get_tensor_stream(A),`
`2794`	`2794`	`)`
`2795`	`2795`
	`2796`	`+ # Zero out values from outlier columns across all rows.`
	`2797`	`+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.`
	`2798`	`+ if rows > 1 and outlier_cols is not None:`
	`2799`	`+ out_row[:, outlier_cols] = 0`
	`2800`	`+`
`2796`	`2801`	`return out_row, row_stats, outlier_cols`
`2797`	`2802`
`2798`	`2803`