dropbox
diff --git a/‎README.md‎
Lines changed: 9 additions & 11 deletions b/‎README.md‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎examples/benchmark_triton.py‎
Lines changed: 0 additions & 3 deletions b/‎examples/benchmark_triton.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/triton_hqq_example.py‎
Lines changed: 9 additions & 7 deletions b/‎examples/triton_hqq_example.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎gemlite/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎gemlite/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gemlite/core.py‎
Lines changed: 27 additions & 13 deletions b/‎gemlite/core.py‎
Lines changed: 27 additions & 13 deletions
diff --git a/‎gemlite/dtypes.py‎
Lines changed: 12 additions & 0 deletions b/‎gemlite/dtypes.py‎
Lines changed: 12 additions & 0 deletions
@@ -33,6 +33,7 @@ Extensive performance results across different bitwidths, batch sizes, and devic
 - [Contributing](#contributing)
 
 # Recent Highlights
+- GemLite now supports bfloat16!
 - GemLite is now available in <a href="https://github.com/vllm-project/vllm/">vllm</a> via the <a href="https://github.com/mobiusml/hqq/">hqq</a> lib! 
 - GemLite is now integrated with <a href="https://github.com/pytorch/ao">TorchAO</a>/<a href="https://github.com/sgl-project/sglang">SGLang</a> for 4-bit quantization. Check-out the <a href="https://pytorch.org/blog/accelerating-llm-inference/">blogpost</a>!
 - **Major performance improvement**: especially on the A100 and H100.
@@ -61,16 +62,9 @@ pip install git+https://github.com/mobiusml/gemlite/
 import gemlite
 from gemlite import DType, GemLiteLinear
 
-#Set accumulation dtype (only do this once)
-#gemlite.set_acc_dtype(DType.FP32) #For A100/H100 (default)
-#gemlite.set_acc_dtype(DType.FP16) #For 3090/4090 (default)
-
 #Set default packing bitwidth: use 8-bit for larger batch-sizes on A100s/H100s
 #gemlite.set_packing_bitwidth(8)
 
-#Set autotune (by default uses powers of 2 up to 1024)
-#gemlite.set_autotune_setting(lambda M: M) #max-autotune example
-
 #Main constructor
 gemlite_linear = GemLiteLinear(
     W_nbits, #weight quantization bitwidth. supported: [8, 4, 2, 1]
@@ -124,6 +118,9 @@ import gemlite
 #Ignore pre-loaded configs - if you want to start from scratch (Optional)
 #gemlite.reset_config() 
 
+#Set autotune (by default uses powers of 2 up to 1024)
+#gemlite.set_autotune_setting(lambda M: M) #max-autotune example
+
 #Warm-up for A16W4 with group_size=64
 gemlite.helper.warmup(shapes=[(4096, 4096)], W_nbits=[4], group_sizes=[64], mode='static')
 
@@ -136,21 +133,22 @@ gemlite.cache_config('new_config.json')
 
 ## Deep Dive
 We implement various versions of the Triton kernels: 
-* <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemv_A16fWnO16f_int32packing.py">GEMV</a></b>: This GEMV kernel splits the activations into 1D chunks, performs the dot product using `tl.sum`, and accumulates via atomic addition. It is primarily intended for use with small batch sizes (M < 16). As `tl.atomic_add` does not support bfloat16, this kernel is limited to float16.
+* <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemv_A16fWnO16f_int32packing.py">GEMV</a></b>: This GEMV kernel splits the activations into 1D chunks, performs the dot product using `tl.sum`, and accumulates via atomic addition. It is primarily intended for use with small batch sizes (M == 1). 
 
 * <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemm_A16fWnO16f_int32packing.py">GEMM</a></b>: This GEMM kernel is implemented similarly to <a href="https://github.com/fpgaminer/GPTQ-triton">GPTQ-triton</a>. Since it uses tensor cores, activations must be padded with zeros along the batch dimension to fit at least 16 rows. It supports both float32 and float16 accumulation for fp16 inputs, but only float32 accumulation for bfloat16.
 
-* <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemm_splitK_A16fWnO16f_int32packing.py">GEMM Split-K</a></b>: This Split-K GEMM kernel is implemented similarly to <a href="https://github.com/foundation-model-stack/foundation-model-stack/blob/triton/triton/kernels/gptq/splitk_dequant_gemm.py">the gptq Split-K version</a>. We build on the gemm version above and add another dimension in the grid which splits the K dimension into multiple jobs that calculate partial sums, which are atomically added and finally stored. Split-K performs particularly well for batched LLM decoding (batch-size between 1 and 32). 
+* <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemm_splitK_A16fWnO16f_int32packing.py">GEMM Split-K</a></b>: This Split-K GEMM kernel is implemented similarly to <a href="https://github.com/foundation-model-stack/foundation-model-stack/blob/triton/triton/kernels/gptq/splitk_dequant_gemm.py">the gptq Split-K version</a>. We build on the gemm version above and add another dimension in the grid which splits the K dimension into multiple jobs that calculate partial sums, which are atomically added and finally stored. Split-K performs particularly well for batched LLM decoding (batch-size between 2 and 32). 
 
 * <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemv_revsplitK_A16fWnO16f_int32packing.py">Gemv RevSplit-K</a></b>: 
 This newly proposed algorithm in GemLite operates in contrast to the GEMM Split-K approach, but within a GEMV context. By doubling the workload per Triton program launched in the GEMV kernel, it reduces the frequency of loading scales/zeros and lowers the number of threads needed. As a result, this method delivers the best performance for batch-size=1 decoding. 
 
-All kernels are flexible, supporting 8, 4, 2, and 1-bit weight precisions as well as both fp16 and int8/fp8 activations.
+All kernels are flexible, supporting 8, 4, 2, and 1-bit weight precisions as well as float16, bfloat16 and int8/fp8 activations.
 
 ## Limitations
 * All kernels require a minimum group-size of 32.
-* The default accumulation DType for FP16 inputs is FP16. If you encounter precision issues, you can try <a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/core.py#L28">reverting to FP32</a>.
 * <b><a href="https://github.com/mobiusml/gemlite/blob/master/gemlite/triton_kernels/gemv_revsplitK_A16fWnO16f_int32packing.py">Gemv RevSplit-K</a></b>, which is the default kernel for batch-size=1, does not work with 1-bit weights packed as 32-bit with a group-size of 32. In this case, you should use 8-bit bitpacking via `.pack(...,packing_bitwidth=8)`, or revert to using the `GEMV` kernel instead.
+* On datacenter gpus (A100, H100, H200), 8-bit packing via `gemlite.set_packing_bitwidth(8)` is faster with larger batches.
+* `bfloat16` is about 5-7% slower for `1 <= M <= 64` because of the fp32 fallback atomic addition implementation. You can set the default gemv to the Split-K kernel which could run faster for `M == 1` in some cases depending on the GPU (A100 confirmed, but slower on the H100) `gemlite.core.get_default_gemv = lambda W_nbits: 'GEMM_SPLITK' if (W_nbits < 8) else 'GEMV_SPLITK'`.
 
 ## Performance
 ### End-2-End Performance
 
@@ -19,9 +19,6 @@
 from gemlite.core import GemLiteLinearTriton, DType, set_autotune, GEMLITE_ACC_DTYPE
 set_autotune({'GEMV_REVSPLITK':True, 'GEMV_SPLITK': True, 'GEMV':True, 'GEMM_SPLITK':True, 'GEMM':True}, exhaustive=True, use_cuda_graph=False)
 
-GEMLITE_ACC_DTYPE[DType.FP16] = DType.FP32 #For A100/H100
-#GEMLITE_ACC_DTYPE[DType.FP16] = DType.FP16 #For 3090/4090
-
 device = 'cuda:0'
 compute_dtype = torch.float16
 
 
@@ -8,26 +8,28 @@ def check_valid(x, W, quant_linear, tol=1e-3):
 ############################################################################################
 from hqq.core.quantize import HQQLinear, BaseQuantizeConfig
 
-in_features, out_features = 4096*4, 4096*2
+in_features, out_features = 4096*4, 4096*4
 #W_nbits, group_size = 8, in_features 
-W_nbits, group_size = 4, 128 
-#W_nbits, group_size = 2, 128
+W_nbits, group_size = 4, 64 
+#W_nbits, group_size = 2, 64
+compute_dtype = torch.float16 #float16 / bfloat16
 
 linear       = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=False, device='cpu')
 quant_config = BaseQuantizeConfig(nbits=W_nbits, group_size=group_size, quant_zero=False, quant_scale=False, axis=1)
-hqq_layer    = HQQLinear(linear, quant_config=quant_config, compute_dtype=torch.float16, device='cuda:0', del_orig=False) 
+hqq_layer    = HQQLinear(linear, quant_config=quant_config, compute_dtype=compute_dtype, device='cuda:0', del_orig=False) 
 
 orig_shape   = (out_features, in_features)
 W            = hqq_layer.dequantize().reshape(orig_shape)
 ############################################################################################
 
-from gemlite.core import GemLiteLinearTriton, DType
+from gemlite.core import GemLiteLinearTriton, DType, TORCH_TO_DTYPE
+gemlite_dtype = TORCH_TO_DTYPE[compute_dtype]
 gemlite_linear = GemLiteLinearTriton(W_nbits, 
                                     group_size=group_size, 
                                     in_features=in_features, 
                                     out_features=out_features, 
-                                    input_dtype=DType.FP16, 
-                                    output_dtype=DType.FP16)
+                                    input_dtype=gemlite_dtype, 
+                                    output_dtype=gemlite_dtype)
 
 W_q           = hqq_layer.unpack(dtype=torch.uint8).view(orig_shape)
 scales        = hqq_layer.meta['scale']
 
@@ -1,4 +1,4 @@
-__version__ = "0.4.3"
+__version__ = "0.4.4"
 __author__  = 'Dr. Hicham Badri'
 __credits__ = 'Mobius Labs GmbH'
 
 
@@ -28,8 +28,23 @@
 ###################################################################################################################################
 # Triton backend
 ###################################################################################################################################
-GEMLITE_ACC_DTYPE           = {DType.FP16: DType.FP32 if gpu_has_more_shared_memory() else DType.FP16, DType.FP8: DType.FP32, DType.FP8e5: DType.FP32, DType.INT8: DType.INT32}
-GEMLITE_TRITON_KERNELS      = [gemv_A16fWnO16f, gemv_revsplitK_A16fWnO16f, gemv_splitK_A16fWnO16f, gemm_splitK_A16fWnO16f, gemm_A16fWnO16f] 
+GEMLITE_ACC_DTYPE = {
+    DType.FP16: DType.FP32 if gpu_has_more_shared_memory() else DType.FP16,
+    DType.BF16: DType.FP32,
+    DType.FP32: DType.FP32,
+    DType.FP8: DType.FP32,
+    DType.FP8e5: DType.FP32,
+    DType.INT8: DType.INT32,
+}
+
+GEMLITE_TRITON_KERNELS = [
+    gemv_A16fWnO16f,
+    gemv_revsplitK_A16fWnO16f,
+    gemv_splitK_A16fWnO16f,
+    gemm_splitK_A16fWnO16f,
+    gemm_A16fWnO16f,
+]
+
 GEMLITE_TRITON_MAPPING      = {kernel.matmul_type : kernel for kernel in GEMLITE_TRITON_KERNELS}
 GEMLITE_TRITON_CONFIG_CACHE = {} #Global config cache for all the kernels
 GEMLITE_TRITON_CACHE        = {} #Cache used forward with warmup
@@ -94,11 +109,14 @@ def set_acc_dtype(dtype):
     assert dtype in [DType.FP16, DType.FP32], "Invalid dtype (should be DType.FP16 or DType.FP32)."
     GEMLITE_ACC_DTYPE[DType.FP16] = dtype
 
+#Return the default gemv kernel to use for M==1
+def get_default_gemv(W_nbits: int) -> str:
+    return 'GEMV_REVSPLITK' if (W_nbits < 8) else 'GEMV_SPLITK'
 ###################################################################################################################################
 #Main class
 class GemLiteLinearTriton(torch.nn.Module):
     SUPPORTED_BITS_TRITON = [1, 2, 4, 8, 16]
-    SUPPORTED_DTYPES      = [DType.FP16, DType.FP8, DType.FP8e5, DType.INT8]
+    SUPPORTED_DTYPES      = [DType.FP16, DType.BF16, DType.FP32, DType.FP8, DType.FP8e5, DType.INT8]
     MIN_SIZE              = 64
     PACKING_BITWIDTH      = 32 #Default packing bitwidth
 
@@ -144,8 +162,8 @@ def __init__(
 
         self.input_dtype   = input_dtype
         self.output_dtype  = output_dtype
-        self.compute_dtype = torch.float16
-        self.meta_dtype    = DType.FP16
+        self.compute_dtype = DTYPE_TO_TORCH[input_dtype.value]
+        self.meta_dtype    = input_dtype
         self.kernels       = GEMLITE_TRITON_KERNELS
 
         #Accumulation
@@ -161,18 +179,14 @@ def __init__(
             self.forward = self.forward_auto_no_warmup
 
         #Default GEMV for packed vs. non-packed data
-        self.default_gemv = self.get_default_gemv()
+        self.default_gemv = get_default_gemv(self.W_nbits)
 
         #Set torch flags
         try:
             torch._dynamo.config.inline_inbuilt_nn_modules = False #2.5.0 fix
         except:
             pass
 
-    #Returns the default gemv choice based on the config
-    def get_default_gemv(self):
-        return 'GEMV_REVSPLITK' if (self.W_nbits < 8) else 'GEMV_SPLITK'
-
     #Override this function to perform dynamic activation quantization
     def scale_activations(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         return x, self.scales_x
@@ -188,8 +202,8 @@ def pack(self, W_q: Tensor, scales: Tensor, zeros: Union[Tensor, int], bias: Uni
 
         #Unpacked weights
         self.W_q = None
-        if(W_q.dtype in [torch.float16, torch.int8, torch.float8_e4m3fn, torch.float8_e5m2]):
-            if(W_q.dtype == torch.float16): 
+        if(W_q.dtype in [torch.float16, torch.bfloat16, torch.int8, torch.float8_e4m3fn, torch.float8_e5m2]):
+            if(W_q.dtype in [torch.float16, torch.bfloat16]): 
                 assert self.W_nbits == 16, "Invalid fp16 weights."
             else: 
                 assert self.W_nbits == 8, "Invalid 8-bit weights."
@@ -281,7 +295,7 @@ def pack(self, W_q: Tensor, scales: Tensor, zeros: Union[Tensor, int], bias: Uni
             self.scales = torch.tensor([[]], dtype=torch.int32, device=self.device)
 
         if(self.scales is not None):
-            self.meta_dtype = DType.FP32 if self.scales.dtype == torch.float32 else DType.FP16
+            self.meta_dtype = TORCH_TO_DTYPE[self.scales.dtype]
 
         #Force contiguous
         if(contiguous):
 
@@ -25,6 +25,18 @@ class DType(Enum):
     8: torch.float8_e5m2,
 }
 
+TORCH_TO_DTYPE = {
+    torch.float32: DType.FP32,
+    torch.float16: DType.FP16,
+    torch.bfloat16: DType.BF16,
+    torch.float8_e4m3fn: DType.FP8,
+    torch.int8: DType.INT8,
+    torch.uint8: DType.UINT8,
+    torch.int32: DType.INT32,
+    torch.uint32: DType.UINT32,
+    torch.float8_e5m2: DType.FP8e5,
+}
+
 TORCH_DTYPE_TO_TRITON = {
     torch.float16:       tl.float16,
     torch.float32:       tl.float32,
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.4.3"`
	`1`	`+__version__ = "0.4.4"`
`2`	`2`	`__author__ = 'Dr. Hicham Badri'`
`3`	`3`	`__credits__ = 'Mobius Labs GmbH'`
`4`	`4`