Skip to content

Commit aff7c10

Browse files
authored
Add fast CUDA MMQ GGUF kernels (huggingface#3465)
* Add fast CUDA MMQ GGUF kernels * Adjust tolerance
1 parent b503458 commit aff7c10

21 files changed

Lines changed: 9197 additions & 5 deletions

candle-core/src/quantized/cuda.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,11 +721,14 @@ impl QCudaStorage {
721721
storage: &CudaStorage,
722722
layout: &crate::Layout,
723723
) -> Result<(CudaStorage, crate::Shape)> {
724-
// Try the fast MMVQ path first (supports BF16//F16/F32, batch 1-8, all quant types, reuses per-device workspace).
724+
// Optimized MMVQ and MMQ paths (support most paths: BF16/F16/F32, batch 1-8, all quant types, reuses per-device workspace).
725725
if !FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
726726
if let Some(result) = super::fast_mmvq::try_fwd(self, self_shape, storage, layout)? {
727727
return Ok(result);
728728
}
729+
if let Some(result) = super::fast_mmq::try_fwd(self, self_shape, storage, layout)? {
730+
return Ok(result);
731+
}
729732
}
730733

731734
// Fallback

0 commit comments

Comments
 (0)