spiceai
diff --git a/‎candle-core/src/quantized/cuda.rs‎
Lines changed: 4 additions & 1 deletion b/‎candle-core/src/quantized/cuda.rs‎
Lines changed: 4 additions & 1 deletion
@@ -721,11 +721,14 @@ impl QCudaStorage {
         storage: &CudaStorage,
         layout: &crate::Layout,
     ) -> Result<(CudaStorage, crate::Shape)> {
-        // Try the fast MMVQ path first (supports BF16//F16/F32, batch 1-8, all quant types, reuses per-device workspace).
+        // Optimized MMVQ and MMQ paths (support most paths: BF16/F16/F32, batch 1-8, all quant types, reuses per-device workspace).
         if !FORCE_DMMV.load(std::sync::atomic::Ordering::Relaxed) {
             if let Some(result) = super::fast_mmvq::try_fwd(self, self_shape, storage, layout)? {
                 return Ok(result);
             }
+            if let Some(result) = super::fast_mmq::try_fwd(self, self_shape, storage, layout)? {
+                return Ok(result);
+            }
         }
 
         // Fallback