ml-explore
diff --git a/‎mlx/backend/cpu/quantized.cpp‎
Lines changed: 156 additions & 29 deletions b/‎mlx/backend/cpu/quantized.cpp‎
Lines changed: 156 additions & 29 deletions
diff --git a/‎mlx/backend/cuda/CMakeLists.txt‎
Lines changed: 7 additions & 4 deletions b/‎mlx/backend/cuda/CMakeLists.txt‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎mlx/backend/cuda/primitives.cpp‎
Lines changed: 0 additions & 8 deletions b/‎mlx/backend/cuda/primitives.cpp‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎mlx/backend/cuda/quantized/cuda_fp4.h‎
Lines changed: 17 additions & 0 deletions b/‎mlx/backend/cuda/quantized/cuda_fp4.h‎
Lines changed: 17 additions & 0 deletions
@@ -14,6 +14,19 @@ namespace mlx::core {
 
 namespace {
 
+array ensure_row_contiguous(
+    const array& arr,
+    cpu::CommandEncoder& encoder,
+    Stream s) {
+  if (arr.flags().row_contiguous) {
+    return arr;
+  } else {
+    auto arr_cpy = contiguous_copy_cpu(arr, s);
+    encoder.add_temporary(arr_cpy);
+    return arr_cpy;
+  }
+};
+
 const static float FP4_LUT[16] = {
     +0.0f,
     +0.5f,
@@ -922,20 +935,9 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
   auto& scales_pre = inputs[2];
 
   auto& encoder = cpu::get_command_encoder(stream());
-  auto ensure_row_contiguous = [s = stream(), &encoder](const array& arr) {
-    if (arr.flags().row_contiguous) {
-      return arr;
-    } else {
-      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
-      copy_cpu(arr, arr_cpy, CopyType::General, s);
-      encoder.add_temporary(arr_cpy);
-      return arr_cpy;
-    }
-  };
-
-  auto x = ensure_row_contiguous(x_pre);
-  auto w = ensure_row_contiguous(w_pre);
-  auto scales = ensure_row_contiguous(scales_pre);
+  auto x = ensure_row_contiguous(x_pre, encoder, stream());
+  auto w = ensure_row_contiguous(w_pre, encoder, stream());
+  auto scales = ensure_row_contiguous(scales_pre, encoder, stream());
 
   out.set_data(allocator::malloc(out.nbytes()));
 
@@ -944,7 +946,7 @@ void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
   encoder.set_input_array(scales);
   encoder.set_output_array(out);
   if (mode_ == QuantizationMode::Affine) {
-    auto biases = ensure_row_contiguous(inputs[3]);
+    auto biases = ensure_row_contiguous(inputs[3], encoder, stream());
     encoder.set_input_array(biases);
     encoder.dispatch([out = array::unsafe_weak_copy(out),
                       x = array::unsafe_weak_copy(x),
@@ -1052,6 +1054,105 @@ void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
   }
 }
 
+uint8_t to_fp8_e8m0(float x) {
+  if (!std::isfinite(x)) {
+    return 0xFF;
+  }
+  if (x < 0.0f) {
+    return 0x00;
+  }
+  float le = std::log2(x);
+  int n = int(std::round(le));
+
+  n = n < -127 ? -127 : n;
+  n = n > 127 ? 127 : n;
+  return static_cast<uint8_t>(n + 127);
+}
+
+uint8_t to_fp4_e2m1(float x) {
+  if (std::isnan(x)) {
+    return 0x7;
+  }
+
+  const uint8_t sign_bit = (std::signbit(x)) ? 0x8 : 0x0;
+  x = std::abs(x);
+
+  uint8_t bits;
+  if (x > 5.0f) {
+    bits = 0x7;
+  } else if (x >= 3.5f) {
+    bits = 0x6;
+  } else if (x > 2.5f) {
+    bits = 0x5;
+  } else if (x >= 1.75f) {
+    bits = 0x4;
+  } else if (x > 1.25f) {
+    bits = 0x3;
+  } else if (x >= 0.75f) {
+    bits = 0x2;
+  } else if (x > 0.25f) {
+    bits = 0x1;
+  } else {
+    bits = 0x0;
+  }
+  return bits | sign_bit;
+}
+
+template <typename T>
+void fp_quantize_dequantize(
+    const array& w_arr,
+    array& out_arr,
+    int bits,
+    int group_size,
+    size_t w_size) {
+  auto w = w_arr.data<T>();
+  auto out = out_arr.data<T>();
+
+  size_t n_groups = w_size / group_size;
+
+  for (size_t i = 0; i < n_groups; ++i) {
+    size_t idx = i * group_size;
+    float scale = -std::numeric_limits<float>::infinity();
+    for (int j = 0; j < group_size; ++j) {
+      scale = std::max(scale, std::abs(w[idx + j]));
+    }
+    scale /= bits == 4 ? 6.0f : 448.0f;
+    if (group_size == 16) {
+      scale = dequantize_scale<float, 16>(detail::ToFP8()(scale));
+    } else {
+      scale = dequantize_scale<float, 32>(to_fp8_e8m0(scale));
+    }
+
+    for (int j = 0; j < group_size; ++j) {
+      float w_el = scale == 0 ? 0.0f : w[idx + j] / scale;
+      float output;
+      if (bits == 8) {
+        output = detail::FromFP8()(detail::ToFP8()(w_el));
+      } else {
+        output = FP4_LUT[to_fp4_e2m1(w_el)];
+      }
+      out[idx + j] = static_cast<T>(scale * output);
+    }
+  }
+}
+
+void dispatch_quantize_dequantize(
+    const array& w,
+    array& out,
+    int bits,
+    int group_size) {
+  if (w.dtype() == float16) {
+    fp_quantize_dequantize<float16_t>(w, out, bits, group_size, w.size());
+  } else if (w.dtype() == bfloat16) {
+    fp_quantize_dequantize<bfloat16_t>(w, out, bits, group_size, w.size());
+  } else if (w.dtype() == float32) {
+    fp_quantize_dequantize<float>(w, out, bits, group_size, w.size());
+  } else {
+    throw std::runtime_error(
+        "[quantize_dequantize] Only supports floating point inputs");
+  }
+}
+
 template <typename T, typename U>
 void quantize(
     const T* w,
@@ -1136,26 +1237,15 @@ void dispatch_quantize(
 void fast::Quantize::eval_cpu(
     const std::vector<array>& inputs,
     std::vector<array>& outputs) {
-  auto ensure_row_contiguous = [s = stream()](const array& arr) {
-    if (arr.flags().row_contiguous) {
-      return std::make_pair(arr, false);
-    } else {
-      return std::make_pair(contiguous_copy_cpu(arr, s), true);
-    }
-  };
-
-  auto [w, copied] = ensure_row_contiguous(inputs[0]);
+  auto& encoder = cpu::get_command_encoder(stream());
+  auto w = ensure_row_contiguous(inputs[0], encoder, stream());
   auto& out = outputs[0];
   out.set_data(allocator::malloc(out.nbytes()));
 
   auto& scales = outputs[1];
   auto& biases = outputs[2];
   scales.set_data(allocator::malloc(scales.nbytes()));
   biases.set_data(allocator::malloc(biases.nbytes()));
-  auto& encoder = cpu::get_command_encoder(stream());
-  if (copied) {
-    encoder.add_temporary(w);
-  }
   encoder.set_input_array(w);
   encoder.set_input_array(scales);
   encoder.set_input_array(biases);
@@ -1238,6 +1328,43 @@ void fast::ConvertFP8::eval_cpu(
 }
 
 void QQMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
-  throw std::runtime_error("QQMatmul not implemented on CPU.");
+  auto& encoder = cpu::get_command_encoder(stream());
+
+  bool w_quantized = (inputs[1].dtype() == uint32);
+  if (w_quantized && inputs[0].shape(-2) == 1) {
+    bool donate_x = inputs[0].is_donatable();
+    auto x = ensure_row_contiguous(inputs[0], encoder, stream());
+    auto w = ensure_row_contiguous(inputs[1], encoder, stream());
+    auto scales = ensure_row_contiguous(inputs[2], encoder, stream());
+
+    out.set_data(allocator::malloc(out.nbytes()));
+
+    // If x is a copy it should be donatable
+    donate_x |= x.is_donatable();
+    auto xhat = donate_x
+        ? x
+        : array(allocator::malloc(x.nbytes()), x.shape(), x.dtype());
+    if (!donate_x) {
+      encoder.add_temporary(xhat);
+    }
+    encoder.set_input_array(x);
+    encoder.set_input_array(w);
+    encoder.set_input_array(scales);
+    encoder.set_output_array(out);
+    encoder.dispatch([out = array::unsafe_weak_copy(out),
+                      x = array::unsafe_weak_copy(x),
+                      xhat = array::unsafe_weak_copy(xhat),
+                      w = array::unsafe_weak_copy(w),
+                      scales = array::unsafe_weak_copy(scales),
+                      group_size_ = group_size_,
+                      bits_ = bits_]() mutable {
+      dispatch_quantize_dequantize(x, xhat, bits_, group_size_);
+      fp_qmm_dispatch(out, xhat, w, scales, group_size_, bits_, true);
+    });
+    return;
+  } else {
+    throw std::runtime_error("[QQMatmul] NYI for the general case");
+  }
 }
+
 } // namespace mlx::core
@@ -56,7 +56,10 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmv.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm_utils.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
 
@@ -66,12 +69,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
 # fp4 is not available on < 12.8
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8.0)
   target_include_directories(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/)
+  target_sources(mlx
+                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/no_qqmm_impl.cpp)
 else()
   target_sources(
-    mlx
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/quantized/cublas_qqmm.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm_utils.cu)
+    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm_impl.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/quantized/cublas_qqmm.cpp)
 endif()
 
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
 
@@ -24,20 +24,12 @@ namespace mlx::core {
     throw std::runtime_error(#func " has no CUDA implementation.");   \
   }
 
-#if CUDART_VERSION < 12080
-void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
-  throw std::runtime_error(
-      "[QQMatmul::eval_gpu] QQMM is only supported with CUDA 12.8 or higher.");
-}
-#endif
-
 NO_GPU(BlockMaskedMM)
 NO_GPU(FFT)
 NO_GPU(GatherQMM)
 NO_GPU(Hadamard)
 NO_GPU_MULTI(LUF)
 NO_GPU_MULTI(QRF)
-NO_GPU(QuantizedMatmul)
 NO_GPU(SegmentedMM)
 NO_GPU_MULTI(SVD)
 NO_GPU(Inverse)
 
@@ -81,3 +81,20 @@ struct __nv_fp4_e2m1 {
   }
   uint8_t __x{0};
 };
+
+struct __nv_fp4x4_e2m1 {
+  __device__ operator float4() {
+    float4 out;
+    auto bits = __high & 0xf;
+    out.x = float(*(__nv_fp4_e2m1*)(&bits));
+    bits = (__high >> 4) & 0xf;
+    out.y = float(*(__nv_fp4_e2m1*)(&bits));
+    bits = (__low) & 0xf;
+    out.z = float(*(__nv_fp4_e2m1*)(&bits));
+    bits = (__low >> 4) & 0xf;
+    out.w = float(*(__nv_fp4_e2m1*)(&bits));
+    return out;
+  }
+  uint8_t __high{0};
+  uint8_t __low{0};
+};