Fix compilation with CUDA 11 (#2331)

zcbenz · web-flow · commit 2ca533b27943 · 2025-07-07T20:00:43.000-07:00
diff --git a/mlx/backend/cuda/arg_reduce.cu b/mlx/backend/cuda/arg_reduce.cu
@@ -1,6 +1,7 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/common/utils.h"
 #include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
 #include "mlx/backend/cuda/iterators/strided_iterator.cuh"
 #include "mlx/backend/cuda/kernel_utils.cuh"
 #include "mlx/dtype_utils.h"
diff --git a/mlx/backend/cuda/device.cpp b/mlx/backend/cuda/device.cpp
@@ -264,19 +264,26 @@ void CommandEncoder::commit() {
     graph_key_ += std::to_string(graph_node_count_);
     graph_key_ += ".";
     graph_key_ += std::to_string(empty_node_count_);
-    auto [it, _] = graph_cache_.emplace(graph_key_, nullptr);
-    auto& graph_exec = it->second;
-
-    if (graph_exec != NULL) {
-      cudaGraphExecUpdateResultInfo update_result;
-      cudaGraphExecUpdate(graph_exec, graph_, &update_result);
-      if (update_result.result != cudaGraphExecUpdateSuccess) {
-        cudaGetLastError();
+
+    cudaGraphExec_t& graph_exec = graph_cache_[graph_key_];
+
+    if (graph_exec != nullptr) {
+      cudaGraphExecUpdateResult update_result;
+#if CUDART_VERSION >= 12000
+      cudaGraphExecUpdateResultInfo info;
+      cudaGraphExecUpdate(graph_exec, graph_, &info);
+      update_result = info.result;
+#else
+      cudaGraphNode_t error_node;
+      cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
+#endif // CUDART_VERSION >= 12000
+      if (update_result != cudaGraphExecUpdateSuccess) {
+        cudaGetLastError(); // reset error
         CHECK_CUDA_ERROR(cudaGraphExecDestroy(graph_exec));
-        graph_exec = NULL;
+        graph_exec = nullptr;
       }
     }
-    if (graph_exec == NULL) {
+    if (graph_exec == nullptr) {
       CHECK_CUDA_ERROR(
           cudaGraphInstantiate(&graph_exec, graph_, NULL, NULL, 0));
     }
diff --git a/mlx/backend/cuda/device/cast_op.cuh b/mlx/backend/cuda/device/cast_op.cuh
@@ -3,6 +3,8 @@
 #pragma once
 
 #include <cuComplex.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <thrust/iterator/transform_iterator.h>
 
 namespace mlx::core::cu {
@@ -17,6 +19,26 @@ struct CastOp {
   }
 };
 
+// Castings between complex and boolean.
+// TODO: Should make a custom complex type.
+template <>
+struct CastOp<cuComplex, bool> {
+  static constexpr bool is_castable = true;
+
+  __device__ bool operator()(cuComplex x) {
+    return x.x != 0 && x.y != 0;
+  }
+};
+
+template <>
+struct CastOp<bool, cuComplex> {
+  static constexpr bool is_castable = true;
+
+  __device__ cuComplex operator()(bool x) {
+    return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
+  }
+};
+
 // Converting a complex number to real number discards the imaginary part.
 template <typename DstT>
 struct CastOp<
@@ -45,6 +67,7 @@ struct CastOp<
   }
 };
 
+// Do nothing when no casting is needed.
 template <typename SrcT, typename DstT>
 struct CastOp<
     SrcT,
@@ -57,9 +80,53 @@ struct CastOp<
   }
 };
 
+// In CUDA 11 the half types do not define conversions between some types,
+// provide fallbacks here.
+#if CUDART_VERSION < 12000
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<
+        !cuda::std::is_convertible_v<SrcT, DstT> &&
+        !cuda::std::is_same_v<SrcT, cuComplex> &&
+        (cuda::std::is_same_v<DstT, __half> ||
+         cuda::std::is_same_v<DstT, __nv_bfloat16>)>> {
+  static constexpr bool is_castable = true;
+
+  __device__ DstT operator()(SrcT x) {
+    return DstT(static_cast<float>(x));
+  }
+};
+
+template <typename SrcT, typename DstT>
+struct CastOp<
+    SrcT,
+    DstT,
+    cuda::std::enable_if_t<
+        !cuda::std::is_convertible_v<SrcT, DstT> &&
+        !cuda::std::is_same_v<DstT, cuComplex> &&
+        !cuda::std::is_same_v<DstT, __half> &&
+        !cuda::std::is_same_v<DstT, __nv_bfloat16> &&
+        (cuda::std::is_same_v<SrcT, __half> ||
+         cuda::std::is_same_v<SrcT, __nv_bfloat16>)>> {
+  static constexpr bool is_castable = true;
+
+  __device__ DstT operator()(SrcT x) {
+    return DstT(static_cast<float>(x));
+  }
+};
+#endif // CUDART_VERSION < 12000
+
+// Helper to deduce the SrcT.
+template <typename DstT, typename SrcT>
+inline __host__ __device__ auto cast_to(SrcT x) {
+  return CastOp<SrcT, DstT>{}(x);
+}
+
 // Return an iterator that cast the value to DstT using CastOp.
 template <typename DstT, typename Iterator>
-__host__ __device__ auto make_cast_iterator(Iterator it) {
+inline __host__ __device__ auto make_cast_iterator(Iterator it) {
   using SrcT = typename cuda::std::iterator_traits<Iterator>::value_type;
   if constexpr (std::is_same_v<SrcT, DstT>) {
     return it;
diff --git a/mlx/backend/cuda/device/utils.cuh b/mlx/backend/cuda/device/utils.cuh
@@ -99,20 +99,20 @@ struct Limits<
     return cuda::std::numeric_limits<T>::infinity();
   }
   static constexpr __host__ __device__ T min() {
-#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
-    return -cuda::std::numeric_limits<T>::infinity();
-#else
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
     return -cuda::std::numeric_limits<float>::infinity();
+#else
+    return -cuda::std::numeric_limits<T>::infinity();
 #endif
   }
   static constexpr __host__ __device__ T finite_max() {
     return cuda::std::numeric_limits<T>::max();
   }
   static constexpr __host__ __device__ T finite_min() {
-#if defined(__CUDA_ARCH__) || CUDART_VERSION >= 12000
-    return cuda::std::numeric_limits<T>::lowest();
-#else
+#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
     return cuda::std::numeric_limits<float>::lowest();
+#else
+    return cuda::std::numeric_limits<T>::lowest();
 #endif
   }
 };
diff --git a/mlx/backend/cuda/reduce/all_reduce.cu b/mlx/backend/cuda/reduce/all_reduce.cu
@@ -37,15 +37,15 @@ __global__ void all_reduce(T* in, U* out, size_t block_step, size_t size) {
   for (; i + block.size() * N <= check; i += block.size() * N) {
     cub::LoadDirectBlockedVectorized<T, N>(block.thread_rank(), in + i, vals);
     for (int j = 0; j < N; j++) {
-      accs[0] = op(accs[0], __cast<U, T>(vals[j]));
+      accs[0] = op(accs[0], cast_to<U>(vals[j]));
     }
   }
 
   if (i < check) {
     cub::LoadDirectBlocked(
-        block.thread_rank(), in + i, vals, check - i, __cast<T, U>(init));
+        block.thread_rank(), in + i, vals, check - i, cast_to<T>(init));
     for (int i = 0; i < N; i++) {
-      accs[0] = op(accs[0], __cast<U, T>(vals[i]));
+      accs[0] = op(accs[0], cast_to<U>(vals[i]));
     }
   }
 
diff --git a/mlx/backend/cuda/reduce/col_reduce.cu b/mlx/backend/cuda/reduce/col_reduce.cu
@@ -3,7 +3,6 @@
 #include <numeric>
 
 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"
 
 #include <cooperative_groups.h>
@@ -128,7 +127,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
         T vals[N_READS];
         cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);
         for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+          totals[i] = op(totals[i], cast_to<U>(vals[i]));
         }
         loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
       }
@@ -137,7 +136,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
         T vals[N_READS];
         cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);
         for (int i = 0; i < N_READS; i++) {
-          totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+          totals[i] = op(totals[i], cast_to<U>(vals[i]));
         }
         loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
       }
@@ -150,9 +149,9 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {
           in + loop.location(),
           vals,
           args.reduction_stride - tile_x * BN,
-          __cast<T, U>(ReduceInit<Op, T>::value()));
+          cast_to<T>(ReduceInit<Op, T>::value()));
       for (int i = 0; i < N_READS; i++) {
-        totals[i] = op(totals[i], __cast<U, T>(vals[i]));
+        totals[i] = op(totals[i], cast_to<U>(vals[i]));
       }
       loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
     }
diff --git a/mlx/backend/cuda/reduce/reduce_ops.cuh b/mlx/backend/cuda/reduce/reduce_ops.cuh
@@ -2,6 +2,8 @@
 
 #pragma once
 
+#include "mlx/backend/cuda/device/atomic_ops.cuh"
+#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/device/utils.cuh"
 #include "mlx/backend/cuda/reduce/reduce_utils.cuh"
 
@@ -40,15 +42,15 @@ struct Sum {
   }
 
   __device__ void atomic_update(__nv_bfloat16* x, __nv_bfloat16 y) {
-    atomicAdd(x, y);
+    atomic_add(x, y);
   }
 
   __device__ void atomic_update(int* x, int y) {
-    atomicAdd(x, y);
+    atomic_add(x, y);
   }
 
   __device__ void atomic_update(float* x, float y) {
-    atomicAdd(x, y);
+    atomic_add(x, y);
   }
 };
 
@@ -152,7 +154,7 @@ struct ReduceInit<Sum, T> {
     if constexpr (cuda::std::is_same_v<T, cuComplex>) {
       return T{0, 0};
     } else {
-      return typename ReduceResult<Sum, T>::type{0};
+      return cast_to<typename ReduceResult<Sum, T>::type>(0);
     }
   }
 };
@@ -163,7 +165,7 @@ struct ReduceInit<Prod, T> {
     if constexpr (cuda::std::is_same_v<T, cuComplex>) {
       return T{1, 0};
     } else {
-      return typename ReduceResult<Prod, T>::type{1};
+      return cast_to<typename ReduceResult<Prod, T>::type>(1);
     }
   }
 };
diff --git a/mlx/backend/cuda/reduce/reduce_utils.cuh b/mlx/backend/cuda/reduce/reduce_utils.cuh
@@ -55,22 +55,6 @@ __device__ void atomic_reduce(T* x, T y) {
   }
 }
 
-// TODO: Should make a custom complex type
-template <typename U, typename T>
-inline __device__ U __cast(T x) {
-  return static_cast<U>(x);
-}
-
-template <>
-inline __device__ bool __cast<bool, cuComplex>(cuComplex x) {
-  return x.x != 0 && x.y != 0;
-}
-
-template <>
-inline __device__ cuComplex __cast<cuComplex, bool>(bool x) {
-  return x ? make_cuFloatComplex(1, 1) : make_cuFloatComplex(0, 0);
-}
-
 template <typename T, int N, typename Block, typename Warp, typename Op>
 inline __device__ void
 block_reduce(Block block, Warp warp, T (&vals)[N], T* smem, Op op, T init) {
diff --git a/mlx/backend/cuda/reduce/row_reduce.cu b/mlx/backend/cuda/reduce/row_reduce.cu
@@ -3,7 +3,6 @@
 #include <numeric>
 
 #include "mlx/backend/cuda/device.h"
-#include "mlx/backend/cuda/device/cast_op.cuh"
 #include "mlx/backend/cuda/reduce/reduce.cuh"
 
 #include <cooperative_groups.h>
@@ -113,7 +112,7 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
             in + k * size + r * (block.size() * N),
             vals[k]);
         for (int j = 0; j < N; j++) {
-          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
         }
       }
     }
@@ -125,7 +124,7 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
             in + k * size + r * (block.size() * N),
             vals[k]);
         for (int j = 0; j < N; j++) {
-          accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+          accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
         }
       }
     }
@@ -138,9 +137,9 @@ __global__ void row_reduce_simple(T* in, U* out, size_t n_rows, int size) {
           in + k * size + final_offset,
           vals[k],
           size,
-          __cast<T, U>(init));
+          cast_to<T>(init));
       for (int j = 0; j < N; j++) {
-        accs[k] = op(accs[k], __cast<U, T>(vals[k][j]));
+        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
       }
     }
   }
@@ -199,7 +198,7 @@ __global__ void row_reduce_looped(
           in + loop.location() + r * BLOCK_DIM * N_READS,
           vals);
       for (int i = 0; i < N_READS; i++) {
-        total[0] = op(total[0], __cast<U, T>(vals[i]));
+        total[0] = op(total[0], cast_to<U>(vals[i]));
       }
     }
     if (final_offset < args.row_size) {
@@ -209,9 +208,9 @@ __global__ void row_reduce_looped(
           in + loop.location() + final_offset,
           vals,
           args.row_size - final_offset,
-          __cast<T, U>(init));
+          cast_to<T>(init));
       for (int i = 0; i < N_READS; i++) {
-        total[0] = op(total[0], __cast<U, T>(vals[i]));
+        total[0] = op(total[0], cast_to<U>(vals[i]));
       }
     }
     // TODO: Maybe block.sync() here?
diff --git a/mlx/backend/cuda/rms_norm.cu b/mlx/backend/cuda/rms_norm.cu
@@ -74,7 +74,7 @@ __global__ void rms_norm(
   for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
     auto index = r * BLOCK_DIM + block.thread_rank();
     T xn[N_READS];
-    cub::LoadDirectBlocked(index, x, xn, axis_size, 0);
+    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
     for (int i = 0; i < N_READS; ++i) {
       float t = static_cast<float>(xn[i]);
       normalizer += t * t;
@@ -130,7 +130,7 @@ __global__ void rms_norm_vjp(
     T wn[N_READS] = {};
     T gn[N_READS] = {};
     auto index = r * BLOCK_DIM + block.thread_rank();
-    cub::LoadDirectBlocked(index, x, xn, axis_size, 0);
+    cub::LoadDirectBlocked(index, x, xn, axis_size, cast_to<T>(0));
     cub::LoadDirectBlocked(index, g, gn, axis_size);
     cub::LoadDirectBlocked(index, strided_iterator(w, w_stride), wn, axis_size);
     for (int i = 0; i < N_READS; i++) {
diff --git a/mlx/backend/cuda/softmax.cu b/mlx/backend/cuda/softmax.cu
@@ -43,7 +43,7 @@ __global__ void softmax(const T* in, T* out, int axis_size) {
   // Thread reduce.
   AccT prevmax;
   AccT maxval = Limits<AccT>::finite_min();
-  AccT normalizer = 0;
+  AccT normalizer = cast_to<AccT>(0);
   for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
     AccT vals[N_READS];
     cub::LoadDirectBlocked(

Original file line number	Diff line number	Diff line change
`@@ -37,15 +37,15 @@ __global__ void all_reduce(T* in, U* out, size_t block_step, size_t size) {`
`37`	`37`	`for (; i + block.size() * N <= check; i += block.size() * N) {`
`38`	`38`	`cub::LoadDirectBlockedVectorized<T, N>(block.thread_rank(), in + i, vals);`
`39`	`39`	`for (int j = 0; j < N; j++) {`
`40`		`- accs[0] = op(accs[0], __cast<U, T>(vals[j]));`
	`40`	`+ accs[0] = op(accs[0], cast_to<U>(vals[j]));`
`41`	`41`	`}`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`if (i < check) {`
`45`	`45`	`cub::LoadDirectBlocked(`
`46`		`- block.thread_rank(), in + i, vals, check - i, __cast<T, U>(init));`
	`46`	`+ block.thread_rank(), in + i, vals, check - i, cast_to<T>(init));`
`47`	`47`	`for (int i = 0; i < N; i++) {`
`48`		`- accs[0] = op(accs[0], __cast<U, T>(vals[i]));`
	`48`	`+ accs[0] = op(accs[0], cast_to<U>(vals[i]));`
`49`	`49`	`}`
`50`	`50`	`}`
`51`	`51`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`#include <numeric>`
`4`	`4`
`5`	`5`	`#include "mlx/backend/cuda/device.h"`
`6`		`-#include "mlx/backend/cuda/device/cast_op.cuh"`
`7`	`6`	`#include "mlx/backend/cuda/reduce/reduce.cuh"`
`8`	`7`
`9`	`8`	`#include <cooperative_groups.h>`
`@@ -128,7 +127,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {`
`128`	`127`	`T vals[N_READS];`
`129`	`128`	`cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);`
`130`	`129`	`for (int i = 0; i < N_READS; i++) {`
`131`		`- totals[i] = op(totals[i], __cast<U, T>(vals[i]));`
	`130`	`+ totals[i] = op(totals[i], cast_to<U>(vals[i]));`
`132`	`131`	`}`
`133`	`132`	`loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());`
`134`	`133`	`}`
`@@ -137,7 +136,7 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {`
`137`	`136`	`T vals[N_READS];`
`138`	`137`	`cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);`
`139`	`138`	`for (int i = 0; i < N_READS; i++) {`
`140`		`- totals[i] = op(totals[i], __cast<U, T>(vals[i]));`
	`139`	`+ totals[i] = op(totals[i], cast_to<U>(vals[i]));`
`141`	`140`	`}`
`142`	`141`	`loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());`
`143`	`142`	`}`
`@@ -150,9 +149,9 @@ col_reduce_looped(T* in, U* out, const __grid_constant__ ColReduceArgs args) {`
`150`	`149`	`in + loop.location(),`
`151`	`150`	`vals,`
`152`	`151`	`args.reduction_stride - tile_x * BN,`
`153`		`- __cast<T, U>(ReduceInit<Op, T>::value()));`
	`152`	`+ cast_to<T>(ReduceInit<Op, T>::value()));`
`154`	`153`	`for (int i = 0; i < N_READS; i++) {`
`155`		`- totals[i] = op(totals[i], __cast<U, T>(vals[i]));`
	`154`	`+ totals[i] = op(totals[i], cast_to<U>(vals[i]));`
`156`	`155`	`}`
`157`	`156`	`loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());`
`158`	`157`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,8 @@`
`2`	`2`
`3`	`3`	`#pragma once`
`4`	`4`
	`5`	`+#include "mlx/backend/cuda/device/atomic_ops.cuh"`
	`6`	`+#include "mlx/backend/cuda/device/cast_op.cuh"`
`5`	`7`	`#include "mlx/backend/cuda/device/utils.cuh"`
`6`	`8`	`#include "mlx/backend/cuda/reduce/reduce_utils.cuh"`
`7`	`9`
`@@ -40,15 +42,15 @@ struct Sum {`
`40`	`42`	`}`
`41`	`43`
`42`	`44`	`__device__ void atomic_update(__nv_bfloat16* x, __nv_bfloat16 y) {`
`43`		`- atomicAdd(x, y);`
	`45`	`+ atomic_add(x, y);`
`44`	`46`	`}`
`45`	`47`
`46`	`48`	`__device__ void atomic_update(int* x, int y) {`
`47`		`- atomicAdd(x, y);`
	`49`	`+ atomic_add(x, y);`
`48`	`50`	`}`
`49`	`51`
`50`	`52`	`__device__ void atomic_update(float* x, float y) {`
`51`		`- atomicAdd(x, y);`
	`53`	`+ atomic_add(x, y);`
`52`	`54`	`}`
`53`	`55`	`};`
`54`	`56`
`@@ -152,7 +154,7 @@ struct ReduceInit<Sum, T> {`
`152`	`154`	`if constexpr (cuda::std::is_same_v<T, cuComplex>) {`
`153`	`155`	`return T{0, 0};`
`154`	`156`	`} else {`
`155`		`- return typename ReduceResult<Sum, T>::type{0};`
	`157`	`+ return cast_to<typename ReduceResult<Sum, T>::type>(0);`
`156`	`158`	`}`
`157`	`159`	`}`
`158`	`160`	`};`
`@@ -163,7 +165,7 @@ struct ReduceInit<Prod, T> {`
`163`	`165`	`if constexpr (cuda::std::is_same_v<T, cuComplex>) {`
`164`	`166`	`return T{1, 0};`
`165`	`167`	`} else {`
`166`		`- return typename ReduceResult<Prod, T>::type{1};`
	`168`	`+ return cast_to<typename ReduceResult<Prod, T>::type>(1);`
`167`	`169`	`}`
`168`	`170`	`}`
`169`	`171`	`};`