ml-explore
diff --git a/‎mlx/backend/cuda/arg_reduce.cu‎
Lines changed: 22 additions & 28 deletions b/‎mlx/backend/cuda/arg_reduce.cu‎
Lines changed: 22 additions & 28 deletions
diff --git a/‎mlx/backend/cuda/binary.cu‎
Lines changed: 52 additions & 42 deletions b/‎mlx/backend/cuda/binary.cu‎
Lines changed: 52 additions & 42 deletions
diff --git a/‎mlx/backend/cuda/binary_two.cu‎
Lines changed: 54 additions & 44 deletions b/‎mlx/backend/cuda/binary_two.cu‎
Lines changed: 54 additions & 44 deletions
diff --git a/‎mlx/backend/cuda/copy/copy.cuh‎
Lines changed: 0 additions & 9 deletions b/‎mlx/backend/cuda/copy/copy.cuh‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎mlx/backend/cuda/copy/copy_contiguous.cu‎
Lines changed: 17 additions & 13 deletions b/‎mlx/backend/cuda/copy/copy_contiguous.cu‎
Lines changed: 17 additions & 13 deletions
@@ -152,35 +152,29 @@ void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
   encoder.set_input_array(in);
   encoder.set_output_array(out);
   encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_REAL_TYPES_CHECKED(in.dtype(), "ArgReduce", CTYPE, {
-      using InType = cuda_type_t<CTYPE>;
+    dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
+      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
       constexpr uint32_t N_READS = 4;
-      MLX_SWITCH_BLOCK_DIM(cuda::ceil_div(axis_size, N_READS), BLOCK_DIM, {
-        dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
-        dim3 block_dims{BLOCK_DIM, 1, 1};
-        auto kernel = &cu::arg_reduce_general<
-            InType,
-            cu::ArgMax<InType>,
-            BLOCK_DIM,
-            N_READS>;
-        if (reduce_type_ == ArgReduce::ArgMin) {
-          kernel = &cu::arg_reduce_general<
-              InType,
-              cu::ArgMin<InType>,
-              BLOCK_DIM,
-              N_READS>;
-        }
-        kernel<<<num_blocks, block_dims, 0, stream>>>(
-            in.data<InType>(),
-            out.data<uint32_t>(),
-            out.size(),
-            const_param(shape),
-            const_param(in_strides),
-            const_param(out_strides),
-            ndim,
-            axis_stride,
-            axis_size);
-      });
+      dispatch_block_dim(
+          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
+            dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
+            auto kernel =
+                cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
+            if (reduce_type_ == ArgReduce::ArgMin) {
+              kernel = cu::
+                  arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
+            }
+            kernel<<<num_blocks, block_dim(), 0, stream>>>(
+                in.data<T>(),
+                out.data<uint32_t>(),
+                out.size(),
+                const_param(shape),
+                const_param(in_strides),
+                const_param(out_strides),
+                ndim,
+                axis_stride,
+                axis_size);
+          });
     });
   });
 }
 
@@ -140,54 +140,64 @@ void binary_op_gpu_inplace(
   encoder.set_input_array(b);
   encoder.set_output_array(out);
   encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(a.dtype(), CTYPE_IN, {
-      MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {
+    dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
         if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
           using InType = cuda_type_t<CTYPE_IN>;
           using OutType = cuda_type_t<CTYPE_OUT>;
           auto bopt = get_binary_op_type(a, b);
           if (bopt == BinaryOpType::General) {
-            auto [shape, strides] = collapse_contiguous_dims(a, b, out);
-            auto& a_strides = strides[0];
-            auto& b_strides = strides[1];
-            bool large = a.data_size() > INT32_MAX ||
-                b.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
-            MLX_SWITCH_BOOL(large, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                MLX_SWITCH_1_2_3(ndim, NDIM, {
-                  auto kernel =
-                      &cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out, large);
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out.data<OutType>(),
-                      out.size(),
-                      const_param<NDIM>(shape),
-                      const_param<NDIM>(a_strides),
-                      const_param<NDIM>(b_strides));
+            dispatch_bool(
+                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                    out.data_size() > INT32_MAX,
+                [&](auto large) {
+                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                  Shape shape;
+                  std::vector<Strides> strides;
+                  std::tie(shape, strides) =
+                      collapse_contiguous_dims(a, b, out);
+                  auto& a_strides = strides[0];
+                  auto& b_strides = strides[1];
+                  int ndim = shape.size();
+                  if (ndim <= 3) {
+                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                      auto kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant()>;
+                      auto [num_blocks, block_dims] =
+                          get_launch_args(kernel, out, large());
+                      kernel<<<num_blocks, block_dims, 0, stream>>>(
+                          a.data<InType>(),
+                          b.data<InType>(),
+                          out.data<OutType>(),
+                          out.size(),
+                          const_param<dims_constant()>(shape),
+                          const_param<dims_constant()>(a_strides),
+                          const_param<dims_constant()>(b_strides));
+                    });
+                  } else {
+                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out, large());
+                    kernel<<<num_blocks, block_dims, 0, stream>>>(
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out.data<OutType>(),
+                        out.size(),
+                        const_param(shape),
+                        const_param(a_strides),
+                        const_param(b_strides),
+                        ndim);
+                  }
                 });
-              } else {
-                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out, large);
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
-                    a.data<InType>(),
-                    b.data<InType>(),
-                    out.data<OutType>(),
-                    out.size(),
-                    const_param(shape),
-                    const_param(a_strides),
-                    const_param(b_strides),
-                    ndim);
-              }
-            });
           } else {
-            MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+            dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
               auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
               if (bopt == BinaryOpType::ScalarVector) {
                 kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
@@ -197,7 +207,7 @@ void binary_op_gpu_inplace(
                 kernel = cu::binary_vv<Op, InType, OutType, IdxT>;
               }
               auto [num_blocks, block_dims] = get_launch_args(
-                  kernel, out.data_size(), out.shape(), out.strides(), LARGE);
+                  kernel, out.data_size(), out.shape(), out.strides(), large());
               kernel<<<num_blocks, block_dims, 0, stream>>>(
                   a.data<InType>(),
                   b.data<InType>(),
 
@@ -138,57 +138,67 @@ void binary_op_gpu_inplace(
   encoder.set_output_array(out_a);
   encoder.set_output_array(out_b);
   encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_ALL_TYPES(a.dtype(), CTYPE_IN, {
-      MLX_SWITCH_ALL_TYPES(out_a.dtype(), CTYPE_OUT, {
+    dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
+        using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
+        using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
         if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
           using InType = cuda_type_t<CTYPE_IN>;
           using OutType = cuda_type_t<CTYPE_OUT>;
 
           auto bopt = get_binary_op_type(a, b);
           if (bopt == BinaryOpType::General) {
-            auto [shape, strides] = collapse_contiguous_dims(a, b, out_a);
-            auto& a_strides = strides[0];
-            auto& b_strides = strides[1];
-            bool large = a.data_size() > INT32_MAX ||
-                b.data_size() > INT32_MAX || out_a.data_size() > INT32_MAX;
-            MLX_SWITCH_BOOL(large, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, int32_t>;
-              int ndim = shape.size();
-              if (ndim <= 3) {
-                MLX_SWITCH_1_2_3(ndim, NDIM, {
-                  auto kernel =
-                      cu::binary_g_nd<Op, InType, OutType, IdxT, NDIM>;
-                  auto [num_blocks, block_dims] =
-                      get_launch_args(kernel, out_a, large);
-                  kernel<<<num_blocks, block_dims, 0, stream>>>(
-                      a.data<InType>(),
-                      b.data<InType>(),
-                      out_a.data<OutType>(),
-                      out_b.data<OutType>(),
-                      out_a.size(),
-                      const_param<NDIM>(shape),
-                      const_param<NDIM>(a_strides),
-                      const_param<NDIM>(b_strides));
+            dispatch_bool(
+                a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
+                    out_a.data_size() > INT32_MAX,
+                [&](auto large) {
+                  using IdxT = std::conditional_t<large(), int64_t, int32_t>;
+                  Shape shape;
+                  std::vector<Strides> strides;
+                  std::tie(shape, strides) =
+                      collapse_contiguous_dims(a, b, out_a);
+                  auto& a_strides = strides[0];
+                  auto& b_strides = strides[1];
+                  int ndim = shape.size();
+                  if (ndim <= 3) {
+                    dispatch_1_2_3(ndim, [&](auto dims_constant) {
+                      auto kernel = cu::binary_g_nd<
+                          Op,
+                          InType,
+                          OutType,
+                          IdxT,
+                          dims_constant()>;
+                      auto [num_blocks, block_dims] =
+                          get_launch_args(kernel, out_a, large());
+                      kernel<<<num_blocks, block_dims, 0, stream>>>(
+                          a.data<InType>(),
+                          b.data<InType>(),
+                          out_a.data<OutType>(),
+                          out_b.data<OutType>(),
+                          out_a.size(),
+                          const_param<dims_constant()>(shape),
+                          const_param<dims_constant()>(a_strides),
+                          const_param<dims_constant()>(b_strides));
+                    });
+                  } else {
+                    auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
+                    auto [num_blocks, block_dims] =
+                        get_launch_args(kernel, out_a, large());
+                    kernel<<<num_blocks, block_dims, 0, stream>>>(
+                        a.data<InType>(),
+                        b.data<InType>(),
+                        out_a.data<OutType>(),
+                        out_b.data<OutType>(),
+                        out_a.size(),
+                        const_param(shape),
+                        const_param(a_strides),
+                        const_param(b_strides),
+                        ndim);
+                  }
                 });
-              } else {
-                auto kernel = cu::binary_g<Op, InType, OutType, IdxT>;
-                auto [num_blocks, block_dims] =
-                    get_launch_args(kernel, out_a, large);
-                kernel<<<num_blocks, block_dims, 0, stream>>>(
-                    a.data<InType>(),
-                    b.data<InType>(),
-                    out_a.data<OutType>(),
-                    out_b.data<OutType>(),
-                    out_a.size(),
-                    const_param(shape),
-                    const_param(a_strides),
-                    const_param(b_strides),
-                    ndim);
-              }
-            });
           } else {
-            MLX_SWITCH_BOOL(out_a.data_size() > UINT32_MAX, LARGE, {
-              using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
+            dispatch_bool(out_a.data_size() > INT32_MAX, [&](auto large) {
+              using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
               auto kernel = cu::binary_ss<Op, InType, OutType, IdxT>;
               if (bopt == BinaryOpType::ScalarVector) {
                 kernel = cu::binary_sv<Op, InType, OutType, IdxT>;
@@ -202,7 +212,7 @@ void binary_op_gpu_inplace(
                   out_a.data_size(),
                   out_a.shape(),
                   out_a.strides(),
-                  LARGE);
+                  large());
               kernel<<<num_blocks, block_dims, 0, stream>>>(
                   a.data<InType>(),
                   b.data<InType>(),
 
@@ -10,15 +10,6 @@
 
 namespace mlx::core {
 
-#define MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, ...) \
-  MLX_SWITCH_ALL_TYPES(in.dtype(), CTYPE_IN, {               \
-    MLX_SWITCH_ALL_TYPES(out.dtype(), CTYPE_OUT, {           \
-      using InType = cuda_type_t<CTYPE_IN>;                  \
-      using OutType = cuda_type_t<CTYPE_OUT>;                \
-      __VA_ARGS__;                                           \
-    });                                                      \
-  })
-
 void copy_contiguous(
     cu::CommandEncoder& encoder,
     CopyType ctype,
 
@@ -36,19 +36,23 @@ void copy_contiguous(
     int64_t in_offset,
     int64_t out_offset) {
   encoder.launch_kernel([&](cudaStream_t stream) {
-    MLX_SWITCH_COPY_TYPES(in, out, InType, OutType, {
-      MLX_SWITCH_BOOL(out.data_size() > UINT32_MAX, LARGE, {
-        using IdxT = std::conditional_t<LARGE, int64_t, uint32_t>;
-        auto kernel = cu::copy_s<InType, OutType, IdxT>;
-        if (ctype == CopyType::Vector) {
-          kernel = cu::copy_v<InType, OutType, IdxT>;
-        }
-        auto [num_blocks, block_dims] = get_launch_args(
-            kernel, out.data_size(), out.shape(), out.strides(), LARGE);
-        kernel<<<num_blocks, block_dims, 0, stream>>>(
-            in.data<InType>() + in_offset,
-            out.data<OutType>() + out_offset,
-            out.data_size());
+    dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
+      dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
+        dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
+          using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
+          using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
+          using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
+          auto kernel = cu::copy_s<InType, OutType, IdxT>;
+          if (ctype == CopyType::Vector) {
+            kernel = cu::copy_v<InType, OutType, IdxT>;
+          }
+          auto [num_blocks, block_dims] = get_launch_args(
+              kernel, out.data_size(), out.shape(), out.strides(), large());
+          kernel<<<num_blocks, block_dims, 0, stream>>>(
+              in.data<InType>() + in_offset,
+              out.data<OutType>() + out_offset,
+              out.data_size());
+        });
       });
     });
   });