Fix SDPA kernel bug on Mac OS 13.3 SDK (#805)

jagrit06 · web-flow · commit ec8a4864fac9 · 2024-03-07T10:18:09.000-08:00
* Move sdpa kernel to allocate tgp mem statically and allow macOS 13.3 SDK builds

* Style
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -77,8 +77,10 @@ elseif (MLX_BUILD_METAL)
     set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14.2_iOS17.2.zip)
   elseif (${MACOS_VERSION} GREATER_EQUAL 14.0)
     set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS14_iOS17-beta.zip)
+  elseif (${MACOS_VERSION} GREATER_EQUAL 13.3)
+    set(METAL_CPP_URL https://developer.apple.com/metal/cpp/files/metal-cpp_macOS13.3_iOS16.4.zip)
   else ()
-    message(FATAL_ERROR "MLX requires macOS >= 13.5 to be built with MLX_BUILD_METAL=ON")
+    message(FATAL_ERROR "MLX requires macOS >= 13.3 to be built with MLX_BUILD_METAL=ON")
   endif()
 
   FetchContent_Declare(
diff --git a/mlx/backend/metal/kernels/scaled_dot_product_attention.metal b/mlx/backend/metal/kernels/scaled_dot_product_attention.metal
@@ -13,10 +13,12 @@ template<typename T, typename T2, typename T4, uint16_t TILE_SIZE_CONST, uint16_
                               device float* O_partials [[buffer(5)]],
                               device float* p_lse [[buffer(6)]],
                               device float* p_maxes [[buffer(7)]],
-                              threadgroup T* threadgroup_block [[threadgroup(0)]],
                               uint simd_lane_id [[thread_index_in_simdgroup]],
                               uint simd_group_id [[simdgroup_index_in_threadgroup]],
                               uint3 tid [[threadgroup_position_in_grid]]) {
+
+    threadgroup T threadgroup_block[32768 / sizeof(T)];
+
     constexpr const size_t DK = 128;
     constexpr const ulong SIMDGROUP_MATRIX_LOAD_FACTOR = 8;
     constexpr const size_t THREADS_PER_SIMDGROUP = 32;
@@ -356,7 +358,6 @@ template [[host_name("fast_inference_sdpa_compute_partials_" #itype "_" #tile_si
     device float* O_partials [[buffer(5)]], \
     device float* p_lse [[buffer(6)]], \
     device float* p_maxes [[buffer(7)]], \
-    threadgroup itype *threadgroup_block [[threadgroup(0)]], \
     uint simd_lane_id [[thread_index_in_simdgroup]], \
     uint simd_group_id [[simdgroup_index_in_threadgroup]], \
     uint3 tid [[threadgroup_position_in_grid]]);
diff --git a/mlx/backend/metal/scaled_dot_product_attention.cpp b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -97,8 +97,6 @@ void sdpa_metal(
   set_array_buffer(compute_encoder, p_lse, 6);
   set_array_buffer(compute_encoder, p_rowmaxes, 7);
 
-  constexpr const uint tgroupMemorySize = 32768;
-  compute_encoder->setThreadgroupMemoryLength(tgroupMemorySize, 0);
   compute_encoder->dispatchThreadgroups(grid_dims, group_dims);
 
   {

Original file line number	Diff line number	Diff line change
`@@ -97,8 +97,6 @@ void sdpa_metal(`
`97`	`97`	`set_array_buffer(compute_encoder, p_lse, 6);`
`98`	`98`	`set_array_buffer(compute_encoder, p_rowmaxes, 7);`
`99`	`99`
`100`		`- constexpr const uint tgroupMemorySize = 32768;`
`101`		`- compute_encoder->setThreadgroupMemoryLength(tgroupMemorySize, 0);`
`102`	`100`	`compute_encoder->dispatchThreadgroups(grid_dims, group_dims);`
`103`	`101`
`104`	`102`	`{`