@@ -407,6 +407,8 @@ struct ggml_backend_opencl_context {
407407
408408 cl_bool non_uniform_workgroups;
409409 size_t image_max_buffer_size;
410+ size_t image2d_max_width;
411+ size_t image2d_max_height;
410412
411413 cl_context context;
412414 cl_command_queue queue;
@@ -420,6 +422,11 @@ struct ggml_backend_opencl_context {
420422 ggml_cl_buffer prealloc_src0;
421423 ggml_cl_buffer prealloc_src1;
422424
425+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
426+ ggml_cl_buffer prealloc_adreno_xmem_const;
427+ bool adreno_xmem_gemm_enabled = false;
428+ #endif
429+
423430 // prealloc buffers for MoE router table preprocess
424431 bool toggle_reorder = false;
425432 ggml_cl_buffer prealloc_post_router;
@@ -538,6 +545,10 @@ struct ggml_backend_opencl_context {
538545 cl_kernel kernel_mul_mat_f16_f32;
539546 cl_kernel kernel_mul_mat_f16_f32_l4;
540547 cl_kernel kernel_mul_mat_f16_f32_tiled;
548+ cl_kernel kernel_adreno_xmem_pack_src_f32;
549+ cl_kernel kernel_adreno_xmem_prepack_weight_f16;
550+ cl_kernel kernel_gemm_xmem_f16_f32_os8;
551+ cl_kernel kernel_adreno_xmem_store_dst_f32;
541552 cl_kernel kernel_mul_mm_f16_f32_kqv;
542553 cl_kernel kernel_mul_mm_f16_f32_kq;
543554 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
@@ -1554,6 +1565,32 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
15541565 GGML_LOG_CONT(".");
15551566 }
15561567
1568+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1569+ // gemm_xmem_f16_f32_os8
1570+ {
1571+ #ifdef GGML_OPENCL_EMBED_KERNELS
1572+ const std::string kernel_src {
1573+ #include "gemm_xmem_f16_f32_os8.cl.h"
1574+ };
1575+ #else
1576+ const std::string kernel_src = read_file("gemm_xmem_f16_f32_os8.cl");
1577+ #endif
1578+ cl_program prog =
1579+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1580+
1581+ CL_CHECK((backend_ctx->kernel_adreno_xmem_pack_src_f32 =
1582+ clCreateKernel(prog, "adreno_xmem_pack_src_f32", &err), err));
1583+ CL_CHECK((backend_ctx->kernel_adreno_xmem_prepack_weight_f16 =
1584+ clCreateKernel(prog, "adreno_xmem_prepack_weight_f16", &err), err));
1585+ CL_CHECK((backend_ctx->kernel_gemm_xmem_f16_f32_os8 =
1586+ clCreateKernel(prog, "kernel_gemm_xmem_f16_f32_os8", &err), err));
1587+ CL_CHECK((backend_ctx->kernel_adreno_xmem_store_dst_f32 =
1588+ clCreateKernel(prog, "adreno_xmem_store_dst_f32", &err), err));
1589+ CL_CHECK(clReleaseProgram(prog));
1590+ GGML_LOG_CONT(".");
1591+ }
1592+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1593+
15571594 // mul_mm_f32_f32_l4_lm
15581595 {
15591596#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3473,6 +3510,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
34733510 clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
34743511 GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
34753512
3513+ clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL);
3514+ clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL);
3515+ GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
3516+
34763517 clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
34773518 GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
34783519
@@ -3511,6 +3552,16 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
35113552 GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
35123553#endif // GGML_OPENCL_USE_ADRENO_KERNELS
35133554
3555+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3556+ backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr &&
3557+ backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
3558+ if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) {
3559+ GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n",
3560+ backend_ctx->adreno_xmem_gemm_enabled ?
3561+ "enabled (temporary weight prepack)" : "requested but unsupported by this driver");
3562+ }
3563+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
3564+
35143565 // determine whether to use large buffer for Adreno
35153566 backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
35163567 backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
@@ -9920,6 +9971,169 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
99209971 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
99219972}
99229973
9974+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9975+ static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32(
9976+ const ggml_backend_opencl_context * backend_ctx,
9977+ const ggml_tensor * src0,
9978+ const ggml_tensor * src1,
9979+ const ggml_tensor * dst) {
9980+ if (!backend_ctx->adreno_xmem_gemm_enabled) {
9981+ return false;
9982+ }
9983+ if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
9984+ return false;
9985+ }
9986+ if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
9987+ return false;
9988+ }
9989+ if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
9990+ return false;
9991+ }
9992+ if (src0->ne[2] != 1 || src0->ne[3] != 1 ||
9993+ src1->ne[2] != 1 || src1->ne[3] != 1 ||
9994+ dst->ne[2] != 1 || dst->ne[3] != 1) {
9995+ return false;
9996+ }
9997+ const int K = src0->ne[0];
9998+ const int M = src0->ne[1];
9999+ const int N = src1->ne[1];
10000+ if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) {
10001+ return false;
10002+ }
10003+ if (N <= 1 || M < 64 || N < 16 || K < 64) {
10004+ return false;
10005+ }
10006+ if ((K % 8) != 0) {
10007+ return false;
10008+ }
10009+ const int kpack = K / 4;
10010+ const int npack = CEIL_DIV(M, 4);
10011+ if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
10012+ static_cast<size_t>(kpack) > backend_ctx->image2d_max_height) {
10013+ return false;
10014+ }
10015+ if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
10016+ static_cast<size_t>(npack) > backend_ctx->image2d_max_height) {
10017+ return false;
10018+ }
10019+ return true;
10020+ }
10021+
10022+ static void ggml_cl_mul_mat_f16_f32_adreno_xmem(
10023+ ggml_backend_t backend,
10024+ const ggml_tensor * src0,
10025+ const ggml_tensor * src1,
10026+ ggml_tensor * dst) {
10027+ ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *)backend->context;
10028+
10029+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10030+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10031+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10032+
10033+ const cl_ulong offset0 = extra0->offset + src0->view_offs;
10034+ const cl_ulong offset1 = extra1->offset + src1->view_offs;
10035+ const cl_ulong offsetd = extrad->offset + dst->view_offs;
10036+
10037+ const int K = src0->ne[0];
10038+ const int M = src0->ne[1];
10039+ const int N = src1->ne[1];
10040+ const int kpack = K / 4;
10041+ const int npack = CEIL_DIV(M, 4);
10042+ const int os = 8;
10043+
10044+ const size_t xmem_bytes = 6144;
10045+ const size_t weight_bytes = static_cast<size_t>(kpack) * static_cast<size_t>(npack) * 4u * sizeof(cl_half4);
10046+
10047+ backend_ctx->prealloc_adreno_xmem_const.allocate(backend_ctx->context, xmem_bytes);
10048+
10049+ cl_int err = CL_SUCCESS;
10050+ cl_image_format fmt = {};
10051+ fmt.image_channel_order = CL_RGBA;
10052+ fmt.image_channel_data_type = CL_HALF_FLOAT;
10053+
10054+ cl_image_desc desc_src = {};
10055+ desc_src.image_type = CL_MEM_OBJECT_IMAGE2D;
10056+ desc_src.image_width = static_cast<size_t>(N);
10057+ desc_src.image_height = static_cast<size_t>(kpack);
10058+ cl_mem src_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_src, nullptr, &err);
10059+ CL_CHECK(err);
10060+
10061+ cl_image_desc desc_dst = {};
10062+ desc_dst.image_type = CL_MEM_OBJECT_IMAGE2D;
10063+ desc_dst.image_width = static_cast<size_t>(N);
10064+ desc_dst.image_height = static_cast<size_t>(npack);
10065+ cl_mem dst_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_dst, nullptr, &err);
10066+ CL_CHECK(err);
10067+
10068+ cl_mem weights = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, weight_bytes, nullptr, &err);
10069+ CL_CHECK(err);
10070+
10071+ cl_kernel prepack = backend_ctx->kernel_adreno_xmem_prepack_weight_f16;
10072+ CL_CHECK(clSetKernelArg(prepack, 0, sizeof(cl_mem), &weights));
10073+ CL_CHECK(clSetKernelArg(prepack, 1, sizeof(cl_mem), &extra0->data_device));
10074+ CL_CHECK(clSetKernelArg(prepack, 2, sizeof(cl_ulong), &offset0));
10075+ CL_CHECK(clSetKernelArg(prepack, 3, sizeof(int), &K));
10076+ CL_CHECK(clSetKernelArg(prepack, 4, sizeof(int), &M));
10077+ CL_CHECK(clSetKernelArg(prepack, 5, sizeof(int), &kpack));
10078+ CL_CHECK(clSetKernelArg(prepack, 6, sizeof(int), &npack));
10079+ CL_CHECK(clSetKernelArg(prepack, 7, sizeof(int), &os));
10080+ size_t lws = 256;
10081+ size_t max_wg = backend_ctx->get_kernel_workgroup_size(prepack);
10082+ if (lws > max_wg) {
10083+ lws = max_wg;
10084+ }
10085+ size_t gws = CEIL_DIV(static_cast<size_t>(kpack) * static_cast<size_t>(npack), lws) * lws;
10086+ backend_ctx->enqueue_ndrange_kernel(prepack, 1, &gws, &lws, dst);
10087+
10088+ cl_kernel pack_src = backend_ctx->kernel_adreno_xmem_pack_src_f32;
10089+ CL_CHECK(clSetKernelArg(pack_src, 0, sizeof(cl_mem), &extra1->data_device));
10090+ CL_CHECK(clSetKernelArg(pack_src, 1, sizeof(cl_ulong), &offset1));
10091+ CL_CHECK(clSetKernelArg(pack_src, 2, sizeof(cl_mem), &src_img));
10092+ CL_CHECK(clSetKernelArg(pack_src, 3, sizeof(int), &K));
10093+ CL_CHECK(clSetKernelArg(pack_src, 4, sizeof(int), &N));
10094+ size_t pack_src_lws[2] = { 16, 16 };
10095+ size_t pack_src_gws[2] = {
10096+ CEIL_DIV(static_cast<size_t>(N), pack_src_lws[0])*pack_src_lws[0],
10097+ CEIL_DIV(static_cast<size_t>(kpack), pack_src_lws[1])*pack_src_lws[1]
10098+ };
10099+ backend_ctx->enqueue_ndrange_kernel(pack_src, 2, pack_src_gws, pack_src_lws, dst);
10100+
10101+ cl_kernel gemm = backend_ctx->kernel_gemm_xmem_f16_f32_os8;
10102+ CL_CHECK(clSetKernelArg(gemm, 0, sizeof(cl_mem), &weights));
10103+ CL_CHECK(clSetKernelArg(gemm, 1, sizeof(cl_mem), &backend_ctx->prealloc_adreno_xmem_const.buffer));
10104+ CL_CHECK(clSetKernelArg(gemm, 2, sizeof(cl_mem), &src_img));
10105+ CL_CHECK(clSetKernelArg(gemm, 3, sizeof(cl_mem), &dst_img));
10106+ CL_CHECK(clSetKernelArg(gemm, 4, sizeof(int), &N));
10107+ CL_CHECK(clSetKernelArg(gemm, 5, sizeof(int), &npack));
10108+ CL_CHECK(clSetKernelArg(gemm, 6, sizeof(int), &kpack));
10109+ const size_t z_values = CEIL_DIV(static_cast<size_t>(npack), static_cast<size_t>(os));
10110+ size_t gemm_lws[3] = { 64, 1, 1 };
10111+ size_t gemm_gws[3] = {
10112+ z_values*gemm_lws[0],
10113+ CEIL_DIV(static_cast<size_t>(N), gemm_lws[0]),
10114+ 1
10115+ };
10116+ backend_ctx->enqueue_ndrange_kernel(gemm, 3, gemm_gws, gemm_lws, dst);
10117+
10118+ cl_kernel store_dst = backend_ctx->kernel_adreno_xmem_store_dst_f32;
10119+ CL_CHECK(clSetKernelArg(store_dst, 0, sizeof(cl_mem), &dst_img));
10120+ CL_CHECK(clSetKernelArg(store_dst, 1, sizeof(cl_mem), &extrad->data_device));
10121+ CL_CHECK(clSetKernelArg(store_dst, 2, sizeof(cl_ulong), &offsetd));
10122+ CL_CHECK(clSetKernelArg(store_dst, 3, sizeof(int), &M));
10123+ CL_CHECK(clSetKernelArg(store_dst, 4, sizeof(int), &N));
10124+ size_t store_lws[2] = { 16, 16 };
10125+ size_t store_gws[2] = {
10126+ CEIL_DIV(static_cast<size_t>(N), store_lws[0])*store_lws[0],
10127+ CEIL_DIV(static_cast<size_t>(npack), store_lws[1])*store_lws[1]
10128+ };
10129+ backend_ctx->enqueue_ndrange_kernel(store_dst, 2, store_gws, store_lws, dst);
10130+
10131+ CL_CHECK(clReleaseMemObject(weights));
10132+ CL_CHECK(clReleaseMemObject(dst_img));
10133+ CL_CHECK(clReleaseMemObject(src_img));
10134+ }
10135+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
10136+
992310137static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
992410138 GGML_TENSOR_BINARY_OP_LOCALS;
992510139 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -11681,6 +11895,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1168111895 return;
1168211896 }
1168311897 case GGML_TYPE_F16: {
11898+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
11899+ if (ggml_cl_can_use_adreno_xmem_gemm_f16_f32(backend_ctx, src0, src1, dst)) {
11900+ ggml_cl_mul_mat_f16_f32_adreno_xmem(backend, src0, src1, dst);
11901+ return;
11902+ }
11903+ #endif
1168411904 kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
1168511905 nth0 = 128; // calculated as (BM*BN)/(TM*TN)
1168611906
0 commit comments