Skip to content

Commit a9883db

Browse files
happyyzyYour Name
andauthored
opencl: add opt-in Adreno xmem F16xF32 GEMM for prefill (ggml-org#22755)
* ggml-opencl: add Adreno xmem F16xF32 GEMM for prefill * ggml-opencl: address Adreno xmem review comments * ggml-opencl: align xmem gemm kernel naming --------- Co-authored-by: Your Name <your@email.com>
1 parent cce09f0 commit a9883db

3 files changed

Lines changed: 457 additions & 0 deletions

File tree

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ set(GGML_OPENCL_KERNELS
176176
flash_attn_f32
177177
)
178178

179+
if (GGML_OPENCL_USE_ADRENO_KERNELS)
180+
list(APPEND GGML_OPENCL_KERNELS gemm_xmem_f16_f32_os8)
181+
endif ()
182+
179183
foreach (K ${GGML_OPENCL_KERNELS})
180184
ggml_opencl_add_kernel(${K})
181185
endforeach()

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,8 @@ struct ggml_backend_opencl_context {
407407

408408
cl_bool non_uniform_workgroups;
409409
size_t image_max_buffer_size;
410+
size_t image2d_max_width;
411+
size_t image2d_max_height;
410412

411413
cl_context context;
412414
cl_command_queue queue;
@@ -420,6 +422,11 @@ struct ggml_backend_opencl_context {
420422
ggml_cl_buffer prealloc_src0;
421423
ggml_cl_buffer prealloc_src1;
422424

425+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
426+
ggml_cl_buffer prealloc_adreno_xmem_const;
427+
bool adreno_xmem_gemm_enabled = false;
428+
#endif
429+
423430
// prealloc buffers for MoE router table preprocess
424431
bool toggle_reorder = false;
425432
ggml_cl_buffer prealloc_post_router;
@@ -538,6 +545,10 @@ struct ggml_backend_opencl_context {
538545
cl_kernel kernel_mul_mat_f16_f32;
539546
cl_kernel kernel_mul_mat_f16_f32_l4;
540547
cl_kernel kernel_mul_mat_f16_f32_tiled;
548+
cl_kernel kernel_adreno_xmem_pack_src_f32;
549+
cl_kernel kernel_adreno_xmem_prepack_weight_f16;
550+
cl_kernel kernel_gemm_xmem_f16_f32_os8;
551+
cl_kernel kernel_adreno_xmem_store_dst_f32;
541552
cl_kernel kernel_mul_mm_f16_f32_kqv;
542553
cl_kernel kernel_mul_mm_f16_f32_kq;
543554
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
@@ -1554,6 +1565,32 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
15541565
GGML_LOG_CONT(".");
15551566
}
15561567

1568+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1569+
// gemm_xmem_f16_f32_os8
1570+
{
1571+
#ifdef GGML_OPENCL_EMBED_KERNELS
1572+
const std::string kernel_src {
1573+
#include "gemm_xmem_f16_f32_os8.cl.h"
1574+
};
1575+
#else
1576+
const std::string kernel_src = read_file("gemm_xmem_f16_f32_os8.cl");
1577+
#endif
1578+
cl_program prog =
1579+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1580+
1581+
CL_CHECK((backend_ctx->kernel_adreno_xmem_pack_src_f32 =
1582+
clCreateKernel(prog, "adreno_xmem_pack_src_f32", &err), err));
1583+
CL_CHECK((backend_ctx->kernel_adreno_xmem_prepack_weight_f16 =
1584+
clCreateKernel(prog, "adreno_xmem_prepack_weight_f16", &err), err));
1585+
CL_CHECK((backend_ctx->kernel_gemm_xmem_f16_f32_os8 =
1586+
clCreateKernel(prog, "kernel_gemm_xmem_f16_f32_os8", &err), err));
1587+
CL_CHECK((backend_ctx->kernel_adreno_xmem_store_dst_f32 =
1588+
clCreateKernel(prog, "adreno_xmem_store_dst_f32", &err), err));
1589+
CL_CHECK(clReleaseProgram(prog));
1590+
GGML_LOG_CONT(".");
1591+
}
1592+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
1593+
15571594
// mul_mm_f32_f32_l4_lm
15581595
{
15591596
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3473,6 +3510,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
34733510
clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
34743511
GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
34753512

3513+
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL);
3514+
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL);
3515+
GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height);
3516+
34763517
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
34773518
GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
34783519

@@ -3511,6 +3552,16 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
35113552
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
35123553
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
35133554

3555+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3556+
backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr &&
3557+
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
3558+
if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) {
3559+
GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n",
3560+
backend_ctx->adreno_xmem_gemm_enabled ?
3561+
"enabled (temporary weight prepack)" : "requested but unsupported by this driver");
3562+
}
3563+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
3564+
35143565
// determine whether to use large buffer for Adreno
35153566
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
35163567
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
@@ -9920,6 +9971,169 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
99209971
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
99219972
}
99229973

9974+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9975+
static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32(
9976+
const ggml_backend_opencl_context * backend_ctx,
9977+
const ggml_tensor * src0,
9978+
const ggml_tensor * src1,
9979+
const ggml_tensor * dst) {
9980+
if (!backend_ctx->adreno_xmem_gemm_enabled) {
9981+
return false;
9982+
}
9983+
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
9984+
return false;
9985+
}
9986+
if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
9987+
return false;
9988+
}
9989+
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
9990+
return false;
9991+
}
9992+
if (src0->ne[2] != 1 || src0->ne[3] != 1 ||
9993+
src1->ne[2] != 1 || src1->ne[3] != 1 ||
9994+
dst->ne[2] != 1 || dst->ne[3] != 1) {
9995+
return false;
9996+
}
9997+
const int K = src0->ne[0];
9998+
const int M = src0->ne[1];
9999+
const int N = src1->ne[1];
10000+
if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) {
10001+
return false;
10002+
}
10003+
if (N <= 1 || M < 64 || N < 16 || K < 64) {
10004+
return false;
10005+
}
10006+
if ((K % 8) != 0) {
10007+
return false;
10008+
}
10009+
const int kpack = K / 4;
10010+
const int npack = CEIL_DIV(M, 4);
10011+
if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
10012+
static_cast<size_t>(kpack) > backend_ctx->image2d_max_height) {
10013+
return false;
10014+
}
10015+
if (static_cast<size_t>(N) > backend_ctx->image2d_max_width ||
10016+
static_cast<size_t>(npack) > backend_ctx->image2d_max_height) {
10017+
return false;
10018+
}
10019+
return true;
10020+
}
10021+
10022+
static void ggml_cl_mul_mat_f16_f32_adreno_xmem(
10023+
ggml_backend_t backend,
10024+
const ggml_tensor * src0,
10025+
const ggml_tensor * src1,
10026+
ggml_tensor * dst) {
10027+
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *)backend->context;
10028+
10029+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10030+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10031+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10032+
10033+
const cl_ulong offset0 = extra0->offset + src0->view_offs;
10034+
const cl_ulong offset1 = extra1->offset + src1->view_offs;
10035+
const cl_ulong offsetd = extrad->offset + dst->view_offs;
10036+
10037+
const int K = src0->ne[0];
10038+
const int M = src0->ne[1];
10039+
const int N = src1->ne[1];
10040+
const int kpack = K / 4;
10041+
const int npack = CEIL_DIV(M, 4);
10042+
const int os = 8;
10043+
10044+
const size_t xmem_bytes = 6144;
10045+
const size_t weight_bytes = static_cast<size_t>(kpack) * static_cast<size_t>(npack) * 4u * sizeof(cl_half4);
10046+
10047+
backend_ctx->prealloc_adreno_xmem_const.allocate(backend_ctx->context, xmem_bytes);
10048+
10049+
cl_int err = CL_SUCCESS;
10050+
cl_image_format fmt = {};
10051+
fmt.image_channel_order = CL_RGBA;
10052+
fmt.image_channel_data_type = CL_HALF_FLOAT;
10053+
10054+
cl_image_desc desc_src = {};
10055+
desc_src.image_type = CL_MEM_OBJECT_IMAGE2D;
10056+
desc_src.image_width = static_cast<size_t>(N);
10057+
desc_src.image_height = static_cast<size_t>(kpack);
10058+
cl_mem src_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_src, nullptr, &err);
10059+
CL_CHECK(err);
10060+
10061+
cl_image_desc desc_dst = {};
10062+
desc_dst.image_type = CL_MEM_OBJECT_IMAGE2D;
10063+
desc_dst.image_width = static_cast<size_t>(N);
10064+
desc_dst.image_height = static_cast<size_t>(npack);
10065+
cl_mem dst_img = clCreateImage(backend_ctx->context, CL_MEM_READ_WRITE, &fmt, &desc_dst, nullptr, &err);
10066+
CL_CHECK(err);
10067+
10068+
cl_mem weights = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, weight_bytes, nullptr, &err);
10069+
CL_CHECK(err);
10070+
10071+
cl_kernel prepack = backend_ctx->kernel_adreno_xmem_prepack_weight_f16;
10072+
CL_CHECK(clSetKernelArg(prepack, 0, sizeof(cl_mem), &weights));
10073+
CL_CHECK(clSetKernelArg(prepack, 1, sizeof(cl_mem), &extra0->data_device));
10074+
CL_CHECK(clSetKernelArg(prepack, 2, sizeof(cl_ulong), &offset0));
10075+
CL_CHECK(clSetKernelArg(prepack, 3, sizeof(int), &K));
10076+
CL_CHECK(clSetKernelArg(prepack, 4, sizeof(int), &M));
10077+
CL_CHECK(clSetKernelArg(prepack, 5, sizeof(int), &kpack));
10078+
CL_CHECK(clSetKernelArg(prepack, 6, sizeof(int), &npack));
10079+
CL_CHECK(clSetKernelArg(prepack, 7, sizeof(int), &os));
10080+
size_t lws = 256;
10081+
size_t max_wg = backend_ctx->get_kernel_workgroup_size(prepack);
10082+
if (lws > max_wg) {
10083+
lws = max_wg;
10084+
}
10085+
size_t gws = CEIL_DIV(static_cast<size_t>(kpack) * static_cast<size_t>(npack), lws) * lws;
10086+
backend_ctx->enqueue_ndrange_kernel(prepack, 1, &gws, &lws, dst);
10087+
10088+
cl_kernel pack_src = backend_ctx->kernel_adreno_xmem_pack_src_f32;
10089+
CL_CHECK(clSetKernelArg(pack_src, 0, sizeof(cl_mem), &extra1->data_device));
10090+
CL_CHECK(clSetKernelArg(pack_src, 1, sizeof(cl_ulong), &offset1));
10091+
CL_CHECK(clSetKernelArg(pack_src, 2, sizeof(cl_mem), &src_img));
10092+
CL_CHECK(clSetKernelArg(pack_src, 3, sizeof(int), &K));
10093+
CL_CHECK(clSetKernelArg(pack_src, 4, sizeof(int), &N));
10094+
size_t pack_src_lws[2] = { 16, 16 };
10095+
size_t pack_src_gws[2] = {
10096+
CEIL_DIV(static_cast<size_t>(N), pack_src_lws[0])*pack_src_lws[0],
10097+
CEIL_DIV(static_cast<size_t>(kpack), pack_src_lws[1])*pack_src_lws[1]
10098+
};
10099+
backend_ctx->enqueue_ndrange_kernel(pack_src, 2, pack_src_gws, pack_src_lws, dst);
10100+
10101+
cl_kernel gemm = backend_ctx->kernel_gemm_xmem_f16_f32_os8;
10102+
CL_CHECK(clSetKernelArg(gemm, 0, sizeof(cl_mem), &weights));
10103+
CL_CHECK(clSetKernelArg(gemm, 1, sizeof(cl_mem), &backend_ctx->prealloc_adreno_xmem_const.buffer));
10104+
CL_CHECK(clSetKernelArg(gemm, 2, sizeof(cl_mem), &src_img));
10105+
CL_CHECK(clSetKernelArg(gemm, 3, sizeof(cl_mem), &dst_img));
10106+
CL_CHECK(clSetKernelArg(gemm, 4, sizeof(int), &N));
10107+
CL_CHECK(clSetKernelArg(gemm, 5, sizeof(int), &npack));
10108+
CL_CHECK(clSetKernelArg(gemm, 6, sizeof(int), &kpack));
10109+
const size_t z_values = CEIL_DIV(static_cast<size_t>(npack), static_cast<size_t>(os));
10110+
size_t gemm_lws[3] = { 64, 1, 1 };
10111+
size_t gemm_gws[3] = {
10112+
z_values*gemm_lws[0],
10113+
CEIL_DIV(static_cast<size_t>(N), gemm_lws[0]),
10114+
1
10115+
};
10116+
backend_ctx->enqueue_ndrange_kernel(gemm, 3, gemm_gws, gemm_lws, dst);
10117+
10118+
cl_kernel store_dst = backend_ctx->kernel_adreno_xmem_store_dst_f32;
10119+
CL_CHECK(clSetKernelArg(store_dst, 0, sizeof(cl_mem), &dst_img));
10120+
CL_CHECK(clSetKernelArg(store_dst, 1, sizeof(cl_mem), &extrad->data_device));
10121+
CL_CHECK(clSetKernelArg(store_dst, 2, sizeof(cl_ulong), &offsetd));
10122+
CL_CHECK(clSetKernelArg(store_dst, 3, sizeof(int), &M));
10123+
CL_CHECK(clSetKernelArg(store_dst, 4, sizeof(int), &N));
10124+
size_t store_lws[2] = { 16, 16 };
10125+
size_t store_gws[2] = {
10126+
CEIL_DIV(static_cast<size_t>(N), store_lws[0])*store_lws[0],
10127+
CEIL_DIV(static_cast<size_t>(npack), store_lws[1])*store_lws[1]
10128+
};
10129+
backend_ctx->enqueue_ndrange_kernel(store_dst, 2, store_gws, store_lws, dst);
10130+
10131+
CL_CHECK(clReleaseMemObject(weights));
10132+
CL_CHECK(clReleaseMemObject(dst_img));
10133+
CL_CHECK(clReleaseMemObject(src_img));
10134+
}
10135+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
10136+
992310137
static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
992410138
GGML_TENSOR_BINARY_OP_LOCALS;
992510139
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -11681,6 +11895,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1168111895
return;
1168211896
}
1168311897
case GGML_TYPE_F16: {
11898+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
11899+
if (ggml_cl_can_use_adreno_xmem_gemm_f16_f32(backend_ctx, src0, src1, dst)) {
11900+
ggml_cl_mul_mat_f16_f32_adreno_xmem(backend, src0, src1, dst);
11901+
return;
11902+
}
11903+
#endif
1168411904
kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
1168511905
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
1168611906

0 commit comments

Comments
 (0)