@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
558558 cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
559559 cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
560560 cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
561- cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
561+ cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
562562 cl_kernel kernel_mul_mat_f32_f32;
563563 cl_kernel kernel_mul_mat_f16_f16;
564564 cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
639639 cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
640640 cl_kernel kernel_upscale;
641641 cl_kernel kernel_upscale_bilinear;
642- cl_kernel kernel_concat_f32;
642+ cl_kernel kernel_concat_f32, kernel_concat_f32_pack ;
643643 cl_kernel kernel_conv_2d_f16;
644644 cl_kernel kernel_conv_2d_f32;
645645 cl_kernel kernel_conv_2d_f16_f32;
@@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
11211121 CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
11221122 CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
11231123 CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
1124+ CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
11241125 CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
11251126 GGML_LOG_CONT(".");
11261127 }
@@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
26152616 cl_program prog =
26162617 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
26172618 CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
2619+ CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
26182620 CL_CHECK(clReleaseProgram(prog));
26192621 GGML_LOG_CONT(".");
26202622 }
@@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
85528554 nth *= 2;
85538555 }
85548556
8555- size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
8557+ int nchunks = 1;
8558+ if (src0->type == GGML_TYPE_F32) {
8559+ const int chunk_target = nth * 4;
8560+ nchunks = (ne00 + chunk_target - 1) / chunk_target;
8561+ nchunks = MAX(1, MIN(nchunks, 64));
8562+ }
8563+
8564+ size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
85568565 size_t local_work_size[] = {(size_t)nth, 1, 1};
85578566
85588567 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
1112811137
1112911138 int nth = MIN(64, ne0);
1113011139
11131- cl_kernel kernel = backend_ctx->kernel_concat_f32;
11140+ const bool concat_pack = (dim == 0 && ne0 < 32);
11141+ cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
11142+ : backend_ctx->kernel_concat_f32;
1113211143
1113311144 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
1113411145 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
1115511166 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
1115611167 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
1115711168
11158- size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
11159- size_t local_work_size[] = {(size_t)nth, 1, 1};
11169+ if (concat_pack) {
11170+ // packed kernel needs the dst dims to unflatten its 1-D row index.
11171+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
11172+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
11173+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));
11174+
11175+ const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
11176+ const int base = MIN(64, maxwg);
11177+ const int tpr = MIN(ne0, base); // threads per row
11178+ const int rpw = MAX(1, base / tpr); // rows per workgroup
11179+ const int lsz = tpr * rpw;
11180+ const int nrows = ne1*ne2*ne3;
11181+ const int nwg = (nrows + rpw - 1) / rpw;
11182+ size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
11183+ size_t local_work_size[] = {(size_t)lsz, 1, 1};
11184+ backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
11185+ } else {
11186+ size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
11187+ size_t local_work_size[] = {(size_t)nth, 1, 1};
1116011188
11161- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11189+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11190+ }
1116211191}
1116311192
1116411193static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1453614565 } else if (backend_ctx->gpu_family == ADRENO) {
1453714566 nth0 = 64;
1453814567 nth1 = 2;
14539- ndst = 4 ;
14568+ ndst = 16 ;
1454014569 } else {
1454114570 GGML_ASSERT(false && "TODO: Unknown GPU");
1454214571 }
@@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
1663316662 kernel = backend_ctx->kernel_cpy_f32_f16;
1663416663 break;
1663516664 case GGML_TYPE_F32:
16636- kernel = backend_ctx->kernel_cpy_f32_f32;
16665+ kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
16666+ : backend_ctx->kernel_cpy_f32_f32;
1663716667 break;
1663816668 default:
1663916669 GGML_ASSERT(false && "not implemented");
@@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
1668516715 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
1668616716 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
1668716717
16688- const int nth = MIN(64, ne00);
16718+ if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
16719+ const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
16720+ const int base = MIN(64, maxwg);
16721+ const int tpr = MIN(ne00, base); // threads per row
16722+ const int rpw = MAX(1, base / tpr); // rows per workgroup
16723+ const int lsz = tpr * rpw; // <= base <= maxwg
16724+ const int nrows = ne01*ne02*ne03;
16725+ const int nwg = (nrows + rpw - 1) / rpw;
1668916726
16690- size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
16691- size_t local_work_size[] = {(size_t)nth, 1, 1};
16727+ size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
16728+ size_t local_work_size[] = {(size_t)lsz, 1, 1};
16729+
16730+ backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
16731+ } else {
16732+ const int nth = MIN(64, ne00);
1669216733
16693- backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
16734+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
16735+ size_t local_work_size[] = {(size_t)nth, 1, 1};
16736+
16737+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
16738+ }
1669416739}
1669516740
1669616741static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments