Skip to content

Commit 308f61c

Browse files
authored
opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160)
* opencl: allow multiple workgroups for large rows * opencl: improve small cpy * opencl: packed concat for small input * opencl: tweak flat q6_K gemv, increase N_DST and remap threads
1 parent da87e9b commit 308f61c

5 files changed

Lines changed: 247 additions & 86 deletions

File tree

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
558558
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
559559
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
560560
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
561-
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
561+
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
562562
cl_kernel kernel_mul_mat_f32_f32;
563563
cl_kernel kernel_mul_mat_f16_f16;
564564
cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
639639
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
640640
cl_kernel kernel_upscale;
641641
cl_kernel kernel_upscale_bilinear;
642-
cl_kernel kernel_concat_f32;
642+
cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
643643
cl_kernel kernel_conv_2d_f16;
644644
cl_kernel kernel_conv_2d_f32;
645645
cl_kernel kernel_conv_2d_f16_f32;
@@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
11211121
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
11221122
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
11231123
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
1124+
CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
11241125
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
11251126
GGML_LOG_CONT(".");
11261127
}
@@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
26152616
cl_program prog =
26162617
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
26172618
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
2619+
CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
26182620
CL_CHECK(clReleaseProgram(prog));
26192621
GGML_LOG_CONT(".");
26202622
}
@@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
85528554
nth *= 2;
85538555
}
85548556

8555-
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
8557+
int nchunks = 1;
8558+
if (src0->type == GGML_TYPE_F32) {
8559+
const int chunk_target = nth * 4;
8560+
nchunks = (ne00 + chunk_target - 1) / chunk_target;
8561+
nchunks = MAX(1, MIN(nchunks, 64));
8562+
}
8563+
8564+
size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
85568565
size_t local_work_size[] = {(size_t)nth, 1, 1};
85578566

85588567
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
1112811137

1112911138
int nth = MIN(64, ne0);
1113011139

11131-
cl_kernel kernel = backend_ctx->kernel_concat_f32;
11140+
const bool concat_pack = (dim == 0 && ne0 < 32);
11141+
cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
11142+
: backend_ctx->kernel_concat_f32;
1113211143

1113311144
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
1113411145
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
1115511166
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
1115611167
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
1115711168

11158-
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
11159-
size_t local_work_size[] = {(size_t)nth, 1, 1};
11169+
if (concat_pack) {
11170+
// packed kernel needs the dst dims to unflatten its 1-D row index.
11171+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
11172+
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
11173+
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));
11174+
11175+
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
11176+
const int base = MIN(64, maxwg);
11177+
const int tpr = MIN(ne0, base); // threads per row
11178+
const int rpw = MAX(1, base / tpr); // rows per workgroup
11179+
const int lsz = tpr * rpw;
11180+
const int nrows = ne1*ne2*ne3;
11181+
const int nwg = (nrows + rpw - 1) / rpw;
11182+
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
11183+
size_t local_work_size[] = {(size_t)lsz, 1, 1};
11184+
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
11185+
} else {
11186+
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
11187+
size_t local_work_size[] = {(size_t)nth, 1, 1};
1116011188

11161-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11189+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
11190+
}
1116211191
}
1116311192

1116411193
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1453614565
} else if (backend_ctx->gpu_family == ADRENO) {
1453714566
nth0 = 64;
1453814567
nth1 = 2;
14539-
ndst = 4;
14568+
ndst = 16;
1454014569
} else {
1454114570
GGML_ASSERT(false && "TODO: Unknown GPU");
1454214571
}
@@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
1663316662
kernel = backend_ctx->kernel_cpy_f32_f16;
1663416663
break;
1663516664
case GGML_TYPE_F32:
16636-
kernel = backend_ctx->kernel_cpy_f32_f32;
16665+
kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
16666+
: backend_ctx->kernel_cpy_f32_f32;
1663716667
break;
1663816668
default:
1663916669
GGML_ASSERT(false && "not implemented");
@@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
1668516715
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
1668616716
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
1668716717

16688-
const int nth = MIN(64, ne00);
16718+
if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
16719+
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
16720+
const int base = MIN(64, maxwg);
16721+
const int tpr = MIN(ne00, base); // threads per row
16722+
const int rpw = MAX(1, base / tpr); // rows per workgroup
16723+
const int lsz = tpr * rpw; // <= base <= maxwg
16724+
const int nrows = ne01*ne02*ne03;
16725+
const int nwg = (nrows + rpw - 1) / rpw;
1668916726

16690-
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
16691-
size_t local_work_size[] = {(size_t)nth, 1, 1};
16727+
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
16728+
size_t local_work_size[] = {(size_t)lsz, 1, 1};
16729+
16730+
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
16731+
} else {
16732+
const int nth = MIN(64, ne00);
1669216733

16693-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
16734+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
16735+
size_t local_work_size[] = {(size_t)nth, 1, 1};
16736+
16737+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
16738+
}
1669416739
}
1669516740

1669616741
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

ggml/src/ggml-opencl/kernels/concat.cl

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
4949
*y = *x;
5050
}
5151
}
52+
53+
kernel void kernel_concat_f32_pack(
54+
global const char * src0,
55+
ulong offset0,
56+
global const char * src1,
57+
ulong offset1,
58+
global char * dst,
59+
ulong offsetd,
60+
int ne00,
61+
int ne01,
62+
int ne02,
63+
int ne03,
64+
ulong nb00,
65+
ulong nb01,
66+
ulong nb02,
67+
ulong nb03,
68+
ulong nb10,
69+
ulong nb11,
70+
ulong nb12,
71+
ulong nb13,
72+
int ne0,
73+
ulong nb0,
74+
ulong nb1,
75+
ulong nb2,
76+
ulong nb3,
77+
int dim,
78+
int ne1,
79+
int ne2,
80+
int ne3
81+
) {
82+
src0 = src0 + offset0;
83+
src1 = src1 + offset1;
84+
dst = dst + offsetd;
85+
86+
int lsz = get_local_size(0);
87+
int tpr = min(ne0, lsz); // threads per row
88+
int rpw = lsz / tpr; // rows per workgroup
89+
int lid = get_local_id(0);
90+
int row = get_group_id(0)*rpw + lid / tpr;
91+
int lane = lid - (lid / tpr) * tpr;
92+
93+
int nrows = ne1*ne2*ne3;
94+
if (row >= nrows) {
95+
return;
96+
}
97+
98+
int i1 = row % ne1;
99+
int t = row / ne1;
100+
int i2 = t % ne2;
101+
int i3 = t / ne2;
102+
103+
int o[4] = {0, 0, 0, 0};
104+
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
105+
106+
for (int i0 = lane; i0 < ne0; i0 += tpr) {
107+
global const float * x;
108+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
109+
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
110+
} else {
111+
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
112+
}
113+
114+
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
115+
116+
*y = *x;
117+
}
118+
}

ggml/src/ggml-opencl/kernels/cpy.cl

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
183183
}
184184
}
185185

186+
kernel void kernel_cpy_f32_f32_pack(
187+
global float * src0,
188+
ulong offset0,
189+
global float * dst,
190+
ulong offsetd,
191+
int ne00,
192+
int ne01,
193+
int ne02,
194+
int ne03,
195+
ulong nb00,
196+
ulong nb01,
197+
ulong nb02,
198+
ulong nb03,
199+
int ne0,
200+
int ne1,
201+
int ne2,
202+
int ne3,
203+
ulong nb0,
204+
ulong nb1,
205+
ulong nb2,
206+
ulong nb3
207+
) {
208+
src0 = (global float*)((global char*)src0 + offset0);
209+
dst = (global float*)((global char*)dst + offsetd);
210+
211+
int lsz = get_local_size(0);
212+
int tpr = min(ne00, lsz); // threads per row
213+
int rpw = lsz / tpr; // rows per workgroup
214+
int lid = get_local_id(0);
215+
int row = get_group_id(0)*rpw + lid / tpr;
216+
int lane = lid - (lid / tpr) * tpr;
217+
218+
int nrows = ne01*ne02*ne03;
219+
if (row >= nrows) {
220+
return;
221+
}
222+
223+
int i01 = row % ne01;
224+
int t = row / ne01;
225+
int i02 = t % ne02;
226+
int i03 = t / ne02;
227+
228+
// linear index of the first element of this row, unflattened over dst dims
229+
long n = (long)row * ne00;
230+
int i3 = (int)(n / ((long)ne2*ne1*ne0));
231+
long rm = n - (long)i3*ne2*ne1*ne0;
232+
int i2 = (int)(rm / ((long)ne1*ne0));
233+
rm -= (long)i2*ne1*ne0;
234+
int i1 = (int)(rm / ne0);
235+
int i0 = (int)(rm - (long)i1*ne0);
236+
237+
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
238+
239+
for (int i00 = lane; i00 < ne00; i00 += tpr) {
240+
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
241+
dst_data[i00] = src[0];
242+
}
243+
}
244+
186245
kernel void kernel_cpy_i32_i32(
187246
global int * src0,
188247
ulong offset0,

ggml/src/ggml-opencl/kernels/get_rows.cl

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
8282
src1 = (global int*)((global char*)src1 + offset1);
8383
dst = (global float*)((global char*)dst + offsetd);
8484

85-
int i10 = get_group_id(0);
86-
int i11 = get_group_id(1);
87-
int i12 = get_group_id(2);
85+
int nchunks = get_num_groups(0) / ne10;
86+
int g = get_group_id(0);
87+
int i10 = g / nchunks;
88+
int chunk = g - i10 * nchunks;
89+
int i11 = get_group_id(1);
90+
int i12 = get_group_id(2);
8891

8992
int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
9093

9194
int i02 = i11;
9295
int i03 = i12;
9396

94-
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
95-
if (ind >= ne00) {
96-
return;
97-
}
98-
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
99-
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
97+
global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1);
98+
global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
99+
100+
int span = (ne00 + nchunks - 1) / nchunks;
101+
int start = chunk * span;
102+
int end = min(start + span, ne00);
103+
104+
for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
105+
dst_row[ind] = src_row[ind];
100106
}
101107
}
102108

0 commit comments

Comments
 (0)