Skip to content

Commit 914dde7

Browse files
authored
ggml : unary ops support non-cont src0 + metal F16 unary ops (ggml-org#19511)
* ggml : unary ops support non-cont src0 * metal : support F16 unary ops + fix ELU
1 parent 3136a84 commit 914dde7

File tree

6 files changed

+164
-78
lines changed

6 files changed

+164
-78
lines changed

ggml/src/ggml-cpu/ops.cpp

Lines changed: 104 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2096,10 +2096,14 @@ static void ggml_compute_forward_gelu_f32(
20962096

20972097
const ggml_tensor * src0 = dst->src[0];
20982098

2099-
assert(ggml_is_contiguous_1(src0));
2100-
assert(ggml_is_contiguous_1(dst));
2099+
assert(ggml_is_contiguous_rows(src0));
21012100
assert(ggml_are_same_shape(src0, dst));
21022101

2102+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2103+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2104+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2105+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2106+
21032107
const int ith = params->ith;
21042108
const int nth = params->nth;
21052109

@@ -2113,10 +2117,14 @@ static void ggml_compute_forward_gelu_f32(
21132117
const int ir0 = dr*ith;
21142118
const int ir1 = MIN(ir0 + dr, nr);
21152119

2116-
for (int i1 = ir0; i1 < ir1; i1++) {
2120+
for (int ir = ir0; ir < ir1; ++ir) {
2121+
const int i3 = ir/(ne02*ne01);
2122+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2123+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2124+
21172125
ggml_vec_gelu_f32(nc,
2118-
(float *) ((char *) dst->data + i1*( dst->nb[1])),
2119-
(float *) ((char *) src0->data + i1*(src0->nb[1])));
2126+
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2127+
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
21202128

21212129
#ifndef NDEBUG
21222130
for (int k = 0; k < nc; k++) {
@@ -2135,10 +2143,14 @@ static void ggml_compute_forward_gelu_f16(
21352143

21362144
const ggml_tensor * src0 = dst->src[0];
21372145

2138-
assert(ggml_is_contiguous_1(src0));
2139-
assert(ggml_is_contiguous_1(dst));
2146+
assert(ggml_is_contiguous_rows(src0));
21402147
assert(ggml_are_same_shape(src0, dst));
21412148

2149+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2150+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2151+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2152+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2153+
21422154
const int ith = params->ith;
21432155
const int nth = params->nth;
21442156

@@ -2152,10 +2164,14 @@ static void ggml_compute_forward_gelu_f16(
21522164
const int ir0 = dr*ith;
21532165
const int ir1 = MIN(ir0 + dr, nr);
21542166

2155-
for (int i1 = ir0; i1 < ir1; i1++) {
2167+
for (int ir = ir0; ir < ir1; ++ir) {
2168+
const int i3 = ir/(ne02*ne01);
2169+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2170+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2171+
21562172
ggml_vec_gelu_f16(nc,
2157-
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
2158-
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
2173+
(ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2174+
(ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
21592175

21602176
#ifndef NDEBUG
21612177
for (int k = 0; k < nc; k++) {
@@ -2276,10 +2292,14 @@ static void ggml_compute_forward_gelu_erf_f32(
22762292

22772293
const ggml_tensor * src0 = dst->src[0];
22782294

2279-
assert(ggml_is_contiguous_1(src0));
2280-
assert(ggml_is_contiguous_1(dst));
2295+
assert(ggml_is_contiguous_rows(src0));
22812296
assert(ggml_are_same_shape(src0, dst));
22822297

2298+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2299+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2300+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2301+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2302+
22832303
const int ith = params->ith;
22842304
const int nth = params->nth;
22852305

@@ -2293,10 +2313,14 @@ static void ggml_compute_forward_gelu_erf_f32(
22932313
const int ir0 = dr*ith;
22942314
const int ir1 = MIN(ir0 + dr, nr);
22952315

2296-
for (int i1 = ir0; i1 < ir1; i1++) {
2316+
for (int ir = ir0; ir < ir1; ++ir) {
2317+
const int i3 = ir/(ne02*ne01);
2318+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2319+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2320+
22972321
ggml_vec_gelu_erf_f32(nc,
2298-
(float *) ((char *) dst->data + i1*( dst->nb[1])),
2299-
(float *) ((char *) src0->data + i1*(src0->nb[1])));
2322+
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2323+
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
23002324

23012325
#ifndef NDEBUG
23022326
for (int k = 0; k < nc; k++) {
@@ -2315,10 +2339,14 @@ static void ggml_compute_forward_gelu_erf_f16(
23152339

23162340
const ggml_tensor * src0 = dst->src[0];
23172341

2318-
assert(ggml_is_contiguous_1(src0));
2319-
assert(ggml_is_contiguous_1(dst));
2342+
assert(ggml_is_contiguous_rows(src0));
23202343
assert(ggml_are_same_shape(src0, dst));
23212344

2345+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2346+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2347+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2348+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2349+
23222350
const int ith = params->ith;
23232351
const int nth = params->nth;
23242352

@@ -2332,10 +2360,14 @@ static void ggml_compute_forward_gelu_erf_f16(
23322360
const int ir0 = dr*ith;
23332361
const int ir1 = MIN(ir0 + dr, nr);
23342362

2335-
for (int i1 = ir0; i1 < ir1; i1++) {
2363+
for (int ir = ir0; ir < ir1; ++ir) {
2364+
const int i3 = ir/(ne02*ne01);
2365+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2366+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2367+
23362368
ggml_vec_gelu_erf_f16(nc,
2337-
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
2338-
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
2369+
(ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2370+
(ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
23392371

23402372
#ifndef NDEBUG
23412373
for (int k = 0; k < nc; k++) {
@@ -2379,10 +2411,14 @@ static void ggml_compute_forward_gelu_quick_f32(
23792411

23802412
const ggml_tensor * src0 = dst->src[0];
23812413

2382-
assert(ggml_is_contiguous_1(src0));
2383-
assert(ggml_is_contiguous_1(dst));
2414+
assert(ggml_is_contiguous_rows(src0));
23842415
assert(ggml_are_same_shape(src0, dst));
23852416

2417+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2418+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2419+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2420+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2421+
23862422
const int ith = params->ith;
23872423
const int nth = params->nth;
23882424

@@ -2396,10 +2432,14 @@ static void ggml_compute_forward_gelu_quick_f32(
23962432
const int ir0 = dr*ith;
23972433
const int ir1 = MIN(ir0 + dr, nr);
23982434

2399-
for (int i1 = ir0; i1 < ir1; i1++) {
2435+
for (int ir = ir0; ir < ir1; ++ir) {
2436+
const int i3 = ir/(ne02*ne01);
2437+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2438+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2439+
24002440
ggml_vec_gelu_quick_f32(nc,
2401-
(float *) ((char *) dst->data + i1*( dst->nb[1])),
2402-
(float *) ((char *) src0->data + i1*(src0->nb[1])));
2441+
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2442+
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
24032443

24042444
#ifndef NDEBUG
24052445
for (int k = 0; k < nc; k++) {
@@ -2418,10 +2458,14 @@ static void ggml_compute_forward_gelu_quick_f16(
24182458

24192459
const ggml_tensor * src0 = dst->src[0];
24202460

2421-
assert(ggml_is_contiguous_1(src0));
2422-
assert(ggml_is_contiguous_1(dst));
2461+
assert(ggml_is_contiguous_rows(src0));
24232462
assert(ggml_are_same_shape(src0, dst));
24242463

2464+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2465+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2466+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2467+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2468+
24252469
const int ith = params->ith;
24262470
const int nth = params->nth;
24272471

@@ -2435,10 +2479,14 @@ static void ggml_compute_forward_gelu_quick_f16(
24352479
const int ir0 = dr*ith;
24362480
const int ir1 = MIN(ir0 + dr, nr);
24372481

2438-
for (int i1 = ir0; i1 < ir1; i1++) {
2482+
for (int ir = ir0; ir < ir1; ++ir) {
2483+
const int i3 = ir/(ne02*ne01);
2484+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2485+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2486+
24392487
ggml_vec_gelu_quick_f16(nc,
2440-
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
2441-
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
2488+
(ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2489+
(ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
24422490

24432491
#ifndef NDEBUG
24442492
for (int k = 0; k < nc; k++) {
@@ -2482,10 +2530,14 @@ static void ggml_compute_forward_silu_f32(
24822530

24832531
const ggml_tensor * src0 = dst->src[0];
24842532

2485-
assert(ggml_is_contiguous_1(src0));
2486-
assert(ggml_is_contiguous_1(dst));
2533+
assert(ggml_is_contiguous_rows(src0));
24872534
assert(ggml_are_same_shape(src0, dst));
24882535

2536+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2537+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2538+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2539+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2540+
24892541
const int ith = params->ith;
24902542
const int nth = params->nth;
24912543

@@ -2499,10 +2551,14 @@ static void ggml_compute_forward_silu_f32(
24992551
const int ir0 = dr*ith;
25002552
const int ir1 = MIN(ir0 + dr, nr);
25012553

2502-
for (int i1 = ir0; i1 < ir1; i1++) {
2554+
for (int ir = ir0; ir < ir1; ++ir) {
2555+
const int i3 = ir/(ne02*ne01);
2556+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2557+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2558+
25032559
ggml_vec_silu_f32(nc,
2504-
(float *) ((char *) dst->data + i1*( dst->nb[1])),
2505-
(float *) ((char *) src0->data + i1*(src0->nb[1])));
2560+
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2561+
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
25062562

25072563
#ifndef NDEBUG
25082564
for (int k = 0; k < nc; k++) {
@@ -2521,10 +2577,14 @@ static void ggml_compute_forward_silu_f16(
25212577

25222578
const ggml_tensor * src0 = dst->src[0];
25232579

2524-
assert(ggml_is_contiguous_1(src0));
2525-
assert(ggml_is_contiguous_1(dst));
2580+
assert(ggml_is_contiguous_rows(src0));
25262581
assert(ggml_are_same_shape(src0, dst));
25272582

2583+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
2584+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
2585+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
2586+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
2587+
25282588
const int ith = params->ith;
25292589
const int nth = params->nth;
25302590

@@ -2538,10 +2598,14 @@ static void ggml_compute_forward_silu_f16(
25382598
const int ir0 = dr*ith;
25392599
const int ir1 = MIN(ir0 + dr, nr);
25402600

2541-
for (int i1 = ir0; i1 < ir1; i1++) {
2601+
for (int ir = ir0; ir < ir1; ++ir) {
2602+
const int i3 = ir/(ne02*ne01);
2603+
const int i2 = (ir - i3*ne02*ne01)/ne01;
2604+
const int i1 = (ir - i3*ne02*ne01 - i2*ne01);
2605+
25422606
ggml_vec_silu_f16(nc,
2543-
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
2544-
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
2607+
(ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1),
2608+
(ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
25452609

25462610
#ifndef NDEBUG
25472611
for (int k = 0; k < nc; k++) {

ggml/src/ggml-cpu/unary-ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ template <float (*op)(float), typename src0_t, typename dst_t>
111111
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
112112
const ggml_tensor * src0 = dst->src[0];
113113

114-
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
114+
GGML_ASSERT(ggml_is_contiguous_rows(src0) && ggml_is_contiguous_rows(dst) && ggml_are_same_shape(src0, dst));
115115

116116
GGML_TENSOR_UNARY_OP_LOCALS
117117

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,7 +1019,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
10191019
case GGML_OP_SIN:
10201020
case GGML_OP_COS:
10211021
case GGML_OP_LOG:
1022-
return ggml_is_contiguous_rows(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1022+
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
10231023
case GGML_OP_UNARY:
10241024
switch (ggml_get_unary_op(op)) {
10251025
case GGML_UNARY_OP_TANH:
@@ -1039,7 +1039,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
10391039
case GGML_UNARY_OP_EXP:
10401040
case GGML_UNARY_OP_SOFTPLUS:
10411041
case GGML_UNARY_OP_EXPM1:
1042-
return ggml_is_contiguous_rows(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1042+
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
10431043
default:
10441044
return false;
10451045
}

0 commit comments

Comments
 (0)