|
| 1 | +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. |
| 2 | +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. |
| 3 | +// |
| 4 | +// SPDX-License-Identifier: BSD-2-Clause |
| 5 | +// |
| 6 | +// This file is part of CEED: http://github.com/ceed |
| 7 | + |
| 8 | +#include <ceed.h> |
| 9 | +#include <ceed/backend.h> |
| 10 | +#ifdef __ARM_FEATURE_SVE |
| 11 | +#include <arm_sve.h> |
| 12 | +#endif |
| 13 | +#include <stdbool.h> |
| 14 | + |
| 15 | +#include "ceed-sve.h" |
| 16 | + |
| 17 | +//------------------------------------------------------------------------------ |
| 18 | +// Blocked Tensor Contract |
| 19 | +//------------------------------------------------------------------------------ |
| 20 | +static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, |
| 21 | + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v, |
| 22 | + const CeedInt JJ) { |
| 23 | + CeedInt t_stride_0 = B, t_stride_1 = 1; |
| 24 | + if (t_mode == CEED_TRANSPOSE) { |
| 25 | + t_stride_0 = 1; |
| 26 | + t_stride_1 = J; |
| 27 | + } |
| 28 | + |
| 29 | + for (CeedInt a = 0; a < A; a++) { |
| 30 | + for (CeedInt b = 0; b < B; b++) { |
| 31 | + // Blocks of JJ rows |
| 32 | + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { |
| 33 | + for (CeedInt jj = 0; jj < JJ; jj++) { // unroll |
| 34 | + // C vectorization by compiler |
| 35 | + for (int32_t c = 0; c < C; c += svcntd()) { |
| 36 | + svbool_t pg = svwhilelt_b32(c, C); |
| 37 | + // Load u, v into vectors |
| 38 | + svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]); |
| 39 | + svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]); |
| 40 | + // Basis matrix value |
| 41 | + float tq = t[(j + jj) * t_stride_0 + b * t_stride_1]; |
| 42 | + // fmadd |
| 43 | + svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq)); |
| 44 | + } |
| 45 | + } |
| 46 | + } |
| 47 | + } |
| 48 | + } |
| 49 | + |
| 50 | + // Remainder of rows |
| 51 | + CeedInt j = (J / JJ) * JJ; |
| 52 | + if (j < J) { |
| 53 | + for (CeedInt a = 0; a < A; a++) { |
| 54 | + for (CeedInt b = 0; b < B; b++) { |
| 55 | + // Blocks of JJ rows |
| 56 | + for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled |
| 57 | + // C vectorization by compiler |
| 58 | + for (int32_t c = 0; c < C; c += svcntd()) { |
| 59 | + svbool_t pg = svwhilelt_b32(c, C); |
| 60 | + // Load u, v into vectors |
| 61 | + svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]); |
| 62 | + svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]); |
| 63 | + // Basis matrix value |
| 64 | + float tq = t[(j + jj) * t_stride_0 + b * t_stride_1]; |
| 65 | + // fmadd |
| 66 | + svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq)); |
| 67 | + } |
| 68 | + } |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + return CEED_ERROR_SUCCESS; |
| 74 | +} |
| 75 | + |
| 76 | +//------------------------------------------------------------------------------ |
| 77 | +// Blocked Tensor Contract |
| 78 | +//------------------------------------------------------------------------------ |
| 79 | +static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, |
| 80 | + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v, |
| 81 | + const CeedInt JJ) { |
| 82 | + CeedInt t_stride_0 = B, t_stride_1 = 1; |
| 83 | + if (t_mode == CEED_TRANSPOSE) { |
| 84 | + t_stride_0 = 1; |
| 85 | + t_stride_1 = J; |
| 86 | + } |
| 87 | + |
| 88 | + for (CeedInt a = 0; a < A; a++) { |
| 89 | + for (CeedInt b = 0; b < B; b++) { |
| 90 | + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { |
| 91 | + for (CeedInt jj = 0; jj < JJ; jj++) { // unroll |
| 92 | + v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b]; |
| 93 | + } |
| 94 | + } |
| 95 | + } |
| 96 | + } |
| 97 | + |
| 98 | + CeedInt j = (J / JJ) * JJ; |
| 99 | + if (j < J) { |
| 100 | + for (CeedInt a = 0; a < A; a++) { |
| 101 | + for (CeedInt b = 0; b < B; b++) { |
| 102 | + for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled |
| 103 | + v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b]; |
| 104 | + } |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + return CEED_ERROR_SUCCESS; |
| 110 | +} |
| 111 | + |
| 112 | +//------------------------------------------------------------------------------ |
| 113 | +// Tensor Contract - Common Sizes |
| 114 | +//------------------------------------------------------------------------------ |
| 115 | +static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, |
| 116 | + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { |
| 117 | + return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8); |
| 118 | +} |
| 119 | +static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, |
| 120 | + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { |
| 121 | + return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8); |
| 122 | +} |
| 123 | + |
| 124 | +//------------------------------------------------------------------------------ |
| 125 | +// Tensor Contract Apply |
| 126 | +//------------------------------------------------------------------------------ |
| 127 | +static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, |
| 128 | + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { |
| 129 | + if (!add) { |
| 130 | + for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0; |
| 131 | + } |
| 132 | + |
| 133 | + if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v); |
| 134 | + else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v); |
| 135 | + |
| 136 | + return CEED_ERROR_SUCCESS; |
| 137 | +} |
| 138 | + |
| 139 | +//------------------------------------------------------------------------------ |
| 140 | +// Tensor Contract Create |
| 141 | +//------------------------------------------------------------------------------ |
| 142 | +int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) { |
| 143 | + Ceed ceed; |
| 144 | + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); |
| 145 | + |
| 146 | + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve)); |
| 147 | + |
| 148 | + return CEED_ERROR_SUCCESS; |
| 149 | +} |
| 150 | + |
| 151 | +//------------------------------------------------------------------------------ |
0 commit comments