Skip to content

Commit 0e8e068

Browse files
committed
sve - initial SVE backend framework
1 parent 54624e1 commit 0e8e068

File tree

8 files changed

+421
-0
lines changed

8 files changed

+421
-0
lines changed

Makefile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c))
240240
ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
241241
opt.c := $(sort $(wildcard backends/opt/*.c))
242242
avx.c := $(sort $(wildcard backends/avx/*.c))
243+
sve.c := $(sort $(wildcard backends/sve/*.c))
243244
xsmm.c := $(sort $(wildcard backends/xsmm/*.c))
244245
cuda.c := $(sort $(wildcard backends/cuda/*.c))
245246
cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp))
@@ -308,6 +309,7 @@ info:
308309
$(info ------------------------------------)
309310
$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
310311
$(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
312+
$(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
311313
$(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
312314
$(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
313315
$(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
@@ -359,6 +361,17 @@ ifneq ($(AVX),)
359361
BACKENDS_MAKE += $(AVX_BACKENDS)
360362
endif
361363

364+
# SVE Backends
365+
SVE_STATUS = Disabled
366+
AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
367+
SVE := $(filter $(SVE_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
368+
SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
369+
ifneq ($(SVE),)
370+
SVE_STATUS = Enabled
371+
libceed.c += $(sve.c)
372+
BACKENDS_MAKE += $(SVE_BACKENDS)
373+
endif
374+
362375
# Collect list of libraries and paths for use in linking and pkg-config
363376
PKG_LIBS =
364377
# Stubs that will not be RPATH'd

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ There are multiple supported backends, which can be selected at runtime in the e
139139
| `/cpu/self/opt/blocked` | Blocked optimized C implementation | Yes |
140140
| `/cpu/self/avx/serial` | Serial AVX implementation | Yes |
141141
| `/cpu/self/avx/blocked` | Blocked AVX implementation | Yes |
142+
| `/cpu/self/sve/serial` | Serial SVE implementation | Yes |
143+
| `/cpu/self/sve/blocked` | Blocked SVE implementation | Yes |
142144
||
143145
| **CPU Valgrind** |
144146
| `/cpu/self/memcheck/*` | Memcheck backends, undefined value checks | Yes |
@@ -180,6 +182,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t
180182

181183
The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.
182184

185+
The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.
186+
183187
The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](http://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
184188
To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
185189
A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.

backends/ceed-backend-list.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,7 @@ CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
2828
CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
2929
CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
3030
CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
31+
CEED_BACKEND(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
32+
CEED_BACKEND(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
3133
CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
3234
CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")

backends/sve/ceed-sve-blocked.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2+
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3+
//
4+
// SPDX-License-Identifier: BSD-2-Clause
5+
//
6+
// This file is part of CEED: http://github.com/ceed
7+
8+
#include <ceed.h>
9+
#include <ceed/backend.h>
10+
#include <stdbool.h>
11+
#include <string.h>
12+
13+
#include "ceed-sve.h"
14+
15+
//------------------------------------------------------------------------------
16+
// Backend Init
17+
//------------------------------------------------------------------------------
18+
static int CeedInit_Sve(const char *resource, Ceed ceed) {
19+
CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked"), ceed,
20+
CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
21+
CeedCallBackend(CeedSetDeterministic(ceed, true));
22+
23+
// Create reference CEED that implementation will be dispatched through unless overridden
24+
Ceed ceed_ref;
25+
CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
26+
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
27+
28+
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
29+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
30+
} else {
31+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
32+
}
33+
34+
return CEED_ERROR_SUCCESS;
35+
}
36+
37+
//------------------------------------------------------------------------------
38+
// Backend Register
39+
//------------------------------------------------------------------------------
40+
CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
41+
//------------------------------------------------------------------------------

backends/sve/ceed-sve-serial.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2+
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3+
//
4+
// SPDX-License-Identifier: BSD-2-Clause
5+
//
6+
// This file is part of CEED: http://github.com/ceed
7+
8+
#include <ceed.h>
9+
#include <ceed/backend.h>
10+
#include <stdbool.h>
11+
#include <string.h>
12+
13+
#include "ceed-sve.h"
14+
15+
//------------------------------------------------------------------------------
16+
// Backend Init
17+
//------------------------------------------------------------------------------
18+
static int CeedInit_Sve(const char *resource, Ceed ceed) {
19+
CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve/serial"), ceed, CEED_ERROR_BACKEND,
20+
"SVE backend cannot use resource: %s", resource);
21+
CeedCallBackend(CeedSetDeterministic(ceed, true));
22+
23+
// Create reference CEED that implementation will be dispatched through unless overridden
24+
Ceed ceed_ref;
25+
CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
26+
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
27+
28+
if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
29+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
30+
} else {
31+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
32+
}
33+
34+
return CEED_ERROR_SUCCESS;
35+
}
36+
37+
//------------------------------------------------------------------------------
38+
// Backend Register
39+
//------------------------------------------------------------------------------
40+
CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }
41+
42+
//------------------------------------------------------------------------------

backends/sve/ceed-sve-tensor-f32.c

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2+
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3+
//
4+
// SPDX-License-Identifier: BSD-2-Clause
5+
//
6+
// This file is part of CEED: http://github.com/ceed
7+
8+
#include <ceed.h>
9+
#include <ceed/backend.h>
10+
#ifdef __ARM_FEATURE_SVE
11+
#include <arm_sve.h>
12+
#endif
13+
#include <stdbool.h>
14+
15+
#include "ceed-sve.h"
16+
17+
//------------------------------------------------------------------------------
18+
// Blocked Tensor Contract
19+
//------------------------------------------------------------------------------
20+
static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
21+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
22+
const CeedInt JJ) {
23+
CeedInt t_stride_0 = B, t_stride_1 = 1;
24+
if (t_mode == CEED_TRANSPOSE) {
25+
t_stride_0 = 1;
26+
t_stride_1 = J;
27+
}
28+
29+
for (CeedInt a = 0; a < A; a++) {
30+
for (CeedInt b = 0; b < B; b++) {
31+
// Blocks of JJ rows
32+
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
33+
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
34+
// C vectorization by compiler
35+
for (int32_t c = 0; c < C; c += svcntd()) {
36+
svbool_t pg = svwhilelt_b32(c, C);
37+
// Load u, v into vectors
38+
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
39+
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
40+
// Basis matrix value
41+
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
42+
// fmadd
43+
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
44+
}
45+
}
46+
}
47+
}
48+
}
49+
50+
// Remainder of rows
51+
CeedInt j = (J / JJ) * JJ;
52+
if (j < J) {
53+
for (CeedInt a = 0; a < A; a++) {
54+
for (CeedInt b = 0; b < B; b++) {
55+
// Blocks of JJ rows
56+
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
57+
// C vectorization by compiler
58+
for (int32_t c = 0; c < C; c += svcntd()) {
59+
svbool_t pg = svwhilelt_b32(c, C);
60+
// Load u, v into vectors
61+
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
62+
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
63+
// Basis matrix value
64+
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
65+
// fmadd
66+
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
67+
}
68+
}
69+
}
70+
}
71+
}
72+
73+
return CEED_ERROR_SUCCESS;
74+
}
75+
76+
//------------------------------------------------------------------------------
77+
// Blocked Tensor Contract
78+
//------------------------------------------------------------------------------
79+
static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
80+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
81+
const CeedInt JJ) {
82+
CeedInt t_stride_0 = B, t_stride_1 = 1;
83+
if (t_mode == CEED_TRANSPOSE) {
84+
t_stride_0 = 1;
85+
t_stride_1 = J;
86+
}
87+
88+
for (CeedInt a = 0; a < A; a++) {
89+
for (CeedInt b = 0; b < B; b++) {
90+
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
91+
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
92+
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
93+
}
94+
}
95+
}
96+
}
97+
98+
CeedInt j = (J / JJ) * JJ;
99+
if (j < J) {
100+
for (CeedInt a = 0; a < A; a++) {
101+
for (CeedInt b = 0; b < B; b++) {
102+
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
103+
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
104+
}
105+
}
106+
}
107+
}
108+
109+
return CEED_ERROR_SUCCESS;
110+
}
111+
112+
//------------------------------------------------------------------------------
113+
// Tensor Contract - Common Sizes
114+
//------------------------------------------------------------------------------
115+
static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
116+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
117+
return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
118+
}
119+
static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
120+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
121+
return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
122+
}
123+
124+
//------------------------------------------------------------------------------
125+
// Tensor Contract Apply
126+
//------------------------------------------------------------------------------
127+
static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
128+
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
129+
if (!add) {
130+
for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
131+
}
132+
133+
if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
134+
else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
135+
136+
return CEED_ERROR_SUCCESS;
137+
}
138+
139+
//------------------------------------------------------------------------------
140+
// Tensor Contract Create
141+
//------------------------------------------------------------------------------
142+
int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
143+
Ceed ceed;
144+
CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
145+
146+
CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
147+
148+
return CEED_ERROR_SUCCESS;
149+
}
150+
151+
//------------------------------------------------------------------------------

0 commit comments

Comments
 (0)