sve - initial SVE backend framework

jeremylt · jeremylt · commit 0e8e068155d9 · 2023-06-15T10:28:22.000-06:00
diff --git a/Makefile b/Makefile
@@ -240,6 +240,7 @@ blocked.c      := $(sort $(wildcard backends/blocked/*.c))
 ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
 opt.c          := $(sort $(wildcard backends/opt/*.c))
 avx.c          := $(sort $(wildcard backends/avx/*.c))
+sve.c          := $(sort $(wildcard backends/sve/*.c))
 xsmm.c         := $(sort $(wildcard backends/xsmm/*.c))
 cuda.c         := $(sort $(wildcard backends/cuda/*.c))
 cuda.cpp       := $(sort $(wildcard backends/cuda/*.cpp))
@@ -308,6 +309,7 @@ info:
 	$(info ------------------------------------)
 	$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
 	$(info AVX_STATUS    = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
+	$(info SVE_STATUS    = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
 	$(info XSMM_DIR      = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
 	$(info OCCA_DIR      = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
 	$(info MAGMA_DIR     = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
@@ -359,6 +361,17 @@ ifneq ($(AVX),)
   BACKENDS_MAKE += $(AVX_BACKENDS)
 endif
 
+# SVE Backends
+SVE_STATUS = Disabled
+AVX_FLAG  := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
+SVE       := $(filter $(SVE_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
+SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
+ifneq ($(SVE),)
+  SVE_STATUS = Enabled
+  libceed.c += $(sve.c)
+  BACKENDS_MAKE += $(SVE_BACKENDS)
+endif
+
 # Collect list of libraries and paths for use in linking and pkg-config
 PKG_LIBS =
 # Stubs that will not be RPATH'd
diff --git a/README.md b/README.md
@@ -139,6 +139,8 @@ There are multiple supported backends, which can be selected at runtime in the e
 | `/cpu/self/opt/blocked`    | Blocked optimized C implementation                | Yes                   |
 | `/cpu/self/avx/serial`     | Serial AVX implementation                         | Yes                   |
 | `/cpu/self/avx/blocked`    | Blocked AVX implementation                        | Yes                   |
+| `/cpu/self/sve/serial`     | Serial SVE implementation                         | Yes                   |
+| `/cpu/self/sve/blocked`    | Blocked SVE implementation                        | Yes                   |
 ||
 | **CPU Valgrind**           |
 | `/cpu/self/memcheck/*`     | Memcheck backends, undefined value checks         | Yes                   |
@@ -180,6 +182,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t
 
 The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.
 
+The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.
+
 The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](http://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
 To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
 A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.
diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h
@@ -28,5 +28,7 @@ CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
 CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
 CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
 CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
+CEED_BACKEND(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
+CEED_BACKEND(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
 CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
 CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
diff --git a/backends/sve/ceed-sve-blocked.c b/backends/sve/ceed-sve-blocked.c
@@ -0,0 +1,41 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Backend Init
+//------------------------------------------------------------------------------
+static int CeedInit_Sve(const char *resource, Ceed ceed) {
+  CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked"), ceed,
+            CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
+  CeedCallBackend(CeedSetDeterministic(ceed, true));
+
+  // Create reference CEED that implementation will be dispatched through unless overridden
+  Ceed ceed_ref;
+  CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
+  CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+
+  if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Backend Register
+//------------------------------------------------------------------------------
+CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve-serial.c b/backends/sve/ceed-sve-serial.c
@@ -0,0 +1,42 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Backend Init
+//------------------------------------------------------------------------------
+static int CeedInit_Sve(const char *resource, Ceed ceed) {
+  CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve/serial"), ceed, CEED_ERROR_BACKEND,
+            "SVE backend cannot use resource: %s", resource);
+  CeedCallBackend(CeedSetDeterministic(ceed, true));
+
+  // Create reference CEED that implementation will be dispatched through unless overridden
+  Ceed ceed_ref;
+  CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
+  CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));
+
+  if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
+  } else {
+    CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Backend Register
+//------------------------------------------------------------------------------
+CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }
+
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve-tensor-f32.c b/backends/sve/ceed-sve-tensor-f32.c
@@ -0,0 +1,151 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <ceed/backend.h>
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+#include <stdbool.h>
+
+#include "ceed-sve.h"
+
+//------------------------------------------------------------------------------
+// Blocked Tensor Contract
+//------------------------------------------------------------------------------
+static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                                 CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
+                                                 const CeedInt JJ) {
+  CeedInt t_stride_0 = B, t_stride_1 = 1;
+  if (t_mode == CEED_TRANSPOSE) {
+    t_stride_0 = 1;
+    t_stride_1 = J;
+  }
+
+  for (CeedInt a = 0; a < A; a++) {
+    for (CeedInt b = 0; b < B; b++) {
+      // Blocks of JJ rows
+      for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
+        for (CeedInt jj = 0; jj < JJ; jj++) {  // unroll
+          // C vectorization by compiler
+          for (int32_t c = 0; c < C; c += svcntd()) {
+            svbool_t pg = svwhilelt_b32(c, C);
+            // Load u, v into vectors
+            svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
+            svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
+            // Basis matrix value
+            float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
+            // fmadd
+            svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
+          }
+        }
+      }
+    }
+  }
+
+  // Remainder of rows
+  CeedInt j = (J / JJ) * JJ;
+  if (j < J) {
+    for (CeedInt a = 0; a < A; a++) {
+      for (CeedInt b = 0; b < B; b++) {
+        // Blocks of JJ rows
+        for (CeedInt jj = 0; jj < J - j; jj++) {  // not unrolled
+          // C vectorization by compiler
+          for (int32_t c = 0; c < C; c += svcntd()) {
+            svbool_t pg = svwhilelt_b32(c, C);
+            // Load u, v into vectors
+            svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
+            svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
+            // Basis matrix value
+            float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
+            // fmadd
+            svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
+          }
+        }
+      }
+    }
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Blocked Tensor Contract
+//------------------------------------------------------------------------------
+static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                                CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
+                                                const CeedInt JJ) {
+  CeedInt t_stride_0 = B, t_stride_1 = 1;
+  if (t_mode == CEED_TRANSPOSE) {
+    t_stride_0 = 1;
+    t_stride_1 = J;
+  }
+
+  for (CeedInt a = 0; a < A; a++) {
+    for (CeedInt b = 0; b < B; b++) {
+      for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
+        for (CeedInt jj = 0; jj < JJ; jj++) {  // unroll
+          v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
+        }
+      }
+    }
+  }
+
+  CeedInt j = (J / JJ) * JJ;
+  if (j < J) {
+    for (CeedInt a = 0; a < A; a++) {
+      for (CeedInt b = 0; b < B; b++) {
+        for (CeedInt jj = 0; jj < J - j; jj++) {  // not unrolled
+          v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
+        }
+      }
+    }
+  }
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract - Common Sizes
+//------------------------------------------------------------------------------
+static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                            CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
+  return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
+}
+static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                           CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
+  return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract Apply
+//------------------------------------------------------------------------------
+static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
+                                       CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
+  if (!add) {
+    for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
+  }
+
+  if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
+  else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
+// Tensor Contract Create
+//------------------------------------------------------------------------------
+int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
+  Ceed ceed;
+  CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
+
+  CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
+
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/sve/ceed-sve-tensor-f64.c b/backends/sve/ceed-sve-tensor-f64.c
diff --git a/backends/sve/ceed-sve.h b/backends/sve/ceed-sve.h