diff --git a/.github/actions/palace-ci/action.yml b/.github/actions/palace-ci/action.yml index 6d47c2d472..88706f5a7e 100644 --- a/.github/actions/palace-ci/action.yml +++ b/.github/actions/palace-ci/action.yml @@ -79,7 +79,7 @@ runs: MPI_IMPL="openmpi" C_CXX_COMPILER="${{ inputs.toolchain }}" fi - PALACE_SPEC="local.palace@develop+superlu-dist+mumps+sundials+strumpack+slepc+arpack${{ inputs.variant }} ${CUDA_ARGS}" + PALACE_SPEC="local.palace@develop+libxsmm+superlu-dist+mumps+sundials+strumpack+slepc+arpack${{ inputs.variant }} ${CUDA_ARGS}" cat << EOF > spack.yaml spack: diff --git a/spack_repo/local/packages/palace/package.py b/spack_repo/local/packages/palace/package.py index 6a8eb784db..16bc62d8ce 100644 --- a/spack_repo/local/packages/palace/package.py +++ b/spack_repo/local/packages/palace/package.py @@ -21,6 +21,7 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage): maintainers("hughcars", "simlap", "cameronrutherford", "sbozzolo", "phdum") version("develop", branch="main") + version("0.16.0", tag="v0.16.0", commit="869ee5ced4850384410a7aeebc7c25f4c01be161") version("0.15.0", tag="v0.15.0", commit="b6762777d85a06072fdf4cc96e8a365da73df170") version("0.14.0", tag="v0.14.0", commit="a428a3a32dbbd6a2a6013b3b577016c3e9425abc") version("0.13.0", tag="v0.13.0", commit="a61c8cbe0cacf496cde3c62e93085fae0d6299ac") @@ -142,6 +143,13 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage): depends_on("gslib+shared", when="+shared") depends_on("gslib~shared", when="~shared") + # When Palace is built without OpenMP, require non-threaded BLAS to avoid + # background thread pools (e.g., OpenBLAS OpenMP threads) that degrade + # performance. + for blas in ("openblas", "amdblis", "blis"): + depends_on(f"{blas} threads=none", when=f"~openmp ^[virtuals=blas] {blas}") + depends_on(f"{blas} threads=openmp", when=f"+openmp ^[virtuals=blas] {blas}") + depends_on("metis@5:") depends_on("metis+shared", when="+shared") depends_on("metis~shared", when="~shared") diff --git a/spack_repo/patches/pr3292_mfem.diff b/spack_repo/patches/pr3292_mfem.diff deleted file mode 100644 index 77a6e3f918..0000000000 --- a/spack_repo/patches/pr3292_mfem.diff +++ /dev/null @@ -1,162 +0,0 @@ -diff --git a/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch b/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch -index d5f1a15080..5b8b352ee2 100644 ---- a/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch -+++ b/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch -@@ -1,5 +1,5 @@ - diff --git a/fem/fe/fe_base.cpp b/fem/fe/fe_base.cpp --index 535fdaca8b..281de35fac 100644 -+index 535fdaca8b..b951c9bdd1 100644 - --- a/fem/fe/fe_base.cpp - +++ b/fem/fe/fe_base.cpp - @@ -382,11 +382,7 @@ const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir, -@@ -15,14 +15,14 @@ index 535fdaca8b..281de35fac 100644 - if (!d2q) - { - #ifdef MFEM_THREAD_SAFE --@@ -661,14 +657,22 @@ void ScalarFiniteElement::ScalarLocalL2Restriction( -+@@ -661,58 +657,66 @@ void ScalarFiniteElement::ScalarLocalL2Restriction( - void NodalFiniteElement::CreateLexicographicFullMap(const IntegrationRule &ir) - const - { - + // Get the FULL version of the map. This call contains omp critical region, - + // so it is done before the critical region below. - + auto &d2q = GetDofToQuad(ir, DofToQuad::FULL); -- -+ - #if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) - #pragma omp critical (DofToQuad) - #endif -@@ -30,17 +30,91 @@ index 535fdaca8b..281de35fac 100644 - - // Get the FULL version of the map. - - auto &d2q = GetDofToQuad(ir, DofToQuad::FULL); - - //Undo the native ordering which is what FiniteElement::GetDofToQuad returns. --+ // If the new Dof2Quad is already present, e.g. added in a previous call --+ // or added by another omp thread, return. --+ if (DofToQuad::SearchArray(dof2quad_array, ir, --+ DofToQuad::LEXICOGRAPHIC_FULL)) --+ { return; } --+ --+ // Undo the native ordering which is what FiniteElement::GetDofToQuad --+ // returns. -- auto *d2q_new = new DofToQuad(d2q); -- d2q_new->mode = DofToQuad::LEXICOGRAPHIC_FULL; -- const int nqpt = ir.GetNPoints(); -+- auto *d2q_new = new DofToQuad(d2q); -+- d2q_new->mode = DofToQuad::LEXICOGRAPHIC_FULL; -+- const int nqpt = ir.GetNPoints(); -++ // Do work only if the new Dof2Quad is not already present, e.g. added in a -++ // previous call or added by another omp thread. -++ if (!DofToQuad::SearchArray(dof2quad_array, ir, -++ DofToQuad::LEXICOGRAPHIC_FULL)) -++ { -++ // Undo the native ordering which is what FiniteElement::GetDofToQuad -++ // returns. -++ auto *d2q_new = new DofToQuad(d2q); -++ d2q_new->mode = DofToQuad::LEXICOGRAPHIC_FULL; -++ const int nqpt = ir.GetNPoints(); -+ -+- const int b_dim = (range_type == VECTOR) ? dim : 1; -++ const int b_dim = (range_type == VECTOR) ? dim : 1; -+ -+- for (int i = 0; i < nqpt; i++) -+- { -+- for (int d = 0; d < b_dim; d++) -++ for (int i = 0; i < nqpt; i++) -+ { -+- for (int j = 0; j < dof; j++) -++ for (int d = 0; d < b_dim; d++) -+ { -+- const double val = d2q.B[i + nqpt*(d+b_dim*lex_ordering[j])]; -+- d2q_new->B[i+nqpt*(d+b_dim*j)] = val; -+- d2q_new->Bt[j+dof*(i+nqpt*d)] = val; -++ for (int j = 0; j < dof; j++) -++ { -++ const double val = d2q.B[i + nqpt*(d+b_dim*lex_ordering[j])]; -++ d2q_new->B[i+nqpt*(d+b_dim*j)] = val; -++ d2q_new->Bt[j+dof*(i+nqpt*d)] = val; -++ } -+ } -+ } -+- } -+ -+- const int g_dim = [this]() -+- { -+- switch (deriv_type) -++ const int g_dim = [this]() -+ { -+- case GRAD: return dim; -+- case DIV: return 1; -+- case CURL: return cdim; -+- default: return 0; -+- } -+- }(); -++ switch (deriv_type) -++ { -++ case GRAD: return dim; -++ case DIV: return 1; -++ case CURL: return cdim; -++ default: return 0; -++ } -++ }(); -+ -+- for (int i = 0; i < nqpt; i++) -+- { -+- for (int d = 0; d < g_dim; d++) -++ for (int i = 0; i < nqpt; i++) -+ { -+- for (int j = 0; j < dof; j++) -++ for (int d = 0; d < g_dim; d++) -+ { -+- const double val = d2q.G[i + nqpt*(d+g_dim*lex_ordering[j])]; -+- d2q_new->G[i+nqpt*(d+g_dim*j)] = val; -+- d2q_new->Gt[j+dof*(i+nqpt*d)] = val; -++ for (int j = 0; j < dof; j++) -++ { -++ const double val = d2q.G[i + nqpt*(d+g_dim*lex_ordering[j])]; -++ d2q_new->G[i+nqpt*(d+g_dim*j)] = val; -++ d2q_new->Gt[j+dof*(i+nqpt*d)] = val; -++ } -+ } -+ } -+- } -+ -+- dof2quad_array.Append(d2q_new); -++ dof2quad_array.Append(d2q_new); -++ } -+ } -+ } -+ - @@ -724,13 +728,7 @@ const DofToQuad &NodalFiniteElement::GetDofToQuad(const IntegrationRule &ir, - #pragma omp critical (DofToQuad) - #endif -@@ -78,7 +152,7 @@ index a579faf835..ad356bcd2d 100644 - --- a/fem/fe/fe_base.hpp - +++ b/fem/fe/fe_base.hpp - @@ -222,6 +222,12 @@ public: -- -+ - /// Returns absolute value of the maps - DofToQuad Abs() const; - + -@@ -88,12 +162,12 @@ index a579faf835..ad356bcd2d 100644 - + const IntegrationRule &ir, - + DofToQuad::Mode mode); - }; -- -+ - /// Describes the function space on each element - @@ -1376,6 +1382,21 @@ public: - void InvertLinearTrans(ElementTransformation &trans, - const IntegrationPoint &pt, Vector &x); -- -+ - + - +// static inline method - +inline DofToQuad *DofToQuad::SearchArray( -@@ -110,5 +184,5 @@ index a579faf835..ad356bcd2d 100644 - +} - + - } // namespace mfem -- -+ - #endif diff --git a/spack_repo/patches/pr3778_libxsmm.diff b/spack_repo/patches/pr3778_libxsmm.diff new file mode 100644 index 0000000000..b6786480fb --- /dev/null +++ b/spack_repo/patches/pr3778_libxsmm.diff @@ -0,0 +1,13 @@ +diff --git a/repos/spack_repo/builtin/packages/libxsmm/package.py b/repos/spack_repo/builtin/packages/libxsmm/package.py +index dcd4f38782..f07d7c9731 100644 +--- a/repos/spack_repo/builtin/packages/libxsmm/package.py ++++ b/repos/spack_repo/builtin/packages/libxsmm/package.py +@@ -120,7 +120,7 @@ class Libxsmm(MakefilePackage): + "FC={0}".format(spack_fc), + "PREFIX=%s" % prefix, + ] +- if spec.target.family == "aarch64": ++ if spec.satisfies("@1.17-cp2k") and spec.target.family == "aarch64": + make_args += ["PLATFORM=1"] + else: + make_args += ["SYM=1"] diff --git a/spack_repo/patches/pr3783_strumpack.diff b/spack_repo/patches/pr3783_strumpack.diff new file mode 100644 index 0000000000..29302cb827 --- /dev/null +++ b/spack_repo/patches/pr3783_strumpack.diff @@ -0,0 +1,24 @@ +diff --git a/repos/spack_repo/builtin/packages/strumpack/package.py b/repos/spack_repo/builtin/packages/strumpack/package.py +index aaf525901d..dbc9f9458e 100644 +--- a/repos/spack_repo/builtin/packages/strumpack/package.py ++++ b/repos/spack_repo/builtin/packages/strumpack/package.py +@@ -58,7 +58,7 @@ class Strumpack(CMakePackage, CudaPackage, ROCmPackage): + variant("shared", default=True, description="Build shared libraries") + variant("mpi", default=True, description="Use MPI") + variant( +- "openmp", default=True, description="Enable thread parallellism via tasking with OpenMP" ++ "openmp", default=True, description="Enable thread parallelism via tasking with OpenMP" + ) + variant("parmetis", default=True, description="Enable use of ParMetis") + variant("scotch", default=False, description="Enable use of Scotch") +@@ -79,7 +79,9 @@ class Strumpack(CMakePackage, CudaPackage, ROCmPackage): + depends_on("mpi", when="+mpi") + depends_on("blas") + depends_on("lapack") +- depends_on("openblas threads=openmp", when="^[virtuals=blas] openblas") ++ for blas in ("openblas", "amdblis", "blis"): ++ depends_on(f"{blas} threads=openmp", when=f"+openmp ^[virtuals=blas] {blas}") ++ depends_on(f"{blas} threads=none", when=f"~openmp ^[virtuals=blas] {blas}") + depends_on("scalapack", when="+mpi") + depends_on("metis") + depends_on("parmetis", when="+parmetis")