diff --git a/.github/actions/palace-ci/action.yml b/.github/actions/palace-ci/action.yml
index 6d47c2d472..88706f5a7e 100644
--- a/.github/actions/palace-ci/action.yml
+++ b/.github/actions/palace-ci/action.yml
@@ -79,7 +79,7 @@ runs:
           MPI_IMPL="openmpi"
           C_CXX_COMPILER="${{ inputs.toolchain }}"
         fi
-        PALACE_SPEC="local.palace@develop+superlu-dist+mumps+sundials+strumpack+slepc+arpack${{ inputs.variant }} ${CUDA_ARGS}"
+        PALACE_SPEC="local.palace@develop+libxsmm+superlu-dist+mumps+sundials+strumpack+slepc+arpack${{ inputs.variant }} ${CUDA_ARGS}"
 
         cat << EOF > spack.yaml
         spack:
diff --git a/spack_repo/local/packages/palace/package.py b/spack_repo/local/packages/palace/package.py
index 6a8eb784db..16bc62d8ce 100644
--- a/spack_repo/local/packages/palace/package.py
+++ b/spack_repo/local/packages/palace/package.py
@@ -21,6 +21,7 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
     maintainers("hughcars", "simlap", "cameronrutherford", "sbozzolo", "phdum")
 
     version("develop", branch="main")
+    version("0.16.0", tag="v0.16.0", commit="869ee5ced4850384410a7aeebc7c25f4c01be161")
     version("0.15.0", tag="v0.15.0", commit="b6762777d85a06072fdf4cc96e8a365da73df170")
     version("0.14.0", tag="v0.14.0", commit="a428a3a32dbbd6a2a6013b3b577016c3e9425abc")
     version("0.13.0", tag="v0.13.0", commit="a61c8cbe0cacf496cde3c62e93085fae0d6299ac")
@@ -142,6 +143,13 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
         depends_on("gslib+shared", when="+shared")
         depends_on("gslib~shared", when="~shared")
 
+    # When Palace is built without OpenMP, require non-threaded BLAS to avoid
+    # background thread pools (e.g., OpenBLAS OpenMP threads) that degrade
+    # performance.
+    for blas in ("openblas", "amdblis", "blis"):
+        depends_on(f"{blas} threads=none", when=f"~openmp ^[virtuals=blas] {blas}")
+        depends_on(f"{blas} threads=openmp", when=f"+openmp ^[virtuals=blas] {blas}")
+
     depends_on("metis@5:")
     depends_on("metis+shared", when="+shared")
     depends_on("metis~shared", when="~shared")
diff --git a/spack_repo/patches/pr3292_mfem.diff b/spack_repo/patches/pr3292_mfem.diff
deleted file mode 100644
index 77a6e3f918..0000000000
--- a/spack_repo/patches/pr3292_mfem.diff
+++ /dev/null
@@ -1,162 +0,0 @@
-diff --git a/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch b/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch
-index d5f1a15080..5b8b352ee2 100644
---- a/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch
-+++ b/repos/spack_repo/builtin/packages/mfem/mfem-4.9.patch
-@@ -1,5 +1,5 @@
- diff --git a/fem/fe/fe_base.cpp b/fem/fe/fe_base.cpp
--index 535fdaca8b..281de35fac 100644
-+index 535fdaca8b..b951c9bdd1 100644
- --- a/fem/fe/fe_base.cpp
- +++ b/fem/fe/fe_base.cpp
- @@ -382,11 +382,7 @@ const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir,
-@@ -15,14 +15,14 @@ index 535fdaca8b..281de35fac 100644
-        if (!d2q)
-        {
-  #ifdef MFEM_THREAD_SAFE
--@@ -661,14 +657,22 @@ void ScalarFiniteElement::ScalarLocalL2Restriction(
-+@@ -661,58 +657,66 @@ void ScalarFiniteElement::ScalarLocalL2Restriction(
-  void NodalFiniteElement::CreateLexicographicFullMap(const IntegrationRule &ir)
-  const
-  {
- +   // Get the FULL version of the map. This call contains omp critical region,
- +   // so it is done before the critical region below.
- +   auto &d2q = GetDofToQuad(ir, DofToQuad::FULL);
-- 
-+
-  #if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP)
-     #pragma omp critical (DofToQuad)
-  #endif
-@@ -30,17 +30,91 @@ index 535fdaca8b..281de35fac 100644
- -      // Get the FULL version of the map.
- -      auto &d2q = GetDofToQuad(ir, DofToQuad::FULL);
- -      //Undo the native ordering which is what FiniteElement::GetDofToQuad returns.
--+      // If the new Dof2Quad is already present, e.g. added in a previous call
--+      // or added by another omp thread, return.
--+      if (DofToQuad::SearchArray(dof2quad_array, ir,
--+                                 DofToQuad::LEXICOGRAPHIC_FULL))
--+      { return; }
--+
--+      // Undo the native ordering which is what FiniteElement::GetDofToQuad
--+      // returns.
--       auto *d2q_new = new DofToQuad(d2q);
--       d2q_new->mode = DofToQuad::LEXICOGRAPHIC_FULL;
--       const int nqpt = ir.GetNPoints();
-+-      auto *d2q_new = new DofToQuad(d2q);
-+-      d2q_new->mode = DofToQuad::LEXICOGRAPHIC_FULL;
-+-      const int nqpt = ir.GetNPoints();
-++      // Do work only if the new Dof2Quad is not already present, e.g. added in a
-++      // previous call or added by another omp thread.
-++      if (!DofToQuad::SearchArray(dof2quad_array, ir,
-++                                  DofToQuad::LEXICOGRAPHIC_FULL))
-++      {
-++         // Undo the native ordering which is what FiniteElement::GetDofToQuad
-++         // returns.
-++         auto *d2q_new = new DofToQuad(d2q);
-++         d2q_new->mode = DofToQuad::LEXICOGRAPHIC_FULL;
-++         const int nqpt = ir.GetNPoints();
-+
-+-      const int b_dim = (range_type == VECTOR) ? dim : 1;
-++         const int b_dim = (range_type == VECTOR) ? dim : 1;
-+
-+-      for (int i = 0; i < nqpt; i++)
-+-      {
-+-         for (int d = 0; d < b_dim; d++)
-++         for (int i = 0; i < nqpt; i++)
-+          {
-+-            for (int j = 0; j < dof; j++)
-++            for (int d = 0; d < b_dim; d++)
-+             {
-+-               const double val = d2q.B[i + nqpt*(d+b_dim*lex_ordering[j])];
-+-               d2q_new->B[i+nqpt*(d+b_dim*j)] = val;
-+-               d2q_new->Bt[j+dof*(i+nqpt*d)] = val;
-++               for (int j = 0; j < dof; j++)
-++               {
-++                  const double val = d2q.B[i + nqpt*(d+b_dim*lex_ordering[j])];
-++                  d2q_new->B[i+nqpt*(d+b_dim*j)] = val;
-++                  d2q_new->Bt[j+dof*(i+nqpt*d)] = val;
-++               }
-+             }
-+          }
-+-      }
-+
-+-      const int g_dim = [this]()
-+-      {
-+-         switch (deriv_type)
-++         const int g_dim = [this]()
-+          {
-+-            case GRAD: return dim;
-+-            case DIV: return 1;
-+-            case CURL: return cdim;
-+-            default: return 0;
-+-         }
-+-      }();
-++            switch (deriv_type)
-++            {
-++               case GRAD: return dim;
-++               case DIV: return 1;
-++               case CURL: return cdim;
-++               default: return 0;
-++            }
-++         }();
-+
-+-      for (int i = 0; i < nqpt; i++)
-+-      {
-+-         for (int d = 0; d < g_dim; d++)
-++         for (int i = 0; i < nqpt; i++)
-+          {
-+-            for (int j = 0; j < dof; j++)
-++            for (int d = 0; d < g_dim; d++)
-+             {
-+-               const double val = d2q.G[i + nqpt*(d+g_dim*lex_ordering[j])];
-+-               d2q_new->G[i+nqpt*(d+g_dim*j)] = val;
-+-               d2q_new->Gt[j+dof*(i+nqpt*d)] = val;
-++               for (int j = 0; j < dof; j++)
-++               {
-++                  const double val = d2q.G[i + nqpt*(d+g_dim*lex_ordering[j])];
-++                  d2q_new->G[i+nqpt*(d+g_dim*j)] = val;
-++                  d2q_new->Gt[j+dof*(i+nqpt*d)] = val;
-++               }
-+             }
-+          }
-+-      }
-+
-+-      dof2quad_array.Append(d2q_new);
-++         dof2quad_array.Append(d2q_new);
-++      }
-+    }
-+ }
-+
- @@ -724,13 +728,7 @@ const DofToQuad &NodalFiniteElement::GetDofToQuad(const IntegrationRule &ir,
-     #pragma omp critical (DofToQuad)
-  #endif
-@@ -78,7 +152,7 @@ index a579faf835..ad356bcd2d 100644
- --- a/fem/fe/fe_base.hpp
- +++ b/fem/fe/fe_base.hpp
- @@ -222,6 +222,12 @@ public:
-- 
-+
-     /// Returns absolute value of the maps
-     DofToQuad Abs() const;
- +
-@@ -88,12 +162,12 @@ index a579faf835..ad356bcd2d 100644
- +      const IntegrationRule &ir,
- +      DofToQuad::Mode mode);
-  };
-- 
-+
-  /// Describes the function space on each element
- @@ -1376,6 +1382,21 @@ public:
-  void InvertLinearTrans(ElementTransformation &trans,
-                         const IntegrationPoint &pt, Vector &x);
-- 
-+
- +
- +// static inline method
- +inline DofToQuad *DofToQuad::SearchArray(
-@@ -110,5 +184,5 @@ index a579faf835..ad356bcd2d 100644
- +}
- +
-  } // namespace mfem
-- 
-+
-  #endif
diff --git a/spack_repo/patches/pr3778_libxsmm.diff b/spack_repo/patches/pr3778_libxsmm.diff
new file mode 100644
index 0000000000..b6786480fb
--- /dev/null
+++ b/spack_repo/patches/pr3778_libxsmm.diff
@@ -0,0 +1,13 @@
+diff --git a/repos/spack_repo/builtin/packages/libxsmm/package.py b/repos/spack_repo/builtin/packages/libxsmm/package.py
+index dcd4f38782..f07d7c9731 100644
+--- a/repos/spack_repo/builtin/packages/libxsmm/package.py
++++ b/repos/spack_repo/builtin/packages/libxsmm/package.py
+@@ -120,7 +120,7 @@ class Libxsmm(MakefilePackage):
+             "FC={0}".format(spack_fc),
+             "PREFIX=%s" % prefix,
+         ]
+-        if spec.target.family == "aarch64":
++        if spec.satisfies("@1.17-cp2k") and spec.target.family == "aarch64":
+             make_args += ["PLATFORM=1"]
+         else:
+             make_args += ["SYM=1"]
diff --git a/spack_repo/patches/pr3783_strumpack.diff b/spack_repo/patches/pr3783_strumpack.diff
new file mode 100644
index 0000000000..29302cb827
--- /dev/null
+++ b/spack_repo/patches/pr3783_strumpack.diff
@@ -0,0 +1,24 @@
+diff --git a/repos/spack_repo/builtin/packages/strumpack/package.py b/repos/spack_repo/builtin/packages/strumpack/package.py
+index aaf525901d..dbc9f9458e 100644
+--- a/repos/spack_repo/builtin/packages/strumpack/package.py
++++ b/repos/spack_repo/builtin/packages/strumpack/package.py
+@@ -58,7 +58,7 @@ class Strumpack(CMakePackage, CudaPackage, ROCmPackage):
+     variant("shared", default=True, description="Build shared libraries")
+     variant("mpi", default=True, description="Use MPI")
+     variant(
+-        "openmp", default=True, description="Enable thread parallellism via tasking with OpenMP"
++        "openmp", default=True, description="Enable thread parallelism via tasking with OpenMP"
+     )
+     variant("parmetis", default=True, description="Enable use of ParMetis")
+     variant("scotch", default=False, description="Enable use of Scotch")
+@@ -79,7 +79,9 @@ class Strumpack(CMakePackage, CudaPackage, ROCmPackage):
+     depends_on("mpi", when="+mpi")
+     depends_on("blas")
+     depends_on("lapack")
+-    depends_on("openblas threads=openmp", when="^[virtuals=blas] openblas")
++    for blas in ("openblas", "amdblis", "blis"):
++        depends_on(f"{blas} threads=openmp", when=f"+openmp ^[virtuals=blas] {blas}")
++        depends_on(f"{blas} threads=none", when=f"~openmp ^[virtuals=blas] {blas}")
+     depends_on("scalapack", when="+mpi")
+     depends_on("metis")
+     depends_on("parmetis", when="+parmetis")