From 1e277478ab80341e7b58fd2d1f972dd6b32dc63b Mon Sep 17 00:00:00 2001
From: Jean-Luc Fattebert <fattebertj@ornl.gov>
Date: Tue, 26 May 2026 08:32:43 -0400
Subject: [PATCH 1/2] Code in support of Davidson

* clean up Davidson code
* add some options in ReplicatedMatrix
* save eigenvectors of H in ProjectedMatrices
---
 src/DavidsonSolver.cc      | 77 ++++++++++++++++++--------------------
 src/DavidsonSolver.h       |  5 ++-
 src/ProjectedMatrices.cc   |  6 +--
 src/ProjectedMatrices.h    | 12 ++++++
 src/ProjectedMatrices2N.cc |  4 +-
 src/ProjectedMatrices2N.h  |  3 +-
 src/ReplicatedMatrix.cc    | 27 +++++++++++++
 src/ReplicatedMatrix.h     |  8 +++-
 8 files changed, 92 insertions(+), 50 deletions(-)
diff --git a/src/DavidsonSolver.cc b/src/DavidsonSolver.cc
index 4804af64..d6bd1d06 100644
--- a/src/DavidsonSolver.cc
+++ b/src/DavidsonSolver.cc
@@ -44,6 +44,21 @@ double evalEntropy(ProjectedMatricesInterface* projmatrices,
     return ts;
 }
 
+// print 10 eigenvalues / row
+void printEigenvalues(const std::vector<double>& eval, std::ostream& os)
+{
+    os << std::fixed << std::setprecision(4);
+    for (unsigned int i = 0; i < eval.size(); i++)
+    {
+        os << eval[i];
+        if (i % 10 == 9)
+            os << std::endl;
+        else
+            os << "  ";
+    }
+    os << std::endl;
+}
+
 template <class OrbitalsType, class MatrixType>
 DavidsonSolver<OrbitalsType, MatrixType>::DavidsonSolver(std::ostream& os,
     Ions& ions, Hamiltonian<OrbitalsType>* hamiltonian, Rho<OrbitalsType>* rho,
@@ -225,8 +240,9 @@ double DavidsonSolver<OrbitalsType, MatrixType>::evaluateDerivative(
 
 template <class OrbitalsType, class MatrixType>
 void DavidsonSolver<OrbitalsType, MatrixType>::buildTarget2N_MVP(
-    MatrixType& h11, MatrixType& h12, MatrixType& h21, MatrixType& h22,
-    MatrixType& s11, MatrixType& s22, MatrixType& target)
+    const MatrixType& h11, const MatrixType& h12, const MatrixType& h21,
+    const MatrixType& h22, const MatrixType& s11, const MatrixType& s22,
+    MatrixType& target)
 {
     target_tm_.start();
 
@@ -427,7 +443,6 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
         OrbitalsType hphi("Davidson_hphi", orbitals);
         MatrixType dm2Ninit("dm2N", 2 * numst_, 2 * numst_);
         std::vector<DISTMATDTYPE> eval(2 * numst_);
-        MatrixType evect("EigVect", 2 * numst_, 2 * numst_);
 
         MatrixType dm11("dm11", numst_, numst_);
         MatrixType dm12("dm12", numst_, numst_);
@@ -681,21 +696,13 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
         } // inner iterations
 
         // update orbitals
+        MatrixType evect("EigVect", 2 * numst_, 2 * numst_);
         proj_mat2N_->diagonalizeDM(eval, evect);
 #if 1
         if (mmpi.PE0() && ct.verbose > 2)
         {
             os_ << "Eigenvalues of Interpolated DM: " << std::endl;
-            os_ << std::fixed << std::setprecision(4);
-            for (unsigned int i = 0; i < eval.size(); i++)
-            {
-                os_ << eval[i];
-                if (i % 10 == 9)
-                    os_ << std::endl;
-                else
-                    os_ << "  ";
-            }
-            os_ << std::endl;
+            printEigenvalues(eval, os_);
             double tot = 0.;
             for (int i = 0; i < numst_; i++)
             {
@@ -717,18 +724,22 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
         //        2*numst_); swapColumnsVect(evect, proj_mat2N_->getMatHB(),
         //        eval, z2N );
 
-        dm12.getsub(evect, numst_, numst_, 0, numst_);
-        dm22.getsub(evect, numst_, numst_, numst_, numst_);
-
         // replace orbitals with eigenvectors corresponding to largest
         // eigenvalues of DM
-        if (mmpi.PE0() && ct.verbose > 2)
-            os_ << "Update trial eigenvectors..." << std::endl;
-        orbitals.multiply_by_matrix(dm12, 0., orbitals);
-        work_orbitals.multiply_by_matrix(dm22, 1., orbitals);
-        orbitals.incrementIterativeIndex();
-        orbitals.incrementIterativeIndex();
-        work_orbitals.incrementIterativeIndex(2);
+        {
+            MatrixType v12("v12", numst_, numst_);
+            MatrixType v22("v22", numst_, numst_);
+            v12.getsub(evect, numst_, numst_, 0, numst_);
+            v22.getsub(evect, numst_, numst_, numst_, numst_);
+
+            if (mmpi.PE0() && ct.verbose > 2)
+                os_ << "Update trial eigenvectors..." << std::endl;
+            orbitals.multiply_by_matrix(v12, 0., orbitals);
+            work_orbitals.multiply_by_matrix(v22, 1., orbitals);
+            orbitals.incrementIterativeIndex();
+            orbitals.incrementIterativeIndex();
+            work_orbitals.incrementIterativeIndex(2);
+        }
 
         std::vector<double> new_occ(numst_);
         double tocc              = 0.;
@@ -771,14 +782,7 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
         if( onpe0 )
         {
             os_<<"New occupations: "<<endl;
-            os_<<fixed<<setprecision(4);
-            for(int i=0;i<numst_;i++)
-            {
-                os_<<new_occ[i];
-                if( i%10==9 )os_<<endl;
-                else         os_<<"  ";
-            }
-            os_<<endl;
+            printEigenvalues(new_occ, os_);
         }
 #endif
 #if 1
@@ -806,16 +810,7 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
             && ct.verbose > 2)
         {
             os_ << "Final Occupations: " << std::endl;
-            os_ << std::fixed << std::setprecision(4);
-            for (int i = 0; i < numst_; i++)
-            {
-                os_ << new_occ[i];
-                if (i % 10 == 9)
-                    os_ << std::endl;
-                else
-                    os_ << "  ";
-            }
-            os_ << std::endl;
+            printEigenvalues(new_occ, os_);
         }
 #endif
 
diff --git a/src/DavidsonSolver.h b/src/DavidsonSolver.h
index 512cab5b..c058c90e 100644
--- a/src/DavidsonSolver.h
+++ b/src/DavidsonSolver.h
@@ -57,8 +57,9 @@ class DavidsonSolver
     int checkConvergence(const double e0, const int it, const double tol);
     double evaluateDerivative(
         MatrixType& dm2Ninit, MatrixType& delta_dm, const double ts0);
-    void buildTarget2N_MVP(MatrixType& h11, MatrixType& h12, MatrixType& h21,
-        MatrixType& h22, MatrixType& s11, MatrixType& s22, MatrixType& target);
+    void buildTarget2N_MVP(const MatrixType& h11, const MatrixType& h12,
+        const MatrixType& h21, const MatrixType& h22, const MatrixType& s11,
+        const MatrixType& s22, MatrixType& target);
     // void buildTarget2N_new(MatrixType& h11,
     //    MatrixType& h12,
     //    MatrixType& h21,
diff --git a/src/ProjectedMatrices.cc b/src/ProjectedMatrices.cc
index 50a6cd21..df18bcc8 100644
--- a/src/ProjectedMatrices.cc
+++ b/src/ProjectedMatrices.cc
@@ -390,18 +390,18 @@ void ProjectedMatrices<MatrixType>::updateDMwithEigenstates()
     if (mmpi.PE0() && ct.verbose > 1)
         (*MPIdata::sout) << "ProjectedMatrices: Compute DM using eigenstates\n";
 
-    MatrixType zz("Z", dim_, dim_);
+    if (ev_ == nullptr) ev_.reset(new MatrixType("ev", dim_, dim_));
 
     // solves generalized eigenvalue problem
     // and return solution in zz and val
-    solveGenEigenProblem(zz);
+    solveGenEigenProblem(*ev_);
     computeChemicalPotentialAndOccupations();
     if (mmpi.instancePE0() && ct.verbose > 1)
         std::cout << "Final mu_ = " << 0.5 * mu_ << " [Ha]" << std::endl;
 
     // Build the density matrix X
     // X = Z * gamma * Z^T
-    buildDM(zz);
+    buildDM(*ev_);
 }
 
 //"replicated" implementation of SP2.
diff --git a/src/ProjectedMatrices.h b/src/ProjectedMatrices.h
index 9827c7ef..da1bf112 100644
--- a/src/ProjectedMatrices.h
+++ b/src/ProjectedMatrices.h
@@ -97,6 +97,12 @@ class ProjectedMatrices : public ProjectedMatricesInterface
      */
     std::unique_ptr<DensityMatrix<MatrixType>> dm_;
 
+    /*!
+     * Matrix of eigenvectors used to build DM
+     * (if used)
+     */
+    std::unique_ptr<MatrixType> ev_;
+
     /*!
      * Gram matrix of orbitals overlaps
      */
@@ -242,7 +248,13 @@ class ProjectedMatrices : public ProjectedMatricesInterface
     }
 
     void assignH(const MatrixType& matH) { *matH_ = matH; }
+
     const MatrixType& getH() { return *matH_; }
+    const MatrixType& getEigenvectors()
+    {
+        assert(ev_ != nullptr);
+        return *ev_;
+    }
 
     void setHB2H() override
     {
diff --git a/src/ProjectedMatrices2N.cc b/src/ProjectedMatrices2N.cc
index 60f2cc80..3fc53e8b 100644
--- a/src/ProjectedMatrices2N.cc
+++ b/src/ProjectedMatrices2N.cc
@@ -31,8 +31,8 @@ ProjectedMatrices2N<MatrixType>::~ProjectedMatrices2N()
 }
 
 template <class MatrixType>
-void ProjectedMatrices2N<MatrixType>::assignBlocksH(
-    MatrixType& h11, MatrixType& h12, MatrixType& h21, MatrixType& h22)
+void ProjectedMatrices2N<MatrixType>::assignBlocksH(const MatrixType& h11,
+    const MatrixType& h12, const MatrixType& h21, const MatrixType& h22)
 {
     ProjectedMatrices<MatrixType>::matH_->assign(h11, 0, 0);
     ProjectedMatrices<MatrixType>::matH_->assign(h12, 0, bdim_);
diff --git a/src/ProjectedMatrices2N.h b/src/ProjectedMatrices2N.h
index a87bf83f..b3cdc900 100644
--- a/src/ProjectedMatrices2N.h
+++ b/src/ProjectedMatrices2N.h
@@ -24,7 +24,8 @@ class ProjectedMatrices2N : public ProjectedMatrices<MatrixType>
 
     ~ProjectedMatrices2N() override;
 
-    void assignBlocksH(MatrixType&, MatrixType&, MatrixType&, MatrixType&);
+    void assignBlocksH(const MatrixType&, const MatrixType&, const MatrixType&,
+        const MatrixType&);
 
     void iterativeUpdateDMwithEigenstates(
         const double occ_width, const bool flag_reduce_T = true);
diff --git a/src/ReplicatedMatrix.cc b/src/ReplicatedMatrix.cc
index 04c56e0d..062f13ad 100644
--- a/src/ReplicatedMatrix.cc
+++ b/src/ReplicatedMatrix.cc
@@ -17,6 +17,7 @@
 using Memory                = MemorySpace::Memory<double, MemorySpace::Device>;
 constexpr double gpuroundup = 32;
 #else
+#include "MGmol_blas1.h"
 #include "blas3_c.h"
 #include "fc_mangle.h"
 #include "lapack_c.h"
@@ -687,6 +688,32 @@ int ReplicatedMatrix::iamax(const int j, double& val)
     return indx;
 }
 
+double ReplicatedMatrix::nrm2(const int j)
+{
+#ifdef HAVE_MAGMA
+    (void)os;
+    std::cerr << "ReplicatedMatrix::nrm2() not implemented" << std::endl;
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+#else
+    int ione = 1;
+    return DNRM2(&dim_, data_.get() + j * ld_, &ione);
+#endif
+}
+
+void ReplicatedMatrix::swapColumns(const int i, const int j)
+{
+#ifdef HAVE_MAGMA
+    (void)os;
+    std::cerr << "ReplicatedMatrix::swapColumns) not implemented" << std::endl;
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+#else
+    std::vector<double> tmp(data_.get() + i * ld_, data_.get() + (i + 1) * ld_);
+
+    memcpy(data_.get() + i * ld_, data_.get() + j * ld_, ld_ * sizeof(double));
+    memcpy(data_.get() + j * ld_, tmp.data(), ld_ * sizeof(double));
+#endif
+}
+
 void ReplicatedMatrix::setVal(const int i, const int j, const double val)
 {
 #ifdef HAVE_MAGMA
diff --git a/src/ReplicatedMatrix.h b/src/ReplicatedMatrix.h
index ff72b296..891e41eb 100644
--- a/src/ReplicatedMatrix.h
+++ b/src/ReplicatedMatrix.h
@@ -18,6 +18,8 @@ class ReplicatedVector;
 #include <string>
 #include <vector>
 
+// Square matrices
+// MPI operations assume replication by each MPI task
 class ReplicatedMatrix
 {
     static MPI_Comm comm_;
@@ -29,7 +31,7 @@ class ReplicatedMatrix
     // leading dimension for storage
     size_t ld_;
 
-    // matrix data
+    // matrix data stored in column major order
     std::unique_ptr<double, void (*)(double*)> data_;
 
     std::string name_;
@@ -123,7 +125,11 @@ class ReplicatedMatrix
 
     void setVal(const int i, const int j, const double val);
     void setDiagonal(const std::vector<double>& diag_values);
+
     int iamax(const int j, double& val);
+    double nrm2(const int j);
+    void swapColumns(const int i, const int j);
+
     double norm(char ty);
     double traceProduct(const ReplicatedMatrix&) const;
     void shift(const double);

From d3ddf4d0b2e1bfe73e43da9556022f0cee832516 Mon Sep 17 00:00:00 2001
From: Jean-Luc Fattebert <fattebertj@ornl.gov>
Date: Fri, 29 May 2026 16:18:25 -0400
Subject: [PATCH 2/2] Use eigenvectors of H in Davidson

---
 src/DavidsonSolver.cc                 | 140 ++++++++++++++++++++++++--
 src/DavidsonSolver.h                  |   7 ++
 src/ReplicatedMatrix.cc               |  46 +++++----
 src/ReplicatedMatrix.h                |   3 +-
 tests/Davidson/davidson.cfg           |   2 +-
 tests/DavidsonMixing/davidson.cfg     |   2 +-
 tests/DavidsonReplicated/davidson.cfg |   2 +-
 7 files changed, 172 insertions(+), 30 deletions(-)

diff --git a/src/DavidsonSolver.cc b/src/DavidsonSolver.cc
index d6bd1d06..a2502198 100644
--- a/src/DavidsonSolver.cc
+++ b/src/DavidsonSolver.cc
@@ -25,6 +25,7 @@
 #include "tools.h"
 
 #include <iomanip>
+#include <numeric> // for std::iota
 
 template <class OrbitalsType, class MatrixType>
 Timer DavidsonSolver<OrbitalsType, MatrixType>::solve_tm_(
@@ -250,11 +251,11 @@ void DavidsonSolver<OrbitalsType, MatrixType>::buildTarget2N_MVP(
 
     proj_mat2N_->assignBlocksH(h11, h12, h21, h22);
 
+    // Build s2N using diagonal blocks s11 and s22
+    // assuming s12 = s21 = 0
     // if( onpe0 )os_<<"Build S2N..."<<endl;
     MatrixType s2N("s2N", 2 * numst_, 2 * numst_);
     s2N.assign(s11, 0, 0);
-    // s2N.assign(s12,0,numst_);
-    // s2N.assign(s21,numst_,0);
     s2N.assign(s22, numst_, numst_);
 
     int orbitals_index = 0;
@@ -279,6 +280,129 @@ void DavidsonSolver<OrbitalsType, MatrixType>::buildTarget2N_MVP(
     target_tm_.stop();
 }
 
+// to differentiate very low occupation vectors, extract those with
+// lowest energy
+template <class OrbitalsType, class MatrixType>
+void DavidsonSolver<OrbitalsType, MatrixType>::reorderEigenvectors(
+    MatrixType& evect, std::vector<double>& eval)
+{
+    Control& ct = *(Control::instance());
+    MGmol_MPI& mmpi(*(MGmol_MPI::instance()));
+
+    // get eigenvectors of H used to build target DM
+    const MatrixType& ev(proj_mat2N_->getEigenvectors());
+
+    MatrixType p("p", 2 * numst_, 2 * numst_);
+    p.gemm('t', 'n', 1, ev, evect, 0);
+
+#ifndef NDEBUG
+    // verify that the sum of the elements squrade in each column is 1
+    for (int j = 0; j < 2 * numst_; j++)
+    {
+        double a = p.nrm2(j);
+        std::cout << j << "  " << a * a << std::endl;
+        assert(std::abs(a - 1) < 1.e-2);
+    }
+#endif
+
+    // now find evect with largest overlap with first numst_ columns of
+    // ev zero out elements associated with last numst_ columns of ev
+    // (high energy ones)
+    for (int i = numst_; i < 2 * numst_; i++)
+        for (int j = 0; j < 2 * numst_; j++)
+            p.setVal(i, j, 0);
+
+    // do not include eigenstates with overlap below threshold
+    const double overlap_threshold = 0.2;
+    // keep vectors associated with occupation above threshold unchanged
+    const double occ_threshold = 0.01;
+    std::vector<double> w(2 * numst_);
+    int count = 0;
+    for (int j = 0; j < 2 * numst_; j++)
+    {
+        // first compute the norm of the projection onto ev lowest energy states
+        double a = p.nrm2(j);
+        w[j]     = a * a;
+        // std::cout << j << "  " << w[j] << std::endl;
+        //  Set weight to 0 if below tolerance
+        //  By doing that we exclude eigenvectors with very little
+        //  overlap with lowest eigenstates of H in subsequent reordering
+        if (w[j] < overlap_threshold) w[j] = 0;
+
+        if (eval[j] > occ_threshold) w[j] = 1;
+
+        if (w[j] > 0 && w[j] < 1) count++;
+    }
+    if (mmpi.instancePE0() && ct.verbose > 1)
+    {
+        std::cout << "Sort " << count << " eigenvectors" << std::endl;
+        for (int j = 0; j < 2 * numst_; j++)
+        {
+            if (w[j] > 0 && w[j] < 1)
+                std::cout << "eval [" << j << "] = " << eval[j] << std::endl;
+        }
+    }
+
+    // Create a vector of indices [0, 1, 2, ...]
+    std::vector<int> indices(w.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // Sort indices based on values in 'w'
+    std::sort(indices.begin(), indices.end(),
+        [&](int i1, int i2) { return w[i1] < w[i2]; });
+    for (int idx : indices)
+    {
+        if (w[idx] > 0 && w[idx] < 1 && mmpi.instancePE0() && ct.verbose > 2)
+            std::cout << "Original Index: " << idx << ", overlap " << w[idx]
+                      << std::endl;
+    }
+
+    // Broadcast permutations to make sure all MPI tasks use the same
+    MPI_Comm comm = mmpi.commSameSpin();
+    MPI_Bcast(indices.data(), indices.size(), MPI_INT, 0, comm);
+
+    // permutation of eigenvalues
+    {
+        std::vector<double> tmp_eval = eval;
+        int j                        = 0;
+        for (int idx : indices)
+        {
+            if (idx != j && w[idx] > 0 && w[idx] < 1 && mmpi.instancePE0()
+                && ct.verbose > 2)
+                std::cout << idx << " <-> " << j << std::endl;
+            eval[j] = tmp_eval[idx];
+            j++;
+        }
+    }
+
+    // permutation of eigenvectors in evect
+    {
+        MatrixType perm("perm", 2 * numst_, 2 * numst_);
+        int j = 0;
+        for (int idx : indices)
+        {
+            perm.setVal(idx, j, 1);
+            j++;
+        }
+        MatrixType tmp("tmp", 2 * numst_, 2 * numst_);
+        tmp.gemm('n', 'n', 1, evect, perm, 0);
+        evect = tmp;
+    }
+#if 0
+    // repeat to check result
+    p.gemm('t', 'n', 1, ev, evect, 0);
+    for (int i = numst_; i < 2 * numst_; i++)
+        for (int j = 0; j < 2 * numst_; j++)
+            p.setVal(i, j, 0);
+    for (int j = 0; j < 2 * numst_; j++)
+    {
+        double a = p.nrm2(j);
+        w[j]     = a * a;
+        std::cout << j << "  " << w[j] << std::endl;
+    }
+#endif
+};
+
 // template <class OrbitalsType, class MatrixType>
 // void DavidsonSolver<OrbitalsType,MatrixType>::buildTarget2N_new(
 //    MatrixType& h11,
@@ -592,7 +716,9 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
             MatrixType delta_dm("delta_dm", 2 * numst_, 2 * numst_);
 
             buildTarget2N_MVP(h11, h12, h21, h22, s11, s22, target);
-            if (mmpi.instancePE0() && ct.verbose > 1 && (outer_it % 10 == 0))
+            const int out_freq = ct.verbose > 2 ? 1 : 10;
+            if (mmpi.instancePE0() && ct.verbose > 1
+                && (outer_it % out_freq == 0))
             {
                 proj_mat2N_->printEigenvalues(os_);
                 proj_mat2N_->printOccupations(os_);
@@ -698,6 +824,7 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
         // update orbitals
         MatrixType evect("EigVect", 2 * numst_, 2 * numst_);
         proj_mat2N_->diagonalizeDM(eval, evect);
+
 #if 1
         if (mmpi.PE0() && ct.verbose > 2)
         {
@@ -720,12 +847,9 @@ int DavidsonSolver<OrbitalsType, MatrixType>::solve(
 
         // to differentiate very low occupation vectors, extract those with
         // lowest energy
-        //        MatrixType  z2N("z2N",2*numst_,
-        //        2*numst_); swapColumnsVect(evect, proj_mat2N_->getMatHB(),
-        //        eval, z2N );
+        reorderEigenvectors(evect, eval);
 
-        // replace orbitals with eigenvectors corresponding to largest
-        // eigenvalues of DM
+        // replace orbitals with rightmost numst_ columns of evect
         {
             MatrixType v12("v12", numst_, numst_);
             MatrixType v22("v22", numst_, numst_);
diff --git a/src/DavidsonSolver.h b/src/DavidsonSolver.h
index c058c90e..1b11a118 100644
--- a/src/DavidsonSolver.h
+++ b/src/DavidsonSolver.h
@@ -70,6 +70,13 @@ class DavidsonSolver
     //    const std::vector<DISTMATDTYPE>& auxenergies, const double kbT,
     //    const double eta, MatrixType& target);
 
+    /*!
+     * To differentiate very low occupation vectors, extract those with
+     * lowest energy by reordering evect/eval
+     * (rightmost vectors will be used in next outer itertaion)
+     */
+    void reorderEigenvectors(MatrixType& evect, std::vector<double>& eval);
+
 public:
     DavidsonSolver(std::ostream& os, Ions& ions,
         Hamiltonian<OrbitalsType>* hamiltonian, Rho<OrbitalsType>* rho,
diff --git a/src/ReplicatedMatrix.cc b/src/ReplicatedMatrix.cc
index 062f13ad..9f0e72d6 100644
--- a/src/ReplicatedMatrix.cc
+++ b/src/ReplicatedMatrix.cc
@@ -170,6 +170,30 @@ void ReplicatedMatrix::consolidate()
 #endif
 }
 
+void ReplicatedMatrix::bcast(const int root)
+{
+    assert(comm_ != MPI_COMM_NULL);
+
+#ifdef HAVE_MAGMA
+    std::vector<double> mat(dim_ * ld_);
+    auto& magma_singleton = MagmaSingleton::get_magma_singleton();
+
+    // copy from GPU to CPU
+    magma_dgetmatrix(
+        dim_, dim_, data_.get(), ld_, mat.data(), ld_, magma_singleton.queue_);
+    double* data = mat.data();
+#else
+    double* data = data_.get();
+#endif
+    MPI_Bcast(data, dim_ * ld_, MPI_DOUBLE, root, comm_);
+
+#ifdef HAVE_MAGMA
+    // copy from CPU to GPU
+    magma_dsetmatrix(
+        dim_, dim_, data, ld_, data_.get(), ld_, magma_singleton.queue_);
+#endif
+}
+
 void ReplicatedMatrix::assign(
     const ReplicatedMatrix& src, const int ib, const int jb)
 {
@@ -181,10 +205,10 @@ void ReplicatedMatrix::assign(
     magma_dcopymatrix(src.dim_, src.dim_, src.data_.get(), src.ld_,
         data_.get() + jb * ld_ + ib, ld_, magma_singleton.queue_);
 #else
-    char uplo = 'a';
-    int dim   = src.dim_;
-    int lda   = src.ld_;
-    int ldb   = ld_;
+    char uplo    = 'a';
+    int dim      = src.dim_;
+    int lda      = src.ld_;
+    int ldb      = ld_;
     DLACPY(&uplo, &dim, &dim, src.data_.get(), &lda,
         data_.get() + jb * ld_ + ib, &ldb);
 #endif
@@ -700,20 +724,6 @@ double ReplicatedMatrix::nrm2(const int j)
 #endif
 }
 
-void ReplicatedMatrix::swapColumns(const int i, const int j)
-{
-#ifdef HAVE_MAGMA
-    (void)os;
-    std::cerr << "ReplicatedMatrix::swapColumns) not implemented" << std::endl;
-    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
-#else
-    std::vector<double> tmp(data_.get() + i * ld_, data_.get() + (i + 1) * ld_);
-
-    memcpy(data_.get() + i * ld_, data_.get() + j * ld_, ld_ * sizeof(double));
-    memcpy(data_.get() + j * ld_, tmp.data(), ld_ * sizeof(double));
-#endif
-}
-
 void ReplicatedMatrix::setVal(const int i, const int j, const double val)
 {
 #ifdef HAVE_MAGMA
diff --git a/src/ReplicatedMatrix.h b/src/ReplicatedMatrix.h
index 891e41eb..d2fd2305 100644
--- a/src/ReplicatedMatrix.h
+++ b/src/ReplicatedMatrix.h
@@ -90,6 +90,8 @@ class ReplicatedMatrix
     // sum up values across MPI tasks
     void consolidate();
 
+    void bcast(const int root);
+
     void axpy(const double alpha, const ReplicatedMatrix& a);
 
     void init(const double* const ha, const int lda);
@@ -128,7 +130,6 @@ class ReplicatedMatrix
 
     int iamax(const int j, double& val);
     double nrm2(const int j);
-    void swapColumns(const int i, const int j);
 
     double norm(char ty);
     double traceProduct(const ReplicatedMatrix&) const;
diff --git a/tests/Davidson/davidson.cfg b/tests/Davidson/davidson.cfg
index e34f407b..f21d0221 100644
--- a/tests/Davidson/davidson.cfg
+++ b/tests/Davidson/davidson.cfg
@@ -20,7 +20,7 @@ solver=PCG
 type=QUENCH
 [Quench]
 solver=Davidson
-max_steps=200
+max_steps=50
 atol=1.e-8
 [Orbitals]
 nempty=10
diff --git a/tests/DavidsonMixing/davidson.cfg b/tests/DavidsonMixing/davidson.cfg
index 39c7d457..7d1be6ab 100644
--- a/tests/DavidsonMixing/davidson.cfg
+++ b/tests/DavidsonMixing/davidson.cfg
@@ -19,7 +19,7 @@ pseudopotential=pseudo.H_ONCV_PBE_SG15
 type=QUENCH
 [Quench]
 solver=Davidson
-max_steps=100
+max_steps=50
 atol=1.e-8
 [Orbitals]
 nempty=1
diff --git a/tests/DavidsonReplicated/davidson.cfg b/tests/DavidsonReplicated/davidson.cfg
index 87f0ea1f..c6beff67 100644
--- a/tests/DavidsonReplicated/davidson.cfg
+++ b/tests/DavidsonReplicated/davidson.cfg
@@ -20,7 +20,7 @@ solver=PCG
 type=QUENCH
 [Quench]
 solver=Davidson
-max_steps=200
+max_steps=50
 atol=1.e-8
 [Orbitals]
 nempty=10