diff --git a/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.H b/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.H
index a2a2b9c5116..2c2b2005f01 100644
--- a/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.H
+++ b/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.H
@@ -24,9 +24,7 @@ class WarpX;
 struct WarpXSolverDOF
 {
     amrex::Vector<std::array<std::unique_ptr<amrex::iMultiFab>,3>> m_array;
-    amrex::Vector<std::array<std::unique_ptr<amrex::iMultiFab>,3>> m_array_lhs;
     amrex::Vector<std::unique_ptr<amrex::iMultiFab>> m_scalar;
-    amrex::Vector<std::unique_ptr<amrex::iMultiFab>> m_scalar_lhs;
 
     warpx::fields::FieldType m_array_type = warpx::fields::FieldType::None;
     warpx::fields::FieldType m_scalar_type = warpx::fields::FieldType::None;
@@ -35,6 +33,9 @@ struct WarpXSolverDOF
     amrex::Long m_nDoFs_g = 0; /*!< Global nDOF */
 
     void Define ( WarpX* const, int, const std::string&, const std::string&);
+
+    void fill_local_dof (amrex::iMultiFab& dof, amrex::iMultiFab const& mask);
+    void fill_global_dof ();
 };
 
 #endif
diff --git a/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.cpp b/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.cpp
index e521325a0d7..06b8737e927 100644
--- a/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.cpp
+++ b/Source/FieldSolver/ImplicitSolvers/WarpXSolverDOF.cpp
@@ -11,7 +11,10 @@
 #include <ablastr/utils/SignalHandling.H>
 #include <ablastr/warn_manager/WarnManager.H>
 
+#include <AMReX_Scan.H>
+
 using warpx::fields::FieldType;
+using namespace amrex;
 
 void WarpXSolverDOF::Define ( WarpX* const        a_WarpX,
                               const int           a_num_amr_levels,
@@ -41,10 +44,7 @@ void WarpXSolverDOF::Define ( WarpX* const        a_WarpX,
 
     m_array.resize(a_num_amr_levels);
     m_scalar.resize(a_num_amr_levels);
-    m_array_lhs.resize(a_num_amr_levels);
-    m_scalar_lhs.resize(a_num_amr_levels);
 
-    amrex::Long offset = 0;
     m_nDoFs_l = 0;
 
     // Define the 3D vector field data container
@@ -62,26 +62,9 @@ void WarpXSolverDOF::Define ( WarpX* const        a_WarpX,
                                                                      this_array[n]->DistributionMap(),
                                                                      2*ncomp, // {local, global} for each comp
                                                                      this_array[n]->nGrowVect() );
-                m_nDoFs_g += this_array[n]->boxArray().numPts()*ncomp;
-
-                m_array[lev][n]->setVal(-1.0);
-                amrex::Long offset_mf = 0;
-                for (amrex::MFIter mfi(*m_array[lev][n]); mfi.isValid(); ++mfi) {
-                    auto bx = mfi.tilebox();
-                    auto dof_arr = m_array[lev][n]->array(mfi);
-                    ParallelFor( bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-                    {
-                        for (int v = 0; v < ncomp; v++) {
-                            dof_arr(i,j,k,2*v) = bx.index(amrex::IntVect(AMREX_D_DECL(i, j, k))) * ncomp
-                                                 + v
-                                                 + offset_mf
-                                                 + offset;
-                        }
-                    });
-                    offset_mf += bx.numPts()*ncomp;
-                }
-                offset += offset_mf;
-                m_nDoFs_l += offset_mf;
+
+                auto* mask = a_WarpX->getFieldDotMaskPointer(m_array_type, lev, ablastr::fields::Direction{n});
+                fill_local_dof(*m_array[lev][n], *mask);
             }
         }
 
@@ -101,106 +84,137 @@ void WarpXSolverDOF::Define ( WarpX* const        a_WarpX,
                                                                this_mf->DistributionMap(),
                                                                2*ncomp, // {local, global} for each comp
                                                                this_mf->nGrowVect() );
-            m_nDoFs_g += this_mf->boxArray().numPts()*ncomp;
-
-            m_scalar[lev]->setVal(-1.0);
-            amrex::Long offset_mf = 0;
-            for (amrex::MFIter mfi(*m_scalar[lev]); mfi.isValid(); ++mfi) {
-                auto bx = mfi.tilebox();
-                auto dof_arr = m_scalar[lev]->array(mfi);
-                ParallelFor( bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-                {
-                    for (int v = 0; v < ncomp; v++) {
-                        dof_arr(i,j,k,2*v) = bx.index(amrex::IntVect(AMREX_D_DECL(i, j, k))) * ncomp
-                                             + v
-                                             + offset_mf
-                                             + offset;
-                    }
-                });
-                offset_mf += bx.numPts()*ncomp;
-            }
-            offset += offset_mf;
-            m_nDoFs_l += offset_mf;
+
+            auto* mask = a_WarpX->getFieldDotMaskPointer(m_scalar_type, lev, ablastr::fields::Direction{0});
+            fill_local_dof(*m_scalar[lev], *mask);
         }
 
     }
 
-    auto nDoFs_g = m_nDoFs_l;
-    amrex::ParallelDescriptor::ReduceLongSum(&nDoFs_g,1);
-    WARPX_ALWAYS_ASSERT_WITH_MESSAGE(
-        m_nDoFs_g == nDoFs_g,
-        "WarpXSolverDOF::Define(): something has gone wrong in DoF counting");
-
-    auto num_procs = amrex::ParallelDescriptor::NProcs();
-    auto my_proc = amrex::ParallelDescriptor::MyProc();
-    amrex::Vector<int> dof_proc_arr(num_procs,0);
-    dof_proc_arr[my_proc] = m_nDoFs_l;
-    amrex::ParallelDescriptor::ReduceIntSum(dof_proc_arr.data(), num_procs);
+    fill_global_dof();
 
-    int offset_global = 0;
-    for (int i = 0; i < my_proc; i++) { offset_global += dof_proc_arr[i]; }
-
-    if (m_array_type != FieldType::None) {
-        for (int lev = 0; lev < a_num_amr_levels; ++lev) {
-            const ablastr::fields::VectorField this_array = a_WarpX->m_fields.get_alldirs(a_vector_type_name, lev);
-            for (int n = 0; n < 3; n++) {
-                auto ncomp = this_array[n]->nComp();
-                for (amrex::MFIter mfi(*m_array[lev][n]); mfi.isValid(); ++mfi) {
-                    auto bx = mfi.tilebox();
-                    auto dof_arr = m_array[lev][n]->array(mfi);
-                    ParallelFor( bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-                    {
-                        for (int v = 0; v < ncomp; v++) {
-                            dof_arr(i,j,k,2*v+1) = dof_arr(i,j,k,2*v) + offset_global;
-                        }
-                    });
+    for (int lev = 0; lev < a_num_amr_levels; ++lev) {
+        for (int n = 0; n < 3; n++) {
+            if (auto* dof = m_array[lev][n].get()) {
+                for (int comp = 1; comp < dof->nComp(); comp += 2) { // Only call this on global id
+                    dof->FillBoundaryAndSync(comp, 1, dof->nGrowVect(), a_WarpX->Geom(lev).periodicity());
                 }
             }
         }
+        if (auto* dof = m_scalar[lev].get()) {
+            for (int comp = 1; comp < dof->nComp(); comp += 2) { // Only call this on global id
+                dof->FillBoundaryAndSync(comp, 1, dof->nGrowVect(), a_WarpX->Geom(lev).periodicity());
+            }
+        }
     }
-    if (m_scalar_type != FieldType::None) {
-        for (int lev = 0; lev < a_num_amr_levels; ++lev) {
-            const amrex::MultiFab* this_mf = a_WarpX->m_fields.get(a_scalar_type_name,lev);
-            auto ncomp = this_mf->nComp();
-            for (amrex::MFIter mfi(*m_scalar[lev]); mfi.isValid(); ++mfi) {
-                auto bx = mfi.tilebox();
-                auto dof_arr = m_scalar[lev]->array(mfi);
-                ParallelFor( bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-                {
-                    for (int v = 0; v < ncomp; v++) {
-                        dof_arr(i,j,k,2*v+1) = dof_arr(i,j,k,2*v) + offset_global;
+
+    amrex::Print() << "Defined DOF object for linear solves (total DOFs = " << m_nDoFs_g << ").\n";
+}
+
+void WarpXSolverDOF::fill_local_dof (iMultiFab& dof, iMultiFab const& mask)
+{
+    int ncomp = dof.nComp() / 2; // /2 because both local and global ids are stored in dof
+
+    AMREX_ALWAYS_ASSERT(dof.boxArray().numPts()*ncomp < static_cast<Long>(std::numeric_limits<int>::max()));
+
+    dof.setVal(std::numeric_limits<int>::lowest());
+
+#ifdef AMREX_USE_MPI
+    int nprocs = ParallelDescriptor::NProcs();
+#endif
+
+    for (MFIter mfi(dof); mfi.isValid(); ++mfi) {
+        Box const& vbx = mfi.validbox();
+        int npts = vbx.numPts();
+        BoxIndexer boxindex(vbx);
+        auto const& m = mask.const_array(mfi);
+        auto const& d = dof.array(mfi);
+        auto start_id = m_nDoFs_l;
+        auto ndofs = Scan::PrefixSum<int>(
+            npts,
+            [=] AMREX_GPU_DEVICE (int offset) -> int
+            {
+                auto [i,j,k] = boxindex(offset);
+                return m(i,j,k) ? 1 : 0;
+            },
+            [=] AMREX_GPU_DEVICE (int offset, int ps)
+            {
+                auto [i,j,k] = boxindex(offset);
+                if (m(i,j,k)) {
+                    d(i,j,k,0) = ps + start_id;
+#ifdef AMREX_USE_MPI
+                    if (nprocs == 1)
+#endif
+                    {
+                        d(i,j,k,1) = ps + start_id;
                     }
-                });
-            }
+                }
+            },
+            Scan::Type::exclusive, Scan::retSum);
+        if (ncomp > 1) {
+            ParallelFor(vbx, ncomp-1, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
+            {
+                if (m(i,j,k)) {
+                    d(i,j,k,2*(n+1)) = d(i,j,k,0) + ndofs*(n+1);
+#ifdef AMREX_USE_MPI
+                    if (nprocs == 1)
+#endif
+                    {
+                        d(i,j,k,2*(n+1)+1) = d(i,j,k,0) + ndofs*(n+1);
+                    }
+                }
+            });
         }
+        m_nDoFs_l += Long(ndofs)*ncomp;
     }
+}
 
-    if (m_array_type != FieldType::None) {
-        for (int lev = 0; lev < a_num_amr_levels; ++lev) {
-            const auto& geom = a_WarpX->Geom(lev);
-            for (int n = 0; n < 3; n++) {
-                m_array_lhs[lev][n] = std::make_unique<amrex::iMultiFab>(m_array[lev][n]->boxArray(),
-                                                                         m_array[lev][n]->DistributionMap(),
-                                                                         m_array[lev][n]->nComp(),
-                                                                         0 );
-                amrex::iMultiFab::Copy(*m_array_lhs[lev][n], *m_array[lev][n], 0, 0, m_array[lev][n]->nComp(), 0);
-                m_array[lev][n]->FillBoundary(geom.periodicity());
-                // do NOT call FillBoundary() on m_array_lhs
+void WarpXSolverDOF::fill_global_dof ()
+{
+#ifndef AMREX_USE_MPI
+    m_nDoFs_g = m_nDoFs_l;
+#else
+    int nprocs = ParallelDescriptor::NProcs();
+    if (nprocs == 1) {
+        m_nDoFs_g = m_nDoFs_l;
+    } else {
+        Vector<Long> ndofs_allprocs(nprocs);
+        MPI_Allgather(&m_nDoFs_l, 1, ParallelDescriptor::Mpi_typemap<Long>::type(),
+                      ndofs_allprocs.data(), 1, ParallelDescriptor::Mpi_typemap<Long>::type(),
+                      ParallelDescriptor::Communicator());
+        Long proc_begin = 0;
+        int myproc = ParallelDescriptor::MyProc();
+        m_nDoFs_g = 0;
+        for (int iproc = 0; iproc < nprocs; ++iproc) {
+            if (iproc < myproc) {
+                proc_begin += ndofs_allprocs[iproc];
             }
+            m_nDoFs_g += ndofs_allprocs[iproc];
         }
-    }
-    if (m_scalar_type != FieldType::None) {
-        for (int lev = 0; lev < a_num_amr_levels; ++lev) {
-            m_scalar_lhs[lev] = std::make_unique<amrex::iMultiFab>(m_scalar[lev]->boxArray(),
-                                                                   m_scalar[lev]->DistributionMap(),
-                                                                   m_scalar[lev]->nComp(),
-                                                                   0 );
-            amrex::iMultiFab::Copy(*m_scalar_lhs[lev], *m_scalar[lev], 0, 0, m_scalar[lev]->nComp(), 0);
-            const auto& geom = a_WarpX->Geom(lev);
-            m_scalar[lev]->FillBoundary(geom.periodicity());
-            // do NOT call FillBoundary() on m_scalar_lhs
+        for (auto& x : m_array) {
+            for (auto& y : x) {
+                if (y) {
+                    auto const& dof = y->arrays();
+                    auto ncomp = y->nComp() / 2;
+                    ParallelFor(*y, IntVect(0), ncomp, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k, int n)
+                    {
+                        dof[b](i,j,k,2*n+1) = dof[b](i,j,k,2*n) + int(proc_begin);
+                    });
+                }
+            }
+        }
+        for (auto& x : m_scalar) {
+            if (x) {
+                auto const& dof = x->arrays();
+                auto ncomp = x->nComp() / 2;
+                ParallelFor(*x, IntVect(0), ncomp, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k, int n)
+                {
+                    dof[b](i,j,k,2*n+1) = dof[b](i,j,k,2*n) + int(proc_begin);
+                });
+            }
         }
+        Gpu::streamSynchronize();
     }
+#endif
 
-    amrex::Print() << "Defined DOF object for linear solves (total DOFs = " << m_nDoFs_g << ").\n";
 }
diff --git a/Source/FieldSolver/ImplicitSolvers/WarpXSolverVec.cpp b/Source/FieldSolver/ImplicitSolvers/WarpXSolverVec.cpp
index 81738019f6c..030e7e3bfff 100644
--- a/Source/FieldSolver/ImplicitSolvers/WarpXSolverVec.cpp
+++ b/Source/FieldSolver/ImplicitSolvers/WarpXSolverVec.cpp
@@ -160,10 +160,13 @@ void WarpXSolverVec::copyFrom ( const amrex::Real* const a_arr)
                     {
                         for (int v = 0; v < ncomp; v++) {
                             int dof = dof_arr(i,j,k,2*v); // local
-                            data_arr(i,j,k,v) = a_arr[dof];
+                            if (dof >= 0) {
+                                data_arr(i,j,k,v) = a_arr[dof];
+                            }
                         }
                     });
                 }
+                m_array_vec[lev][n]->FillBoundaryAndSync(m_WarpX->Geom(lev).periodicity());
             }
         }
         if (m_scalar_type != FieldType::None) {
@@ -176,10 +179,13 @@ void WarpXSolverVec::copyFrom ( const amrex::Real* const a_arr)
                 {
                     for (int v = 0; v < ncomp; v++) {
                         int dof = dof_arr(i,j,k,2*v); // local
-                        data_arr(i,j,k,v) = a_arr[dof];
+                        if (dof >= 0) {
+                            data_arr(i,j,k,v) = a_arr[dof];
+                        }
                     }
                 });
             }
+            m_scalar_vec[lev]->FillBoundaryAndSync(m_WarpX->Geom(lev).periodicity());
         }
     }
 }
@@ -205,7 +211,9 @@ void WarpXSolverVec::copyTo ( amrex::Real* const a_arr) const
                     {
                         for (int v = 0; v < ncomp; v++) {
                             int dof = dof_arr(i,j,k,2*v); // local
-                            a_arr[dof] = data_arr(i,j,k,v);
+                            if (dof >= 0) {
+                                a_arr[dof] = data_arr(i,j,k,v);
+                            }
                         }
                     });
                 }
@@ -221,7 +229,9 @@ void WarpXSolverVec::copyTo ( amrex::Real* const a_arr) const
                 {
                     for (int v = 0; v < ncomp; v++) {
                         int dof = dof_arr(i,j,k,2*v); // local
-                        a_arr[dof] = data_arr(i,j,k,v);
+                        if (dof >= 0) {
+                            a_arr[dof] = data_arr(i,j,k,v);
+                        }
                     }
                 });
             }
diff --git a/Source/NonlinearSolvers/MatrixPC.H b/Source/NonlinearSolvers/MatrixPC.H
index 7c96ceb9fa8..de54e940bbf 100644
--- a/Source/NonlinearSolvers/MatrixPC.H
+++ b/Source/NonlinearSolvers/MatrixPC.H
@@ -361,7 +361,6 @@ int MatrixPC<T,Ops>::Assemble (const T& a_U)
     // Get DOF object from a_U
     const auto& dofs_obj = a_U.getDOFsObject();
     const auto& dofs_mfarrvec = dofs_obj->m_array;
-    const auto& dofs_lhs_mfarrvec = dofs_obj->m_array_lhs;
     AMREX_ALWAYS_ASSERT(m_ndofs_l == dofs_obj->m_nDoFs_l);
     AMREX_ALWAYS_ASSERT(m_ndofs_g == dofs_obj->m_nDoFs_g);
 
@@ -383,8 +382,6 @@ int MatrixPC<T,Ops>::Assemble (const T& a_U)
     auto c_indices_g_ptr = m_c_indices_g.data();
     auto a_ij_ptr = m_a_ij.data();
 
-    const auto ndofs_l = m_ndofs_l;
-    const auto ndofs_g = m_ndofs_g;
     const auto nnz_max = m_pc_mat_nnz;
     auto nnz_actual = nnz_max;
 
@@ -404,11 +401,9 @@ int MatrixPC<T,Ops>::Assemble (const T& a_U)
             for (amrex::MFIter mfi(*dofs_mfarrvec[lev][dir]); mfi.isValid(); ++mfi) {
 
                 auto bx = mfi.tilebox();
-                auto dof_lhs_arr = dofs_lhs_mfarrvec[lev][dir]->const_array(mfi);
-
-#if defined(WARPX_DIM_1D_Z)
                 auto dof_arr = dofs_mfarrvec[lev][dir]->const_array(mfi);
-#elif defined(WARPX_DIM_RCYLINDER)
+
+#if defined(WARPX_DIM_RCYLINDER)
                 amrex::ignore_unused(dxi);
                 WARPX_ABORT_WITH_MESSAGE("MatrixPC<T,Ops>::Assemble() not yet implemented for WARPX_DIM_RCYLINDER");
 #elif defined(WARPX_DIM_RSPHERE)
@@ -418,7 +413,6 @@ int MatrixPC<T,Ops>::Assemble (const T& a_U)
                 amrex::ignore_unused(dxi);
                 WARPX_ABORT_WITH_MESSAGE("MatrixPC<T,Ops>::Assemble() not yet implemented for WARPX_DIM_RZ");
 #elif defined(WARPX_DIM_XZ)
-                auto dof_arr = dofs_mfarrvec[lev][dir]->const_array(mfi);
                 int tdir = -1;
                 if  (dir == 0) { tdir = 2; }
                 else if (dir == 2) { tdir = 0; }
@@ -440,16 +434,16 @@ int MatrixPC<T,Ops>::Assemble (const T& a_U)
                 if (thetaDt > 0.0) {
                     ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
                     {
+                        int ridx_l = dof_arr(i,j,k,0);
+                        if (ridx_l < 0) { return; }
+
                         int icol = 0;
-                        int ridx_l = dof_lhs_arr(i,j,k,0);
-                        int ridx_g = dof_lhs_arr(i,j,k,1);
-                        AMREX_ALWAYS_ASSERT((ridx_l >= 0) && (ridx_l < ndofs_l));
-                        AMREX_ALWAYS_ASSERT((ridx_g >= 0) && (ridx_g < ndofs_g));
+                        int ridx_g = dof_arr(i,j,k,1);
 
                         r_indices_g_ptr[ridx_l] = ridx_g;
 
                         {
-                            int cidx_g_lhs = dof_lhs_arr(i,j,k,1);
+                            int cidx_g_lhs = dof_arr(i,j,k,1);
                             amrex::Real val = 1.0;
                             auto flag = MatrixPCUtils::insertOrAdd( cidx_g_lhs, val,
                                                                     &c_indices_g_ptr[ridx_l*nnz_max],
@@ -638,12 +632,13 @@ int MatrixPC<T,Ops>::Assemble (const T& a_U)
 
                     ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
                     {
-                        int ridx_l = dof_lhs_arr(i,j,k,0);
-                        AMREX_ALWAYS_ASSERT((ridx_l >= 0) && (ridx_l < ndofs_l));
+                        int ridx_l = dof_arr(i,j,k,0);
+                        if (ridx_l < 0) { return; }
+
                         int icol = num_nz_ptr[ridx_l];
 
                         {
-                            int cidx_g_lhs = dof_lhs_arr(i,j,k,1);
+                            int cidx_g_lhs = dof_arr(i,j,k,1);
                             amrex::Real val = sigma_ii_arr(i,j,k,0) - 1.0;
                             auto flag = MatrixPCUtils::insertOrAdd( cidx_g_lhs, val,
                                                                     &c_indices_g_ptr[ridx_l*nnz_max],