diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 55f57cd0ba..4c90c7c7bf 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -144,6 +144,126 @@ function build_projection_coefficients(T::Type, psp::NormConservingPsp)
     proj_coeffs
 end
 
+"""
+Projector set of a single atom (independent of the atom's position),
+and the structure factor for the atom position. Used inside NonlocalProjectors
+such that the projector set can be reused for multiple atoms in the same atom group.
+"""
+struct AtomProjectors{VT <: AbstractVector, PT <: AbstractMatrix}
+    # nbasis
+    structure_factors::VT
+    # nbasis x nproj
+    projectors::PT
+end
+
+"""
+Matrix-like type to represent the nonlocal projection vectors P without
+allocating the full matrix.
+This type extends AbstractMatrix, but it does not implement all
+the required methods, only those that were shown to be needed.
+In particular, random access to the matrix elements is not supported.
+"""
+struct NonlocalProjectors{T <: Number,
+                          PT <: AtomProjectors,
+                         } <: AbstractMatrix{T}
+    atoms::Vector{PT}
+
+    function NonlocalProjectors(atoms::Vector{PT}) where {PT}
+        # also serves as a check for length(atoms) > 0
+        at1 = first(atoms)
+        new{promote_type(eltype(at1.structure_factors),
+                         eltype(at1.projectors)),
+            PT}(atoms)
+    end
+end
+
+function Base.size(P::NonlocalProjectors)
+    n = length(first(P.atoms).structure_factors)
+    m = sum(size(at.projectors, 2) for at in P.atoms)
+    (n, m)
+end
+function Base.Matrix(P::NonlocalProjectors{T}) where {T}
+    n, m = size(P)
+    out = Matrix{T}(undef, n, m)
+    iproj = 1
+    for at in P.atoms
+        for proj in eachcol(at.projectors)
+            out[:, iproj] .= at.structure_factors .* proj
+            iproj += 1
+        end
+    end
+    out
+end
+
+function Base.show(io::IO, P::NonlocalProjectors)
+    print(io, "DFTK.NonlocalProjectors{")
+    show(io, P.atoms)
+    print(io, "}")
+end
+function Base.show(io::IO, ::MIME"text/plain", P::NonlocalProjectors)
+    print(io, summary(P))
+end
+
+# Add a level of indirection here to avoid ambiguity with the mul! method provided by Julia.
+LinearAlgebra.mul!(C::AbstractVector, A::Adjoint{<:Any, <:NonlocalProjectors},
+                   ψk::AbstractVector) = _mul!(C, A, ψk)
+LinearAlgebra.mul!(C::AbstractMatrix, A::Adjoint{<:Any, <:NonlocalProjectors},
+                   ψk::AbstractMatrix) = _mul!(C, A, ψk)
+
+LinearAlgebra.mul!(C::AbstractVector, A::NonlocalProjectors, B::AbstractVector,
+                   α::Number, β::Number) = _mul!(C, A, B, α, β)
+LinearAlgebra.mul!(C::AbstractMatrix, A::NonlocalProjectors, B::AbstractMatrix,
+                   α::Number, β::Number) = _mul!(C, A, B, α, β)
+
+function _mul!(C::AbstractVecOrMat, A::Adjoint{<:Any, <:NonlocalProjectors},
+               ψk::AbstractVecOrMat)
+    if size(C, 1) != size(A, 1) || size(A, 2) != size(ψk, 1) || size(ψk, 2) != size(C, 2)
+        throw(DimensionMismatch(lazy"A has size $(size(A)), B has size $(size(ψk)), C has size $(size(C))"))
+    end
+
+    iproj = 1
+    for at in A.parent.atoms
+        for proj in eachcol(at.projectors)
+            for iband in axes(ψk, 2)
+                C[iproj, iband] = conj(dot(@view(ψk[:, iband]),
+                                           Diagonal(at.structure_factors),
+                                           proj))
+            end
+            iproj += 1
+        end
+    end
+    C
+end
+
+function _mul!(C::AbstractArray, A::NonlocalProjectors{T}, B::AbstractArray{BT},
+               α::Number, β::Number) where {T, BT}
+    if size(C, 1) != size(A, 1) || size(A, 2) != size(B, 1) || size(B, 2) != size(C, 2)
+        throw(DimensionMismatch(lazy"A has size $(size(A)), B has size $(size(B)), C has size $(size(C))"))
+    end
+
+    C .*= β
+
+    maxproj = maximum(at -> size(at.projectors, 2), A.atoms)
+    # TODO: could store in a threadsafe cache to avoid repeated allocations
+    Pbuffer = Matrix{T}(undef, size(A, 1), maxproj)
+
+    iproj = 1
+    for at in A.atoms
+        nproj = size(at.projectors, 2)
+
+        Pwork = @view Pbuffer[:, 1:nproj]
+        Pwork .= at.structure_factors .* at.projectors
+        Bwork = if BT <: AbstractVector
+            @view(B[iproj:iproj+nproj-1])
+        else
+            @view(B[iproj:iproj+nproj-1, :])
+        end
+        mul!(C, Pwork, Bwork, α, 1)
+
+        iproj += nproj
+    end
+    C
+end
 
 @doc raw"""
 Build projection vectors for a atoms array generated by term_nonlocal
@@ -171,36 +291,31 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
                                   psps::AbstractVector{<: NormConservingPsp},
                                   psp_positions) where {T}
     unit_cell_volume = basis.model.unit_cell_volume
-    n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
-    proj_vectors = zeros(Complex{eltype(psp_positions[1][1])}, n_G, n_proj)
     G_plus_k = to_cpu(Gplusk_vectors(basis, kpt))
 
     # Compute the columns of proj_vectors = 1/√Ω \hat proj_i(k+G)
     # Since the proj_i are translates of each others, \hat proj_i(k+G) decouples as
     # \hat proj_i(p) = ∫ proj(r-R) e^{-ip·r} dr = e^{-ip·R} \hat proj(p).
     # The first term is the structure factor, the second the form factor.
-    offset = 0  # offset into proj_vectors
-    for (psp, positions) in zip(psps, psp_positions)
+    atom_projectors = reduce(vcat, map(zip(psps, psp_positions)) do (psp, positions)
         # Compute position-independent form factors
         G_plus_k_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
-        form_factors = build_projector_form_factors(psp, G_plus_k_cart)
+        psp_form_factors = build_projector_form_factors(psp, G_plus_k_cart)
+        psp_form_factors ./= sqrt(unit_cell_volume)
+        # Offload potential values to a device (like a GPU),
+        # and make sure to share this allocation for all atoms in the group
+        psp_form_factors = to_device(basis.architecture, psp_form_factors)
 
         # Combine with structure factors
-        for r in positions
+        map(positions) do r
             # k+G in this formula can also be G, this only changes an unimportant phase factor
-            structure_factors = map(p -> cis2pi(-dot(p, r)), G_plus_k)
-            @views for iproj = 1:count_n_proj(psp)
-                proj_vectors[:, offset+iproj] .=
-                    structure_factors .* form_factors[:, iproj] ./ sqrt(unit_cell_volume)
-            end
-            offset += count_n_proj(psp)
+            structure_factors = to_device(basis.architecture, map(p -> cis2pi(-dot(p, r)), G_plus_k))
+            AtomProjectors(structure_factors, psp_form_factors)
         end
-    end
-    @assert offset == n_proj
+    end)
 
-    # Offload potential values to a device (like a GPU)
-    to_device(basis.architecture, proj_vectors)
+    NonlocalProjectors(atom_projectors)
 end
 
 """
@@ -282,7 +397,7 @@ function PDPψk(basis, positions, psp_groups, kpt, kpt_minus_q, ψk)
     D = build_projection_coefficients(basis, psp_groups)
     P = build_projection_vectors(basis, kpt, psp_groups, positions)
     P_minus_q = build_projection_vectors(basis, kpt_minus_q, psp_groups, positions)
-    P * (D * P_minus_q' * ψk)
+    P * (D * (P_minus_q' * ψk))
 end
 
 function compute_dynmat_δH(::TermAtomicNonlocal, basis::PlaneWaveBasis{T}, ψ, occupation,
diff --git a/src/terms/operators.jl b/src/terms/operators.jl
index 40c8a523e6..bad43b413f 100644
--- a/src/terms/operators.jl
+++ b/src/terms/operators.jl
@@ -110,7 +110,7 @@ end
 function apply!(Hψ, op::NonlocalOperator, ψ)
     mul!(Hψ.fourier, op.P, (op.D * (op.P' * ψ.fourier)), 1, 1)
 end
-Matrix(op::NonlocalOperator) = op.P * op.D * op.P'
+Matrix(op::NonlocalOperator) = op.P * op.D * Matrix(op.P)'
 
 """
 Magnetic field operator A⋅(-i∇).
diff --git a/test/nonlocal.jl b/test/nonlocal.jl
new file mode 100644
index 0000000000..bc55722eb5
--- /dev/null
+++ b/test/nonlocal.jl
@@ -0,0 +1,34 @@
+@testitem "Test NonlocalProjectors" tags=[:psp] setup=[mPspUpf] begin
+    using DFTK
+    using LinearAlgebra
+
+    for (element, psp) in mPspUpf.upf_pseudos
+        lattice = 5 * I(3)
+        el = ElementPsp(element, psp)
+        atoms = [el, el, el]
+        positions = [zeros(3), 1/3 .* ones(3), 2/3 .* ones(3)]
+        model = model_DFT(lattice, atoms, positions; functionals=LDA())
+        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2])
+
+        n_bands = 7
+        ψ = [DFTK.random_orbitals(basis, kpt, n_bands) for kpt in basis.kpoints]
+        occ = [2.0 * ones(n_bands) for _ in basis.kpoints]
+        ρ = DFTK.compute_density(basis, ψ, occ)
+
+        energies, ham = DFTK.energy_hamiltonian(basis, ψ, occ; ρ)
+
+        for (hamblock, ψk) in zip(ham.blocks, ψ)
+            nonloc = hamblock.nonlocal_op
+            Hψk = zero(ψk)
+            DFTK.apply!((; fourier=Hψk), nonloc, (; fourier=ψk))
+
+            nonloc_dense = Matrix(nonloc)
+            Hψk_dense = nonloc_dense * ψk
+
+            Pψk = nonloc.P' * ψk
+            DPψk = nonloc.D * Pψk
+            @assert nonloc.P * DPψk ≈ Matrix(nonloc.P) * DPψk atol=1e-10
+            @assert Hψk ≈ Hψk_dense atol=1e-10
+        end
+    end
+end