exanauts · amontoison · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -28,6 +28,7 @@ VecchiaMLE.generate_xyGrid
 VecchiaMLE.generate_safe_xyGrid
 VecchiaMLE.MadNLP_Print_Level
 VecchiaMLE.Int_to_Mode
+VecchiaMLE.Int_to_KKT_System
 VecchiaMLE.Uni_Error
 VecchiaMLE.IndexReorder
 VecchiaMLE.IndexMaxMin

diff --git a/src/VecchiaMLE.jl b/src/VecchiaMLE.jl
@@ -15,6 +15,7 @@ const KA = KernelAbstractions
 include("VecchiaMLE_defines.jl")
 include("VecchiaMLE_utils.jl")
 include("VecchiaMLE_input.jl")
+include("models/kkt/VecchiaKKT.jl")
 include("VecchiaMLE_kernels.jl")
 include("models/VecchiaMLE_NLPModel.jl")
 

diff --git a/src/VecchiaMLE_defines.jl b/src/VecchiaMLE_defines.jl
@@ -90,6 +90,7 @@ The fields to the struct are as follows:\n
 * `Number_of_Samples`: The Number of Samples the user gives input to the program.
 * `MadNLP_print_level`: The print level to the optimizer. Can be ignored, and will default to ERROR.
 * `mode`: The opertaing mode to the analysis(GPU or CPU). The mapping is [1: 'CPU', 2: 'GPU'].
+* `KKT_System`: Use the default Sparse KKT System or the custom VecchiaKKTSystem. The mapping is [1: 'Sparse System', 2: 'VecchiaKKTSystem'].
 
 """
 mutable struct VecchiaMLEInput{M}
@@ -99,8 +100,9 @@ mutable struct VecchiaMLEInput{M}
     Number_of_Samples::Int
     MadNLP_print_level::Int
     mode::Int
+    KKT_System::Int
 
-    function VecchiaMLEInput(n::Int, k::Int, samples::M, Number_of_Samples::Int, MadNLP_print_Level::Int=5, mode::Int=1) where {M<:AbstractMatrix}
-        return new{M}(n, k, samples, Number_of_Samples, MadNLP_print_Level, mode)
+    function VecchiaMLEInput(n::Int, k::Int, samples::M, Number_of_Samples::Int, MadNLP_print_Level::Int=5, mode::Int=1, KKT_System::Int=1) where {M<:AbstractMatrix}
+        return new{M}(n, k, samples, Number_of_Samples, MadNLP_print_Level, mode, KKT_System)
     end
 end
diff --git a/src/VecchiaMLE_input.jl b/src/VecchiaMLE_input.jl
@@ -57,8 +57,16 @@ function ExecuteModel!(iVecchiaMLE::VecchiaMLEInput, ptGrid::AbstractVector, pre
         model = get_vecchia_model(iVecchiaMLE, ptGrid)
     end
 
-    diags.solve_model_time = @elapsed begin
-        output = madnlp(model, print_level=MadNLP_Print_Level(iVecchiaMLE.MadNLP_print_level))
+    diags.solve_model_time = @elapsed begin        
+        if iVecchiaMLE.KKT_System == 1
+            output = madnlp(model, print_level=MadNLP_Print_Level(iVecchiaMLE.MadNLP_print_level))
+        elseif iVecchiaMLE.KKT_System == 2
+            output = madnlp(model, 
+                print_level=MadNLP_Print_Level(iVecchiaMLE.MadNLP_print_level), 
+                kkt_system = VecchiaKKTSystem, 
+                linear_solver=MadNLPGPU.CUDSSSolver
+            )
+        end
     end
 
 

diff --git a/src/VecchiaMLE_kernels.jl b/src/VecchiaMLE_kernels.jl
@@ -143,7 +143,6 @@ function vecchia_generate_hess_tri_structure!(nnzh::Int, n::Int, colptr_diff::Cu
     fill!(hrows, one(Int))
     fill!(hcols, one(Int))
 
-
     # launch the kernel
     backend = KA.get_backend(hrows)
     kernel = vecchia_generate_hess_tri_structure_kernel!(backend)
@@ -154,7 +153,7 @@ function vecchia_generate_hess_tri_structure!(nnzh::Int, n::Int, colptr_diff::Cu
     view(hrows, 2:n) .+= cumsum(f.(view(colptr_diff, 1:n-1)))
     view(hcols, 2:n) .+= cumsum(view(colptr_diff, 1:n-1))
 
-    kernel(nnzh, n, colptr_diff, view(hrows, 1:n), view(hcols, 1:n), hrows, hcols, ndrange = n)
+    kernel(nnzh, n, colptr_diff, hrows, hcols, hrows, hcols, ndrange = n)
 
     KA.synchronize(backend)
 

diff --git a/src/VecchiaMLE_utils.jl b/src/VecchiaMLE_utils.jl
@@ -26,12 +26,14 @@ end
     Samples_Matrix = generate_Samples(MatCov::AbstractMatrix, 
                                       n::Integer,
                                       Number_of_Samples::Int;
-                                      mode::VecchiaMLE.COMPUTE_MODE )
+                                      mode::VecchiaMLE.COMPUTE_MODE)
 
     Generate a number of samples according to the given Covariance Matrix MatCov.
     Note the samples are given as mean zero. 
     If a CUDA compatible device is detected, the samples are generated on the GPU
     and transferred back to the CPU. 
+    Note that this isn't the most optimized, and should only be used for small sample generation 
+    (i.e., n < 100, Number_of_Samples < 100).
 
 ## Input arguments
 
@@ -62,6 +64,7 @@ function generate_Samples(MatCov::AbstractMatrix, n::Integer, Number_of_Samples:
     return V
 end
 
+
 """
     Covariance_Matrix = generate_MatCov(n::Integer,
                                         params::AbstractArray,
@@ -174,7 +177,7 @@ function MadNLP_Print_Level(pLevel::Integer)::MadNLP.LogLevels
 end
 
 """
-    cpu_mode = Int_to_Mode(n::Integer)
+    cpu_mode = Int_to_Mode(n::Int)
 
     A helper function to convert an integer to a COMPUTE_MODE.
     The mapping is [1: 'CPU', 2: 'GPU'].
@@ -196,14 +199,36 @@ function Int_to_Mode(n::Int)::COMPUTE_MODE
     return CPU
 end
 
+"""
+    kkt_system = Int_to_KKT_System(n::Int)
+
+    A helper function to determine which KKT system to use.
+    The mapping is [1: 'Sparse System', 2: 'VecchiaKKTSystem'].
+
+## Input arguments
+* `n`: the given system as an int;
+
+## Output Arguments
+*`kkt_system` : The chosen KKT system. 
+"""
+function Int_to_KKT_System(n::Int)::MadNLP.AbstractKKTSystem
+    if n == 1
+        return MadNLP.AbstractSparseKKTSystem
+    elseif n == 2
+        return VecchiaKKTSystem
+    else 
+        return MadNLP.AbstractSparseKKTSystem
+    end
+end
+
 """
     Error = Uni_Error(TCov::AbstractMatrix,
                       L::AbstractMatrix)
     Generates the "Univariate KL Divergence" from the Given True Covariane matrix 
     and Approximate Covariance matrix. The output is the result of the following 
-    optimization problem: sup(a^T X_A || a^⊺ X_T). The solution is 
-                f(μ) = ln(μ) + (2μ)^{-2} - 0.5,
-    where μ is the largest or smallest eigenvalue of the matrix Σ_A^{-1}Σ_T, whichever
+    optimization problem: sup(aᵀ X_A || aᵀ X_T). The solution is 
+                f(μ) = ln(μ) + (2μ)⁻² - 0.5,
+    where μ is the largest or smallest eigenvalue of the matrix Σ_A⁻¹Σ_T, whichever
     maximizes the function f(μ). Note: Σ_A , Σ_T are the respective approximate and true
     covariance matrices, and X_A, X_T are samples from the respective Distributions (mean zero).    
 
@@ -559,15 +584,15 @@ The current checks are:\n
 * `iVecchiaMLE`: The filled-out VecchiaMLEInput struct. See VecchiaMLEInput struct for more details. 
 * `ptGrid`: The grid of point locations in 2D space. Must be a Vector of 2D Vectors! 
 """
-function sanitize_input!(iVecchiaMLE::VecchiaMLEInput, ptGrid::T) where T <: Union{AbstractVector, Nothing} 
+function sanitize_input!(iVecchiaMLE::VecchiaMLEInput, ptGrid::T) where T <: Union{AbstractVector, Nothing}
     @assert iVecchiaMLE.n > 0 "The dimension n must be strictly positive!"
     @assert iVecchiaMLE.k <= iVecchiaMLE.n^2 "The number of conditioning neighbors must be less than n^2 !"
     @assert size(iVecchiaMLE.samples, 1) > 0 "Samples must be nonempty!"
     @assert (iVecchiaMLE.MadNLP_print_level in 1:5) "MadNLP Print Level not in 1:5!"
     @assert size(iVecchiaMLE.samples, 2) == iVecchiaMLE.n^2 "samples must be of size Number_of_Samples x n^2!"
     @assert size(iVecchiaMLE.samples, 1) == iVecchiaMLE.Number_of_Samples "samples must be of size Number_of_Samples x n^2!"
-    @assert iVecchiaMLE.mode in [1, 2] "Operation mode not valid! must be in [1, 2]." 
-
+    @assert iVecchiaMLE.mode in [1, 2] "Operation mode not valid! must be in [1, 2]."
+    @assert iVecchiaMLE.KKT_System in [1, 2] "KKT_System not properly specified! Must be in [1, 2]."
     if typeof(iVecchiaMLE.samples) <: Matrix && iVecchiaMLE.mode == 2
         iVecchiaMLE.samples = CuMatrix{Float64}(iVecchiaMLE.samples)
     end
@@ -581,5 +606,5 @@ function sanitize_input!(iVecchiaMLE::VecchiaMLEInput, ptGrid::T) where T <: Uni
         @assert length(pt) == 2 "Position $(i) in ptGrid is not 2 dimensional!"
     end
 
-    return ptGrid::Vector{Vector{Float64}}
+    return ptGrid
 end
diff --git a/src/models/VecchiaMLE_NLPModel.jl b/src/models/VecchiaMLE_NLPModel.jl
@@ -59,7 +59,7 @@ function create_vecchia_cache(samples::AbstractMatrix, k::Int, ptGrid::AbstractV
     if S != Vector{Float64}
 
         offsets = cumsum([0; m[1:end-1]]) |> CuVector{Int}
-        B = [CuMatrix{T}(undef, 0, 0)]
+        B = [CuMatrix{T}(undef, m[j], m[j]) for j in 1:n]
         rowsL = CuVector{Int}(rowsL)
         colsL = CuVector{Int}(colsL)
         colptrL = CuVector{Int}(colptrL)

diff --git a/src/models/kkt/VecchiaKKT.jl b/src/models/kkt/VecchiaKKT.jl
@@ -0,0 +1,195 @@
+using CUDA.CUSOLVER
+#=
+    Custom KKT System for the Vecchia optimiation problem. 
+    TODO: Update KKT System function?
+=#
+
+
+struct VecchiaKKTSystem{T, VT, MT<:AbstractMatrix{T}, QN<:MadNLP.AbstractHessian{T, VT}, LS} <: MadNLP.AbstractCondensedKKTSystem{T, VT, MT, QN}
+
+    # number of nonzeros in L
+    p::Int
+
+    # number of constraints
+    n::Int
+
+    # number of nonzeros in hessian. Not p+n!
+    nnzh::Int
+
+    # Diagonal Matrices (Stored as vectors)
+    # Diagonal matrix NB⁻¹Nᵀ
+    NBNT::VT
+
+    # Vector of Vectors which holds s_j
+    S::Vector{VT}
+
+    # holds sizes of s_j ~> side length of each sub-block in hessian
+    # cumsum format
+    m::Vector{Int}
+
+    # Jacobian values
+    jac::VT
+    # Hessian Values
+    hess::VT
+
+    # Scrap Buffer 
+    buffer::VT
+
+    linear_solver::LS
+
+    # constraint vector
+    c::VT 
+end
+
+#=
+    TODO: Fill out
+    Things to do:
+        1. Query MadNLPSolver for c, grad_y_Lagrange, grad_z_Lagrange, x, lagrange multipliers
+        2. Allocate stuff (buffer)
+        3. Calculate NBNT, S, Λ, D.
+        4. Create VecchiaKKT
+        5. return
+ =#
+
+ function MadNLP.create_kkt_system(
+    ::Type{VecchiaKKTSystem},
+    cb::MadNLP.SparseCallback{T,VT},
+    ind_cons,
+    linear_solver::Type;
+    opt_linear_solver=MadNLP.default_options(linear_solver),
+    hessian_approximation=MadNLP.ExactHessian,
+    qn_options=MadNLP.QuasiNewtonOptions(),
+    ) where {T,VT}
+
+    nlp = cb.nlp
+    n = nlp.cache.n
+    p = get_nvar(nlp) - n
+
+
+    # Get S, need potrf_batch and potrs_batch.
+    # How do I do this?
+    B_cpy = copy(nlp.cache.B)
+
+    # port m to CPU right now, i don't know exactly how to get this?    
+    m = zeros(length(nlp.cache.m))
+    copyto!(m, nlp.cache.m)
+
+    # What is m?
+    println("m\n", m)
+
+    S = [CuVector{T}([T(i == 1) for i in 1:m[j]]) for j in eachindex(m)]
+    println("S\n", S)
+
+    println("length S: ", length(S))
+    println("lengths: ", [length(s) for s in S])
+    println("length B_cpy", length(B_cpy))
+    println("lengths:", [size(B) for B in B_cpy])
+    CUDA.CUSOLVER.potrfBatched!('U', B_cpy)
+    CUDA.CUSOLVER.potrsBatched!('U', B_cpy, S)
+
+    CUDA.@allowscalar NBNT = CuVector{T}([S[j][1] for j in eachindex(m)])
+
+
+    # NOTE: MadNLP automatically updates these values  
+    hess = CUDA.zeros(T, p+n)
+    jac = CUDA.zeros(T, 2*n)
+
+    # Setting the buffer avoids edge cases for small k values 
+    buffer = CUDA.zeros(T, max(p+n, 3*n))
+
+    # Query for constraint vector
+    c = # ???
+
+    return VecchiaKKTSystem{T, VT, AbstractMatrix{T}, MadNLP.AbstractHessian{T, VT}, typeof(linear_solver)}(
+        p,
+        n,
+        p+n,
+        NBNT, 
+        S, 
+        m, 
+        jac,
+        hess,
+        buffer,
+        linear_solver,
+        c
+    )
+end
+
+#=
+    Things to do:
+        1. Solve Δz
+        2. Solve Δλ
+        3. Solve Δy
+        4. return true
+
+        w = (Δy, Δz, Δλ)
+        Δy is length kkt.p
+        Δz is length kkt.n
+        Δλ is length kkt.n
+        ~> w is length kkt.p + 2*kkt.n  
+=#
+function MadNLP.solve!(kkt::VecchiaKKTSystem, w::MadNLP.AbstractKKTVector)
+
+    # Hold here for the moment. 
+    Λ = view(kkt.jac, (1:kkt.n).+kkt.p)
+    D = view(kkt.hess, (1:kkt.n).+kkt.p) ./ Λ
+
+    # get buffer
+    buffer = kkt.buffer
+
+    # Solve for Δz
+    # RHS ~> save to buffer
+    buffer_dz_ptr = view(buffer, 1:p)
+    view(buffer_dz_ptr, 1:kkt.n) .= view(w, (1:kkt.n).+kkt.p)
+
+    # TODO: MAYBE WRONG!!!
+    # TODO: This is for sure wrong, since the subviews are over the colptr, not just m either. 
+    subviews = [view(buffer_dz_ptr, kkt.S[i] : (kkt.S[i+1] - 1)) for i in eachindex(kkt.S)]
+    view(buffer_dz_ptr, 1:length(kkt.S)) .= dot.(kkt.S, subviews)
+
+    buffer_dz_ptr = view(buffer, 1:kkt.n)
+    buffer_dz_ptr .= kkt.c .- buffer_dz_ptr # TODO: kkt.c won't work. c is stored in w (where?), wait for francois.
+
+    # Need product for later, save off to other part of buffer 
+    buffer_cmNBgyL = view(buffer, (1:kkt.n).+2*kkt.n)
+    buffer_cmNBgyL .= copy(buffer_dz_ptr)
+
+    buffer_dz_ptr .= buffer_dz_ptr ./ NBNT
+    buffer_dz_ptr .= kkt.grad_z_Lagrange - buffer_dz_ptr
+
+    # building matrix ΛD + D(NB⁻¹Nᵀ)⁻¹D
+    # Should be same size as grad_z_Lagrange
+    buffer_DNBNTD_ptr = view(buffer, (1:kkt.n).+kkt.n)
+
+    buffer_DNBNTD_ptr .= D ./ kkt.NBNT
+    buffer_DNBNTD_ptr ./= D
+    buffer_DNBNTD_ptr .+= view(kkt.hess, kkt.nnzh-kkt.n:kkt.nnzh) # View is on ΛD in Hessian of Lagrangian
+
+    # Now solve for Δz. After Δy in w.
+    view(w, (1:kkt.n).+kkt.p) .= - buffer_dz_ptr ./ buffer_DNBNTD_ptr 
+
+    # buffer can now be overwritten 
+    # solve for Δλ
+    # same size as buffer_dz_ptr! 
+    buffer_dlm_ptr = view(buffer, 1:kkt.n)
+    buffer_dlm_ptr .= D .* view(w, (1:kkt.n).+kkt.p)
+    buffer_dlm_ptr .+= buffer_cmNBgyL
+
+    view(w, (1:kkt.n).+(kkt.n+kkt.p)) .= buffer_dlm_ptr ./ kkt.NBNT
+
+    # buffer can now be overwritten
+    # solve for Δy
+    buffer_dy_ptr = view(buffer, 1:kkt.p)
+
+    # view onto diagonal to add Δλ entries
+    # Note negative (of the Nᵀ) cancels out with the other negative
+    buffer_diag_y = view(buffer_dy_ptr, kkt.m)
+    buffer_diag_y .+= view(w, (1:kkt.n).+(kkt.n+kkt.p))
+    # That SHOULD be the RHS of Δy
+
+    # The S_j's are the nonzero elements of B⁻¹. 
+    # Fill in Δy.
+    view(w, 1:kkt.p) .= dot.(kkt.S[1:length(kkt.m)], Ref(buffer_diag_y))
+
+    return true
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -18,13 +18,16 @@ include("models/VecchiaMLE_models.jl")
 include("test_cpu_compatible_with_jump.jl")
 include("test_cpu_diagnostics.jl")
 include("test_memory_allocation_cpu.jl")
+include("test_custom_kkt_cpu.jl")
+include("test_kkt_system_comparable_cpu.jl")
+include("test_model_inputs.jl")
+include("test_abnormal_ptGrid.jl")
 
 if CUDA.has_cuda()
     include("test_gpu_compatible_with_jump.jl")
     include("test_gpu_diagnostics.jl")
     include("test_cpu_compatible_with_gpu.jl")
     include("test_memory_allocation_gpu.jl")
+    include("test_custom_kkt_gpu.jl")
+    include("test_kkt_system_comparable_gpu.jl")
 end
-
-include("test_model_inputs.jl")
-include("test_abnormal_ptGrid.jl")