JuliaHealth · termi-official · May 9, 2025 · Feb 28, 2025 · Feb 28, 2025 · Mar 23, 2025
diff --git a/Project.toml b/Project.toml
@@ -13,6 +13,7 @@ FerriteGmsh = "4f95f4f8-b27c-4ae5-9a39-ea55e634e36b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -47,6 +48,7 @@ FastBroadcast = "0.3.5"
 Ferrite = "1"
 ForwardDiff = "0.10.38"
 JET = "0.9"
+KernelAbstractions = "0.9.34"
 LinearSolve = "2"
 Logging = "1.10"
 ModelingToolkit = "9"
@@ -64,6 +66,8 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+MatrixDepot = "b51810bb-c9f3-55da-ae3c-350fc1fbce05"
+ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
 
 [targets]
-test = ["Aqua", "DelimitedFiles", "JET", "Pkg", "StaticArrays", "Tensors", "Test"]
+test = ["Aqua", "DelimitedFiles", "JET", "Pkg", "StaticArrays", "Tensors", "Test","MatrixDepot","ThreadPinning"]
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -1,6 +1,6 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.11.4"
+julia_version = "1.11.3"
 manifest_format = "2.0"
 project_hash = "e3f0483fad38a42c2ece39dd647c14ff0c29b8f8"
 
@@ -122,6 +122,24 @@ weakdeps = ["SparseArrays"]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 version = "1.11.0"
 
+[[deps.Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "b5bb4dc6248fde467be2a863eb8452993e74d402"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "1.1.1"
+
+    [deps.Atomix.extensions]
+    AtomixCUDAExt = "CUDA"
+    AtomixMetalExt = "Metal"
+    AtomixOpenCLExt = "OpenCL"
+    AtomixoneAPIExt = "oneAPI"
+
+    [deps.Atomix.weakdeps]
+    CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+    Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+    OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
+    oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 version = "1.11.0"
@@ -1021,6 +1039,18 @@ git-tree-sha1 = "07649c499349dad9f08dde4243a4c597064663e9"
 uuid = "ef3ab10e-7fda-4108-b977-705223b18434"
 version = "0.6.0"
 
+[[deps.KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "MacroTools", "PrecompileTools", "Requires", "StaticArrays", "UUIDs"]
+git-tree-sha1 = "80d268b2f4e396edc5ea004d1e0f569231c71e9e"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.9.34"
+weakdeps = ["EnzymeCore", "LinearAlgebra", "SparseArrays"]
+
+    [deps.KernelAbstractions.extensions]
+    EnzymeExt = "EnzymeCore"
+    LinearAlgebraExt = "LinearAlgebra"
+    SparseArraysExt = "SparseArrays"
+
 [[deps.Krylov]]
 deps = ["LinearAlgebra", "Printf", "SparseArrays"]
 git-tree-sha1 = "b29d37ce30fa401a4563b18880ab91f979a29734"
@@ -1563,7 +1593,7 @@ version = "0.3.27+1"
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
-version = "0.8.1+4"
+version = "0.8.1+2"
 
 [[deps.OpenMPI_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
@@ -2238,7 +2268,7 @@ uuid = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 version = "0.5.2"
 
 [[deps.Thunderbolt]]
-deps = ["Adapt", "BlockArrays", "DataStructures", "DiffEqBase", "FastBroadcast", "Ferrite", "FerriteGmsh", "ForwardDiff", "GPUArraysCore", "JLD2", "LinearAlgebra", "LinearSolve", "Logging", "ModelingToolkit", "OrderedCollections", "OrdinaryDiffEqCore", "Polyester", "Preferences", "ReadVTK", "Reexport", "SciMLBase", "SparseArrays", "SparseMatricesCSR", "StaticArrays", "Tensors", "TimerOutputs", "UnPack", "Unrolled", "WriteVTK"]
+deps = ["Adapt", "BlockArrays", "DataStructures", "DiffEqBase", "FastBroadcast", "Ferrite", "FerriteGmsh", "ForwardDiff", "GPUArraysCore", "JLD2", "KernelAbstractions", "LinearAlgebra", "LinearSolve", "Logging", "ModelingToolkit", "OrderedCollections", "OrdinaryDiffEqCore", "Polyester", "Preferences", "ReadVTK", "Reexport", "SciMLBase", "SparseArrays", "SparseMatricesCSR", "StaticArrays", "Tensors", "TimerOutputs", "UnPack", "Unrolled", "WriteVTK"]
 path = ".."
 uuid = "909927c2-98d5-4a67-bba9-79f03a9ad49b"
 version = "0.0.1"
@@ -2330,6 +2360,17 @@ git-tree-sha1 = "6cc9d682755680e0f0be87c56392b7651efc2c7b"
 uuid = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
 version = "0.1.5"
 
+[[deps.UnsafeAtomics]]
+git-tree-sha1 = "b13c4edda90890e5b04ba24e20a310fbe6f249ff"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.3.0"
+
+    [deps.UnsafeAtomics.extensions]
+    UnsafeAtomicsLLVM = ["LLVM"]
+
+    [deps.UnsafeAtomics.weakdeps]
+    LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
+
 [[deps.VTKBase]]
 git-tree-sha1 = "c2d0db3ef09f1942d08ea455a9e252594be5f3b6"
 uuid = "4004b06d-e244-455f-a6ce-a5f9919cc534"

diff --git a/docs/src/api-reference/solver.md b/docs/src/api-reference/solver.md
@@ -10,6 +10,14 @@ DocTestSetup = :(using Thunderbolt)
 SchurComplementLinearSolver
 ```
 
+## Preconditioners
+
+```@docs
+Thunderbolt.Preconditioners.L1GSPreconditioner
+Thunderbolt.Preconditioners.BlockPartitioning
+Thunderbolt.Preconditioners.L1GSPrecBuilder
+```
+
 ## Nonlinear
 
 ```@docs

diff --git a/docs/src/assets/references.bib b/docs/src/assets/references.bib
@@ -345,3 +345,19 @@ @inproceedings{PonVerEldGarEnnPer:2019:mlv
   year={2019},
   organization={Springer},
 }
+
+@article{BakFalKolYan:2011:MSU,
+  author    = {Allison H. Baker and Robert D. Falgout and Tzanio V. Kolev and Ulrike Meier Yang},
+  title     = {Multigrid Smoothers for Ultraparallel Computing},
+  journal   = {SIAM Journal on Scientific Computing},
+  volume    = {33},
+  number    = {5},
+  pages     = {2864--2887},
+  year      = {2011},
+  doi       = {10.1137/100798806},
+  url       = {https://doi.org/10.1137/100798806},
+  eprint    = {https://doi.org/10.1137/100798806},
+  abstract  = {This paper investigates the properties of smoothers in the context of algebraic multigrid (AMG) running on parallel computers with potentially millions of processors. The development of multigrid smoothers in this case is challenging, because some of the best relaxation schemes, such as the Gauss–Seidel (GS) algorithm, are inherently sequential. Based on the sharp two-grid multigrid theory from Falgout and Vassilevski (2004) and Falgout, Vassilevski, and Zikatanov (2005), we characterize the smoothing properties of a number of practical candidates for parallel smoothers, including several C-F, polynomial, and hybrid schemes. We show, in particular, that the popular hybrid GS algorithm has multigrid smoothing properties which are independent of the number of processors in many practical applications, provided that the problem size per processor is large enough. This is encouraging news for the scalability of AMG on ultraparallel computers. We also introduce the more robust ℓ₁ smoothers, which are always convergent and have already proven essential for the parallel solution of some electromagnetic problems.}
+}
+
+
diff --git a/ext/CuThunderboltExt.jl b/ext/CuThunderboltExt.jl
@@ -1,6 +1,11 @@
 module CuThunderboltExt
 
 using Thunderbolt
+using LinearSolve
+using KernelAbstractions
+using SparseMatricesCSR
+
+import SparseArrays
 
 import CUDA:
     CUDA, CuArray, CuVector, CUSPARSE,blockDim,blockIdx,gridDim,threadIdx,
@@ -10,7 +15,7 @@ import CUDA:
 import Thunderbolt:
     UnPack.@unpack,
     SimpleMesh,
-    SparseMatrixCSR, SparseMatrixCSC,
+    SparseMatrixCSR, SparseMatrixCSC, AbstractSparseMatrix,
     AbstractSemidiscreteFunction, AbstractPointwiseFunction, solution_size,
     AbstractPointwiseSolverCache,assemble_element!,
     LinearOperator,QuadratureRuleCollection,
@@ -26,6 +31,9 @@ import Thunderbolt.FerriteUtils:
     FeMemShape, KeMemShape, KeFeMemShape, DeviceCellIterator,DeviceOutOfBoundCellIterator,DeviceCellCache,
     FeCellMem, KeCellMem, KeFeCellMem,NoCellMem,AbstractMemShape
 
+import Thunderbolt.Preconditioners:
+    sparsemat_format_type, CSCFormat, CSRFormat
+
 
 import Ferrite:
     AbstractDofHandler,get_grid,CellIterator,get_node_coordinate,getcoordinates,get_coordinate_eltype,getcells,
@@ -80,9 +88,19 @@ function Thunderbolt.adapt_vector_type(::Type{<:CuVector}, v::VT) where {VT <: V
     return CuVector(v)
 end
 
+const __cuda_version__ = pkgversion(CUDA)
+@info("CuThunderboltExt.jl: CUDA version: ", __cuda_version__)
+
+include("cuda/cuda_utils.jl")
 include("cuda/cuda_operator.jl")
 include("cuda/cuda_memalloc.jl")
 include("cuda/cuda_adapt.jl")
 include("cuda/cuda_iterator.jl")
 
+if __cuda_version__ >= v"5.7.3" #TODO: better way? support back compatibility?
+    include("cuda/cuda_preconditioner.jl")
+else
+    @warn("CuThunderboltExt.jl: CUDA version is too old <$__cuda_version__, skipping CUDA preconditioner.")
+end
+
 end
diff --git a/ext/cuda/cuda_preconditioner.jl b/ext/cuda/cuda_preconditioner.jl
@@ -0,0 +1,35 @@
+#########################################
+## CUDA L1 Gauss Seidel Preconditioner ##
+#########################################
+
+# PIRACY ALERT: this code exhibit piratic nature because both `adapt` and its arguments are foreign objects.
+# Therefore, `adapt` behavior is going to be different depending on whether `Thunderbolt` and `CuThunderboltExt` are loaded or not.
+# Reference: https://juliatesting.github.io/Aqua.jl/stable/piracies/
+# Note: the problem is with `AbstractSparseMatrix` as the default behavior of `adapt` is to return the same object, whatever the backend is.
+# Adapt.adapt(::CUDABackend, A::CUSPARSE.AbstractCuSparseMatrix) = A
+# Adapt.adapt(::CUDABackend,A::AbstractSparseMatrix) = A |> cu
+# Adapt.adapt(::CUDABackend, x::Vector) = x |> cu # not needed 
+# Adapt.adapt(::CUDABackend, x::CuVector) = x # not needed
+
+# TODO: remove this function if back compatibility is not needed
+Preconditioners.convert_to_backend(::CUDABackend, A::AbstractSparseMatrix) = adapt(CUDABackend(), A)
+
+
+# For some reason, these properties are not automatically defined for Device Arrays, 
+# TODO: remove the following code when https://github.com/JuliaGPU/CUDA.jl/pull/2738 is merged
+#SparseArrays.rowvals(A::CUSPARSE.CuSparseDeviceMatrixCSC{Tv,Ti,1}) where {Tv,Ti} = A.rowVal
+#SparseArrays.getcolptr(A::CUSPARSE.CuSparseDeviceMatrixCSC{Tv,Ti,1}) where {Tv,Ti} = A.colPtr
+#SparseArrays.getnzval(A::CUSPARSE.CuSparseDeviceMatrixCSC{Tv,Ti,1}) where {Tv,Ti} = A.nzVal
+#SparseMatricesCSR.getnzval(A::CUSPARSE.CuSparseDeviceMatrixCSR{Tv,Ti,1}) where {Tv,Ti} = A.nzVal
+
+# PIRACY ALERT: the following code is commented out to avoid piracy
+# SparseMatricesCSR.colvals(A::CUSPARSE.CuSparseDeviceMatrixCSR{Tv,Ti,1}) where {Tv,Ti} = A.colVal
+# SparseMatricesCSR.getrowptr(A::CUSPARSE.CuSparseDeviceMatrixCSR{Tv,Ti,1}) where {Tv,Ti} = A.rowPtr
+
+# workaround for the issue with SparseMatricesCSR
+# TODO: find a more robust solution to dispatch the correct function
+Preconditioners.colvals(A::CUSPARSE.CuSparseDeviceMatrixCSR{Tv,Ti,1}) where {Tv,Ti} = A.colVal
+Preconditioners.getrowptr(A::CUSPARSE.CuSparseDeviceMatrixCSR{Tv,Ti,1}) where {Tv,Ti} = A.rowPtr
+
+Preconditioners.sparsemat_format_type(::CUSPARSE.CuSparseDeviceMatrixCSC{Tv,Ti,1}) where {Tv,Ti} = CSCFormat
+Preconditioners.sparsemat_format_type(::CUSPARSE.CuSparseDeviceMatrixCSR{Tv,Ti,1}) where {Tv,Ti} = CSRFormat
diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl
@@ -0,0 +1,20 @@
+# remove the following code once PR (https://github.com/JuliaGPU/CUDA.jl/pull/2720) is merged ##
+CUDA.CUSPARSE.CuSparseMatrixCSR{T}(Mat::SparseMatrixCSR) where {T} =
+    CUDA.CUSPARSE.CuSparseMatrixCSR{T}(CuVector{Cint}(Mat.rowptr), CuVector{Cint}(Mat.colval),
+        CuVector{T}(Mat.nzval), size(Mat))
+
+
+CUSPARSE.CuSparseMatrixCSC{T}(Mat::SparseMatrixCSR) where {T} =
+    CUSPARSE.CuSparseMatrixCSC{T}(CUSPARSE.CuSparseMatrixCSR(Mat))
+
+SparseMatricesCSR.SparseMatrixCSR(A::CUSPARSE.CuSparseMatrixCSR) =
+    SparseMatrixCSR(CUSPARSE.SparseMatrixCSC(A)) # no direct conversion (gpu_CSR -> cpu_CSC -> cpu_CSR)
+
+Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSR) =
+    CUSPARSE.CuSparseMatrixCSR(xs)
+
+Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSR) where {T} =
+    CUSPARSE.CuSparseMatrixCSR{T}(xs)
+
+Adapt.adapt_storage(::Type{Array}, mat::CUSPARSE.CuSparseMatrixCSR) =
+    SparseMatrixCSR(mat)
diff --git a/src/Thunderbolt.jl b/src/Thunderbolt.jl
@@ -42,7 +42,7 @@ import ForwardDiff
 
 import ModelingToolkit
 import ModelingToolkit: @variables, @parameters, @component, @named,
-        compose, ODESystem, Differential
+    compose, ODESystem, Differential
 
 # Accelerator support libraries
 import GPUArraysCore: AbstractGPUVector, AbstractGPUArray
@@ -83,6 +83,8 @@ include("solver/interface.jl")
 include("solver/linear.jl")
 include("solver/nonlinear.jl")
 include("solver/time_integration.jl")
+include("solver/linear/preconditioners/Preconditioners.jl")
+@reexport using .Preconditioners 
 
 
 include("modeling/electrophysiology/ecg.jl")

diff --git a/src/solver/linear/preconditioners/Preconditioners.jl b/src/solver/linear/preconditioners/Preconditioners.jl
@@ -0,0 +1,48 @@
+module Preconditioners
+
+using SparseArrays, SparseMatricesCSR
+using LinearSolve
+import LinearSolve: \
+using Adapt
+using UnPack
+import KernelAbstractions: Backend, @kernel, @index, @ndrange, @groupsize, @print, functional,
+    CPU,synchronize
+import SparseArrays: getcolptr,getnzval
+import SparseMatricesCSR: getnzval
+import LinearAlgebra: Symmetric
+
+## Generic Code #
+
+# CSR and CSC are exact the same in symmetric matrices,so we need to hold symmetry info
+# in order to be exploited in cases in which one format has better access pattern than the other.
+abstract type AbstractMatrixSymmetry end
+struct SymmetricMatrix <: AbstractMatrixSymmetry end 
+struct NonSymmetricMatrix <: AbstractMatrixSymmetry end
+
+abstract type AbstractMatrixFormat end
+struct CSRFormat <: AbstractMatrixFormat end
+struct CSCFormat <: AbstractMatrixFormat end
+
+
+# Why using these traits?
+# Since we are targeting multiple backends, but unfortunately, all the sparse matrix CSC/CSR on all
+# backends don't share the same supertype (e.g. AbstractSparseMatrixCSC/AbstractSparseMatrixCSR)
+# e.g. CUSPARSE.CuSparseDeviceMatrixCSC <:SparseArrays.AbstractSparseMatrixCSC → false
+# So we need to define our own traits to identify the format of the sparse matrix
+sparsemat_format_type(::SparseMatrixCSC) = CSCFormat
+sparsemat_format_type(::SparseMatrixCSR) = CSRFormat
+
+#TODO: remove once https://github.com/JuliaGPU/CUDA.jl/pull/2740 is merged
+convert_to_backend(backend::Backend, A::AbstractSparseMatrix) =
+    adapt(backend, A) # fallback value, specific backends are to be extended in their corresponding extensions.
+
+# Why? because we want to circumvent piracy when extending these functions for device backend (e.g. CuSparseDeviceMatrixCSR)
+# TODO: find a more robust solution to dispatch the correct function
+colvals(A::SparseMatrixCSR) = SparseMatricesCSR.colvals(A)
+getrowptr(A::SparseMatrixCSR) = SparseMatricesCSR.getrowptr(A)
+
+include("l1_gauss_seidel.jl")
+
+export L1GSPrecBuilder
+
+end