diff --git a/examples/solve_sparse_system.py b/examples/solve_sparse_system.py new file mode 100644 index 0000000..c9a17cb --- /dev/null +++ b/examples/solve_sparse_system.py @@ -0,0 +1,79 @@ +# copyright ################################# # +# This file is part of the Xobjects Package. # +# Copyright (c) CERN, 2021. # +# ########################################### # + +import xobjects as xo +import scipy.sparse as sp +import numpy as np +from xobjects.sparse._test_helpers import rel_residual + +''' +The goal of this example is to provide a short user guide for the xo.sparse module. + +The xo.sparse module can be used to solve sparse linear systems of equations: + A*x = b +where A is a sparse matrix. + +Currently this module only supports CPU and Cupy contexts. This module contains a +variety of solvers for different contexts, with consistent APIs. The intended use +is to reuse the same LHS for many solves, so the solvers work as follows: + +solver(A) # Performs decomposition/factorization +solver.solve(b) # Solves Ax = b using precomputed factors + +For optimal performance across backends b should be a column-major (F Contiguous) +array or vector. + +The intended interface for this module is: + +xo.sparse.factorized_sparse_solver() + +The function includes detailed documentation for usage, but in short, it returns +the best performing solver based on the context and available modules. If the +context is not explicitly defined, it is inferred based on the input matrix. + +This is how modules that build upon this functionality within xsuite should interact +with the xo.sparse module, so that cross-platform compatibility is guaranteed. + +For development and convenience purposes xo.sparse provides the: +xo.sparse.solvers module + +which provides the following: +xo.sparse.solvers.CPU. + - scipysplu : Alias for scipy SuperLU + - KLUSuperLU : Alias for PyKLU +xo.sparse.solvers.CPU. + - cuDSS : nvmath.sparse.advanced.DirectSolver Wrapper with a SuperLU-like interface + - cachedSpSM : Rewrite of cupy's SuperLU to cache the SpSM analysis step + offering massive speedups compared to cupy splu when the only + available backend is SpSM + - cupysplu : Alias for scipy SuperLU +''' + +# Example: solve small matrix system +n = 5 +# Create matrix +main = 2.0 + np.abs(np.random.standard_normal(n)) +lower = np.random.standard_normal(n-1) +upper = np.random.standard_normal(n-1) +A = sp.diags( + diagonals=[lower, main, upper], + offsets=[-1, 0, 1], + format="csc" +) + +# Create solver: +print("Solver selection process: ") +solver = xo.sparse.factorized_sparse_solver(A, verbose = True) + +# Generate random vector to solve: +b = np.random.standard_normal(n) + +# Solve system +x = solver.solve(b) + +# Calculate relative residual to assess solver: +res = rel_residual(A,x,b) +print("Relative residual of solution ", res) +print("Residual should be small (<10^-12)") diff --git a/tests/test_sparse_solvers.py b/tests/test_sparse_solvers.py new file mode 100644 index 0000000..eb43633 --- /dev/null +++ b/tests/test_sparse_solvers.py @@ -0,0 +1,199 @@ +# copyright ################################# # +# This file is part of the Xobjects Package. # +# Copyright (c) CERN, 2021. # +# ########################################### # + +import numpy as np +import scipy.sparse as sp +import xobjects as xo +from xobjects.test_helpers import fix_random_seed +from xobjects.context import ModuleNotAvailableError +import warnings +import pytest + +''' +The following tests rely on computing the relative residual of the solution +The relative residual can be defined as: + || A * x - b || + η = --------------------- + ||b|| + +Typically, the expected value for this quantity is: +* Ideally: 1e-12 - 1e-14 +* Ill-conditioned systems: 1e-9 - 1e-10 + +In this module, we evaluate the residual as follows: +* We compare the residual of the reference solver (scipy) with ABS_TOL +* We compare the residual of the KLU solver with ABS_TOL +* We ensure that the residual of the KLU Solver is within TOLERANCE_FACTOR + of the reference solver. + +For reference, the machine precision for FP64 is ~2.2e-16 (PRECISION) + +Note: The completely random sparse system is more prone to failing the tests +due to numerical noise, often requires looser tolerances. Still worth including +but if testing larger systems, could potentially be omitted. +''' + +# ---- Helper functions ---- +def issymmetric(A, tol=0): + if A.shape[0] != A.shape[1]: + return False + diff = A - A.T + if tol == 0: + return diff.nnz == 0 + else: + # tolerance-based check + return abs(diff).max() <= tol + + +def assert_residual_ok(res_ref, res_solver, + abs_tol=1e-12, + factor=10): + """ + Check that our solver's residual is both: + - absolutely small enough (abs_tol), + - not catastrophically worse than the reference (factor * res_ref). + """ + # sanity: reference solver itself should be good + assert res_ref < abs_tol, f"Reference residual too large: {res_ref}" + + # absolute bound + assert res_solver < abs_tol, ( + f"Residual {res_solver} exceeds absolute tolerance {abs_tol}" + ) + + # relative bound vs reference + assert res_solver <= factor * res_ref, ( + f"Residual {res_solver} not within factor {factor} of " + f"reference residual {res_ref}" + ) + +# ---- Tests ---- +cpu_tests = [ + ("scipySLU", xo.ContextCpu()), + ("PyKLU", xo.ContextCpu()), + ] + +cupy_tests = [] +try: + cupy_tests = [ + ("cuDSS", xo.ContextCupy()), + ("CachedSLU", xo.ContextCupy()), + ("cupySLU", xo.ContextCupy()), + ] +except ModuleNotAvailableError: + warnings.warn("!!!ContextCupy unavailable. " + "Skipping tests for Cupy Solvers!!!") + +PARAMS = cpu_tests + cupy_tests + +SPARSE_SYSTEM_SIZE = 2000 # (n,n) matrix +NUM_BATCHES = 10 +PRECISION = np.finfo(float).eps +ABS_TOL = 1e-12 +TOLERANCE_FACTOR = 2 + +def batch_vectors_as_matrix(vector_list): + return np.asfortranarray(np.moveaxis(np.array(vector_list),0,-1)) + +@fix_random_seed(1337) +def make_random_sparse_system(n, nbatches, density=0.01): + A = sp.random( + n, n, + density=density, + format="csc", + random_state=np.random, + data_rvs=np.random.standard_normal + ) + # Make it nonsingular & better conditioned: + # Add something on the diagonal so pivots aren't tiny/zero + A = A + sp.eye(n, format="csc") * 5.0 # tweak factor as you like + b_array = [] + if nbatches == 0: + b = np.random.standard_normal(n) + b_array.append(b) + else: + for i in range(nbatches): + b = np.cos(2*i/(nbatches-1)*np.pi) * np.random.standard_normal(n) + b_array.append(b) + b = batch_vectors_as_matrix(b_array) + solver = sp.linalg.splu(A) + x = solver.solve(b) + return (A, b, x, b_array) + +@fix_random_seed(1337) +def make_tridiagonal_system(n, nbatches): + main = 2.0 + np.abs(np.random.standard_normal(n)) + lower = np.random.standard_normal(n-1) + upper = np.random.standard_normal(n-1) + A = sp.diags( + diagonals=[lower, main, upper], + offsets=[-1, 0, 1], + format="csc" + ) + b_array = [] + if nbatches == 0: + b = np.random.standard_normal(n) + b_array.append(b) + else: + for i in range(nbatches): + b = np.cos(2*i/(nbatches-1)*np.pi) * np.random.standard_normal(n) + b_array.append(b) + b = batch_vectors_as_matrix(b_array) + solver = sp.linalg.splu(A) + x = solver.solve(b) + return (A, b, x, b_array) + +random_system = make_random_sparse_system(SPARSE_SYSTEM_SIZE, 0) +tridiag_system = make_tridiagonal_system(SPARSE_SYSTEM_SIZE, 0) + +@pytest.mark.parametrize("test_solver,test_context", PARAMS) +@pytest.mark.parametrize("sparse_system", [random_system, tridiag_system]) +def test_vector_solve(test_solver, test_context, sparse_system): + A_sp, b_sp, x_sp, _ = sparse_system + assert not issymmetric(A_sp) + + scipy_residual = xo.sparse.rel_residual(A_sp,x_sp,b_sp) + + if "Cpu" in str(test_context): + A = test_context.splike_lib.sparse.csc_matrix(A_sp) + if "Cupy" in str(test_context): + A = test_context.splike_lib.sparse.csr_matrix(A_sp) + b_test = test_context.nparray_to_context_array(b_sp) + b = test_context.nplike_lib.asfortranarray(b_test) + solver = xo.sparse.factorized_sparse_solver(A, + force_solver = test_solver, + context = test_context + ) + x = solver.solve(b) + + solver_residual = xo.sparse.rel_residual(A,x,b) + assert_residual_ok(scipy_residual,solver_residual, + abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR) + +random_system = make_random_sparse_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES) +tridiag_system = make_tridiagonal_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES) + +@pytest.mark.parametrize("test_solver,test_context", PARAMS) +@pytest.mark.parametrize("sparse_system", [random_system, tridiag_system]) +def test_batched_solve(test_solver, test_context, sparse_system): + A_sp, b_sp, x_sp, _ = sparse_system + assert not issymmetric(A_sp) + scipy_residual = xo.sparse.rel_residual(A_sp,x_sp,b_sp) + if "Cpu" in str(test_context): + A = test_context.splike_lib.sparse.csc_matrix(A_sp) + if "Cupy" in str(test_context): + A = test_context.splike_lib.sparse.csr_matrix(A_sp) + b_test = test_context.nparray_to_context_array(b_sp) + b = test_context.nplike_lib.asfortranarray(b_test) + solver = xo.sparse.factorized_sparse_solver(A, + n_batches = NUM_BATCHES, + force_solver = test_solver, + context = test_context + ) + x = solver.solve(b) + + solver_residual = xo.sparse.rel_residual(A,x,b) + assert_residual_ok(scipy_residual,solver_residual, + abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR) \ No newline at end of file diff --git a/xobjects/__init__.py b/xobjects/__init__.py index 0d9f709..8c2f5a7 100644 --- a/xobjects/__init__.py +++ b/xobjects/__init__.py @@ -19,6 +19,7 @@ from .string import String from .struct import Struct, Field from .ref import Ref, UnionRef +from . import sparse from .context_cpu import ContextCpu from .context_pyopencl import ContextPyopencl diff --git a/xobjects/context.py b/xobjects/context.py index effd95a..bbd57de 100644 --- a/xobjects/context.py +++ b/xobjects/context.py @@ -203,6 +203,10 @@ def __init__(self, message="Module not available"): def __getattr__(self, attr): raise NameError(self.message) +class ModuleNotAvailableError(ModuleNotFoundError): + """Raised when an optional dependency is missing.""" + pass + class XContext(ABC): minimum_alignment = 1 diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py index 3838376..d1b3b87 100644 --- a/xobjects/context_cupy.py +++ b/xobjects/context_cupy.py @@ -18,6 +18,7 @@ classes_from_kernels, sort_classes, sources_from_classes, + ModuleNotAvailableError, ) from .linkedarray import BaseLinkedArray from .specialize_source import specialize_source @@ -398,6 +399,11 @@ def __init__( default_shared_mem_size_bytes=0, device=None, ): + if not _enabled: + raise ModuleNotAvailableError( + "cupy is not installed. " "ContextCupy is not available!" + ) + if device is not None: cupy.cuda.Device(device).use() diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py index c61e21e..196eb17 100644 --- a/xobjects/context_pyopencl.py +++ b/xobjects/context_pyopencl.py @@ -18,6 +18,7 @@ classes_from_kernels, sort_classes, sources_from_classes, + ModuleNotAvailableError, ) from .linkedarray import BaseLinkedArray from .specialize_source import specialize_source @@ -127,6 +128,11 @@ def __init__( """ + if not _enabled: + raise ModuleNotAvailableError( + "pyopencl is not installed. ContextPyopencl is not available!" + ) + super().__init__() # TODO assume one device only diff --git a/xobjects/sparse/__init__.py b/xobjects/sparse/__init__.py new file mode 100644 index 0000000..9a4cca2 --- /dev/null +++ b/xobjects/sparse/__init__.py @@ -0,0 +1,3 @@ +from ._sparse import factorized_sparse_solver, rel_residual +from . import solvers +__all__ = ["factorized_sparse_solver","solvers", "rel_residual"] diff --git a/xobjects/sparse/_sparse.py b/xobjects/sparse/_sparse.py new file mode 100644 index 0000000..80f80f6 --- /dev/null +++ b/xobjects/sparse/_sparse.py @@ -0,0 +1,345 @@ +# copyright ################################# # +# This file is part of the Xobjects Package. # +# Copyright (c) CERN, 2021. # +# ########################################### # + +import scipy.sparse +import numpy.linalg as npl +from numpy import ndarray as nparray +from typing import Optional, Literal, Union +from ..context import XContext +from ..context_cpu import ContextCpu +from ..context_cupy import ContextCupy +from ..context_pyopencl import ContextPyopencl +from .solvers._abstract_solver import SuperLUlikeSolver +from ..general import _print + +try: + from cupy import ndarray as cparray + import cupyx.scipy.sparse + from cupyx import cusparse + _cupy_available = True +except: + _cupy_available = False + pass + +def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix, + scipy.sparse.csc_matrix], + n_batches: int = 0, + force_solver: Optional[ + Literal["scipySLU", + "PyKLU", + "cuDSS", + "CachedSLU", + "cupySLU"] + ] = None, + solverKwargs: dict = None, + context: Union[ + Literal["cpu", + "cupy", + "pyopencl"], + XContext + ] = None, + verbose: bool = False + ) -> SuperLUlikeSolver: + """ + Build and return a factorized sparse linear solver on CPU or GPU. + + This function inspects the provided sparse matrix and execution context + and returns a *factorized* solver object (SuperLU-like). The solver can + then be reused to efficiently solve multiple linear systems with the same + matrix `A`. + + The actual backend is chosen automatically based on: + * The requested/derived `context` ("cpu", "cupy", "pyopencl"), + * The availability of optional libraries (PyKLU, cuDSS, CuPy/CUSPARSE), + * And the optional `force_solver` argument. + + On CPU: + * Default: try PyKLU, fall back to `scipy.sparse.linalg.splu`. + * You may force `"scipySLU"` or `"PyKLU"` explicitly. + + On CUDA/CuPy: + * Default: try cuDSS (`DirectSolverSuperLU`), then fall back to: + - `cupyx.scipy.sparse.linalg.splu` if CUSPARSE `csrsm2` is available, or + - a cached SuperLU-based solver (`luLU`) and finally `cupyx.splu`. + * You may force `"cuDSS"`, `"CachedSLU"`, or `"cupySLU"` explicitly. + + PyOpenCL: + * Currently not supported and will raise `NotImplementedError`. + + Parameters + ---------- + A : scipy.sparse.spmatrix or cupyx.scipy.sparse.spmatrix + Sparse system matrix to factorize. + + The matrix is internally converted to the format expected by the + chosen backend: + + * CPU context: converted to CSC (`scipy.sparse.csc_matrix`). + * CuPy/GPU context: converted to CSR (`cupyx.scipy.sparse.csr_matrix`). + + For **best performance**, you should pass `A` already in the + preferred format to avoid extra conversions: + + * If the context is (or will usually be) **CPU**, provide `A` + as a CSC matrix (`A.tocsc()`). + * If the context is **GPU/CuPy**, provide `A` as a CSR matrix + (`A.tocsr()` or `cupyx.scipy.sparse.csr_matrix`). + + If `context` is `None`, it is still inferred from the type of `A` + and the availability of CuPy, e.g.: + + * SciPy sparse or NumPy array → `"cpu"`, + * CuPy sparse or CuPy array → `"cupy"`, + * otherwise a `TypeError` is raised. + + n_batches : int, optional + Controls the expected shape of the right-hand side (RHS) for GPU + solvers and hence whether solves are treated as single or batched: + + * If ``n_batches == 0`` (default), the solver is configured for + single-RHS solves and expects a vector RHS of shape ``(n,)``. + * If ``n_batches > 0``, the solver is configured for batched solves + and expects a 2D RHS array of shape ``(n, n_batches)`` (i.e. + ``nrhs = n_batches``). + + This argument is primarily used by CUDA-based solvers (e.g. cuDSS and + cached SuperLU) to preconfigure internal data structures for batched + solves. It has no effect for CPU-based solvers. + + force_solver : {"scipySLU", "PyKLU", "cuDSS", "CachedSLU", "cupySLU"}, optional + If provided, forces the use of a specific backend instead of the + automatic selection: + + * `"scipySLU"` : Use `scipy.sparse.linalg.splu` (CPU). + * `"PyKLU"` : Use the `PyKLU.Klu` solver (CPU). + * `"cuDSS"` : Use CUDA/cuDSS-based `DirectSolverSuperLU` (GPU). + * `"CachedSLU"`: Use CUDA cached spsm SuperLU (`luLU`) (GPU). + * `"cupySLU"` : Use `cupyx.scipy.sparse.linalg.splu` (GPU). + + Using a solver that does not match the current `context` will result + in a `ValueError`. + + solverKwargs : dict, optional + Extra keyword arguments forwarded to the underlying solver constructor. + If `None`, an empty dict is used. + + Some backends make use of `permc_spec` (matrix permutation strategy). + When not explicitly provided and appropriate, this function sets + `permc_spec="MMD_AT_PLUS_A"` as a sensible default for the matrices that + will typically be encountered in an xobjects workflow. + + context : {"cpu", "cupy", "pyopencl"} or XContext, optional + Execution context. Can be either: + + * A string: + - `"cpu"`: Use CPU-based solvers (SciPy / PyKLU). + - `"cupy"`: Use CuPy/CUDA-based solvers. + - `"pyopencl"`: PyOpenCL context (currently unsupported). + * A context object instance: + - `ContextCpu` + - `ContextCupy` + - `ContextPyopencl` + + If `None`, the context is inferred from `A` as described above. + + verbose : bool, optional + If `True`, prints debug messages describing the solver-selection + process, fallbacks, and the final solver that is returned. + + Returns + ------- + SuperLUlikeSolver + A factorized solver object compatible with SciPy’s `splu`-like + interface (i.e. typically exposing a `solve` method and related + accessors). The exact concrete type depends on the backend: + * CPU: + - `scipy.sparse.linalg.SuperLU` (for `"scipySLU"`), + - `PyKLU.Klu` (for `"PyKLU"`). + * CUDA/CuPy: + - `DirectSolverSuperLU` (cuDSS), + - `luLU` (cached spsm SuperLU), + - `cupyx.scipy.sparse.linalg.SuperLU` (for `"cupySLU"`). + + All returned solver objects implement a `solve(b)` method. + For **optimal performance across all backends**, the right-hand + side `b` should be passed as a **Fortran-contiguous (column-major)** + array: + + * For a single RHS: shape ``(n,)`` or ``(n, 1)`` (Fortran contiguous). + * For multiple RHSs: shape ``(n, nrhs)`` with **Fortran layout**. + + If `b` is not Fortran-contiguous, the solver will internally copy or + transform it, which can incur extra overhead—especially on CUDA/GPU + backends and in batched-solve scenarios. + + Raises + ------ + TypeError + If the type of `A` is unsupported when inferring the context. + + AssertionError + If `A` does not match the required type for the chosen context + + ModuleNotFoundError + If a requested solver backend depends on a module that is not + installed (e.g. CuPy, PyKLU, cuDSS), and no fallback is available. + + RuntimeError + If a requested GPU solver fails during initialization. + + NotImplementedError + If `context` is `"pyopencl"` or `ContextPyopencl`, since no sparse + solver is currently implemented for that backend. + + ValueError + If an invalid `context` string is provided, or if `force_solver` + does not match any known solver for the active context. + + Notes + ----- + - For best performance on CPU, PyKLU is preferred when available. + - For CuPy/CUDA, cuDSS is preferred when available, and this function + will automatically fall back to other solvers if cuDSS is not present + or fails at runtime. + - The returned solver is *factorized* and should be reused to solve + multiple right-hand sides efficiently. + + Examples + -------- + Factorize a SciPy sparse matrix on CPU with automatic solver selection: + + >>> A = scipy.sparse.random(1000, 1000, density=0.01, format="csr") + >>> solver = factorized_sparse_solver(A) + >>> x = solver.solve(b) + + Explicitly request the SciPy SuperLU solver on CPU: + + >>> solver = factorized_sparse_solver(A, force_solver="scipySLU") + + Using CuPy and cuDSS (requires CuPy and cuDSS bindings): + + >>> A_gpu = cupyx.scipy.sparse.csr_matrix(A) + >>> solver = factorized_sparse_solver(A_gpu, context="cupy") + >>> x_gpu = solver.solve(b_gpu) + """ + if solverKwargs is None: + solverKwargs = {} + if context is None: + dbugprint(verbose, "No context provided. " \ + "Context will be inferred from matrix") + if isinstance(A, scipy.sparse.spmatrix) or isinstance(A, nparray): + context = 'cpu' + elif (_cupy_available and + (isinstance(A, cupyx.scipy.sparse.spmatrix) + or isinstance(A, cparray))): + context = 'cupy' + else: + raise TypeError("Unsupported type for A") + if context == "cpu" or isinstance(context, ContextCpu): + assert isinstance(A, scipy.sparse.spmatrix), ( + "When using CPU context A must be a scipy.sparse matrix" + ) + A = A.tocsc() # CPU Solvers require csc format + if 'permc_spec' not in solverKwargs: + solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"} + if force_solver is None: + dbugprint(verbose, "No solver requested. " \ + "Picking best solver for CPU Context") + try: + dbugprint(verbose, "Attempting to use PyKLU") + import PyKLU + solver = PyKLU.Klu(A) + dbugprint(verbose, "PyKLU succeeded") + except (ModuleNotFoundError, RuntimeError) as e: + dbugprint(verbose, "PyKLU failed. " \ + "Falling back to scipy.splu \n" + f"Encountered error: {e}") + + solver = scipy.sparse.linalg.splu(A,**solverKwargs) + elif force_solver == "scipySLU": + solver = scipy.sparse.linalg.splu(A,**solverKwargs) + elif force_solver == "PyKLU": + import PyKLU + solver = PyKLU.Klu(A) + else: + raise ValueError("Unrecognized CPU Sparse solver. Available options: " + "scipySLU, PyKLU") + + elif context == "cupy" or isinstance(context, ContextCupy): + if not _cupy_available: + raise ModuleNotFoundError("No cupy module found. " \ + "ContextCupy unavailable") + assert isinstance(A ,cupyx.scipy.sparse.spmatrix), ( + "When using ContextCupy, input must be " + "cupyx.scipy.sparse matrix") + + A = A.tocsr() # GPU solvers require csr format + if force_solver is not None and force_solver != "cuDSS": + if 'permc_spec' not in solverKwargs: + solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"} + if force_solver is None: + dbugprint(verbose, "No solver requested. " \ + "Picking best solver for Cupy Context") + import warnings + try: + dbugprint(verbose, "Attempting to use cuDSS Solver") + from .solvers.CUDA._cuDSSLU import DirectSolverSuperLU + solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs) + dbugprint(verbose, "cuDSS succeeded") + except (ModuleNotFoundError, RuntimeError) as e: + dbugprint(verbose, "cuDSS failed. \n" + f"Encountered Error: {e}") + warnings.warn("cuDSS not available. Performance will be degraded") + if 'permc_spec' not in solverKwargs: + solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"} + if cusparse.check_availability('csrsm2'): + dbugprint(verbose, "csrsm2 available. Using cupyx.splu solver") + solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs) + else: + try: + dbugprint(verbose, "csrms2 unavailable. " \ + "Attempting to use CachedSuperLU (spsm)") + from .solvers.CUDA._luLU import luLU + solver = luLU(A, n_batches = n_batches, **solverKwargs) + dbugprint(verbose, "CachedSuperLU succeeded") + except RuntimeError as e: + dbugprint(verbose, "CachedSuperLU failed. \n" + f"Encountered error: {e} \n" + "Falling back to cupyx.splu with spsm") + solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs) + elif force_solver == "cuDSS": + from .solvers.CUDA._cuDSSLU import DirectSolverSuperLU + solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs) + elif force_solver == "CachedSLU": + from .solvers.CUDA._luLU import luLU + solver = luLU(A, n_batches = n_batches, **solverKwargs) + elif force_solver == "cupySLU": + solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs) + else: + raise ValueError("Unrecognized CUDA Sparse solver. Available options: " + "cuDSS, CachedSLU, cupySLU") + elif context == "pyopencl" or isinstance(context, ContextPyopencl): + raise NotImplementedError("No sparse solver is currently available " + "for PyOpenCL context") + else: + raise ValueError("Invalid context. Available contexts are: " \ + "cpu, cupy, pyopencl") + dbugprint(verbose, "Returning solver: " + str(solver)) + return solver + +def dbugprint(verbose: bool, text: str): + if verbose: + _print("[xo.sparse] "+text) + +def rel_residual(A,x,b): + if hasattr(A, "get"): + A = A.get() + if hasattr(x, "get"): + x = x.get() + if hasattr(b, "get"): + b = b.get() + assert scipy.sparse.issparse(A), ("A must be a sparse matrix") + + return npl.norm(A@x - b) / (npl.norm(b)) \ No newline at end of file diff --git a/xobjects/sparse/solvers/CPU/__init__.py b/xobjects/sparse/solvers/CPU/__init__.py new file mode 100644 index 0000000..07b2c79 --- /dev/null +++ b/xobjects/sparse/solvers/CPU/__init__.py @@ -0,0 +1,11 @@ +from scipy.sparse.linalg import splu as scipysplu +try: + from PyKLU import Klu as KLU +except (ModuleNotFoundError,ImportError) as e: + def KLU(*args, _import_err=e, **kwargs): + from ....context import ModuleNotAvailableError + raise ModuleNotAvailableError( + "KLU is not available. Could not import required backend." + ) from _import_err + +__all__ = ["scipysplu", "KLU"] \ No newline at end of file diff --git a/xobjects/sparse/solvers/CUDA/__init__.py b/xobjects/sparse/solvers/CUDA/__init__.py new file mode 100644 index 0000000..536fe49 --- /dev/null +++ b/xobjects/sparse/solvers/CUDA/__init__.py @@ -0,0 +1,26 @@ +try: + from ._cuDSSLU import DirectSolverSuperLU as cuDSS +except (ModuleNotFoundError,ImportError) as e: + def cuDSS(*args, _import_err=e, **kwargs): + from ....context import ModuleNotAvailableError + raise ModuleNotAvailableError( + "cuDSS is not available. Could not import required backend." + ) from _import_err +try: + from ._luLU import luLU as cachedSpSM +except (ModuleNotFoundError,ImportError) as e: + def cachedSpSM(*args, _import_err=e, **kwargs): + from ....context import ModuleNotAvailableError + raise ModuleNotAvailableError( + "cachedSpSM is not available. Could not import required backend." + ) from _import_err +try: + from cupyx.scipy.sparse.linalg import splu as cupysplu +except (ModuleNotFoundError,ImportError) as e: + def cupysplu(*args, _import_err=e, **kwargs): + from ....context import ModuleNotAvailableError + raise ModuleNotAvailableError( + "cupysplu is not available. Could not import required backend." + ) from _import_err + +__all__ = ["cuDSS", "cachedSpSM", "cupysplu"] \ No newline at end of file diff --git a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py new file mode 100644 index 0000000..31ded5e --- /dev/null +++ b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py @@ -0,0 +1,106 @@ +# copyright ################################# # +# This file is part of the Xobjects Package. # +# Copyright (c) CERN, 2021. # +# ########################################### # + +import cupy as cp +import cupyx.scipy.sparse as sp +import nvmath +from typing import Literal, Union +import warnings + +class DirectSolverSuperLU(nvmath.sparse.advanced.DirectSolver): + """cuDSS-based direct solver; reuse factors for many RHS (SuperLU-style).""" + + def __init__(self, A: sp.csr_matrix, *, + n_batches: int = 0, + assume_general: bool = True, + **kwargs): + # 1) Validate A + if not isinstance(A, sp.csr_matrix): + raise TypeError("A must be cupyx.scipy.sparse.csr_matrix") + if A.shape[0] != A.shape[1]: + raise ValueError("A must be square") + if A.indices.dtype != cp.int32 or A.indptr.dtype != cp.int32: + A = sp.csr_matrix((A.data, + A.indices.astype(cp.int32), + A.indptr.astype(cp.int32)), + shape=A.shape) + # 2) Sample RHS to initialize parent + if n_batches == 0: + b_sample = cp.zeros(A.shape[0], dtype=A.dtype) + else: + b_sample = cp.zeros((A.shape[0],n_batches), + dtype=A.dtype, order = 'F') + self.b_dtype = A.dtype + self.b_shape = b_sample.shape + super().__init__(A, b_sample, **kwargs) + + # 3) Optional: configure plan + pc = self.plan_config + if assume_general: + # LU path (cuDSS will often infer this, but we can be explicit if available) + try: + pc.matrix_type = nvmath.sparse.advanced.DirectSolverMatrixType.GENERAL + except AttributeError: + pass # older wheels infer type; safe to ignore + + # 4) Build & factorize once; warmup solve to build internal caches + super().plan() + self.fac_info = super().factorize() + # Warmup on zeros (uses parent solve()) + super().reset_operands(b=b_sample) + super().solve() + + def _prepare_rhs(self, b): + # Accept 1-D or 2-D; ensure dtype/device and layout + if not isinstance(b, cp.ndarray): + raise TypeError('b must be cupy.ndarray') + if b.dtype != self.b_dtype: + raise TypeError(f"RHS dtype {b.dtype} does not match " + f"matrix dtype {self.b_dtype}") + if b.ndim == 1: + return b # vector is fine + if not b.flags.f_contiguous: + warnings.warn("cuDSS expects a column-major ('F' ordered) " \ + "matrix for the RHS. RHS was autmatically converted " + "by the solver.") + return cp.asfortranarray(b) + + def solve(self, b): + """Solve A x = b using cached factors. Accepts (n,) or (n,k) RHS.""" + assert b.shape == self.b_shape, ( + "Cached solver can only accept RHS with the same shape " + f"as the initialized value. {self.b_shape}. " + "The initialized RHS shape can be changed by initializing " + "using a different value for n_batches" + ) + b = self._prepare_rhs(b) + super().reset_operands(b=b) + return super().solve() + + # Optional helpers for SuperLU-like API + def __call__(self, b): # x = solver(b) + return self.solve(b) + + def refactorize_values(self, A_new): + """Refactorize when numerical values change but sparsity pattern is the same.""" + if not isinstance(A_new, sp.csr_matrix): + raise TypeError("A_new must be CSR") + if A_new.shape != self.operands.a_shape: + raise ValueError("Shape mismatch") + if A_new.indices.dtype != cp.int32 or A_new.indptr.dtype != cp.int32: + A_new = sp.csr_matrix((A_new.data, A_new.indices.astype(cp.int32), + A_new.indptr.astype(cp.int32)), shape=A_new.shape) + super().reset_operands(A=A_new) + self.fac_info = self.factorize() + + def close(self): + """Explicitly free resources when not using a context manager.""" + try: + self.__exit__(None, None, None) # DirectSolver is context-manageable + except Exception: + pass + + def __dealloc__(self): + self.close() \ No newline at end of file diff --git a/xobjects/sparse/solvers/CUDA/_luLU.py b/xobjects/sparse/solvers/CUDA/_luLU.py new file mode 100644 index 0000000..25665ec --- /dev/null +++ b/xobjects/sparse/solvers/CUDA/_luLU.py @@ -0,0 +1,327 @@ +# copyright ################################# # +# This file is part of the Xobjects Package. # +# Copyright (c) CERN, 2021. # +# ########################################### # + +import cupy as _cupy +import numpy as _numpy +import cupyx.scipy.sparse +from cupyx.scipy.sparse.linalg import SuperLU +from cupy_backends.cuda.api import driver as _driver +from cupy_backends.cuda.api import runtime as _runtime +from cupy_backends.cuda.libs import cusparse as _cusparse +from cupy._core import _dtype +from cupy.cuda import device as _device +from cupy.cuda import stream as _stream +from cupyx.cusparse import _dtype_to_IndexType, SpMatDescriptor, DnMatDescriptor, check_availability +import scipy.sparse.linalg +try: + import scipy.sparse.linalg + scipy_available = True +except ImportError: + scipy_available = False + +class CachedAbSolver: + + def __init__(self, a, b, alpha=1.0, lower=True, unit_diag=False, transa=False): + if not check_availability('spsm'): + raise RuntimeError('spsm is not available.') + + # Canonicalise transa + if transa is False: + transa = 'N' + elif transa is True: + transa = 'T' + elif transa not in 'NTH': + raise ValueError(f'Unknown transa (actual: {transa})') + + # Check A's type and sparse format + if cupyx.scipy.sparse.isspmatrix_csr(a): + pass + elif cupyx.scipy.sparse.isspmatrix_csc(a): + if transa == 'N': + a = a.T + transa = 'T' + elif transa == 'T': + a = a.T + transa = 'N' + elif transa == 'H': + a = a.conj().T + transa = 'N' + lower = not lower + elif cupyx.scipy.sparse.isspmatrix_coo(a): + pass + else: + raise ValueError('a must be CSR, CSC or COO sparse matrix') + assert a.has_canonical_format + + # Check B's ndim + if b.ndim == 1: + is_b_vector = True + b = b.reshape(-1, 1) + elif b.ndim == 2: + is_b_vector = False + else: + raise ValueError('b.ndim must be 1 or 2') + self.is_b_vector = is_b_vector + + # Check shapes + if not (a.shape[0] == a.shape[1] == b.shape[0]): + raise ValueError('mismatched shape') + + # Check dtypes + dtype = a.dtype + if dtype.char not in 'fdFD': + raise TypeError('Invalid dtype (actual: {})'.format(dtype)) + if dtype != b.dtype: + raise TypeError('dtype mismatch') + self.dtype = dtype + + # Prepare fill mode + if lower is True: + fill_mode = _cusparse.CUSPARSE_FILL_MODE_LOWER + elif lower is False: + fill_mode = _cusparse.CUSPARSE_FILL_MODE_UPPER + else: + raise ValueError('Unknown lower (actual: {})'.format(lower)) + self.fill_mode = fill_mode + + # Prepare diag type + if unit_diag is False: + diag_type = _cusparse.CUSPARSE_DIAG_TYPE_NON_UNIT + elif unit_diag is True: + diag_type = _cusparse.CUSPARSE_DIAG_TYPE_UNIT + else: + raise ValueError('Unknown unit_diag (actual: {})'.format(unit_diag)) + self.diag_type = diag_type + self.transa = transa + # Prepare op_a + if transa == 'N': + op_a = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE + elif transa == 'T': + op_a = _cusparse.CUSPARSE_OPERATION_TRANSPOSE + else: # transa == 'H' + if dtype.char in 'fd': + op_a = _cusparse.CUSPARSE_OPERATION_TRANSPOSE + else: + op_a = _cusparse.CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE + self.op_a = op_a + # Prepare op_b + self.op_b = self._get_opb(b) + + # Allocate space for matrix C. Note that it is known cusparseSpSM requires + # the output matrix zero initialized. + m, _ = a.shape + if self.op_b == _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE: + _, n = b.shape + else: + n, _ = b.shape + c_shape = m, n + self.c_shape = c_shape + + self._perform_analysis(a, b, alpha=alpha) + + def _perform_analysis(self, a, b, alpha=1.0): + """Solves a sparse triangular linear system op(a) * x = alpha * op(b). + + Args: + a (cupyx.scipy.sparse.csr_matrix or cupyx.scipy.sparse.coo_matrix): + Sparse matrix with dimension ``(M, M)``. + b (cupy.ndarray): Dense matrix with dimension ``(M, K)``. + alpha (float or complex): Coefficient. + lower (bool): + True: ``a`` is lower triangle matrix. + False: ``a`` is upper triangle matrix. + unit_diag (bool): + True: diagonal part of ``a`` has unit elements. + False: diagonal part of ``a`` has non-unit elements. + transa (bool or str): True, False, 'N', 'T' or 'H'. + 'N' or False: op(a) == ``a``. + 'T' or True: op(a) == ``a.T``. + 'H': op(a) == ``a.conj().T``. + """ + + c = _cupy.zeros(self.c_shape, dtype=a.dtype, order='f') + + # Prepare descriptors and other parameters + self.handle = _device.get_cusparse_handle() + self.mat_a = SpMatDescriptor.create(a) + mat_b = DnMatDescriptor.create(b) + mat_c = DnMatDescriptor.create(c) + self.spsm_descr = _cusparse.spSM_createDescr() + self.alpha = _numpy.array(alpha, dtype=c.dtype).ctypes + self.cuda_dtype = _dtype.to_cuda_dtype(c.dtype) + self.algo = _cusparse.CUSPARSE_SPSM_ALG_DEFAULT + + try: + # Specify Lower|Upper fill mode + self.mat_a.set_attribute(_cusparse.CUSPARSE_SPMAT_FILL_MODE, self.fill_mode) + + # Specify Unit|Non-Unit diagonal type + self.mat_a.set_attribute(_cusparse.CUSPARSE_SPMAT_DIAG_TYPE, self.diag_type) + + # Allocate the workspace needed by the succeeding phases + buff_size = _cusparse.spSM_bufferSize( + self.handle, self.op_a, self.op_b, self.alpha.data, self.mat_a.desc, mat_b.desc, + mat_c.desc, self.cuda_dtype, self.algo, self.spsm_descr) + self.buff = _cupy.empty(buff_size, dtype=_cupy.int8) + + # Perform the analysis phase + _cusparse.spSM_analysis( + self.handle, self.op_a, self.op_b, self.alpha.data, self.mat_a.desc, mat_b.desc, + mat_c.desc, self.cuda_dtype, self.algo, self.spsm_descr, self.buff.data.ptr) + except Exception as e: + raise RuntimeError('spSM_analysis failed.') from e + + def _get_opb(self, b): + # Prepare op_b + if b._f_contiguous: + op_b = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE + elif b._c_contiguous: + if _cusparse.get_build_version() < 11701: # earlier than CUDA 11.6 + raise ValueError('b must be F-contiguous.') + b = b.T + op_b = _cusparse.CUSPARSE_OPERATION_TRANSPOSE + else: + raise ValueError('b must be F-contiguous or C-contiguous.') + return op_b + + def solve(self, b): + assert b.dtype == self.dtype + assert self._get_opb(b) == self.op_b + + if b.ndim == 1: + is_b_vector = True + b = b.reshape(-1, 1) + elif b.ndim == 2: + is_b_vector = False + else: + raise ValueError('b.ndim must be 1 or 2') + self.is_b_vector = is_b_vector + + c = _cupy.zeros(self.c_shape, dtype=self.dtype, order='f') + mat_b = DnMatDescriptor.create(b) + mat_c = DnMatDescriptor.create(c) + try: + # Executes the solve phase + _cusparse.spSM_solve( + self.handle, self.op_a, self.op_b, self.alpha.data, self.mat_a.desc, mat_b.desc, + mat_c.desc, self.cuda_dtype, self.algo, self.spsm_descr, self.buff.data.ptr) + finally: + _cupy.cuda.get_current_stream().synchronize() + # Reshape back if B was a vector + if self.is_b_vector: + c = c.reshape(-1) + return c + + # def __del__(self): + # # Destroy matrix descriptor + # print("Deleting solver obj...") + # _cusparse.spSM_destroyDescr(self.spsm_descr) + +class luLU(SuperLU): + + def __init__(self, A, + trans = 'N', + permc_spec=None, + n_batches: int = 0, + diag_pivot_thresh=None, + relax=None, + panel_size=None, + options={}): + if not check_availability('spsm'): + raise RuntimeError('spsm is not available.') + if not scipy_available: + raise RuntimeError('scipy is not available') + if not cupyx.scipy.sparse.isspmatrix(A): + raise TypeError('A must be cupyx.scipy.sparse.spmatrix') + if A.shape[0] != A.shape[1]: + raise ValueError('A must be a square matrix (A.shape: {})' + .format(A.shape)) + if A.dtype.char not in 'fdFD': + raise TypeError('Invalid dtype (actual: {})'.format(A.dtype)) + + a = A.get().tocsc() + a_slu = scipy.sparse.linalg.splu( + a, permc_spec=permc_spec, diag_pivot_thresh=diag_pivot_thresh, + relax=relax, panel_size=panel_size, options=options) + super().__init__(a_slu) + self.b_dtype = self.L.dtype + self._init_solvers(trans=trans, n_batches = n_batches) + + def _init_solvers(self,trans = 'N', n_batches = 0): + if n_batches == 0: + b_sample = _cupy.zeros(self.shape[0], + dtype=self.b_dtype) + else: + b_sample = _cupy.zeros((self.shape[0],n_batches), + dtype=self.b_dtype, order = 'F') + self.trans = trans + self.b_shape = b_sample.shape + self.Lsolver = CachedAbSolver(self.L, b_sample, lower=True, transa=self.trans) + self.Usolver = CachedAbSolver(self.U, b_sample, lower=False, transa=self.trans) + # self.Usolver = CachedAbSolver(self.U.T, b_sample, lower=True, transa="T") #Can improve performance at times + + def solve(self, rhs, trans='N'): + """Solves linear system of equations with one or several right-hand sides. + + Args: + rhs (cupy.ndarray): Right-hand side(s) of equation with dimension + ``(M)`` or ``(M, K)``. + trans (str): 'N', 'T' or 'H'. + 'N': Solves ``A * x = rhs``. + 'T': Solves ``A.T * x = rhs``. + 'H': Solves ``A.conj().T * x = rhs``. + + Returns: + cupy.ndarray: + Solution vector(s) + """ # NOQA + from cupyx import cusparse + if trans != self.trans: + raise AssertionError("Solve function assumes cached configuration. " \ + "Rebuild cache by calling _init_solvers with desired configuration.") + if not isinstance(rhs, _cupy.ndarray): + raise TypeError('ojb must be cupy.ndarray') + if rhs.ndim not in (1, 2): + raise ValueError('rhs.ndim must be 1 or 2 (actual: {})'. + format(rhs.ndim)) + if rhs.shape[0] != self.shape[0]: + raise ValueError('shape mismatch (self.shape: {}, rhs.shape: {})' + .format(self.shape, rhs.shape)) + assert rhs.shape == self.b_shape, ( + "Cached solver can only accept RHS with the same shape " + f"as the initialized value. {self.b_shape}. " + "The initialized RHS shape can be changed by initializing " + "using a different value for n_batches" + ) + if trans not in ('N', 'T', 'H'): + raise ValueError('trans must be \'N\', \'T\', or \'H\'') + + x = rhs.astype(self.L.dtype) + if trans == 'N': + if self.perm_r is not None: + if x.ndim == 2 and x._f_contiguous: + x = x.T[:, self._perm_r_rev].T # want to keep f-order + else: + x = x[self._perm_r_rev] + x = self.Lsolver.solve(x) + x = self.Usolver.solve(x) + if self.perm_c is not None: + x = x[self.perm_c] + else: + if self.perm_c is not None: + if x.ndim == 2 and x._f_contiguous: + x = x.T[:, self._perm_c_rev].T # want to keep f-order + else: + x = x[self._perm_c_rev] + x = self.Usolver.solve(x) + x = self.Lsolver.solve(x) + if self.perm_r is not None: + x = x[self.perm_r] + + if not x._f_contiguous: + # For compatibility with SciPy + x = x.copy(order='F') + return x + \ No newline at end of file diff --git a/xobjects/sparse/solvers/__init__.py b/xobjects/sparse/solvers/__init__.py new file mode 100644 index 0000000..3721b50 --- /dev/null +++ b/xobjects/sparse/solvers/__init__.py @@ -0,0 +1,3 @@ +from . import CPU +from . import CUDA +__all__ = ["CPU", "CUDA"] diff --git a/xobjects/sparse/solvers/_abstract_solver.py b/xobjects/sparse/solvers/_abstract_solver.py new file mode 100644 index 0000000..9acfe10 --- /dev/null +++ b/xobjects/sparse/solvers/_abstract_solver.py @@ -0,0 +1,16 @@ +# copyright ################################# # +# This file is part of the Xobjects Package. # +# Copyright (c) CERN, 2021. # +# ########################################### # + +from abc import ABC, abstractmethod + +class SuperLUlikeSolver(ABC): + + @abstractmethod + def __init__(self, A): + pass + + @abstractmethod + def solve(self,b): + return x \ No newline at end of file diff --git a/xobjects/test_helpers.py b/xobjects/test_helpers.py index 00d244d..7db897c 100644 --- a/xobjects/test_helpers.py +++ b/xobjects/test_helpers.py @@ -98,7 +98,8 @@ def wrapper(*args, **kwargs): rng_state = np.random.get_state() try: np.random.seed(seed) - test_function(*args, **kwargs) + #Return value of function instead of None + return test_function(*args, **kwargs) finally: np.random.set_state(rng_state)