scverse
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/spca/kernels_spca.cuh‎
Lines changed: 49 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/spca/kernels_spca.cuh‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/spca/spca.cu‎
Lines changed: 88 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/spca/spca.cu‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/decoupler_gpu/_method_aucell.py‎
Lines changed: 5 additions & 1 deletion b/‎src/rapids_singlecell/decoupler_gpu/_method_aucell.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/rapids_singlecell/preprocessing/_sparse_pca/_helper.py‎
Lines changed: 22 additions & 38 deletions b/‎src/rapids_singlecell/preprocessing/_sparse_pca/_helper.py‎
Lines changed: 22 additions & 38 deletions
diff --git a/‎src/rapids_singlecell/preprocessing/_sparse_pca/_kernels/_pca_sparse_kernel.py‎
Lines changed: 0 additions & 77 deletions b/‎src/rapids_singlecell/preprocessing/_sparse_pca/_kernels/_pca_sparse_kernel.py‎
Lines changed: 0 additions & 77 deletions
diff --git a/‎src/rapids_singlecell/preprocessing/_sparse_pca/_sparse_pca.py‎
Lines changed: 21 additions & 34 deletions b/‎src/rapids_singlecell/preprocessing/_sparse_pca/_sparse_pca.py‎
Lines changed: 21 additions & 34 deletions
@@ -54,4 +54,5 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_autocorr_cuda     src/rapids_singlecell/_cuda/autocorr/autocorr.cu)
   add_nb_cuda_module(_cooc_cuda         src/rapids_singlecell/_cuda/cooc/cooc.cu)
   add_nb_cuda_module(_aggr_cuda         src/rapids_singlecell/_cuda/aggr/aggr.cu)
+  add_nb_cuda_module(_spca_cuda         src/rapids_singlecell/_cuda/spca/spca.cu)
 endif()
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void gram_csr_upper_kernel(const int* indptr, const int* index, const T* data, int nrows,
+                                      int ncols, T* out) {
+  int row = blockIdx.x;
+  int col_offset = threadIdx.x;
+  if (row >= nrows) return;
+
+  int start = indptr[row];
+  int end = indptr[row + 1];
+
+  for (int idx1 = start; idx1 < end; ++idx1) {
+    int index1 = index[idx1];
+    T data1 = data[idx1];
+    for (int idx2 = idx1 + col_offset; idx2 < end; idx2 += blockDim.x) {
+      int index2 = index[idx2];
+      T data2 = data[idx2];
+      atomicAdd(&out[(size_t)index1 * ncols + index2], data1 * data2);
+    }
+  }
+}
+
+template <typename T>
+__global__ void copy_upper_to_lower_kernel(T* output, int ncols) {
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row >= ncols || col >= ncols) return;
+  if (row > col) {
+    output[row * ncols + col] = output[col * ncols + row];
+  }
+}
+
+template <typename T>
+__global__ void cov_from_gram_kernel(T* cov_values, const T* gram_matrix, const T* mean_x,
+                                     const T* mean_y, int ncols) {
+  int rid = blockDim.x * blockIdx.x + threadIdx.x;
+  int cid = blockDim.y * blockIdx.y + threadIdx.y;
+  if (rid >= ncols || cid >= ncols) return;
+  cov_values[rid * ncols + cid] = gram_matrix[rid * ncols + cid] - mean_x[rid] * mean_y[cid];
+}
+
+__global__ void check_zero_genes_kernel(const int* indices, int* genes, int nnz) {
+  int value = blockIdx.x * blockDim.x + threadIdx.x;
+  if (value >= nnz) return;
+  atomicAdd(&genes[indices[value]], 1);
+}
@@ -0,0 +1,88 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_spca.cuh"
+
+namespace nb = nanobind;
+
+template <typename T>
+static inline void launch_gram_csr_upper(std::uintptr_t indptr_ptr, std::uintptr_t index_ptr,
+                                         std::uintptr_t data_ptr, int nrows, int ncols,
+                                         std::uintptr_t out_ptr) {
+  dim3 block(128);
+  dim3 grid(nrows);
+  const int* indptr = reinterpret_cast<const int*>(indptr_ptr);
+  const int* index = reinterpret_cast<const int*>(index_ptr);
+  const T* data = reinterpret_cast<const T*>(data_ptr);
+  T* out = reinterpret_cast<T*>(out_ptr);
+  gram_csr_upper_kernel<T><<<grid, block>>>(indptr, index, data, nrows, ncols, out);
+}
+
+template <typename T>
+static inline void launch_copy_upper_to_lower(std::uintptr_t out_ptr, int ncols) {
+  dim3 block(32, 32);
+  dim3 grid((ncols + block.x - 1) / block.x, (ncols + block.y - 1) / block.y);
+  T* out = reinterpret_cast<T*>(out_ptr);
+  copy_upper_to_lower_kernel<T><<<grid, block>>>(out, ncols);
+}
+
+template <typename T>
+static inline void launch_cov_from_gram(std::uintptr_t cov_ptr, std::uintptr_t gram_ptr,
+                                        std::uintptr_t meanx_ptr, std::uintptr_t meany_ptr,
+                                        int ncols) {
+  dim3 block(32, 32);
+  dim3 grid((ncols + 31) / 32, (ncols + 31) / 32);
+  T* cov = reinterpret_cast<T*>(cov_ptr);
+  const T* gram = reinterpret_cast<const T*>(gram_ptr);
+  const T* meanx = reinterpret_cast<const T*>(meanx_ptr);
+  const T* meany = reinterpret_cast<const T*>(meany_ptr);
+  cov_from_gram_kernel<T><<<grid, block>>>(cov, gram, meanx, meany, ncols);
+}
+
+static inline void launch_check_zero_genes(std::uintptr_t indices_ptr, std::uintptr_t genes_ptr,
+                                           int nnz) {
+  dim3 block(32);
+  dim3 grid((nnz + block.x - 1) / block.x);
+  const int* indices = reinterpret_cast<const int*>(indices_ptr);
+  int* genes = reinterpret_cast<int*>(genes_ptr);
+  check_zero_genes_kernel<<<grid, block>>>(indices, genes, nnz);
+}
+
+NB_MODULE(_spca_cuda, m) {
+  m.def("gram_csr_upper", [](std::uintptr_t indptr, std::uintptr_t index, std::uintptr_t data,
+                             int nrows, int ncols, std::uintptr_t out, int itemsize) {
+    if (itemsize == 4) {
+      launch_gram_csr_upper<float>(indptr, index, data, nrows, ncols, out);
+    } else if (itemsize == 8) {
+      launch_gram_csr_upper<double>(indptr, index, data, nrows, ncols, out);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("copy_upper_to_lower", [](std::uintptr_t out, int ncols, int itemsize) {
+    if (itemsize == 4) {
+      launch_copy_upper_to_lower<float>(out, ncols);
+    } else if (itemsize == 8) {
+      launch_copy_upper_to_lower<double>(out, ncols);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("cov_from_gram", [](std::uintptr_t cov, std::uintptr_t gram, std::uintptr_t meanx,
+                            std::uintptr_t meany, int ncols, int itemsize) {
+    if (itemsize == 4) {
+      launch_cov_from_gram<float>(cov, gram, meanx, meany, ncols);
+    } else if (itemsize == 8) {
+      launch_cov_from_gram<double>(cov, gram, meanx, meany, ncols);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("check_zero_genes", [](std::uintptr_t indices, std::uintptr_t genes, int nnz) {
+    launch_check_zero_genes(indices, genes, nnz);
+  });
+}
@@ -3,7 +3,11 @@
 import cupy as cp
 import numpy as np
 
-from rapids_singlecell._cuda import _aucell_cuda as _au
+try:
+    from rapids_singlecell._cuda import _aucell_cuda as _au
+except ImportError:
+    _au = None
+
 from rapids_singlecell.decoupler_gpu._helper._docs import docs
 from rapids_singlecell.decoupler_gpu._helper._log import _log
 from rapids_singlecell.decoupler_gpu._helper._Method import Method, MethodMeta
 
@@ -1,62 +1,46 @@
 from __future__ import annotations
 
-import math
 from typing import TYPE_CHECKING
 
 import cupy as cp
 
-from ._kernels._pca_sparse_kernel import _copy_kernel, _cov_kernel
-
 if TYPE_CHECKING:
     from cupyx.scipy.sparse import spmatrix
+try:
+    from rapids_singlecell._cuda import _spca_cuda as _spca
+except ImportError:
+    _spca = None
 
 
-def _copy_gram(gram_matrix, n_cols):
-    """
-    Flips the upper triangle of the gram matrix to the lower triangle. This is necessary because the kernel only computes the upper triangle.
-    """
-    copy_gram = _copy_kernel(gram_matrix.dtype)
-    block = (32, 32)
-    grid = (math.ceil(n_cols / block[0]), math.ceil(n_cols / block[1]))
-    copy_gram(
-        grid,
-        block,
-        (gram_matrix, n_cols),
+def _copy_gram(gram_matrix: cp.ndarray, n_cols: int) -> cp.ndarray:
+    _spca.copy_upper_to_lower(
+        gram_matrix.data.ptr, int(n_cols), int(cp.dtype(gram_matrix.dtype).itemsize)
     )
     return gram_matrix
 
 
-def _compute_cov(cov_result, gram_matrix, mean_x):
-    compute_cov = _cov_kernel(gram_matrix.dtype)
-
-    block_size = (32, 32)
-    grid_size = (math.ceil(gram_matrix.shape[0] / 8),) * 2
-    compute_cov(
-        grid_size,
-        block_size,
-        (cov_result, gram_matrix, mean_x, mean_x, gram_matrix.shape[0]),
+def _compute_cov(
+    cov_result: cp.ndarray, gram_matrix: cp.ndarray, mean_x: cp.ndarray
+) -> cp.ndarray:
+    _spca.cov_from_gram(
+        cov_result.data.ptr,
+        gram_matrix.data.ptr,
+        mean_x.data.ptr,
+        mean_x.data.ptr,
+        int(gram_matrix.shape[0]),
+        int(cp.dtype(gram_matrix.dtype).itemsize),
     )
     return cov_result
 
 
 def _check_matrix_for_zero_genes(X: spmatrix) -> None:
     gene_ex = cp.zeros(X.shape[1], dtype=cp.int32)
-
-    from ._kernels._pca_sparse_kernel import _zero_genes_kernel
-
-    block = (32,)
-    grid = (int(math.ceil(X.nnz / block[0])),)
-    _zero_genes_kernel(
-        grid,
-        block,
-        (
-            X.indices,
-            gene_ex,
-            X.nnz,
-        ),
+    _spca.check_zero_genes(
+        X.indices.data.ptr,
+        gene_ex.data.ptr,
+        int(X.nnz),
     )
     if cp.any(gene_ex == 0):
         raise ValueError(
-            "There are genes with zero expression. "
-            "Please remove them before running PCA."
+            "There are genes with zero expression. Please remove them before running PCA."
         )
@@ -16,6 +16,11 @@
 
 from ._helper import _check_matrix_for_zero_genes, _compute_cov, _copy_gram
 
+try:
+    from rapids_singlecell._cuda import _spca_cuda as _spca
+except ImportError:
+    _spca = None
+
 
 class PCA_sparse:
     def __init__(self, n_components: int | None, *, zero_center: bool = True) -> None:
@@ -199,50 +204,32 @@ def _cov_sparse(
 
 
 def _create_gram_matrix(x):
-    from ._kernels._pca_sparse_kernel import (
-        _gramm_kernel_csr,
-    )
-
     if isinstance(x, csr_matrix):
         gram_matrix = cp.zeros((x.shape[1], x.shape[1]), dtype=x.data.dtype)
-
-        block = (128,)
-        grid = (x.shape[0],)
-        compute_mean_cov = _gramm_kernel_csr(x.dtype)
-        compute_mean_cov(
-            grid,
-            block,
-            (
-                x.indptr,
-                x.indices,
-                x.data,
-                x.shape[0],
-                x.shape[1],
-                gram_matrix,
-            ),
+        _spca.gram_csr_upper(
+            x.indptr.data.ptr,
+            x.indices.data.ptr,
+            x.data.data.ptr,
+            int(x.shape[0]),
+            int(x.shape[1]),
+            gram_matrix.data.ptr,
+            int(cp.dtype(x.dtype).itemsize),
         )
     elif isinstance(x, DaskArray):
-        compute_mean_cov = _gramm_kernel_csr(x.dtype)
-        compute_mean_cov.compile()
         n_cols = x.shape[1]
         if isinstance(x._meta, csr_matrix):
             # Gram matrix for CSR matrix
             def __gram_block(x_part):
                 gram_matrix = cp.zeros((n_cols, n_cols), dtype=x.dtype)
 
-                block = (128,)
-                grid = (x_part.shape[0],)
-                compute_mean_cov(
-                    grid,
-                    block,
-                    (
-                        x_part.indptr,
-                        x_part.indices,
-                        x_part.data,
-                        x_part.shape[0],
-                        n_cols,
-                        gram_matrix,
-                    ),
+                _spca.gram_csr_upper(
+                    x_part.indptr.data.ptr,
+                    x_part.indices.data.ptr,
+                    x_part.data.data.ptr,
+                    int(x_part.shape[0]),
+                    int(n_cols),
+                    gram_matrix.data.ptr,
+                    int(cp.dtype(x_part.dtype).itemsize),
                 )
                 return gram_matrix[None, ...]  # need new axis for summing
         else: