Skip to content

Commit 0685436

Browse files
committed
add pca and make safe docs
1 parent a529a58 commit 0685436

10 files changed

Lines changed: 199 additions & 153 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,5 @@ if (RSC_BUILD_EXTENSIONS)
5454
add_nb_cuda_module(_autocorr_cuda src/rapids_singlecell/_cuda/autocorr/autocorr.cu)
5555
add_nb_cuda_module(_cooc_cuda src/rapids_singlecell/_cuda/cooc/cooc.cu)
5656
add_nb_cuda_module(_aggr_cuda src/rapids_singlecell/_cuda/aggr/aggr.cu)
57+
add_nb_cuda_module(_spca_cuda src/rapids_singlecell/_cuda/spca/spca.cu)
5758
endif()
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
5+
template <typename T>
6+
__global__ void gram_csr_upper_kernel(const int* indptr, const int* index, const T* data, int nrows,
7+
int ncols, T* out) {
8+
int row = blockIdx.x;
9+
int col_offset = threadIdx.x;
10+
if (row >= nrows) return;
11+
12+
int start = indptr[row];
13+
int end = indptr[row + 1];
14+
15+
for (int idx1 = start; idx1 < end; ++idx1) {
16+
int index1 = index[idx1];
17+
T data1 = data[idx1];
18+
for (int idx2 = idx1 + col_offset; idx2 < end; idx2 += blockDim.x) {
19+
int index2 = index[idx2];
20+
T data2 = data[idx2];
21+
atomicAdd(&out[(size_t)index1 * ncols + index2], data1 * data2);
22+
}
23+
}
24+
}
25+
26+
template <typename T>
27+
__global__ void copy_upper_to_lower_kernel(T* output, int ncols) {
28+
int row = blockIdx.y * blockDim.y + threadIdx.y;
29+
int col = blockIdx.x * blockDim.x + threadIdx.x;
30+
if (row >= ncols || col >= ncols) return;
31+
if (row > col) {
32+
output[row * ncols + col] = output[col * ncols + row];
33+
}
34+
}
35+
36+
template <typename T>
37+
__global__ void cov_from_gram_kernel(T* cov_values, const T* gram_matrix, const T* mean_x,
38+
const T* mean_y, int ncols) {
39+
int rid = blockDim.x * blockIdx.x + threadIdx.x;
40+
int cid = blockDim.y * blockIdx.y + threadIdx.y;
41+
if (rid >= ncols || cid >= ncols) return;
42+
cov_values[rid * ncols + cid] = gram_matrix[rid * ncols + cid] - mean_x[rid] * mean_y[cid];
43+
}
44+
45+
__global__ void check_zero_genes_kernel(const int* indices, int* genes, int nnz) {
46+
int value = blockIdx.x * blockDim.x + threadIdx.x;
47+
if (value >= nnz) return;
48+
atomicAdd(&genes[indices[value]], 1);
49+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#include <cuda_runtime.h>
2+
#include <nanobind/nanobind.h>
3+
#include <cstdint>
4+
5+
#include "kernels_spca.cuh"
6+
7+
namespace nb = nanobind;
8+
9+
template <typename T>
10+
static inline void launch_gram_csr_upper(std::uintptr_t indptr_ptr, std::uintptr_t index_ptr,
11+
std::uintptr_t data_ptr, int nrows, int ncols,
12+
std::uintptr_t out_ptr) {
13+
dim3 block(128);
14+
dim3 grid(nrows);
15+
const int* indptr = reinterpret_cast<const int*>(indptr_ptr);
16+
const int* index = reinterpret_cast<const int*>(index_ptr);
17+
const T* data = reinterpret_cast<const T*>(data_ptr);
18+
T* out = reinterpret_cast<T*>(out_ptr);
19+
gram_csr_upper_kernel<T><<<grid, block>>>(indptr, index, data, nrows, ncols, out);
20+
}
21+
22+
template <typename T>
23+
static inline void launch_copy_upper_to_lower(std::uintptr_t out_ptr, int ncols) {
24+
dim3 block(32, 32);
25+
dim3 grid((ncols + block.x - 1) / block.x, (ncols + block.y - 1) / block.y);
26+
T* out = reinterpret_cast<T*>(out_ptr);
27+
copy_upper_to_lower_kernel<T><<<grid, block>>>(out, ncols);
28+
}
29+
30+
template <typename T>
31+
static inline void launch_cov_from_gram(std::uintptr_t cov_ptr, std::uintptr_t gram_ptr,
32+
std::uintptr_t meanx_ptr, std::uintptr_t meany_ptr,
33+
int ncols) {
34+
dim3 block(32, 32);
35+
dim3 grid((ncols + 31) / 32, (ncols + 31) / 32);
36+
T* cov = reinterpret_cast<T*>(cov_ptr);
37+
const T* gram = reinterpret_cast<const T*>(gram_ptr);
38+
const T* meanx = reinterpret_cast<const T*>(meanx_ptr);
39+
const T* meany = reinterpret_cast<const T*>(meany_ptr);
40+
cov_from_gram_kernel<T><<<grid, block>>>(cov, gram, meanx, meany, ncols);
41+
}
42+
43+
static inline void launch_check_zero_genes(std::uintptr_t indices_ptr, std::uintptr_t genes_ptr,
44+
int nnz) {
45+
dim3 block(32);
46+
dim3 grid((nnz + block.x - 1) / block.x);
47+
const int* indices = reinterpret_cast<const int*>(indices_ptr);
48+
int* genes = reinterpret_cast<int*>(genes_ptr);
49+
check_zero_genes_kernel<<<grid, block>>>(indices, genes, nnz);
50+
}
51+
52+
NB_MODULE(_spca_cuda, m) {
53+
m.def("gram_csr_upper", [](std::uintptr_t indptr, std::uintptr_t index, std::uintptr_t data,
54+
int nrows, int ncols, std::uintptr_t out, int itemsize) {
55+
if (itemsize == 4) {
56+
launch_gram_csr_upper<float>(indptr, index, data, nrows, ncols, out);
57+
} else if (itemsize == 8) {
58+
launch_gram_csr_upper<double>(indptr, index, data, nrows, ncols, out);
59+
} else {
60+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
61+
}
62+
});
63+
64+
m.def("copy_upper_to_lower", [](std::uintptr_t out, int ncols, int itemsize) {
65+
if (itemsize == 4) {
66+
launch_copy_upper_to_lower<float>(out, ncols);
67+
} else if (itemsize == 8) {
68+
launch_copy_upper_to_lower<double>(out, ncols);
69+
} else {
70+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
71+
}
72+
});
73+
74+
m.def("cov_from_gram", [](std::uintptr_t cov, std::uintptr_t gram, std::uintptr_t meanx,
75+
std::uintptr_t meany, int ncols, int itemsize) {
76+
if (itemsize == 4) {
77+
launch_cov_from_gram<float>(cov, gram, meanx, meany, ncols);
78+
} else if (itemsize == 8) {
79+
launch_cov_from_gram<double>(cov, gram, meanx, meany, ncols);
80+
} else {
81+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
82+
}
83+
});
84+
85+
m.def("check_zero_genes", [](std::uintptr_t indices, std::uintptr_t genes, int nnz) {
86+
launch_check_zero_genes(indices, genes, nnz);
87+
});
88+
}

src/rapids_singlecell/decoupler_gpu/_method_aucell.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
import cupy as cp
44
import numpy as np
55

6-
from rapids_singlecell._cuda import _aucell_cuda as _au
6+
try:
7+
from rapids_singlecell._cuda import _aucell_cuda as _au
8+
except ImportError:
9+
_au = None
10+
711
from rapids_singlecell.decoupler_gpu._helper._docs import docs
812
from rapids_singlecell.decoupler_gpu._helper._log import _log
913
from rapids_singlecell.decoupler_gpu._helper._Method import Method, MethodMeta
Lines changed: 22 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,46 @@
11
from __future__ import annotations
22

3-
import math
43
from typing import TYPE_CHECKING
54

65
import cupy as cp
76

8-
from ._kernels._pca_sparse_kernel import _copy_kernel, _cov_kernel
9-
107
if TYPE_CHECKING:
118
from cupyx.scipy.sparse import spmatrix
9+
try:
10+
from rapids_singlecell._cuda import _spca_cuda as _spca
11+
except ImportError:
12+
_spca = None
1213

1314

14-
def _copy_gram(gram_matrix, n_cols):
15-
"""
16-
Flips the upper triangle of the gram matrix to the lower triangle. This is necessary because the kernel only computes the upper triangle.
17-
"""
18-
copy_gram = _copy_kernel(gram_matrix.dtype)
19-
block = (32, 32)
20-
grid = (math.ceil(n_cols / block[0]), math.ceil(n_cols / block[1]))
21-
copy_gram(
22-
grid,
23-
block,
24-
(gram_matrix, n_cols),
15+
def _copy_gram(gram_matrix: cp.ndarray, n_cols: int) -> cp.ndarray:
16+
_spca.copy_upper_to_lower(
17+
gram_matrix.data.ptr, int(n_cols), int(cp.dtype(gram_matrix.dtype).itemsize)
2518
)
2619
return gram_matrix
2720

2821

29-
def _compute_cov(cov_result, gram_matrix, mean_x):
30-
compute_cov = _cov_kernel(gram_matrix.dtype)
31-
32-
block_size = (32, 32)
33-
grid_size = (math.ceil(gram_matrix.shape[0] / 8),) * 2
34-
compute_cov(
35-
grid_size,
36-
block_size,
37-
(cov_result, gram_matrix, mean_x, mean_x, gram_matrix.shape[0]),
22+
def _compute_cov(
23+
cov_result: cp.ndarray, gram_matrix: cp.ndarray, mean_x: cp.ndarray
24+
) -> cp.ndarray:
25+
_spca.cov_from_gram(
26+
cov_result.data.ptr,
27+
gram_matrix.data.ptr,
28+
mean_x.data.ptr,
29+
mean_x.data.ptr,
30+
int(gram_matrix.shape[0]),
31+
int(cp.dtype(gram_matrix.dtype).itemsize),
3832
)
3933
return cov_result
4034

4135

4236
def _check_matrix_for_zero_genes(X: spmatrix) -> None:
4337
gene_ex = cp.zeros(X.shape[1], dtype=cp.int32)
44-
45-
from ._kernels._pca_sparse_kernel import _zero_genes_kernel
46-
47-
block = (32,)
48-
grid = (int(math.ceil(X.nnz / block[0])),)
49-
_zero_genes_kernel(
50-
grid,
51-
block,
52-
(
53-
X.indices,
54-
gene_ex,
55-
X.nnz,
56-
),
38+
_spca.check_zero_genes(
39+
X.indices.data.ptr,
40+
gene_ex.data.ptr,
41+
int(X.nnz),
5742
)
5843
if cp.any(gene_ex == 0):
5944
raise ValueError(
60-
"There are genes with zero expression. "
61-
"Please remove them before running PCA."
45+
"There are genes with zero expression. Please remove them before running PCA."
6246
)

src/rapids_singlecell/preprocessing/_sparse_pca/_kernels/_pca_sparse_kernel.py

Lines changed: 0 additions & 77 deletions
This file was deleted.

src/rapids_singlecell/preprocessing/_sparse_pca/_sparse_pca.py

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616

1717
from ._helper import _check_matrix_for_zero_genes, _compute_cov, _copy_gram
1818

19+
try:
20+
from rapids_singlecell._cuda import _spca_cuda as _spca
21+
except ImportError:
22+
_spca = None
23+
1924

2025
class PCA_sparse:
2126
def __init__(self, n_components: int | None, *, zero_center: bool = True) -> None:
@@ -199,50 +204,32 @@ def _cov_sparse(
199204

200205

201206
def _create_gram_matrix(x):
202-
from ._kernels._pca_sparse_kernel import (
203-
_gramm_kernel_csr,
204-
)
205-
206207
if isinstance(x, csr_matrix):
207208
gram_matrix = cp.zeros((x.shape[1], x.shape[1]), dtype=x.data.dtype)
208-
209-
block = (128,)
210-
grid = (x.shape[0],)
211-
compute_mean_cov = _gramm_kernel_csr(x.dtype)
212-
compute_mean_cov(
213-
grid,
214-
block,
215-
(
216-
x.indptr,
217-
x.indices,
218-
x.data,
219-
x.shape[0],
220-
x.shape[1],
221-
gram_matrix,
222-
),
209+
_spca.gram_csr_upper(
210+
x.indptr.data.ptr,
211+
x.indices.data.ptr,
212+
x.data.data.ptr,
213+
int(x.shape[0]),
214+
int(x.shape[1]),
215+
gram_matrix.data.ptr,
216+
int(cp.dtype(x.dtype).itemsize),
223217
)
224218
elif isinstance(x, DaskArray):
225-
compute_mean_cov = _gramm_kernel_csr(x.dtype)
226-
compute_mean_cov.compile()
227219
n_cols = x.shape[1]
228220
if isinstance(x._meta, csr_matrix):
229221
# Gram matrix for CSR matrix
230222
def __gram_block(x_part):
231223
gram_matrix = cp.zeros((n_cols, n_cols), dtype=x.dtype)
232224

233-
block = (128,)
234-
grid = (x_part.shape[0],)
235-
compute_mean_cov(
236-
grid,
237-
block,
238-
(
239-
x_part.indptr,
240-
x_part.indices,
241-
x_part.data,
242-
x_part.shape[0],
243-
n_cols,
244-
gram_matrix,
245-
),
225+
_spca.gram_csr_upper(
226+
x_part.indptr.data.ptr,
227+
x_part.indices.data.ptr,
228+
x_part.data.data.ptr,
229+
int(x_part.shape[0]),
230+
int(n_cols),
231+
gram_matrix.data.ptr,
232+
int(cp.dtype(x_part.dtype).itemsize),
246233
)
247234
return gram_matrix[None, ...] # need new axis for summing
248235
else:

0 commit comments

Comments
 (0)