Skip to content

Commit df69917

Browse files
committed
first draft for 64 bit support
1 parent 51817a0 commit df69917

27 files changed

Lines changed: 1085 additions & 738 deletions

src/rapids_singlecell/_cuda/aggr/aggr.cu

Lines changed: 58 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,28 @@ using namespace nb::literals;
88
constexpr int BLOCK_SIZE_SPARSE = 64;
99
constexpr int BLOCK_SIZE_DENSE = 256;
1010

11-
template <typename T>
12-
static inline void launch_csr_aggr(const int* indptr, const int* index,
11+
template <typename T, typename IdxT>
12+
static inline void launch_csr_aggr(const IdxT* indptr, const IdxT* index,
1313
const T* data, double* out, const int* cats,
1414
const bool* mask, size_t n_cells,
1515
size_t n_genes, size_t n_groups,
1616
cudaStream_t stream) {
1717
dim3 grid((unsigned)n_cells);
1818
dim3 block(BLOCK_SIZE_SPARSE);
19-
csr_aggr_kernel<T><<<grid, block, 0, stream>>>(
19+
csr_aggr_kernel<T, IdxT><<<grid, block, 0, stream>>>(
2020
indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups);
2121
CUDA_CHECK_LAST_ERROR(csr_aggr_kernel);
2222
}
2323

24-
template <typename T>
25-
static inline void launch_csc_aggr(const int* indptr, const int* index,
24+
template <typename T, typename IdxT>
25+
static inline void launch_csc_aggr(const IdxT* indptr, const IdxT* index,
2626
const T* data, double* out, const int* cats,
2727
const bool* mask, size_t n_cells,
2828
size_t n_genes, size_t n_groups,
2929
cudaStream_t stream) {
3030
dim3 grid((unsigned)n_genes);
3131
dim3 block(BLOCK_SIZE_SPARSE);
32-
csc_aggr_kernel<T><<<grid, block, 0, stream>>>(
32+
csc_aggr_kernel<T, IdxT><<<grid, block, 0, stream>>>(
3333
indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups);
3434
CUDA_CHECK_LAST_ERROR(csc_aggr_kernel);
3535
}
@@ -58,50 +58,51 @@ static inline void launch_dense_aggr_F(const T* data, double* out,
5858
CUDA_CHECK_LAST_ERROR(dense_aggr_kernel_F);
5959
}
6060

61-
template <typename T>
62-
static inline void launch_csr_to_coo(const int* indptr, const int* index,
61+
template <typename T, typename IdxT>
62+
static inline void launch_csr_to_coo(const IdxT* indptr, const IdxT* index,
6363
const T* data, int* row, int* col,
6464
double* ndata, const int* cats,
6565
const bool* mask, int n_cells,
6666
cudaStream_t stream) {
6767
dim3 grid((unsigned)n_cells);
6868
dim3 block(BLOCK_SIZE_SPARSE);
69-
csr_to_coo_kernel<T><<<grid, block, 0, stream>>>(
69+
csr_to_coo_kernel<T, IdxT><<<grid, block, 0, stream>>>(
7070
indptr, index, data, row, col, ndata, cats, mask, n_cells);
7171
CUDA_CHECK_LAST_ERROR(csr_to_coo_kernel);
7272
}
7373

74-
static inline void launch_sparse_var(const int* indptr, const int* index,
74+
template <typename IdxT>
75+
static inline void launch_sparse_var(const IdxT* indptr, const IdxT* index,
7576
double* data, const double* mean_data,
7677
double* n_cells, int dof, int n_groups,
7778
cudaStream_t stream) {
7879
dim3 grid((unsigned)n_groups);
7980
dim3 block(BLOCK_SIZE_SPARSE);
80-
sparse_var_kernel<<<grid, block, 0, stream>>>(
81+
sparse_var_kernel<IdxT><<<grid, block, 0, stream>>>(
8182
indptr, index, data, mean_data, n_cells, dof, n_groups);
8283
CUDA_CHECK_LAST_ERROR(sparse_var_kernel);
8384
}
8485

85-
template <typename T, typename Device>
86+
template <typename T, typename IdxT, typename Device>
8687
void def_sparse_aggr(nb::module_& m) {
8788
m.def(
8889
"sparse_aggr",
89-
[](gpu_array_c<const int, Device> indptr,
90-
gpu_array_c<const int, Device> index,
90+
[](gpu_array_c<const IdxT, Device> indptr,
91+
gpu_array_c<const IdxT, Device> index,
9192
gpu_array_c<const T, Device> data, gpu_array_c<double, Device> out,
9293
gpu_array_c<const int, Device> cats,
9394
gpu_array_c<const bool, Device> mask, size_t n_cells, size_t n_genes,
9495
size_t n_groups, bool is_csc, std::uintptr_t stream) {
9596
if (is_csc) {
96-
launch_csc_aggr<T>(indptr.data(), index.data(), data.data(),
97-
out.data(), cats.data(), mask.data(),
98-
n_cells, n_genes, n_groups,
99-
(cudaStream_t)stream);
97+
launch_csc_aggr<T, IdxT>(indptr.data(), index.data(),
98+
data.data(), out.data(), cats.data(),
99+
mask.data(), n_cells, n_genes,
100+
n_groups, (cudaStream_t)stream);
100101
} else {
101-
launch_csr_aggr<T>(indptr.data(), index.data(), data.data(),
102-
out.data(), cats.data(), mask.data(),
103-
n_cells, n_genes, n_groups,
104-
(cudaStream_t)stream);
102+
launch_csr_aggr<T, IdxT>(indptr.data(), index.data(),
103+
data.data(), out.data(), cats.data(),
104+
mask.data(), n_cells, n_genes,
105+
n_groups, (cudaStream_t)stream);
105106
}
106107
},
107108
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out"_a, "cats"_a,
@@ -131,56 +132,66 @@ void def_dense_aggr(nb::module_& m) {
131132
"n_genes"_a, "n_groups"_a, "is_fortran"_a, "stream"_a = 0);
132133
}
133134

134-
template <typename T, typename Device>
135+
template <typename T, typename IdxT, typename Device>
135136
void def_csr_to_coo(nb::module_& m) {
136137
m.def(
137138
"csr_to_coo",
138-
[](gpu_array_c<const int, Device> indptr,
139-
gpu_array_c<const int, Device> index,
139+
[](gpu_array_c<const IdxT, Device> indptr,
140+
gpu_array_c<const IdxT, Device> index,
140141
gpu_array_c<const T, Device> data, gpu_array_c<int, Device> out_row,
141142
gpu_array_c<int, Device> out_col,
142143
gpu_array_c<double, Device> out_data,
143144
gpu_array_c<const int, Device> cats,
144145
gpu_array_c<const bool, Device> mask, int n_cells,
145146
std::uintptr_t stream) {
146-
launch_csr_to_coo<T>(indptr.data(), index.data(), data.data(),
147-
out_row.data(), out_col.data(),
148-
out_data.data(), cats.data(), mask.data(),
149-
n_cells, (cudaStream_t)stream);
147+
launch_csr_to_coo<T, IdxT>(
148+
indptr.data(), index.data(), data.data(), out_row.data(),
149+
out_col.data(), out_data.data(), cats.data(), mask.data(),
150+
n_cells, (cudaStream_t)stream);
150151
},
151152
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out_row"_a,
152153
"out_col"_a, "out_data"_a, "cats"_a, "mask"_a, "n_cells"_a,
153154
"stream"_a = 0);
154155
}
155156

157+
template <typename IdxT, typename Device>
158+
void def_sparse_var(nb::module_& m) {
159+
m.def(
160+
"sparse_var",
161+
[](gpu_array_c<const IdxT, Device> indptr,
162+
gpu_array_c<const IdxT, Device> index,
163+
gpu_array_c<double, Device> data,
164+
gpu_array_c<const double, Device> means,
165+
gpu_array_c<double, Device> n_cells, int dof, int n_groups,
166+
std::uintptr_t stream) {
167+
launch_sparse_var<IdxT>(indptr.data(), index.data(), data.data(),
168+
means.data(), n_cells.data(), dof, n_groups,
169+
(cudaStream_t)stream);
170+
},
171+
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a,
172+
"dof"_a, "n_groups"_a, "stream"_a = 0);
173+
}
174+
156175
template <typename Device>
157176
void register_bindings(nb::module_& m) {
158-
def_sparse_aggr<float, Device>(m);
159-
def_sparse_aggr<double, Device>(m);
177+
def_sparse_aggr<float, int, Device>(m);
178+
def_sparse_aggr<float, long long, Device>(m);
179+
def_sparse_aggr<double, int, Device>(m);
180+
def_sparse_aggr<double, long long, Device>(m);
160181

161182
// F-order must come before C-order for proper dispatch
162183
def_dense_aggr<float, nb::f_contig, Device>(m);
163184
def_dense_aggr<float, nb::c_contig, Device>(m);
164185
def_dense_aggr<double, nb::f_contig, Device>(m);
165186
def_dense_aggr<double, nb::c_contig, Device>(m);
166187

167-
def_csr_to_coo<float, Device>(m);
168-
def_csr_to_coo<double, Device>(m);
188+
def_csr_to_coo<float, int, Device>(m);
189+
def_csr_to_coo<float, long long, Device>(m);
190+
def_csr_to_coo<double, int, Device>(m);
191+
def_csr_to_coo<double, long long, Device>(m);
169192

170-
m.def(
171-
"sparse_var",
172-
[](gpu_array_c<const int, Device> indptr,
173-
gpu_array_c<const int, Device> index,
174-
gpu_array_c<double, Device> data,
175-
gpu_array_c<const double, Device> means,
176-
gpu_array_c<double, Device> n_cells, int dof, int n_groups,
177-
std::uintptr_t stream) {
178-
launch_sparse_var(indptr.data(), index.data(), data.data(),
179-
means.data(), n_cells.data(), dof, n_groups,
180-
(cudaStream_t)stream);
181-
},
182-
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a,
183-
"dof"_a, "n_groups"_a, "stream"_a = 0);
193+
def_sparse_var<int, Device>(m);
194+
def_sparse_var<long long, Device>(m);
184195
}
185196

186197
NB_MODULE(_aggr_cuda, m) {

src/rapids_singlecell/_cuda/aggr/kernels_aggr.cuh

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
#include <cuda_runtime.h>
44

55
// sparse -> dense aggregate (CSR by cells), mask per cell, cats per cell
6-
template <typename T>
7-
__global__ void csr_aggr_kernel(const int* __restrict__ indptr,
8-
const int* __restrict__ index,
6+
template <typename T, typename IdxT>
7+
__global__ void csr_aggr_kernel(const IdxT* __restrict__ indptr,
8+
const IdxT* __restrict__ index,
99
const T* __restrict__ data,
1010
double* __restrict__ out,
1111
const int* __restrict__ cats,
1212
const bool* __restrict__ mask, size_t n_cells,
1313
size_t n_genes, size_t n_groups) {
1414
size_t cell = blockIdx.x;
1515
if (cell >= n_cells || !mask[cell]) return;
16-
int cell_start = indptr[cell];
17-
int cell_end = indptr[cell + 1];
16+
IdxT cell_start = indptr[cell];
17+
IdxT cell_end = indptr[cell + 1];
1818
size_t group = static_cast<size_t>(cats[cell]);
19-
for (int p = cell_start + threadIdx.x; p < cell_end; p += blockDim.x) {
19+
for (IdxT p = cell_start + threadIdx.x; p < cell_end; p += blockDim.x) {
2020
size_t gene_pos = static_cast<size_t>(index[p]);
2121
double v = static_cast<double>(data[p]);
2222
atomicAdd(&out[group * n_genes + gene_pos], v);
@@ -27,19 +27,19 @@ __global__ void csr_aggr_kernel(const int* __restrict__ indptr,
2727
}
2828

2929
// sparse -> dense aggregate (CSC by genes), mask per cell, cats per cell
30-
template <typename T>
31-
__global__ void csc_aggr_kernel(const int* __restrict__ indptr,
32-
const int* __restrict__ index,
30+
template <typename T, typename IdxT>
31+
__global__ void csc_aggr_kernel(const IdxT* __restrict__ indptr,
32+
const IdxT* __restrict__ index,
3333
const T* __restrict__ data,
3434
double* __restrict__ out,
3535
const int* __restrict__ cats,
3636
const bool* __restrict__ mask, size_t n_cells,
3737
size_t n_genes, size_t n_groups) {
3838
size_t gene = blockIdx.x;
3939
if (gene >= n_genes) return;
40-
int gene_start = indptr[gene];
41-
int gene_end = indptr[gene + 1];
42-
for (int p = gene_start + threadIdx.x; p < gene_end; p += blockDim.x) {
40+
IdxT gene_start = indptr[gene];
41+
IdxT gene_end = indptr[gene + 1];
42+
for (IdxT p = gene_start + threadIdx.x; p < gene_end; p += blockDim.x) {
4343
size_t cell = static_cast<size_t>(index[p]);
4444
if (!mask[cell]) continue;
4545
size_t group = static_cast<size_t>(cats[cell]);
@@ -52,41 +52,42 @@ __global__ void csc_aggr_kernel(const int* __restrict__ indptr,
5252

5353
// sparse -> sparse copy (CSR by cells) row/col/value from one to another by
5454
// cats/mask
55-
template <typename T>
56-
__global__ void csr_to_coo_kernel(const int* __restrict__ indptr,
57-
const int* __restrict__ index,
55+
template <typename T, typename IdxT>
56+
__global__ void csr_to_coo_kernel(const IdxT* __restrict__ indptr,
57+
const IdxT* __restrict__ index,
5858
const T* __restrict__ data,
5959
int* __restrict__ row, int* __restrict__ col,
6060
double* __restrict__ ndata,
6161
const int* __restrict__ cats,
6262
const bool* __restrict__ mask, int n_cells) {
6363
int cell = blockIdx.x;
6464
if (cell >= n_cells || !mask[cell]) return;
65-
int start = indptr[cell];
66-
int end = indptr[cell + 1];
65+
IdxT start = indptr[cell];
66+
IdxT end = indptr[cell + 1];
6767
int group = cats[cell];
68-
for (int p = start + threadIdx.x; p < end; p += blockDim.x) {
69-
int g = index[p];
68+
for (IdxT p = start + threadIdx.x; p < end; p += blockDim.x) {
69+
int g = static_cast<int>(index[p]);
7070
ndata[p] = static_cast<double>(data[p]);
7171
row[p] = group;
7272
col[p] = g;
7373
}
7474
}
7575

7676
// variance adjust per group (CSR-like segment)
77-
__global__ void sparse_var_kernel(const int* __restrict__ indptr,
78-
const int* __restrict__ index,
77+
template <typename IdxT>
78+
__global__ void sparse_var_kernel(const IdxT* __restrict__ indptr,
79+
const IdxT* __restrict__ index,
7980
double* __restrict__ data,
8081
const double* __restrict__ mean_data,
8182
double* __restrict__ n_cells, int dof,
8283
int n_groups) {
8384
int group = blockIdx.x;
8485
if (group >= n_groups) return;
85-
int start = indptr[group];
86-
int end = indptr[group + 1];
86+
IdxT start = indptr[group];
87+
IdxT end = indptr[group + 1];
8788
double doffer =
8889
n_cells[group] / (n_cells[group] - static_cast<double>(dof));
89-
for (int p = start + threadIdx.x; p < end; p += blockDim.x) {
90+
for (IdxT p = start + threadIdx.x; p < end; p += blockDim.x) {
9091
double var = data[p];
9192
double mean_sq = mean_data[p] * mean_data[p];
9293
var = var - mean_sq;

src/rapids_singlecell/_cuda/autocorr/autocorr.cu

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,16 @@ static inline void launch_gearys_sparse(
6464
CUDA_CHECK_LAST_ERROR(gearys_C_num_sparse_kernel);
6565
}
6666

67-
template <typename T>
68-
static inline void launch_pre_den_sparse(const int* data_col_ind,
69-
const T* data_values, int nnz,
67+
template <typename T, typename IdxT>
68+
static inline void launch_pre_den_sparse(const IdxT* data_col_ind,
69+
const T* data_values, long long nnz,
7070
const T* mean_array, T* den,
7171
int* counter, cudaStream_t stream) {
7272
dim3 block(ELEMENTWISE_BLOCK_SIZE);
73-
dim3 grid((nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE);
74-
pre_den_sparse_kernel<<<grid, block, 0, stream>>>(
73+
long long grid_size =
74+
(nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE;
75+
dim3 grid(grid_size);
76+
pre_den_sparse_kernel<T, IdxT><<<grid, block, 0, stream>>>(
7577
data_col_ind, data_values, nnz, mean_array, den, counter);
7678
CUDA_CHECK_LAST_ERROR(pre_den_sparse_kernel);
7779
}
@@ -220,31 +222,59 @@ void register_bindings(nb::module_& m) {
220222
"data_row_ptr"_a, "data_col_ind"_a, "data_values"_a, "n_samples"_a,
221223
"n_features"_a, "num"_a, "stream"_a = 0);
222224

223-
// pre_den_sparse - float32
225+
// pre_den_sparse - float32, int32
224226
m.def(
225227
"pre_den_sparse",
226228
[](gpu_array_c<const int, Device> data_col_ind,
227-
gpu_array_c<const float, Device> data_values, int nnz,
229+
gpu_array_c<const float, Device> data_values, long long nnz,
228230
gpu_array_c<const float, Device> mean_array,
229231
gpu_array_c<float, Device> den, gpu_array_c<int, Device> counter,
230232
std::uintptr_t stream) {
231-
launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
232-
mean_array.data(), den.data(), counter.data(),
233-
(cudaStream_t)stream);
233+
launch_pre_den_sparse<float, int>(
234+
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
235+
den.data(), counter.data(), (cudaStream_t)stream);
234236
},
235237
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
236238
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
237-
// pre_den_sparse - float64
239+
// pre_den_sparse - float64, int32
238240
m.def(
239241
"pre_den_sparse",
240242
[](gpu_array_c<const int, Device> data_col_ind,
241-
gpu_array_c<const double, Device> data_values, int nnz,
243+
gpu_array_c<const double, Device> data_values, long long nnz,
244+
gpu_array_c<const double, Device> mean_array,
245+
gpu_array_c<double, Device> den, gpu_array_c<int, Device> counter,
246+
std::uintptr_t stream) {
247+
launch_pre_den_sparse<double, int>(
248+
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
249+
den.data(), counter.data(), (cudaStream_t)stream);
250+
},
251+
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
252+
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
253+
// pre_den_sparse - float32, int64
254+
m.def(
255+
"pre_den_sparse",
256+
[](gpu_array_c<const long long, Device> data_col_ind,
257+
gpu_array_c<const float, Device> data_values, long long nnz,
258+
gpu_array_c<const float, Device> mean_array,
259+
gpu_array_c<float, Device> den, gpu_array_c<int, Device> counter,
260+
std::uintptr_t stream) {
261+
launch_pre_den_sparse<float, long long>(
262+
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
263+
den.data(), counter.data(), (cudaStream_t)stream);
264+
},
265+
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
266+
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
267+
// pre_den_sparse - float64, int64
268+
m.def(
269+
"pre_den_sparse",
270+
[](gpu_array_c<const long long, Device> data_col_ind,
271+
gpu_array_c<const double, Device> data_values, long long nnz,
242272
gpu_array_c<const double, Device> mean_array,
243273
gpu_array_c<double, Device> den, gpu_array_c<int, Device> counter,
244274
std::uintptr_t stream) {
245-
launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
246-
mean_array.data(), den.data(), counter.data(),
247-
(cudaStream_t)stream);
275+
launch_pre_den_sparse<double, long long>(
276+
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
277+
den.data(), counter.data(), (cudaStream_t)stream);
248278
},
249279
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
250280
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);

0 commit comments

Comments
 (0)