11from __future__ import annotations
22
3+ try :
4+ from rapids_singlecell ._cuda import _pv_cuda as _pv
5+ except ImportError :
6+ _pv = None
37import cupy as cp
48import numba as nb
59import numpy as np
610
7- # Reverse cumulative min along the last axis, per row (float64)
8- _rev_cummin64 = cp .RawKernel (
9- r"""
10- extern "C" __global__
11- void rev_cummin64(const double* __restrict__ x,
12- double* __restrict__ y,
13- const int n_rows,
14- const int m)
15- {
16- int r = blockDim.x * blockIdx.x + threadIdx.x;
17- if (r >= n_rows) return;
18-
19- const double* xr = x + (size_t)r * m;
20- double* yr = y + (size_t)r * m;
21-
22- double cur = xr[m - 1];
23- yr[m - 1] = cur;
24-
25- // right -> left
26- for (int j = m - 2; j >= 0; --j) {
27- double v = xr[j];
28- cur = (v < cur) ? v : cur;
29- yr[j] = cur;
30- }
31- }
32- """ ,
33- "rev_cummin64" ,
34- )
11+
12+ def _rev_cummin64 (x , n_rows , m ):
13+ y = cp .empty_like (x )
14+
15+ _pv .rev_cummin64 (x .data .ptr , y .data .ptr , int (n_rows ), int (m ))
16+ return y
3517
3618
3719def fdr_bh_axis1_cupy_optimized (ps , * , mem_gb : float = 4.0 ) -> cp .ndarray :
@@ -78,7 +60,6 @@ def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
7860
7961 out = cp .empty_like (ps , dtype = cp .float64 )
8062
81- threads = 256 # for the rev_cummin kernel
8263 for s in range (0 , n_rows , B ):
8364 e = min (n_rows , s + B )
8465 R = e - s
@@ -97,9 +78,7 @@ def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
9778 ps_bh = ps_sorted * scale # (R, m) float64
9879
9980 # 4) reverse cumulative min via custom kernel
100- ps_mon = cp .empty_like (ps_bh )
101- blocks = (R + threads - 1 ) // threads
102- _rev_cummin64 ((blocks ,), (threads ,), (ps_bh , ps_mon , R , m ))
81+ ps_mon = _rev_cummin64 (ps_bh , R , m )
10382
10483 # 5) build inverse permutation without argsort (scatter)
10584 inv_order = cp .empty_like (order , dtype = cp .int32 ) # (R, m) int32
0 commit comments